In [40]:
%pip install langchain langchain-community langchain-ollama pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [47]:
import pandas as pd
import os
df = pd.read_csv("memento_data/bertopic_topics.csv")

In [45]:
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


import pandas as pd
from tqdm import tqdm
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


def rename_topics_with_langchain(df, model_name="llama3", max_chars=500):
    """
    Rename BERTopic topics using LangChain and Ollama with a progress bar.

    Parameters:
    - df: pandas.DataFrame with BERTopic output, containing 'Topic', 'Representation', and 'Representative_Docs'.
    - model_name: Name of the Ollama model to use (e.g., 'llama3').
    - max_chars: Maximum number of characters to use from the representative document.

    Returns:
    - A copy of the DataFrame with a new column 'Generated_Name'.
    """
    llm = ChatOllama(model=model_name)

    prompt_template = ChatPromptTemplate.from_template(
        "Given the topic keywords: {keywords}\n"
        'And this document excerpt:\n"{doc}"\n\n'
        "That is generated from BertTopic, a topic modelling framework\n"
        "Suggest a short and descriptive topic name (1–5 words). Output only the name, no other text.\n"
    )

    parser = StrOutputParser()
    chain = prompt_template | llm | parser

    df_copy = df.copy()
    generated_names = []

    for _, row in tqdm(df_copy.iterrows(), total=len(df_copy), desc="Renaming topics"):
        if row["Topic"] == -1:
            generated_names.append("Outlier")
        else:
            rep_words = row["Representation"]
            doc_excerpt = ""
            if isinstance(row["Representative_Docs"], str):
                try:
                    doc_list = eval(row["Representative_Docs"])
                    if isinstance(doc_list, list) and doc_list:
                        doc_excerpt = ""
                        for doc in doc_list:
                            if len(doc) > max_chars:
                                doc = doc[:max_chars] + "..."
                            doc_excerpt += doc + "\n"
                except Exception:
                    doc_excerpt = ""
            try:
                name = chain.invoke({"keywords": rep_words, "doc": doc_excerpt}).strip()
            except Exception:
                name = "Error"
            generated_names.append(name)

    df_copy["Generated_Name"] = generated_names
    return df_copy

In [48]:
df_renamed = rename_topics_with_langchain(df, model_name="llama3", max_chars=500)

Renaming topics: 100%|██████████| 202/202 [01:03<00:00,  3.17it/s]


In [49]:
# Remove qutation from the newly generated name
df_renamed["Generated_Name"] = df_renamed["Generated_Name"].str.replace('"', '', regex=False)

In [50]:
df_renamed.to_csv("memento_data/bertopic_topics_renamed.csv", index=False)

In [51]:
df_renamed = pd.read_csv("memento_data/bertopic_topics_renamed.csv")

In [52]:
df = pd.read_csv("memento_data/newsCorpora_with_topic.csv")

In [54]:
df.head(10)

Unnamed: 0,id,title,url,source,category,story_id,hostname,timestamp,body,assigned_topic_name
0,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,2014-03-10 16:52:51.207,Paris/London/Atlanta: Federal Reserve Bank of ...,80_fed_bank_yellen_rates
1,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,2014-03-10 16:52:52.027,The Federal Reserve meeting in the coming week...,"110_jobs_job_unemployment_labor, 125_spending_..."
2,280701,Only 2 people allowed in the delivery room: Pr...,http://indiatoday.intoday.in/story/only-2-peop...,India Today,e,dQEXtwpswiNW-4MJkBLcTFNZCQ42M,indiatoday.intoday.in,2014-06-13 18:58:17.020,The 'Third Person' actress is expecting her fi...,39_kunis_baby_pregnant_mila
3,285435,Actress And Civil Rights Activist Ruby Dee Die...,http://wegotthiscovered.com/movies/actress-civ...,We Got This Covered,e,duy-eYWM3GLFeSMlLjm7WEMy_7YJM,wegotthiscovered.com,2014-06-14 14:41:05.421,"Award-winning actress Ruby Dee, best known for...","188_elsa_anna_broadway_frozen, 23_lindsay_loha..."
4,233727,VIDEO: Pat Sajak spins climate change on Twitter,http://www.tele-management.ca/2014/05/video-pa...,Telemanagement,e,dqLZi-5Q-qdz3LMgOFGiDO9R2KyNM,www.tele-management.ca,2014-05-21 21:18:17.155,This movie is about a woman who visits her boy...,92_tries_boyfriend_movie_like middle
5,363681,Jessica Simpson Ties Knot No. 2,http://www.newser.com/story/190511/jessica-sim...,Newser,e,d0SpyDazKjniBFMJ5oGEaDQQpbDTM,www.newser.com,2014-07-07 03:15:50.876,After a nearly-four-year engagement that produ...,113_simpson_jessica_jessica simpson_eric johnson
6,338022,Liza Minnelli's Publicist Sends Labeouf Dvd Co...,http://www.contactmusic.com/story/liza-minnell...,Contactmusic.com,e,dSxCH6eKHibCnTML3kb-li9KO9EUM,www.contactmusic.com,2014-07-01 00:08:46.441,404 - page not found.\n\nThe page you are tryi...,0_page_404_404 page_page page
7,369165,"Jessica Simpson was a ""beautiful golden girl"" ...",http://home.nzcity.co.nz/news/article.aspx\?id...,NZ City,e,d7CcpRc8eODUyyMB8NI5CmhL3xJeM,home.nzcity.co.nz,2014-07-08 16:21:10.206,"Jessica Simpson was a ""beautiful golden girl"" ...",113_simpson_jessica_jessica simpson_eric johnson
8,279212,Amazon Prime Music streaming service launches ...,http://www.theguardian.com/technology/2014/jun...,The Guardian,e,dHSyTLPkx34SoNME4mlWRFx5XNCUM,www.theguardian.com,2014-06-13 15:50:47.241,Amazon is the latest internet company to enter...,"89_chromecast_tv_android tv_android, 63_amazon..."
9,342245,Zac Efron dances atop table to Jason Derulo's ...,http://www.nydailynews.com/entertainment/gossi...,New York Daily News,e,dBaSUi97m7Lmj8MXr3ccrYJutCOlM,www.nydailynews.com,2014-07-01 20:53:06.222,Zac Efron is living it up on his Italia vacati...,106_zac_efron_zac efron_michelle


In [55]:
# Replace df.assigned_topic_name that have the old df.Name with new name from df_renamed.Generated_Name
topic2name_map = dict(zip(df_renamed.Name, df_renamed.Generated_Name))


df.assigned_topic_name = df.assigned_topic_name.apply(
    lambda x: (", ".join([
            topic2name_map.get(topic, topic)
            for topic in str(x).split(", ")
            if topic in topic2name_map
        ])
        if pd.notna(x)
        else x
    )
)

In [56]:
df.head()

Unnamed: 0,id,title,url,source,category,story_id,hostname,timestamp,body,assigned_topic_name
0,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,2014-03-10 16:52:51.207,Paris/London/Atlanta: Federal Reserve Bank of ...,Fed Interest Rate Policy
1,5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,2014-03-10 16:52:52.027,The Federal Reserve meeting in the coming week...,"Job Market Trends, US Economic Downturn, Fed I..."
2,280701,Only 2 people allowed in the delivery room: Pr...,http://indiatoday.intoday.in/story/only-2-peop...,India Today,e,dQEXtwpswiNW-4MJkBLcTFNZCQ42M,indiatoday.intoday.in,2014-06-13 18:58:17.020,The 'Third Person' actress is expecting her fi...,Mila Kunis Pregnancy Update
3,285435,Actress And Civil Rights Activist Ruby Dee Die...,http://wegotthiscovered.com/movies/actress-civ...,We Got This Covered,e,duy-eYWM3GLFeSMlLjm7WEMy_7YJM,wegotthiscovered.com,2014-06-14 14:41:05.421,"Award-winning actress Ruby Dee, best known for...","Frozen Broadway Musicals, Lindsay Lohan's Real..."
4,233727,VIDEO: Pat Sajak spins climate change on Twitter,http://www.tele-management.ca/2014/05/video-pa...,Telemanagement,e,dqLZi-5Q-qdz3LMgOFGiDO9R2KyNM,www.tele-management.ca,2014-05-21 21:18:17.155,This movie is about a woman who visits her boy...,Relationship Drama Movie


In [57]:
df.to_csv("../data.csv", index=False)
df_renamed.to_csv("../topic_data.csv", index=False)