In [None]:
import pandas as pd

# Load the CSV file
file_path = "results/topic_info_df.csv"  
df = pd.read_csv(file_path)

# Replace missing values with an empty string, then count tokens by splitting on whitespace
df["token_count"] = df["Representative_Docs"].fillna("").apply(lambda x: len(x.split()))

# Get the maximum number of tokens in any single document
max_tokens = df["token_count"].max()
print(f"Maximum number of tokens in a single document: {max_tokens}")

# Display the documents with their corresponding token counts
print(df[["Representative_Docs", "token_count"]])

In [None]:
df.columns.tolist()

In [None]:
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

In [41]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain

# Initialize LLM (GPT-4o mini)
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    max_tokens=50
)

# Prompt Template
prompt_template = ChatPromptTemplate.from_template(
    """
You are a topic modeling assistant. Given the following information:

Name: {name}
Representation: {representation}
Representative Document: {document}

Generate a concise topic title that best summarizes the content above. The topic should be no more than 10 words. Return only the topic title.
"""
)

# LLMChain
chain = LLMChain(llm=llm, prompt=prompt_template)

In [42]:
# Store results
topics = []

# Iterate through each row (or use batch in next step)
for i, row in df.iterrows():
    name = row.get("Name", "")
    representation = row.get("Representation", "")
    document = " ".join(str(row.get("Representative_Docs", "")))  

    try:
        result = chain.run({
            "name": name,
            "representation": representation,
            "document": document
        })
        topics.append(result.strip())
    except Exception as e:
        print(f"Error at row {i}: {e}")
        topics.append("[ERROR]")

In [None]:
# Save result
df["GPT_topic"] = topics
df.to_csv("results/topic_info_with_topics_GPT.csv", index=False)

In [None]:
df.head()

In [None]:
len(df)

In [None]:
df['GPT_topic'].unique()

In [None]:
len(df['GPT_topic'].unique())