In [1]:
%pip install tomotopy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import tomotopy
import numpy as np
import ast
import re

# === Load full CSVs
trump_df = pd.read_csv("trump_speaker_rows.csv")
obama_df = pd.read_csv("obama_speaker_rows.csv")

# === Clean function (lowercase + remove short words)
def clean_dialogue(utt_str):
    utts = ast.literal_eval(utt_str) if isinstance(utt_str, str) else utt_str
    text = " ".join(utts).lower()
    words = re.findall(r"\b\w{5,}\b", text)
    return " ".join(words)

# === Add cleaned text + source tag
trump_df["clean_dialogue"] = trump_df["utt"].apply(clean_dialogue)
trump_df["source"] = 1

obama_df["clean_dialogue"] = obama_df["utt"].apply(clean_dialogue)
obama_df["source"] = 0

# === Combine into one full DataFrame (with all columns!)
full_df = pd.concat([trump_df, obama_df], ignore_index=True)

# === LDA training using cleaned dialogue only
model = tomotopy.LDAModel(k=25)
for text in full_df["clean_dialogue"]:
    model.add_doc(text.split())
model.train(1000)

# === Compute dominant topics
dominant_topics = [np.argmax(doc.get_topic_dist()) for doc in model.docs]
full_df["dominant_topic"] = dominant_topics

# === Save ALL columns, correctly
full_df.to_csv("dialogues_with_topics_labeled.csv", index=False)
print("✅ All original columns + topic labels saved to 'dialogues_with_topics_labeled.csv'")


✅ All original columns + topic labels saved to 'dialogues_with_topics_labeled.csv'


In [3]:
# Print top 10 words per topic
print("\n🧠 Topics learned by the LDA model:\n" + "="*40)

for topic_id in range(model.k):
    top_words = model.get_topic_words(topic_id, top_n=10)
    words_str = ", ".join([word for word, _ in top_words])
    print(f"Topic {topic_id:>2}: {words_str}")




🧠 Topics learned by the LDA model:
Topic  0: trump, donald, money, business, estate, great, million, apprentice, company, beach
Topic  1: trade, countries, world, united, russia, climate, china, states, change, agreement
Topic  2: country, president, american, their, people, which, about, america, those, political
Topic  3: president, obama, house, white, congress, office, first, united, today, states
Topic  4: morning, martin, trayvon, defense, airport, george, baghdad, troops, denver, zimmerman
Topic  5: terrorism, terrorist, attack, laden, muslim, muslims, attacks, terror, terrorists, killed
Topic  6: immigration, governor, country, people, senator, would, america, trump, states, thank
Topic  7: republican, party, debate, candidates, senator, republicans, voters, presidential, mccain, political
Topic  8: think, about, people, going, right, because, would, really, there, thing
Topic  9: nuclear, world, north, korea, united, israel, sanctions, states, would, china
Topic 10: larry, ni