In [None]:
import polars as pl
import bertopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from dotenv import dotenv_values
import pathlib
from gensim.corpora.dictionary import Dictionary
import numpy as np

config = dotenv_values("../.env")
FILE = f"../{config["DATA_DIR"]}/communications_preprocessed.csv"
WORKERS = 4
MIN_TOPICS = 5
MAX_TOPICS = 30

In [None]:
df = pl.read_csv(FILE)
df

In [None]:
vectorizer = CountVectorizer()
doc_matrix = vectorizer.fit_transform(df["stemmed_text"])
vectorizer.get_feature_names_out()

In [None]:
def generate_topics(n: int, feature_names: list[str], doc_matrix, n_top_words=10) -> list[str]:
    lda = LatentDirichletAllocation(n_components=n)
    lda.fit(doc_matrix)
    topics = []
    for idx, topic in enumerate(lda.components_):
        topics.append([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
    return topics

In [None]:
feature_names = vectorizer.get_feature_names_out()
n_top_words = 10
for n in range(5, 31):
    topics = generate_topics(n, feature_names, doc_matrix, n_top_words=n_top_words)
    rows = [[i + 1] + t for i, t in enumerate(topics)]
    columns = ["topic"] + [f"word_{i + 1}" for i in range(n_top_words)]
    topics_df = pl.DataFrame(rows, schema=columns, orient="row")
    topics_df.write_csv(f"../{config["OUTPUT_DIR"]}/lda_{n:02}_topics.csv")

In [None]:
dfs = []
for i in range(5, 31):
    df = pl.read_csv(f"../{config["OUTPUT_DIR"]}/lda_{i:02}_topics.csv")
    dfs.append(df)
dfs

In [None]:
def get_topic_for_row(df: pl.DataFrame, i: int) -> list[str]:
    return df.row(i)[1:]

get_topic_for_row(dfs[0], 1)

In [None]:
communications_df = pl.read_csv(f"../{config["DATA_DIR"]}/communications_preprocessed.csv")
communications_df

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

texts = [s.split() for s in communications_df["stemmed_text"]]
gensim_dict = Dictionary(documents=texts)
coherence_metrics = []
for df in dfs:
    topics = [get_topic_for_row(df, i) for i in range(len(df))]
    cm = CoherenceModel(topics=topics, texts=texts, dictionary=gensim_dict)
    coherence = cm.get_coherence()
    coherence_metrics.append(coherence)
coherence_metrics

In [None]:
coherence_df = pl.DataFrame({"topic": [i for i in range(5, 31)], "coherence": coherence_metrics})
coherence_df.write_csv(f"../{config["OUTPUT_DIR"]}/topic_coherence_lda.csv")
coherence_df

In [None]:
coherence_df.sort("coherence", descending=True)

In [None]:
from gensim.models.ldamulticore import LdaMulticore

texts = [s.split() for s in df["stemmed_text"]]
gensim_dict = Dictionary(documents=texts)
corpus = [gensim_dict.doc2bow(t) for t in texts]

In [None]:
lda = LdaMulticore(corpus, num_topics=10, workers=WORKERS, id2word=gensim_dict)

In [None]:
def get_topics_df(lda) -> pl.DataFrame:
    topics = lda.show_topics(formatted=False)
    d = {"word": [], "topic": [], "prob": []}
    for topic_num, word_probs in topics:
        for word, prob in word_probs:
            d["word"].append(word)
            d["topic"].append(topic_num + 1)
            d["prob"].append(prob)
    return pl.DataFrame(d)

In [None]:
get_topics_df(lda)

Source for exclusivity calculation:
ChatGPT, but corroborated by the [STM for Open Ended Survey Responses Online Appendix](https://scholar.harvard.edu/files/dtingley/files/ajpsappendix.pdf)

In [None]:
import gensim.models.basemodel

def get_exclusivity(model: gensim.models.basemodel.BaseTopicModel) -> np.ndarray: 
    topic_word_probs: np.ndarray = model.get_topics()
    word_totals = topic_word_probs.sum(axis=0)
    word_totals[word_totals == 0] = 1e-10 # Avoids division by zero
    return topic_word_probs / word_totals
get_exclusivity(lda)

In [None]:
from gensim.models.coherencemodel import CoherenceModel

cm = CoherenceModel(model=lda, corpus=gensim_dict, texts=texts)
coherence = cm.get_coherence()
coherence

In [None]:
from gensim.models.ldamulticore import LdaMulticore
import warnings

texts = [s.split() for s in df["stemmed_text"]]
gensim_dict = Dictionary(documents=texts)
corpus = [gensim_dict.doc2bow(t) for t in texts]

metrics_dict = {"n_topics": [], "exclusivity": [], "coherence": []}

warnings.warn("Exclusivity metrics are calculated improperly for now")
for n in range(MIN_TOPICS, 6):
    print(f"Training model with {n} topics")
    lda = LdaMulticore(corpus, num_topics=n, workers=WORKERS, id2word=gensim_dict)
    topics = get_topics_df(lda)

    exclusivity = get_exclusivity(lda).sum()

    print("Computing coherence")
    cm = CoherenceModel(model=lda, corpus=corpus, texts=texts)
    coherence = cm.get_coherence()

    metrics_dict["n_topics"].append(n)
    metrics_dict["exclusivity"].append(exclusivity)
    metrics_dict["coherence"].append(coherence)

    filename = f"../{config["OUTPUT_DIR"]}/lda_{n:02}_topics.csv"
    topics.write_csv(filename)
    print(f"Results saved to {filename}")

metrics_df = pl.DataFrame(metrics_dict)
metrics_filename = f"../{config["OUTPUT_DIR"]}/lda_metrics.csv"
metrics_df.write_csv(metrics_filename)
print(f"Metrics saved to {metrics_filename}")