In [None]:
import polars as pl
import bertopic
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords

FILE = "data/communications.csv"
TEXT_COLUMN = "Text"

In [None]:
nltk.download('stopwords')

In [None]:
df = pl.read_csv(FILE)
df

In [None]:
# Preprocessing
stop_words = set(stopwords.words("english"))

def preprocess_text(text: str) -> str:
    text = re.sub(r'[^\w]', ' ', text)
    words = text.lower().split()
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

df = df.with_columns(
    pl.col("Text")
    .map_elements(preprocess_text, return_dtype=pl.String)
    .alias("clean_text")
)
df

In [None]:
vectorizer = CountVectorizer()
doc_matrix = vectorizer.fit_transform(df["clean_text"])
vectorizer.get_feature_names_out()

In [None]:
def generate_topics(n: int, feature_names: list[str], doc_matrix, n_top_words=10) -> list[str]:
    lda = LatentDirichletAllocation(n_components=n)
    lda.fit(doc_matrix)
    topics = []
    for idx, topic in enumerate(lda.components_):
        topics.append([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
    return topics

In [None]:
feature_names = vectorizer.get_feature_names_out()
n_top_words = 10
for n in range(5, 31):
    topics = generate_topics(n, feature_names, doc_matrix, n_top_words=n_top_words)
    rows = [[i + 1] + t for i, t in enumerate(topics)]
    columns = ["topic"] + [f"word_{i + 1}" for i in range(n_top_words)]
    topics_df = pl.DataFrame(rows, schema=columns, orient="row")
    topics_df.write_csv(f"output/lda_{n:02}_topics.csv")

In [None]:
dfs = []
for i in range(5, 31):
    df = pl.read_csv(f"output/lda_{i:02}_topics.csv")
    dfs.append(df)
dfs

In [None]:
def get_topic_for_row(df: pl.DataFrame, i: int) -> list[str]:
    return df.row(i)[1:]

get_topic_for_row(dfs[0], 1)

In [None]:
communications_df = pl.read_csv("data/communications_preprocessed.csv")
communications_df

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

texts = [s.split() for s in communications_df["clean_text"]]
gensim_dict = Dictionary(documents=texts)
coherence_metrics = []
for df in dfs:
    topics = [get_topic_for_row(df, i) for i in range(len(df))]
    cm = CoherenceModel(topics=topics, texts=texts, dictionary=gensim_dict)
    coherence = cm.get_coherence()
    coherence_metrics.append(coherence)
coherence_metrics

In [None]:
coherence_df = pl.DataFrame({"topic": [i for i in range(5, 31)], "coherence": coherence_metrics})
coherence_df.write_csv("output/topic_coherence_lda.csv")
coherence_df

In [None]:
coherence_df.sort("coherence", descending=True)