In [51]:
import os
import glob
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd


# Download NLTK stopwords if not already done
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nishithreddy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nishithreddy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [52]:
data_dir = "./test/"  

# Load transcripts
files = glob.glob(os.path.join(data_dir, "*.txt"))
documents = []

In [53]:
for file_path in files:
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
        documents.append(text)


In [54]:
stop_words = set(stopwords.words("english"))
def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    return tokens



In [55]:
texts = [preprocess(doc) for doc in documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [56]:

# Train LDA model
num_topics = 15  # you can tune this
lda_model = gensim.models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    passes=10,
    alpha='auto'
)

In [57]:

# Print the topics
print("\nGlobal Topics:")
for i, topic in lda_model.show_topics(num_topics=num_topics, num_words=8, formatted=False):
    print(f"Topic {i}: {[word for word, _ in topic]}")


Global Topics:
Topic 0: ['really', 'like', 'things', 'know', 'lot', 'used', 'people', 'think']
Topic 1: ['andrews', 'people', 'research', 'kind', 'time', 'work', 'forest', 'get']
Topic 2: ['climate', 'think', 'health', 'like', 'really', 'one', 'change', 'work']
Topic 3: ['well', 'know', 'would', 'people', 'go', 'said', 'one', 'get']
Topic 4: ['know', 'people', 'one', 'think', 'time', 'would', 'like', 'well']
Topic 5: ['think', 'like', 'climate', 'people', 'get', 'know', 'really', 'going']
Topic 6: ['like', 'people', 'one', 'know', 'would', 'really', 'lot', 'think']
Topic 7: ['said', 'know', 'got', 'one', 'go', 'going', 'yeah', 'like']
Topic 8: ['one', 'people', 'students', 'get', 'would', 'time', 'think', 'school']
Topic 9: ['history', 'observational', 'natural', 'question', 'think', 'know', 'time', 'science']
Topic 10: ['forest', 'think', 'would', 'people', 'yeah', 'going', 'really', 'right']
Topic 11: ['like', 'know', 'really', 'think', 'people', 'would', 'yeah', 'lot']
Topic 12: ['

In [58]:
# Get topic distribution per transcript
print("\nPer-transcript topic distributions:")
for i, bow in enumerate(corpus):
    topic_probs = lda_model.get_document_topics(bow)
    print(f"Transcript {i+1} ({os.path.basename(files[i])}): {topic_probs}")



Per-transcript topic distributions:
Transcript 1 (34626.txt): [(1, np.float32(0.39001068)), (7, np.float32(0.50932324)), (10, np.float32(0.042088628)), (11, np.float32(0.058416758))]
Transcript 2 (35538.txt): [(11, np.float32(0.9998669))]
Transcript 3 (30871.txt): [(4, np.float32(0.5013479)), (7, np.float32(0.08184737)), (8, np.float32(0.37927026)), (11, np.float32(0.03351583))]
Transcript 4 (34961.txt): [(4, np.float32(0.019559208)), (11, np.float32(0.98025787))]
Transcript 5 (31616.txt): [(5, np.float32(0.06254819)), (12, np.float32(0.9371716))]
Transcript 6 (35316.txt): [(5, np.float32(0.19727597)), (10, np.float32(0.8019096))]
Transcript 7 (34551.txt): [(1, np.float32(0.4650713)), (4, np.float32(0.1372068)), (10, np.float32(0.3975661))]
Transcript 8 (34586.txt): [(1, np.float32(0.3699137)), (4, np.float32(0.19433506)), (10, np.float32(0.41605687)), (11, np.float32(0.01696086))]
Transcript 9 (35539.txt): [(11, np.float32(0.9996087))]
Transcript 10 (31011.txt): [(4, np.float32(0.175

In [59]:
for i, bow in enumerate(corpus):
    topic_probs = lda_model.get_document_topics(bow)
    top_topic = max(topic_probs, key=lambda x: x[1])  # topic with highest probability
    print(f"{os.path.basename(files[i])}: Topic {top_topic[0]} (weight={top_topic[1]:.2f})")


34626.txt: Topic 7 (weight=0.51)
35538.txt: Topic 11 (weight=1.00)
30871.txt: Topic 4 (weight=0.50)
34961.txt: Topic 11 (weight=0.98)
31616.txt: Topic 12 (weight=0.94)
35316.txt: Topic 10 (weight=0.80)
34551.txt: Topic 1 (weight=0.47)
34586.txt: Topic 10 (weight=0.42)
35539.txt: Topic 11 (weight=1.00)
31011.txt: Topic 11 (weight=0.82)
33821.txt: Topic 13 (weight=1.00)
34816.txt: Topic 10 (weight=0.87)
31761.txt: Topic 11 (weight=0.55)
34631.txt: Topic 1 (weight=0.44)
30721.txt: Topic 4 (weight=0.82)
34976.txt: Topic 13 (weight=0.65)
35315.txt: Topic 7 (weight=0.74)
31601.txt: Topic 5 (weight=1.00)
34546.txt: Topic 1 (weight=0.55)
35116.txt: Topic 10 (weight=0.99)
34591.txt: Topic 10 (weight=0.41)
33611.txt: Topic 13 (weight=0.97)
31006.txt: Topic 11 (weight=0.99)
35506.txt: Topic 11 (weight=1.00)
33836.txt: Topic 13 (weight=1.00)
33826.txt: Topic 13 (weight=1.00)
31016.txt: Topic 11 (weight=1.00)
34436.txt: Topic 4 (weight=0.49)
35066.txt: Topic 5 (weight=0.32)
34581.txt: Topic 10 (wei

In [68]:
transcript_tags = {}

for i, bow in enumerate(corpus):
    topics = lda_model.get_document_topics(bow)
    sorted_topics = sorted(topics, key=lambda x: x[1], reverse=True)
    top_topic = sorted_topics[0][0]
    topic_words = [w for w, _ in lda_model.show_topic(top_topic, topn=5)]
    transcript_tags[os.path.basename(files[i])] = topic_words

df_tags = pd.DataFrame(
    transcript_tags.items(), 
    columns=['Filename', 'Topic Words']
)

# 2. Convert the list of topic words into a comma-separated string for better CSV readability
# e.g., ['jewish', 'community', 'family'] -> "jewish, community, family"
df_tags['Topic Words'] = df_tags['Topic Words'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
df_tags['ID'] = df_tags['Filename'].str.extract(r'(\d+)\.txt')
df_tags.head()

Unnamed: 0,Filename,Topic Words,ID
0,34626.txt,"said, know, got, one, go",34626
1,35538.txt,"like, know, really, think, people",35538
2,30871.txt,"know, people, one, think, time",30871
3,34961.txt,"like, know, really, think, people",34961
4,31616.txt,"yeah, forest, think, andrews, well",31616


In [69]:
output_filename = 'lda_transcript_topics.csv'
df_tags.to_csv(output_filename, index=False)

In [62]:
from bertopic import BERTopic
from umap import UMAP
import re
from hdbscan import HDBSCAN


In [63]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation/numbers
    text = ' '.join([w for w in text.split() if w not in stop_words])
    return text

In [64]:


data_dir = "./test/"  

# Load transcripts
files = glob.glob(os.path.join(data_dir, "*.txt"))

docs = [clean_text(open(f, encoding="utf-8").read()) for f in files]


# Create a smaller, more stable UMAP model
umap_model = UMAP(
    n_neighbors=10,      # reduce neighbor count (default=15)
    n_components=5,     # 2D projection
    min_dist=0.1,       
    metric='cosine', 
    random_state=42
)

hdb = HDBSCAN(
    min_cluster_size=2,    # small clusters allowed
    min_samples=1,         # sensitive to small clusters
    cluster_selection_method="eom",
    prediction_data=True   # << needed for probabilities
)

topic_model = BERTopic(
    embedding_model="all-mpnet-base-v2",
    umap_model=umap_model,
    hdbscan_model=hdb,
    calculate_probabilities=True
)

topics, probs = topic_model.fit_transform(docs)

topic_model.get_topic_info().head()


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,23,-1_people_one_would_well,"[people, one, would, well, got, said, go, time...",[introductory remarks max geier purpose study ...
1,0,10,0_like_know_gender_really,"[like, know, gender, really, feel, im, dont, y...",[okay today march th im lilith lilith wikstein...
2,1,10,1_know_community_irco_asian,"[know, community, irco, asian, family, like, s...",[name natalia fernndez oregon state university...
3,2,9,2_like_gay_im_really,"[like, gay, im, really, people, lesbian, would...",[good morning history oregon state university ...
4,3,8,3_andrews_people_work_forest,"[andrews, people, work, forest, research, lot,...",[andrews talking little bit worked andrews sin...


In [65]:
topic_model.visualize_topics()


In [66]:
import pandas as pd

df = pd.DataFrame({
    "file": files,
    "topic": topics
})

df.head()


Unnamed: 0,file,topic
0,./test/34626.txt,32
1,./test/35538.txt,0
2,./test/30871.txt,-1
3,./test/34961.txt,7
4,./test/31616.txt,25


In [71]:
topic_info = topic_model.get_topic_info()
topic_info = topic_info[["Topic", "Name"]]  # keeps only topic number and name

# Merge topic names into df
df = df.merge(topic_info, left_on="topic", right_on="Topic", how="left")
df = df[["file", "topic"]]
df['ID'] = df['file'].str.extract(r'/(\d+)\.txt')
df.head()

Unnamed: 0,file,topic,ID
0,./test/34626.txt,32,34626
1,./test/35538.txt,0,35538
2,./test/30871.txt,-1,30871
3,./test/34961.txt,7,34961
4,./test/31616.txt,25,31616


In [72]:
def get_topic_words(topic_num):
    if topic_num == -1:
        return ["outlier"]  # handle unassigned transcripts
    # Returns only the words, not their scores
    return [word for word, _ in topic_model.get_topic(topic_num)]


In [73]:
df["Top_Words"] = df["topic"].apply(get_topic_words)
df.head()


Unnamed: 0,file,topic,ID,Top_Words
0,./test/34626.txt,32,34626,"[time, work, district, kind, al, would, one, g..."
1,./test/35538.txt,0,35538,"[like, know, gender, really, feel, im, dont, y..."
2,./test/30871.txt,-1,30871,[outlier]
3,./test/34961.txt,7,34961,"[students, campus, color, osu, think, student,..."
4,./test/31616.txt,25,31616,"[climate, change, carbon, ocean, model, econom..."


In [74]:
df.to_csv("bertopic_transcript_topics.csv", index=False)