# Step 0: Imports

In [1]:
from tqdm import tqdm
from sklearn.datasets import fetch_20newsgroups # We use the 20 news groups text dataset
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import pandas as pd

# Step 1: Fetching data and preprocessing

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')

In [3]:
print(len(newsgroups_train.target_names))

20


In [17]:
# Split into smaller training sets in percentage
percentage = 0.5
split_index = int(len(newsgroups_train.data) * percentage)
train_data_small = newsgroups_train.data[:split_index]
train_targets_small = newsgroups_train.target[:split_index]

In [18]:
print(train_data_small[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [19]:
nltk.download('stopwords')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))

# Initialize structures for the preprocessed corpus
filtered_train = [[] for _ in range(len(train_data_small))]  # Preprocessed articles
flattened_train = []  # A single list of all words in the corpus

# Tokenizing and removing stopwords
print("Processing Articles")
for i, article in tqdm(enumerate(train_data_small), total=len(train_data_small)):
    word_tokens = word_tokenize(article)  # Tokenize article
    # Remove stop words and add to both filtered_train and flattened_train
    filtered_words = [w.lower() for w in word_tokens if w.lower() not in stop_words]
    filtered_train[i] = filtered_words
    flattened_train.extend(filtered_words)

# Create a vocabulary mapping
unique_words = sorted(set(flattened_train))  # Get unique words
word_to_id = {word: idx for idx, word in enumerate(unique_words)}  # Map word to ID
id_to_word = {idx: word for word, idx in word_to_id.items()}  # Reverse mapping

# Map the filtered articles to integer IDs
int_corpus = [[word_to_id[word] for word in article] for article in filtered_train]

# Display mappings and a small example
print(f"Total unique words: {len(unique_words)}")
print("Word-to-ID mapping example:", {k: word_to_id[k] for k in list(word_to_id)[:5]})
print("Integer Corpus example (first article):", int_corpus[0][100:110])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Processing Articles


100%|██████████| 5657/5657 [00:17<00:00, 327.17it/s]


Total unique words: 120769
Word-to-ID mapping example: {'\x03\x03\x1b': 0, '\x1a': 1, '!': 2, '#': 3, '$': 4}
Integer Corpus example (first article): []


In [20]:
word_count = Counter(flattened_train)

In [21]:
# Extract low-frequency words (occurrence <= 10) into a set
low_frequency_words = {word for word, count in word_count.items() if count <= 10}

In [22]:
# Filter articles efficiently using set operations
corpus_hf = []
for article in tqdm(int_corpus, desc="Removing LF words"):
    article_set = set(article)
    filtered_article = list(article_set - low_frequency_words)
    corpus_hf.append(filtered_article)

Removing LF words: 100%|██████████| 5657/5657 [00:00<00:00, 22785.28it/s]


In [23]:
flattened_train = [word for word in flattened_train if word not in low_frequency_words]
voc_size = len(sorted(set(flattened_train)))
print("Vocabulary size after removing LF words", voc_size)

# Step 2: Gibbs sampling

In [24]:
from collections import defaultdict

def lda_gibbs_sampling(corpus, V, K, alpha, beta, iterations):
    """
    Implements Collapsed Gibbs Sampling for LDA.

    :param corpus: List of lists, where each inner list contains word IDs in a document.
    :param K: Number of topics.
    :param alpha: Dirichlet prior for document-topic distribution.
    :param beta: Dirichlet prior for topic-word distribution.
    :param iterations: Number of Gibbs sampling iterations.
    :return: Topic assignments, document-topic counts, topic-word counts, topic totals.
    """
    # Initialize variables
    D = len(corpus)  # Number of documents

    # Count matrices
    ndk = np.zeros((D, K))  # Document-topic counts
    nkw = np.zeros((K, V))  # Topic-word counts
    nk = np.zeros(K)        # Total words in each topic

    # Topic assignments for each word
    z = []  # Topic assignment for each word in corpus
    for d, doc in enumerate(corpus):
        doc_topics = []
        for word in doc:
            topic = np.random.randint(K)  # Randomly assign a topic
            doc_topics.append(topic)
            ndk[d, topic] += 1
            nkw[topic, word] += 1
            nk[topic] += 1
        z.append(doc_topics)

    # Gibbs sampling
    for _ in tqdm(range(iterations)):
        for d, doc in enumerate(corpus):
            for i, word in enumerate(doc):
                current_topic = z[d][i]

                # Decrement counts
                ndk[d, current_topic] -= 1
                nkw[current_topic, word] -= 1
                nk[current_topic] -= 1

                # Compute topic probabilities (Maybe do a for loop here instead)
                topic_probs = (ndk[d] + alpha) * (nkw[:, word] + beta) / (nk + beta * V)
                topic_probs /= np.sum(topic_probs)  # Normalize

                # Sample new topic
                new_topic = np.random.choice(K, p=topic_probs)
                z[d][i] = new_topic

                # Increment counts
                ndk[d, new_topic] += 1
                nkw[new_topic, word] += 1
                nk[new_topic] += 1

    return z, ndk, nkw, nk

In [25]:
# Initialize data
corpus = corpus_hf.copy()
targets = train_targets_small.copy()
topics = newsgroups_train.target_names.copy()

# First parameter combo
z, ndk, nkw, nk = lda_gibbs_sampling(corpus, alpha = 0.1, beta = 0.1, V=voc_size, K=len(topics), iterations=50)

100%|██████████| 50/50 [27:34<00:00, 33.10s/it]


In [27]:
print(ndk.shape, nkw.shape, nk.shape)

(5657, 20) (20, 120769) (20,)


In [44]:
import pandas as pd
def get_top_words(nkw, id_to_word, top_n=20, method="raw", beta=0.1):
    """
    Get the top words for each topic.

    :param nkw: Topic-word counts (K x V matrix).
    :param id_to_word: Dictionary mapping word IDs to their original words.
    :param top_n: Number of top words to retrieve per topic.
    :param method: "raw" for raw counts, "relative" for relative frequencies.
    :param beta: Dirichlet prior for smoothing (used in relative frequency).
    :return: Dictionary of top words for each topic.
    """
    K, V = nkw.shape
    top_words_per_topic = {}

    if method == "raw":
        # Use raw counts
        for k in range(K):
            top_word_indices = np.argsort(nkw[k, :])[::-1][:top_n]  # Top N words by count
            top_words_per_topic[k] = [id_to_word[idx] for idx in top_word_indices]

    elif method == "relative":
        # Compute relative frequencies
        word_totals = np.sum(nkw, axis=0)  # Total count of each word across all topics
        for k in range(K):
            relative_freqs = (nkw[k, :] + beta) / (word_totals + beta * K)  # Smoothed relative frequency
            top_word_indices = np.argsort(relative_freqs)[::-1][:top_n]  # Top N words by relative frequency
            top_words_per_topic[k] = [
                id_to_word[idx] for idx in top_word_indices
            ]

    return top_words_per_topic


def top_words_to_df(top_words_per_topic, method="raw"):
    """
    Display the top words for each topic in a table format.

    :param top_words_per_topic: Dictionary of top words for each topic.
    :param method: Description of the method used ("raw" or "relative").
    """
    df_top_words = pd.DataFrame.from_dict(top_words_per_topic, orient="index")
    df_top_words.columns = [f"Topic {i+1}" for i in range(df_top_words.shape[1])]
    return df_top_words



In [45]:
top_words_per_topic = get_top_words(nkw, id_to_word, top_n=20)
df_top_words = top_words_to_df(top_words_per_topic)
df_top_words

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20
0,``,one,people,'s,'',would,n't,even,think,also,many,us,way,see,make,well,say,),],could
1,subject,god,",",lines,.,organization,:,(,@,>,),jesus,christians,writes,christian,bible,time,jews,?,israel
2,x,program,using,code,window,application,information,;,via,user,server,=,programs,display,data,files,motif,systems,file,function
3,disease,medical,treatment,patients,medicine,doctor,sensitivity,cause,patient,food,diet,effects,msg,hospital,severe,superstition,health,foods,syndrome,symptoms
4,subject,@,:,(,lines,),.,",",organization,>,?,writes,<,n't,--,article,'s,'',``,nntp-posting-host
5,:,@,lines,subject,organization,),.,(,",",?,--,nntp-posting-host,university,thanks,-,!,>,distribution,please,$
6,x-newsreader,tin,1.1,version,],[,wrote,subject,pl8,{,@,},",",lines,circuit,),writes,(,use,?
7,99,m+,96,qb,dt,+9,lq,a2,dk,k,rv,02,=/,nl,vu,x8,1y,+=,e.,3k
8,terrorist,cia,neptunium,nsc,fema,drew,66,mi5,deuterium,gozer.idbsu.edu,mi6,semtex,betz,idaho,insurrection,fodder,kgb,outlaws,81,gozer
9,organization,lines,:,subject,",",game,(,@,),.,team,university,!,play,year,games,players,baseball,--,win
