<a href="https://colab.research.google.com/github/MahdiTheGreat/Intro-to-language-modeling/blob/main/Assignment_3_cleaned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 0: Imports

In [None]:
from tqdm import tqdm
from sklearn.datasets import fetch_20newsgroups # We use the 20 news groups text dataset
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import pandas as pd
import re

# Step 1: Fetching data and preprocessing

In [None]:
newsgroups_train = fetch_20newsgroups(subset='train')

In [None]:
print(len(newsgroups_train.target_names))

20


In [None]:
# Split into smaller training sets in percentage
percentage = 0.8
split_index = int(len(newsgroups_train.data) * percentage)
train_data_small = newsgroups_train.data[:split_index]
train_targets_small = newsgroups_train.target[:split_index]

In [None]:
print(train_data_small[445])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [None]:
def extract_body(text):
    # Extract only words using regex
    words = re.findall(r'\b[a-zA-Z]+\b', text)

    # Join the words with spaces (optional)
    cleaned_text = " ".join(words)

    return cleaned_text

removed_headers = extract_body(train_data_small[445])
print(removed_headers)

From mliggett silver ucs indiana edu matthew liggett Subject Re Opel owners Nntp Posting Host silver ucs indiana edu Organization Indiana University Lines In DG news cso uiuc edu uxa cso uiuc edu OrioleFan uiuc writes boyle cactus org Craig Boyle writes In article news cso uiuc edu uxa cso uiuc edu OrioleFan uiuc writes gibbonsa fraser sfu ca Darren Gibbons writes I m looking for information on Opel cars Now you ask which model Well the sad truth is I m not entirely sure but it s a two seater with roll over headlights hard top and really sporty looking My friend has one sitting in his yard in really nice condition body wise but he transmission has seized up on him so it hasn t run for a while Does anyone have any info on these cars The engine compartment looks really tight to work on but it is in fine shape and I am quite interested in it Thanks Darren Gibbons gibbonsa sfu ca This would be the manta would it not Sold through Buick dealers in the mid s as the price leader Sounds a lot m

In [None]:
nltk.download('stopwords')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))

# Initialize structures for the preprocessed corpus
filtered_train = [[] for _ in range(len(train_data_small))]  # Preprocessed articles
flattened_train = []  # A single list of all words in the corpus

# Tokenizing and removing stopwords
print("Processing Articles")
for i, article in tqdm(enumerate(train_data_small), total=len(train_data_small)):
    article_body = extract_body(article) # Only use body and remove headers and footers
    word_tokens = word_tokenize(article_body)  # Tokenize article
    # Remove stop words and add to both filtered_train and flattened_train
    filtered_words = [w.lower() for w in word_tokens if w.lower() not in stop_words]
    filtered_train[i] = filtered_words
    flattened_train.extend(filtered_words)

# Create a vocabulary mapping
unique_words = sorted(set(flattened_train))  # Get unique words
word_to_id = {word: idx for idx, word in enumerate(unique_words)}  # Map word to ID
id_to_word = {idx: word for word, idx in word_to_id.items()}  # Reverse mapping

# Map the filtered articles to integer IDs
int_corpus = [[word_to_id[word] for word in article] for article in filtered_train]

# Display mappings and a small example
print(f"Total unique words: {len(unique_words)}")
print("Word-to-ID mapping example:", {k: word_to_id[k] for k in list(word_to_id)[:5]})
print("Integer Corpus example (first article):", int_corpus[0][:10])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ANv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ANv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Processing Articles


100%|██████████| 9051/9051 [00:10<00:00, 843.86it/s] 


Total unique words: 109209
Word-to-ID mapping example: {'\x03': 0, '\x03\x03\x1b': 1, '\x1a': 2, '!': 3, '#': 4}
Integer Corpus example (first article): []


In [None]:
word_count = Counter(flattened_train)

In [None]:
# Extract low-frequency words (occurrence <= 10) into a set
low_frequency_words = {word for word, count in word_count.items() if count <= 10}

In [None]:
# Filter articles efficiently using set operations
corpus_hf = []
for article in tqdm(int_corpus, desc="Removing LF words"):
    article_set = set(article)
    filtered_article = list(article_set - low_frequency_words)
    corpus_hf.append(filtered_article)

Removing LF words: 100%|██████████| 9051/9051 [00:00<00:00, 80738.78it/s]


In [None]:
flattened_train = [word for word in flattened_train if word not in low_frequency_words]
voc_size = len(sorted(set(flattened_train)))
print("Number of words in corpus", len(flattened_train))
print("Vocabulary size after removing LF words", voc_size)

Number of words in corpus 1550053
Vocabulary size after removing LF words 11537


# Step 2: Gibbs sampling

In [None]:
from collections import defaultdict

def lda_gibbs_sampling(corpus, K, alpha, beta, iterations):
    """
    Implements Collapsed Gibbs Sampling for LDA.

    :param corpus: List of lists, where each inner list contains word IDs in a document.
    :param K: Number of topics.
    :param alpha: Dirichlet prior for document-topic distribution.
    :param beta: Dirichlet prior for topic-word distribution.
    :param iterations: Number of Gibbs sampling iterations.
    :return: Topic assignments, document-topic counts, topic-word counts, topic totals.
    """
    # Initialize variables
    D = len(corpus)  # Number of documents
    V = len(list(set(word for doc in corpus for word in doc)))

    # Count matrices
    ndk = np.zeros((D, K))  # Document-topic counts
    nkw = np.zeros((K, V))  # Topic-word counts
    nk = np.zeros(K)        # Total words in each topic

    # Topic assignments for each word
    z = []  # Topic assignment for each word in corpus
    for d, doc in enumerate(corpus):
        doc_topics = []
        for word in doc:
            topic = np.random.randint(K)  # Randomly assign a topic
            doc_topics.append(topic)
            ndk[d, topic] += 1
            nkw[topic, word] += 1
            nk[topic] += 1
        z.append(doc_topics)

    # Gibbs sampling
    for _ in tqdm(range(iterations)):
        for d, doc in enumerate(corpus):
            for i, word in enumerate(doc):
                current_topic = z[d][i]

                # Decrement counts
                ndk[d, current_topic] -= 1
                nkw[current_topic, word] -= 1
                nk[current_topic] -= 1

                # Compute topic probabilities (Maybe do a for loop here instead)
                topic_probs = (ndk[d] + alpha) * (nkw[:, word] + beta) / (nk + beta * V)
                topic_probs /= np.sum(topic_probs)  # Normalize

                # Sample new topic
                new_topic = np.random.choice(K, p=topic_probs)
                z[d][i] = new_topic

                # Increment counts
                ndk[d, new_topic] += 1
                nkw[new_topic, word] += 1
                nk[new_topic] += 1

    return z, ndk, nkw, nk

In [None]:
# Initialize data
corpus = corpus_hf.copy()
targets = train_targets_small.copy()
topics = newsgroups_train.target_names.copy()

# First parameter combo
z, ndk, nkw, nk = lda_gibbs_sampling(corpus, alpha = 0.1, beta = 0.1, K=len(topics), iterations=200)

100%|██████████| 200/200 [1:03:52<00:00, 19.16s/it]


In [None]:
print(ndk.shape, nkw.shape, nk.shape)

(9051, 20) (20, 109209) (20,)


In [None]:
import pandas as pd
def get_top_words(nkw, id_to_word, top_n=20, method="raw", beta=0.1):
    """
    Get the top words for each topic.

    :param nkw: Topic-word counts (K x V matrix).
    :param id_to_word: Dictionary mapping word IDs to their original words.
    :param top_n: Number of top words to retrieve per topic.
    :param method: "raw" for raw counts, "relative" for relative frequencies.
    :param beta: Dirichlet prior for smoothing (used in relative frequency).
    :return: Dictionary of top words for each topic.
    """
    K, V = nkw.shape
    top_words_per_topic = {}

    if method == "raw":
        # Use raw counts
        for k in range(K):
            top_word_indices = np.argsort(nkw[k, :])[::-1][:top_n]  # Top N words by count
            top_words_per_topic[k] = [f"{id_to_word[idx]}, {int(nkw[k, idx])}" for idx in top_word_indices]

    elif method == "relative":
        # Compute relative frequencies
        word_totals = np.sum(nkw, axis=0)  # Total count of each word across all topics
        for k in range(K): # for k in topics
            relative_freqs = (nkw[k, :] + beta) / (word_totals + beta * K)  # Smoothed relative frequency
            top_word_indices = np.argsort(relative_freqs)[::-1][:top_n]  # Top N words by relative frequency
            top_words_per_topic[k] = [
                f"{id_to_word[idx]}, {int(nkw[k,idx])}" for idx in top_word_indices
            ]

    return top_words_per_topic


def top_words_to_df(top_words_per_topic):
    """
    Display the top words for each topic in a table format.

    :param top_words_per_topic: Dictionary of top words for each topic.
    :param method: Description of the method used ("raw" or "relative").
    """
    df_top_words = pd.DataFrame.from_dict(top_words_per_topic)
    df_top_words.columns = [f"Topic {i+1}" for i in range(df_top_words.shape[1])]
    return df_top_words



In [None]:
top_words_per_topic = get_top_words(nkw, id_to_word, top_n=20, method="raw")
df_top_words = top_words_to_df(top_words_per_topic)
df_top_words

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20
0,"disease, 80","bike, 196","., 546","de, 15","georgia, 7","., 1549","., 805","viking, 18","\/, 10","+, 9","11, 47","apr, 172","car, 236","space, 188","., 3400","$, 407","curve, 10","israel, 160","., 1941","key, 224"
1,"medical, 78","dod, 131",",, 498","comp, 12","huey, 7",",, 1537",",, 792","1066, 18","o\, 9","#, 9","16, 44","1993, 111","cars, 147","earth, 92",",, 3316","., 405","bezier, 9","israeli, 129",",, 1779","encryption, 168"
2,"doctor, 61","ride, 109","), 412","van, 9","athens, 7",":, 1369",":, 735","sorenson, 17","q, 9","u, 9","6, 42","gmt, 109","engine, 62","nasa, 88","(, 3032",",, 341","calculating, 7","turkish, 110","), 1343","chip, 162"
3,"food, 61","riding, 90","(, 411","howard, 8","mcovingt, 7","), 1354","(, 709","distant, 17","x, 8","g, 9","12, 41","date, 77","miles, 61","launch, 84","), 3019","), 281","curves, 7","jews, 103","(, 1295","clipper, 161"
4,"treatment, 58","#, 88",":, 401","marc, 8","aisun3.ai.uga.edu, 7","(, 1302","), 708","isu, 17","'', 7","k, 8","19, 38","93, 72","driver, 48","orbit, 77",":, 2795","(, 277","cubic, 7","turks, 101","?, 1260","information, 151"
5,"medicine, 56","motorcycle, 72","@, 351","south, 8","todamhyp, 7",">, 1225",">, 648","machines, 17","jd, 7","mr, 8","33, 37","organization, 28","driving, 46","moon, 76","@, 2706","drive, 247","3-, 7","policy, 95",":, 1133","system, 149"
6,"cause, 55","bikes, 69","'s, 333","le, 8","charles.unlv.edu, 7","@, 1217","@, 618","exnet.iastate.edu, 16","/, 7","e, 8","8, 37","newsgroups, 28","ford, 45","shuttle, 59",">, 2571","card, 212","2-, 7","armenia, 95","@, 1042","government, 139"
7,"patients, 54","road, 45","game, 318","=, 8","40, 6","``, 1146","'', 613","promiscuous, 15","//, 7","l, 8","13, 36","subject, 27","drive, 43","flight, 59","?, 2362","@, 207","cusp, 6","armenians, 93","thanks, 790","public, 132"
8,"symptoms, 46","front, 45","!, 314","il, 7","photography, 6","'', 1116","?, 562","exotic, 15","ms, 7","aj, 8","14, 35","wilson, 24","driven, 42","pat, 58","writes, 2214",":, 201","bj, 6","arab, 91",">, 655","keys, 129"
9,"diet, 44","bmw, 44","team, 309","smtp, 7",">, 6","writes, 1109","writes, 549","z1dan, 14","-, 7","mp, 8","26, 34","fri, 22","honda, 41","science, 44","n't, 2075","sale, 201","detecting, 6","war, 90","anyone, 627","security, 124"
