# Step 0: Imports

In [24]:
from tqdm import tqdm
from sklearn.datasets import fetch_20newsgroups # We use the 20 news groups text dataset
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import pandas as pd

# Step 1: Fetching data and preprocessing

In [25]:
newsgroups_train = fetch_20newsgroups(subset='train')

In [26]:
print(newsgroups_train.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [37]:
# Split into smaller training sets in percentage
percentage = 0.2
split_index = int(len(newsgroups_train.data) * percentage)
train_data_small = newsgroups_train.data[:split_index]
train_targets_small = newsgroups_train.target[:split_index]

In [38]:
print(len(train_data_small))

2262


In [39]:
nltk.download('stopwords')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))
# Tokenizing and removing stop words
filtered_train = [[]] * len(train_data_small) # First index is the article
flattened_train = []
for i, article in tqdm(enumerate(train_data_small), total=len(train_data_small), desc="Processing Articles"):
    word_tokens = word_tokenize(article)

    text = [w for w in word_tokens if w.lower() not in stop_words]

    for w in word_tokens:
        if w not in stop_words:
            flattened_train.append(w)
            filtered_train[i].append(w)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Processing Articles: 100%|██████████| 2262/2262 [00:08<00:00, 267.94it/s]


In [40]:
print(len(flattened_train))

761817


In [41]:
word_count = Counter(flattened_train)

In [42]:
# Extract low-frequency words (occurrence <= 10) into a set
low_frequency_words = {word for word, count in word_count.items() if count <= 10}

In [43]:
# Filter articles efficiently using set operations
filtered_hf_train = []
for article in tqdm(filtered_train, desc="Removing LF words"):
    article_set = set(article)
    filtered_article = list(article_set - low_frequency_words)
    filtered_hf_train.append(filtered_article)



Removing LF words: 100%|██████████| 2262/2262 [02:03<00:00, 18.28it/s]


In [44]:
flattened_train = [word for word in flattened_train if word not in low_frequency_words]

In [45]:
print(len(flattened_train))

620610


In [46]:
print(flattened_train[0])

From


# Step 2: Gibbs sampling

In [None]:
def gibbs_sampling(data, alpha, beta, K, iterations = 150):
  q = np.zeros(K)
  p = np.zeros(K)
  z = np.zeros(len(data))
  for i in tqdm(iterations):
    # Start with arbitrary z:
    z = data[0][0]

    # Iterate sequentially through the data:
    for j in data[:]:
      for d in data[j][:]:
        for k in range(K):
          q[k] = (alpha +
          p[k] = q[k]/sum(q)
        # Update z[d][j]
        z[d][j] =


In [None]:
# First parameter combo
gibbs_sampling(filtered_train, alpha = 0.1, beta = 0.1, K=10)

In [None]:
# Second parameter combo
gibbs_sampling(filtered_train, alpha = 0.01, beta = 0.01, K=10)

In [None]:
# Third parameter combo
gibbs_sampling(filtered_train, alpha = 0.1, beta = 0.1, K=50)

In [None]:
# Fourth parameter combo
gibbs_sampling(filtered_train, alpha = 0.01, beta = 0.01, K=50)