## simple 2.1 (for 2.2) (remove if needed)

In [1]:
import re
import nltk
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
# If you haven't already downloaded NLTK stopwords (and other corpora), run:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

from sklearn.feature_extraction.text import CountVectorizer

def custom_preprocessor(text):
    text = text.lower()
    return text


def custom_tokenizer(text):
    tokens = word_tokenize(text)
    return tokens


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# 2.2 dimension reduction

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
from scipy.linalg import norm

# We’ll use a subset of categories for the binary classification tasks
categories = ['comp.graphics', 'sci.med']

newsgroups_train = fetch_20newsgroups(
    subset='train',
    categories=categories,
    shuffle=True,
    random_state=42,
    remove=('headers', 'footers', 'quotes')  # Remove metadata for cleaner text
)

newsgroups_test = fetch_20newsgroups(
    subset='test',
    categories=categories,
    shuffle=True,
    random_state=42,
    remove=('headers', 'footers', 'quotes')
)

print("Number of training samples:", len(newsgroups_train.data))
print("Number of testing samples:", len(newsgroups_test.data))
print("Categories:", newsgroups_train.target_names)

vectorizer = CountVectorizer(
    max_features=5000,
    stop_words='english',
    lowercase=True,
    strip_accents='unicode',
    min_df=2,
    max_df=0.95
)

# Transform training data
X_train = vectorizer.fit_transform(newsgroups_train.data)

# Transform test data using the same vectorizer
X_test = vectorizer.transform(newsgroups_test.data)

# Get target variables
y_train = newsgroups_train.target
y_test = newsgroups_test.target



print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


Performing NMF for various r

In [None]:
from sklearn.decomposition import NMF
import numpy as np
import matplotlib.pyplot as plt


def calculate_reconstruction_error(X, W, H):
    X_reconstructed = W.dot(H)
    return norm(X.toarray() - X_reconstructed, 'fro')**2

# requires X_train, X_test, from BOW matrices from Problem 2.1?

r_values = [1, 10, 50, 100, 200, 500, 1000, 2000]
train_errors = []
test_errors = []


# Initialize and fit the NMF model
for r in r_values:
    print(f"Analyzing NMF with {r} components")
    nmf = NMF(n_components=r, init='random', random_state=42)

    W_train = nmf.fit_transform(X_train)
    H = nmf.components_

    # transform test data
    W_test = nmf.transform(X_test)

    # calculate reconstruction errors
    train_error = calculate_reconstruction_error(X_train, W_train, H)
    test_error = calculate_reconstruction_error(X_test, W_test, H)

    train_errors.append(train_error)
    test_errors.append(test_error)



plt.figure(figsize=(10, 6))
plt.plot(r_values, train_errors, marker='o', label='Training Error')
plt.plot(r_values, test_errors, marker='s', label='Test Error')
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Number of Topics (r)')
plt.ylabel('Reconstruction Error')
plt.title('NMF Reconstruction Error vs Number of Topics')
plt.legend()
plt.grid(True)
plt.show()

Analyzing topic coherence

In [None]:
# Prints the top words for each topic in H.
def print_top_words(H, feature_names, n_top_words = 10):
    for topic_idx, topic_weights in enumerate(H):
        top_word_indices = topic_weights.argsort()[:-n_top_words-1:-1]
        top_words = [feature_names[i] for i in top_word_indices]
        print(f"Topic {topic_idx}: {', '.join(top_words)}")

# Example: analyzing topics for r = 10
r = 10
nmf_10 = NMF(n_components=r, init='random', random_state=42)
W_10 = nmf_10.fit_transform(X_train)
H_10 = nmf_10.components_

# list of feature names from vectorizer
feature_names = vectorizer.get_feature_names_out()

print_top_words(H_10, feature_names, n_top_words=10)
