# Topic Modeling Using NMF

In [2]:
import numpy as np
import pandas as pd
from __future__ import print_function
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.datasets import fetch_20newsgroups

In [3]:
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

In [4]:
t0 = time()
print("Loading dataset and extracting TF-IDF features...")
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))

Loading dataset and extracting TF-IDF features...


In [10]:
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                             stop_words='english')
tfidf = vectorizer.fit_transform(dataset.data[:n_samples])

print(tfidf.shape)
print("done in %0.3fs." % (time() - t0))

(2000, 1000)
done in 96.598s.


In [6]:
# Fit the NMF model
print("Fitting the NMF model with n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
nmf = NMF(n_components=n_topics, random_state=1).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

Fitting the NMF model with n_samples=2000 and n_features=1000...
done in 8.720s.


In [12]:
feature_names = vectorizer.get_feature_names_out()

In [13]:
for topic_idx, topic in enumerate(nmf.components_):
    print("Topic #%d:" % topic_idx)
    print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

Topic #0:
just people don think like know say did make really time way ve right sure good going want got wrong

Topic #1:
windows use using window dos program application os drivers software help screen running ms code motif pc work ve mode

Topic #2:
god jesus bible faith does christian christians christ believe life heaven sin lord church religion true mary human belief love

Topic #3:
thanks know does mail advance hi info interested anybody email like looking help appreciated card information list send need post

Topic #4:
car new 00 bike 10 price space power cars sale good year engine years used cost miles condition great 000

Topic #5:
edu soon com send university internet ftp mail mit information article cc pub address hope program email mac blood contact

Topic #6:
file problem files format ftp win space sound read pub available program site help version image book copy save memory

Topic #7:
game team year games win play season players nhl runs toronto ll flyers division goal h

# Building NMF from Scratch

In [14]:
def nmf(X, k, max_iterations=100, tol=1e-6):
    # X: Input matrix of shape (n, m)
    # k: Number of components/factors
    # max_iterations: Maximum number of iterations
    # tol: Tolerance value for convergence
    
    n, m = X.shape
    
    # Initialize random non-negative matrices W and H
    W = np.random.rand(n, k)
    H = np.random.rand(k, m)
    
    # Iterative update of W and H
    for iteration in range(max_iterations):
        # Update H
        numerator = W.T@X
        denominator = (W.T@W)@H
        denominator = np.where(denominator > 0, denominator, np.finfo(float).eps) 
        H *= numerator / denominator
        
        # Update W
        numerator = X@H.T
        denominator = (W@H)@H.T
        denominator = np.where(denominator > 0, denominator, np.finfo(float).eps) 
        W *= numerator / denominator
        
        # Calculate reconstruction error
        reconstruction_error = np.linalg.norm(X - np.dot(W, H))
        
        # Check for convergence
        if reconstruction_error < tol:
            break
    
    return W, H, W@H
    

#this is a test
print(nmf(np.array([[5,5,5],[5,5,5],[5,5,5]]),4,1000,1e-6))

(array([[0.77089507, 0.78488755, 0.24218167, 0.6263597 ],
       [0.617991  , 0.06527078, 1.00185752, 0.34611508],
       [0.70174678, 0.17367148, 0.65180909, 0.65116993]]), array([[2.28595137, 0.10370124, 1.1785538 ],
       [1.69070024, 2.55975804, 2.13171273],
       [2.78917804, 3.64076982, 3.22132596],
       [1.97214778, 3.23968354, 2.61536308]]), array([[4.99999989, 4.99999989, 5.00000022],
       [4.99999978, 4.99999979, 5.00000043],
       [5.00000033, 5.00000032, 4.99999935]]))


In [15]:
W, H, X_approx = nmf(tfidf, 10, max_iterations=100, tol=1e-6)

In [17]:
feature_names = vectorizer.get_feature_names_out()

In [19]:
for topic_idx, topic in enumerate(H):
    print("Topic #%d:" % topic_idx)
    print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

Topic #0:
like ve don got ll know bike sounds didn thing good say right time look way little doing maybe looks

Topic #1:
god jesus bible faith does christian christians christ believe life heaven sin lord church mary religion love human atheism belief

Topic #2:
thanks know does mail advance hi info interested anybody email looking help card appreciated information list send video need reply

Topic #3:
just thought don think sure wrong really heard want wondering listen bad doesn book mean does read way argument work

Topic #4:
people think don did law government israel make rights time said say case state going evidence know point gun mr

Topic #5:
game team year games win play season players nhl toronto division runs flyers goal think player hockey won teams better

Topic #6:
drive car new 00 power 10 price sale drives software hard card speed computer disk used 16 condition old high

Topic #7:
edu soon com send university internet ftp mail mit information article pub cc hope contac