# Exploring documents with NMF-based topic modeling

Adapted from: https://stackabuse.com/python-for-nlp-topic-modeling/

## Load the docs

Document loading is detailed in `clustering.ipynb`

In [1]:
import os

DATADIR = '../data/DocumentCloud/subset'

def documents(datadir=DATADIR):
    for fn in os.listdir(datadir):
        yield open(os.path.join(datadir, fn)).read()
docs = [doc for doc in documents()]

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')
doc_term_matrix = tfidf_vect.fit_transform(docs)

In [3]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=5, random_state=42)
nmf.fit(doc_term_matrix )

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=5, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [4]:
for i,topic in enumerate(nmf.components_):
    print(f'Topic #{i}:')
    print([tfidf_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Topic #0:
['boardroom', 'considering', 'chairman', 'general', 'secretary', 'authority', 'board', 'transit', 'chicago', 'matters']


Topic #1:
['silva', 'report', 'asked', 'moved', 'peterson', 'chairman', 'agenda', 'board', 'approved', 'motion']


Topic #2:
['room', 'scheduled', 'discuss', 'authority', 'secretary', 'transit', 'review', 'chicago', 'retirement', 'employee']


Topic #3:
['months', 'period', 'agreement', 'number', '00', 'approve', 'contract', 'authorizing', 'ordinance', '000']


Topic #4:
['pm', '2nd', 'retirement', 'business', 'pending', 'adjournment', 'old', '2019', 'bills', 'litigation']


