# Exploring documents with LDA-based topic modeling

Adapted from: https://stackabuse.com/python-for-nlp-topic-modeling/

## Load the docs

Document loading is detailed in `clustering.ipynb`

In [4]:
import os

DATADIR = '../data/DocumentCloud/subset'

def documents(datadir=DATADIR):
    for fn in os.listdir(datadir):
        yield open(os.path.join(datadir, fn)).read()
docs = [doc for doc in documents()]

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
doc_term_matrix = count_vect.fit_transform(docs)
doc_term_matrix

<380x3400 sparse matrix of type '<class 'numpy.int64'>'
	with 38817 stored elements in Compressed Sparse Row format>

In [9]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.fit(doc_term_matrix)

for i,topic in enumerate(LDA.components_):
    print(f'Topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Topic #0:
['service', 'vs', '2017', 'mr', 'complaints', '2018', 'bus', 'ada', 'ms', 'cta']


Topic #1:
['approve', 'regular', 'report', 'matters', 'authority', 'motion', 'chairman', 'transit', 'chicago', 'board']


Topic #2:
['stations', 'facilitator', 'complaints', 'work', 'members', 'cta', '2017', 'ms', '2016', 'station']


Topic #3:
['planning', 'delivery', 'strategic', '2018', 'approval', '2019', 'service', 'agenda', 'business', 'minutes']


Topic #4:
['agreement', 'review', 'finance', 'motion', 'number', 'authorizing', 'contract', 'ordinance', '000', '00']


