# Exploring documents with LDA-based topic modeling

Adapted from: https://stackabuse.com/python-for-nlp-topic-modeling/

## Load the docs

Document loading is detailed in `clustering.ipynb`

In [12]:
import os
import time
import pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()

DATADIR = '../data/DocumentCloud/subset'

# The visualization section below can be very slow. Not recommended for larger data sets, but tweaking some
# of these values might be helpful
N_TOPICS = 5
TERMS_PER_TOPIC = 30
NGRAM_RANGE = (1,3)
LAMBDA_STEP = 0.05

def documents(datadir=DATADIR):
    for fn in os.listdir(datadir):
        yield open(os.path.join(datadir, fn)).read()
docs = [doc for doc in documents()]

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english', ngram_range=NGRAM_RANGE)
doc_term_matrix = count_vect.fit_transform(docs)
doc_term_matrix

<380x16454 sparse matrix of type '<class 'numpy.int64'>'
	with 124359 stored elements in Compressed Sparse Row format>

In [14]:
doc_term_matrix.indices

array([10288,  7482,  3545, ...,  5781,  4989,  2498], dtype=int32)

In [15]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=N_TOPICS, random_state=42)
LDA.fit(doc_term_matrix)

for i,topic in enumerate(LDA.components_):
    print(f'Topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-TERMS_PER_TOPIC:]])
    print('\n')

Topic #0:
['budget', '000 00 contract', 'adjourn', 'ordinance authorizing', 'review', 'authorizing', 'finance audit', 'business', 'audit', 'items', '00 contract number', '00 contract', '2017', 'approval', 'chairman', '2018', 'ordinance', 'finance', '2019', 'board', '000 00', 'approved', '000', 'agenda', 'minutes', 'contract number', 'motion', 'number', 'contract', '00']


Topic #1:
['work', 'motion', '2016', 'rail', 'information', 'customer', 'elevator', 'stated', 'facilitator', 'stations', 'serpe', 'minutes', 'asked', '2019', 'fuller', 'service', 'chairman', 'new', 'people', 'members', 'vs', 'ada', '2018', 'station', '2017', 'complaints', 'bus', 'cta', 'mr', 'ms']


Topic #2:
['longhini assistant', 'offices', 'longhini', 'gregory longhini', 'assistant secretary', 'gregory', 'west', '567', 'assistant', 'west lake', '567 west lake', '567 west', 'time', 'second', 'notice', 'regular', 'business', 'general', 'chicago transit board', 'chairman', 'transit board', 'secretary', 'transit author

In [16]:
start = time.time()
viz = pyLDAvis.sklearn.prepare(LDA, doc_term_matrix, count_vect, R=TERMS_PER_TOPIC, lambda_step=LAMBDA_STEP, sort_topics=False)
print('Visualization prepared in: ', round(time.time() - start), 'seconds')
viz

Visualization prepared in:  178 seconds


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
