# Dependencies

This notebook depends on:
* Scikit Learn
* pyLDAvis

In [1]:
try:
    from xml.etree.cElementTree import XML
except ImportError:
    from xml.etree.ElementTree import XML
import zipfile

In [2]:
document = zipfile.ZipFile('Hearing-Day-2.docx')
xml_content = document.read('word/document.xml')
document.close()
tree = XML(xml_content)

WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
PARA = WORD_NAMESPACE + 'p'
TEXT = WORD_NAMESPACE + 't'

paragraphs = []
for paragraph in tree.getiterator(PARA):
    texts = [node.text
             for node in paragraph.getiterator(TEXT)
             if node.text]
    if texts:
        paragraphs.append(''.join(texts))

In [3]:
print(paragraphs)



In [4]:
import re

topicDict = {}
text = ''
topicId = ''
for para in paragraphs:
    m = re.match('(^.*\))(.*)', para)
    if m is not None:
        topicDict[m.group(1)] = m.group(2)

In [5]:
print(len(topicDict.values()))
print(len(topicDict.keys()))

1647
1647


In [11]:
%%time

from sklearn.feature_extraction.text import CountVectorizer


# Using no stop words gives useless results in this case
# tf_vectorizer = CountVectorizer(min_df=2, stop_words=None)

tf_vectorizer = CountVectorizer(min_df=2, stop_words='english')
# tf_vectorizer = CountVectorizer(stop_words='english')
# tf_vectorizer = CountVectorizer(min_df=2, ngram_range=(1,2), stop_words='english')

default_tf = tf_vectorizer.fit_transform(topicDict.values())
tf_feature_names = tf_vectorizer.get_feature_names()
# Materialize the sparse data
dense = default_tf.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Data Sparsicity: ", ((dense > 0).sum()/dense.size)*100, "%")

print(len(tf_vectorizer.stop_words_))

Data Sparsicity:  0.6223847167541827 %
2473
Wall time: 217 ms


In [12]:
from sklearn.decomposition import LatentDirichletAllocation

# Run LDA
%time lda = LatentDirichletAllocation(n_components=15, max_iter=10, learning_method='online', learning_offset=50.,random_state=0).fit(default_tf)

print("\nLog Likelihood: %d ## Perplexity: %d\n" % ((lda.score(default_tf)), lda.perplexity(default_tf)))

# Print the top 5 words in each topic
num_words = 5
print('LDA Topics: ')
for topic_idx, topic in enumerate(lda.components_):
    print ("Topic %d:" % (topic_idx), " ".join([tf_feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]))


Wall time: 6.59 s

Log Likelihood: -301453 ## Perplexity: 1633

LDA Topics: 
Topic 0: abortion pain fifth talk month
Topic 1: think justice case said court
Topic 2: based race employee lateral employment
Topic 3: court president care people supreme
Topic 4: senator state rights want religious
Topic 5: medicare wedding wonderful sold interfaith
Topic 6: yes correct appeared chosen university
Topic 7: english mean interrupting button created
Topic 8: serve refuse couple yep wedding
Topic 9: women woman pro choice right
Topic 10: heller ambiguous construct defiance lying
Topic 11: challenged cert renewable oil epa
Topic 12: right amendment case court okay
Topic 13: daughter dame notre court vivian
Topic 14: sorry jimmy look hosanna tabor


In [13]:
topic_labels = []
for topic_idx, topic in enumerate(lda.components_):
    label_string = "Topic "+str(topic_idx)+ ": "+" ".join([tf_feature_names[i] for i in topic.argsort()[:-1 - 1:-1]])
    topic_labels.append(label_string)
    print(label_string)


Topic 0: abortion
Topic 1: think
Topic 2: based
Topic 3: court
Topic 4: senator
Topic 5: medicare
Topic 6: yes
Topic 7: english
Topic 8: serve
Topic 9: women
Topic 10: heller
Topic 11: challenged
Topic 12: right
Topic 13: daughter
Topic 14: sorry


In [14]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda, default_tf, tf_vectorizer, mds='tsne')
pyLDAvis.display(panel)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [10]:
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# pipeline = Pipeline([
#     ('vect', CountVectorizer()),
#     ('lda', LatentDirichletAllocation())
# ])

# # this is where you define the values for
# # GridSearchCV to iterate over
# param_grid = {
#     'vect__max_df':[0.8,0.9,1.0],
#     'lda__n_components':[5,10,15]
# }

# # do 3-fold cross validation for each of the 6 possible
# # combinations of the parameter values above
# grid = GridSearchCV(pipeline, cv=3, param_grid=param_grid)
# grid.fit(topicDict.values())

# Define Search Param
search_params = {'n_components': [5, 10, 15], 'learning_decay': [.5, .7, .9]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=10, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class

# add njobs=-1 to make this multithreaded.
model = GridSearchCV(lda, n_jobs=-1, param_grid=search_params)
# Do the Grid Search
model.fit(default_tf)


# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(default_tf))



Best Model's Params:  {'learning_decay': 0.9, 'n_components': 5}
Best Log Likelihood Score:  -178758.18807989755
Model Perplexity:  4061.4089494332416
