# Dependencies

This notebook depends on:
* Scikit Learn
* pyLDAvis

In [6]:
try:
    from xml.etree.cElementTree import XML
except ImportError:
    print('secondary')
    from xml.etree.ElementTree import XML
import zipfile

In [13]:
document = zipfile.ZipFile('../data/Hearing-Day-2.docx')
xml_content = document.read('word/document.xml')
document.close()
tree = XML(xml_content)
paragraphs = []

namespace = {'w': "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
text_elements = tree.findall('.//w:t', namespace)
for t_element in text_elements:
    paragraphs.append(''.join(t_element.text))
    print(t_element.text)
    
# WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
# PARA = WORD_NAMESPACE + 'p'
# TEXT = WORD_NAMESPACE + 't'

# for paragraph in tree.getiterator(PARA):
#     texts = [node.text
#              for node in paragraph.getiterator(TEXT)
#              if node.text]
#     if texts:
#         paragraphs.append(''.join(texts))

Oct 13, 2020
Amy Coney Barrett Senate Confirmation Hearing Day 2 Transcript
Day 2 of the Supreme Court confirmation hearing of Amy Coney Barrett took place before the Senate on October 13. Barrett answered questions about Roe v. Wade, the Affordable Care Act, gun regulation, and more. Read the transcript of the full hearing with Barrett’s testimony here.
Chairman Lindsey Graham: (
01:43
)
[crosstalk 00:01:43] … 30 minute period today, then come back Wednesday and finish up. Then we’ll go on about our business. 
So
 I will try, I’ll make sure I stay within 30 minutes for sure, and if I can shorten it up, I will. 
So
 let’s get to it. You can start the clock. 
So
 you can relax a bit here, judge, and take your mask off. So yesterday we had a lot of the discussion about the Affordable Health Care Act. What I’m going to try to do very briefly this morning is to demonstrate the difference between politics and judging. 
All of
 my colleagues on the other side had very emotional pleas about O

In [14]:
print(paragraphs)



In [15]:
import re

topicDict = {}
text = ''
topicId = ''
for para in paragraphs:
    m = re.match('(^.*\))(.*)', para)
    if m is not None:
        topicDict[m.group(1)] = m.group(2)

In [16]:
print(len(topicDict.values()))
print(len(topicDict.keys()))

3
3


In [19]:
%%time

from sklearn.feature_extraction.text import CountVectorizer


# Using no stop words gives useless results in this case
# tf_vectorizer = CountVectorizer(min_df=2, stop_words=None)

tf_vectorizer = CountVectorizer(min_df=2, stop_words='english')
# tf_vectorizer = CountVectorizer(stop_words='english')
# tf_vectorizer = CountVectorizer(min_df=2, ngram_range=(1,2), stop_words='english')

default_tf = tf_vectorizer.fit_transform(paragraphs)
tf_feature_names = tf_vectorizer.get_feature_names_out()
# Materialize the sparse data
dense = default_tf.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Data Sparsicity: ", ((dense > 0).sum()/dense.size)*100, "%")

print(len(tf_vectorizer.stop_words_))

Data Sparsicity:  0.15364782546418626 %
2414
CPU times: user 139 ms, sys: 130 ms, total: 268 ms
Wall time: 290 ms


In [20]:
from sklearn.decomposition import LatentDirichletAllocation

# Run LDA
%time lda = LatentDirichletAllocation(n_components=15, max_iter=10, learning_method='online', learning_offset=50.,random_state=0).fit(default_tf)

print("\nLog Likelihood: %d ## Perplexity: %d\n" % ((lda.score(default_tf)), lda.perplexity(default_tf)))

# Print the top 5 words in each topic
num_words = 5
print('LDA Topics: ')
for topic_idx, topic in enumerate(lda.components_):
    print ("Topic %d:" % (topic_idx), " ".join([tf_feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]))


CPU times: user 6.02 s, sys: 0 ns, total: 6.02 s
Wall time: 6.52 s

Log Likelihood: -353775 ## Perplexity: 1264

LDA Topics: 
Topic 0: president kennedy 25 trump 11
Topic 1: 02 01 13 24 43
Topic 2: americans people state million healthcare
Topic 3: think court case justice said
Topic 4: booker 23 cory 08 38
Topic 5: amy barrett coney judge act
Topic 6: 00 hawley letter josh school
Topic 7: 04 10 40 07 28
Topic 8: election wrote 50 words sasse
Topic 9: senator john feinstein 20 harris
Topic 10: 05 47 true nfib ernst
Topic 11: 03 16 blumenthal richard sen
Topic 12: graham lindsey chairman 22 32
Topic 13: amendment colleagues thank judge like
Topic 14: precedent court cases supreme law


In [21]:
topic_labels = []
for topic_idx, topic in enumerate(lda.components_):
    label_string = "Topic "+str(topic_idx)+ ": "+" ".join([tf_feature_names[i] for i in topic.argsort()[:-1 - 1:-1]])
    topic_labels.append(label_string)
    print(label_string)


Topic 0: president
Topic 1: 02
Topic 2: americans
Topic 3: think
Topic 4: booker
Topic 5: amy
Topic 6: 00
Topic 7: 04
Topic 8: election
Topic 9: senator
Topic 10: 05
Topic 11: 03
Topic 12: graham
Topic 13: amendment
Topic 14: precedent


In [23]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
Collecting numexpr (from pyLDAvis)
  Downloading numexpr-2.8.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Collecting gensim (from pyLDAvis)
  Downloading gensim-4.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.3 kB)
Collecting smart-open>=1.8.1 (from gensim->pyLDAvis)
  Downloading smart_open-6.4.0-py3-none-any.whl.metadata (21 kB)
Downloading gensim-4.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hDownloading 

In [24]:
import pyLDAvis

import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda, default_tf, tf_vectorizer, mds='tsne')
pyLDAvis.display(panel)


ModuleNotFoundError: No module named 'pyLDAvis.sklearn'

In [10]:
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# pipeline = Pipeline([
#     ('vect', CountVectorizer()),
#     ('lda', LatentDirichletAllocation())
# ])

# # this is where you define the values for
# # GridSearchCV to iterate over
# param_grid = {
#     'vect__max_df':[0.8,0.9,1.0],
#     'lda__n_components':[5,10,15]
# }

# # do 3-fold cross validation for each of the 6 possible
# # combinations of the parameter values above
# grid = GridSearchCV(pipeline, cv=3, param_grid=param_grid)
# grid.fit(topicDict.values())

# Define Search Param
search_params = {'n_components': [5, 10, 15], 'learning_decay': [.5, .7, .9]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=10, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class

# add njobs=-1 to make this multithreaded.
model = GridSearchCV(lda, n_jobs=-1, param_grid=search_params)
# Do the Grid Search
model.fit(default_tf)


# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(default_tf))



Best Model's Params:  {'learning_decay': 0.9, 'n_components': 5}
Best Log Likelihood Score:  -178758.18807989755
Model Perplexity:  4061.4089494332416
