# Read in Syracuse University Commencement speeches from document repot

In [8]:
import numpy as np
import pandas as pd
import re, nltk, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
import glob #module used to easily read in documents

path = '/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/*.txt'   

corpus = []

for f in glob.glob(path):
    r = open(f, "r", encoding = "UTF-8")
    print(r)
    text = r.read() 
    corpus.append(text)

<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2004_phylicia_rashad.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2016_donald_newhouse.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2015_mary_karr.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2013_nicholas_kristof.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2011_j_craig_venter.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2005_jane_goodall.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2003_

# Vectorize the documents

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text 

my_additional_stop_words = frozenset(['ve',]) #add in additional stop words to the vanilla dictionary

# NMF is able to use tf-idf
#tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
#tfidf = tfidf_vectorizer.fit_transform(documents)
#tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
vectorizer = CountVectorizer(min_df=2, stop_words=text.ENGLISH_STOP_WORDS.union(my_additional_stop_words), lowercase=True, ngram_range=(1,2))
data_vectorized = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()

# Then call the LDA algorithm to fit a topic model, and transform all documents to their topic distrinbutions

In [28]:
no_topics = 5

# Run NMF
#nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda_model = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
lda_output = lda_model.fit_transform(data_vectorized)
#lda_z = lda.fit_transform(data_vectorized)

In [29]:
lda_output

array([[  1.74546655e-03,   1.74671681e-03,   1.74547547e-03,
          9.93015841e-01,   1.74649985e-03],
       [  3.54994360e-04,   3.55322315e-04,   3.55361122e-04,
          9.98578845e-01,   3.55476877e-04],
       [  2.50109557e-04,   2.50377339e-04,   2.50122125e-04,
          9.98999046e-01,   2.50344930e-04],
       [  2.33152294e-04,   2.33234862e-04,   2.33175231e-04,
          9.99067012e-01,   2.33425957e-04],
       [  2.67399080e-04,   2.67772423e-04,   2.67450282e-04,
          9.98929784e-01,   2.67594226e-04],
       [  1.91719201e-04,   1.91881857e-04,   1.91786840e-04,
          9.99232745e-01,   1.91867535e-04],
       [  1.93193204e-04,   1.93522658e-04,   1.93327840e-04,
          9.99226543e-01,   1.93413525e-04],
       [  1.16290456e-03,   1.16475170e-03,   1.16464211e-03,
          9.95342936e-01,   1.16476519e-03],
       [  2.59382383e-04,   2.59555573e-04,   2.59483740e-04,
          9.98962183e-01,   2.59394952e-04],
       [  2.37470434e-04,   2.3775824

In [27]:
# Create Document - Topic Matrix
lda_output = lda_model.transform(data_vectorized)

# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_topics)]

# index names
docnames = ["Doc" + str(i) for i in range(len(data))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

TypeError: 'NoneType' object cannot be interpreted as an integer

In [42]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 20

In [43]:
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
people life come going years know love million country good america try 11 doing courage thing lives thank need said
Topic 1:
hope family syracuse human know work parents look maybe america life change lives good happy make university need today help
Topic 2:
syracuse friends home don today love said wasn place future hope wish people students let feel year ll times good
Topic 3:
mother heart syracuse family live great work good life years asked education people wish know long congratulations came happy ll
Topic 4:
people life know today years going hope good said don make syracuse things work little right year thank human love


In [44]:
print(lda_z.shape)
print(lda_z[0])

(13, 5)
[ 0.00429435  0.00429837  0.00429642  0.98278714  0.00432372]


In [45]:
doc_topic_dist = lda.transform(tf)
doc_topic_dist

array([[  4.29434750e-03,   4.29837443e-03,   4.29642234e-03,
          9.82787138e-01,   4.32371739e-03],
       [  1.16068435e-03,   1.16295759e-03,   1.16271744e-03,
          1.17292429e-03,   9.95340716e-01],
       [  8.35796029e-04,   8.35582542e-04,   8.37851976e-04,
          8.41533591e-04,   9.96649236e-01],
       [  7.81124074e-04,   7.81802852e-04,   7.81564724e-04,
          7.84786886e-04,   9.96870721e-01],
       [  1.08213875e-03,   1.07982661e-03,   1.08116378e-03,
          1.08385655e-03,   9.95673014e-01],
       [  6.30201467e-04,   6.30836260e-04,   6.31533258e-04,
          6.32499521e-04,   9.97474929e-01],
       [  7.33648763e-04,   7.33062324e-04,   7.33414707e-04,
          7.35449920e-04,   9.97064424e-01],
       [  2.39306798e-03,   2.38624379e-03,   2.39669682e-03,
          2.39162751e-03,   9.90432364e-01],
       [  1.20833085e-03,   1.21238231e-03,   1.20997661e-03,
          1.21600166e-03,   9.95153309e-01],
       [  7.86331630e-04,   7.8389250

In [30]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel