# Import Necessary Packages

In [1]:
import numpy as np
import pandas as pd
import glob
import nltk
import spacy
import gensim
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
import re
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Read in Syracuse University Commencement Speeches from the Document Repot

In [2]:
path = '/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/*.txt'   

data = []

for f in glob.glob(path):
    r = open(f, "r", encoding = "UTF-8")
    print(r)
    text = r.read() 

    data.append(text)

<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2004_phylicia_rashad.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2016_donald_newhouse.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2015_mary_karr.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2013_nicholas_kristof.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2011_j_craig_venter.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2005_jane_goodall.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2003_

# Tokenize Phrases

In [3]:
def tokenize(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data_words = list(tokenize(data))

print(data_words) #check some of the tokens from the sentences



# Lemmatization

In [4]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

['congratulation happy mother day time be precious short so will get right point ask come so here ask what could say august body that meaningful personal true want offer suggestion good way live world be mother heart mother heart be brave mother heart be keenly intelligent mother heart be resourceful quick skilled action mother heart be flexible mother heart be sustain empower purity  intentiont soul intentiono see family encourage member family  diverse individual personality embrace family whole love respect unyield effort mother heart sacrifice  own pleasure well being family  great wish be would understand take great effort sustain renew commitment would come regard world  inhabitant  family would embrace mother heart may  day be fill brilliant sunrise magnificent sunset may take time regard just mother heart may live constant remembrance gratitude one who create may good fortune always attend  endeavor tonight may throw celebrate celebrate celebrate world that want live need creat

# Create Document-Word Matrix

In [5]:
from sklearn.feature_extraction import text

my_additional_stop_words = frozenset(['ve','don'])
stop_words=text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

vectorizer = CountVectorizer(       
                             stop_words=stop_words, #remove stop words
                             lowercase=True,#convert all words to lowercase
                             ngram_range=(1,2), #utilize uni and bigrams
                            )

data_vectorized = vectorizer.fit_transform(data_lemmatized)
feature_names=vectorizer.get_feature_names()

# Build LDA model with sklearn

In [6]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=3, #set number of topics
                                      max_iter=10, #set max learning iterations
                                      learning_method='online',   
                                      random_state=0, #set the random state
                                      n_jobs=-1, #use all processors
                                      batch_size=128,
                                      evaluate_every=-1,
                                      total_samples=1000,
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=3, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None, total_samples=1000,
             verbose=0)


# Utilize Grid Search to Find Best Model

In [7]:
## Define Search Param
search_params = {'n_components': [3,4,5,6], 'learning_decay': [.2, .3, .5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_components': [3, 4, 5, 6], 'learning_decay': [0.2, 0.3, 0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [8]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'n_components': 3, 'learning_decay': 0.7}
Best Log Likelihood Score:  -85361.77423120577
Model Perplexity:  10861.659392700554


# View Dominant Topic per Document

In [9]:
# Create Document - Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)

# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(data))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 5), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

df_document_topic

Unnamed: 0,Topic0,Topic1,Topic2,dominant_topic
Doc0,0.00143,0.99716,0.00142,1
Doc1,0.00025,0.9995,0.00025,1
Doc2,0.99968,0.00016,0.00016,0
Doc3,0.99966,0.00017,0.00017,0
Doc4,0.00018,0.00018,0.99964,2
Doc5,0.99972,0.00014,0.00014,0
Doc6,0.99972,0.00014,0.00014,0
Doc7,0.00121,0.00116,0.99763,2
Doc8,0.00017,0.00017,0.99967,2
Doc9,0.0002,0.00019,0.99961,2


# Topic Distribution Across Documents

In [10]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,0,5
1,2,4
2,1,4


# Visualize the LDA model with pyLDAvis

In [11]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

# Show Top 20 Tokens Per Topic

In [12]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 20

topics=display_topics(lda_model, feature_names, no_top_words)

Topic 0:
new, year, change, life, synthetic, cell, people, know, future, energy, genomic, computer, dna, genome, make, world, include, disease, today, just
Topic 1:
say, year, time, good, today, know, make, look, day, life, think, thing, people, way, just, friend, fear, play, feel, want
Topic 2:
people, world, think, life, say, good, time, know, day, year, just, hope, make, work, come, want, thing, today, great, help


In [13]:
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(data)
words = vectorizer.get_feature_names()
similarity_matrix = cosine_similarity(tfidf)

In [14]:
pd.DataFrame(similarity_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.0,0.565308,0.59452,0.562681,0.524732,0.605995,0.581889,0.45788,0.604543,0.564165,0.589296,0.613979,0.380591
1,0.565308,1.0,0.760047,0.735249,0.700376,0.742985,0.763223,0.467204,0.763731,0.732466,0.766106,0.748926,0.522532
2,0.59452,0.760047,1.0,0.74398,0.697208,0.78884,0.763824,0.558464,0.768324,0.782586,0.793119,0.764298,0.516398
3,0.562681,0.735249,0.74398,1.0,0.687701,0.783582,0.747183,0.493209,0.753331,0.760012,0.754109,0.745386,0.519264
4,0.524732,0.700376,0.697208,0.687701,1.0,0.737847,0.779874,0.448934,0.778922,0.727649,0.728258,0.728097,0.600915
5,0.605995,0.742985,0.78884,0.783582,0.737847,1.0,0.818923,0.531658,0.813114,0.825664,0.81092,0.787688,0.572043
6,0.581889,0.763223,0.763824,0.747183,0.779874,0.818923,1.0,0.500075,0.852018,0.811021,0.796368,0.813006,0.641649
7,0.45788,0.467204,0.558464,0.493209,0.448934,0.531658,0.500075,1.0,0.503363,0.548231,0.594711,0.588064,0.2726
8,0.604543,0.763731,0.768324,0.753331,0.778922,0.813114,0.852018,0.503363,1.0,0.794663,0.794306,0.804527,0.650728
9,0.564165,0.732466,0.782586,0.760012,0.727649,0.825664,0.811021,0.548231,0.794663,1.0,0.814784,0.798708,0.560985
