# Import Necessary Packages

In [1]:
import numpy as np
import pandas as pd
import glob
import nltk
import spacy
import gensim
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Read in Syracuse University Commencement Speeches as Single Text File

In [2]:
path = '/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/all_documents.txt'

def text_to_sentences(text):
    sentences = re.split(r"[.!?]", text)
    sentences = [sentence.strip(" ") for sentence in sentences]
    return sentences

r = open(path, "r", encoding = "UTF-8")

text = r.read()

data=text_to_sentences(text)

data

['"I am proud that a Newhouse graduate was selected student speaker, but Kaitlyn, you sure are a hard act to follow',
 'Thank you, Chancellor Syverud, for your kind words, but I need to correct one thing',
 'A giant I am not',
 'Good about morning, trustees, Deans, members of the faculty, families of the graduates, and an especially good morning to graduates',
 'I grew up in the newspaper business',
 'One of the greats of my newspaper world, the late Salsburger of the New York Times once told me a universal truth which I am happy to share with you today, and I quote, there is no such thing as a too short speech',
 "Happily for you I've taken that dictum to the heart",
 'But this occasion, which I know has special meaning for you, has great meaning for me too',
 '\nI arrived at Syracuse as a freshman in 1947',
 "And while I did not achieve what you have, it took me 69 years to get a degree, I am extremely moved to be here alongside you today and if you will indulge me, I'd like to tell 

In [3]:
path = '/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/*.txt'   

data = []

for f in glob.glob(path):
    r = open(f, "r", encoding = "UTF-8")
    print(r)
    text = r.read() 

    data.append(text)

<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2004_phylicia_rashad.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2016_donald_newhouse.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2015_mary_karr.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2013_nicholas_kristof.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2011_j_craig_venter.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2005_jane_goodall.txt' mode='r' encoding='UTF-8'>
<_io.TextIOWrapper name='/Users/Ryan/Dropbox/college/7_Semester_V/IST_736/final_project/documents/2003_

# Tokenize Phrases

In [4]:
def tokenize(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data_words = list(tokenize(data))

print(data_words) #check some of the tokens from the senteces



# Lemmatization

In [5]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

['congratulation happy mother day time be precious short so will get right point ask come so here ask what could say august body that meaningful personal true want offer suggestion good way live world be mother heart mother heart be brave mother heart be keenly intelligent mother heart be resourceful quick skilled action mother heart be flexible mother heart be sustain empower purity  intentiont soul intentiono see family encourage member family  diverse individual personality embrace family whole love respect unyield effort mother heart sacrifice  own pleasure well being family  great wish be would understand take great effort sustain renew commitment would come regard world  inhabitant  family would embrace mother heart may  day be fill brilliant sunrise magnificent sunset may take time regard just mother heart may live constant remembrance gratitude one who create may good fortune always attend  endeavor tonight may throw celebrate celebrate celebrate world that want live need creat

# Create Document-Word Matrix

In [6]:
vectorizer = CountVectorizer(       
                             stop_words='english', #remove stop words
                             lowercase=True,#convert all words to lowercase
                             ngram_range=(1,2), #utilize uni and bigrams
                            )

data_vectorized = vectorizer.fit_transform(data_lemmatized)

# Build LDA model with sklearn

In [7]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=3, #set number of topics
                                      max_iter=10, #set max learning iterations
                                      learning_method='online',   
                                      random_state=0, #set the random state
                                      n_jobs=-1, #use all processors
                                      batch_size=128,
                                      evaluate_every=-1,
                                      total_samples=1000,
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=3, n_jobs=-1, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None, total_samples=1000,
             verbose=0)


# View Dominant Topic per Document

In [8]:
# Create Document - Topic Matrix
lda_output = lda_model.transform(data_vectorized)

# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(data))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

df_document_topic

Unnamed: 0,Topic0,Topic1,Topic2,dominant_topic
Doc0,0.0,1.0,0.0,1
Doc1,0.0,0.0,1.0,2
Doc2,1.0,0.0,0.0,0
Doc3,1.0,0.0,0.0,0
Doc4,0.0,0.0,1.0,2
Doc5,1.0,0.0,0.0,0
Doc6,1.0,0.0,0.0,0
Doc7,1.0,0.0,0.0,0
Doc8,0.0,0.0,1.0,2
Doc9,0.0,0.0,1.0,2


# Topic Distribution Across Documents

In [9]:
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,0,6
1,2,5
2,1,2


# Visualize the LDA model with pyLDAvis

In [10]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel