In [2]:
# Plotting tools
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Other tools
from nltk.tokenize import RegexpTokenizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
from gensim.models import CoherenceModel

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.0-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
Collecting funcy
  Downloading funcy-1.18-py2.py3-none-any.whl (33 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.18 pyLDAvis-3.4.0


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
  from collections import Mapping, defaultdict
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [4]:
# Load in state of the union address data
import pandas as pd
# Read in data
url = "/content/state_ofthe_union_texts.csv"
df = pd.read_csv(url)

# Subset df to modern era
modern = df[(df["Year"] >1945) & (df["Year"] < 2020)]

In [5]:
# Tokenize 

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
stop_words = stopwords.words('english')
stop_words.extend(['shall', 'applause', 'nation', 'year', 'must', 'world', 'people', 'america', 'american', 'us', 'congress','u', 'y'])
#stop_words.extend(['applause','year', 'u', 'y', 'shall'])
#stop_words.extend(['applause','year', 'u', 'y', 'per', 'cent', 'nof', 'nto', 'nthe', 'nand', 'nin', 'ni', 'nit', 'na'])
#stop_words.extend(['applause','year', 'u', 'y', 'per', 'cent', 'tonight', 'today'])

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

# compile sample documents into a list
# doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]
doc_set = modern['Text'].values.tolist()

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in stop_words]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)


# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

# filter extremes
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, 
                                           num_topics=3, 
                                           id2word = dictionary, 
                                           passes=100)

for idx, topic in ldamodel.print_topics():
  print("Topic: {} \nWords: {}".format(idx, topic ))
  print("\n")
# print(ldamodel.print_topics(num_topics=10, num_words=10))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad 

Topic: 0 
Words: 0.006*"terrorist" + 0.006*"parent" + 0.005*"iraq" + 0.004*"compani" + 0.004*"republican" + 0.004*"terror" + 0.004*"medicar" + 0.003*"someth" + 0.003*"class" + 0.003*"immigr"


Topic: 1 
Words: 0.006*"communist" + 0.004*"oil" + 0.003*"vietnam" + 0.003*"1980" + 0.003*"strateg" + 0.003*"demonstr" + 0.003*"transport" + 0.003*"space" + 0.003*"object" + 0.002*"agricultur"


Topic: 2 
Words: 0.016*"expenditur" + 0.009*"estim" + 0.009*"1947" + 0.009*"agricultur" + 0.007*"appropri" + 0.007*"1946" + 0.007*"veteran" + 0.006*"adequ" + 0.006*"1945" + 0.005*"enterpris"




In [6]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=ldamodel, texts=texts, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.3758757723690527


In [7]:
# Compute Perplexity
print('\nPerplexity: ', ldamodel.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt


Perplexity:  -7.518868732121896


  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)


In [8]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
vis

  default_term_info = default_term_info.sort_values(
