In [None]:
# Install packages
!pip install pyLDAvis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.0-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
Collecting funcy
  Downloading funcy-1.18-py2.py3-none-any.whl (33 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.18 pyLDAvis-3.4.0


In [None]:
# Import plotting packages
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

# Import other packages
from nltk.tokenize import RegexpTokenizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
from gensim.models import CoherenceModel
import re
from wordcloud import WordCloud

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
  from collections import Mapping, defaultdict
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [None]:
# read in data to google colab (only use this code block if running on Google Colab)
from google.colab import files
uploades = files.upload()

Saving state_ofthe_union_texts.csv to state_ofthe_union_texts.csv


In [None]:
# Load in state of the union address data
import pandas as pd
# Read in data
url = "/content/state_ofthe_union_texts.csv"
df = pd.read_csv(url)

In [None]:
# Create a WordCloud to see possible stop words

# See reference [1]

# remove punctuation
df['Text_processed'] = \
df['Text'].map(lambda x: re.sub('[,\.!?]', '', x))

# convert titles to lowercase
df['Text_processed'] = \
df['Text_processed'].map(lambda x: x.lower())

# join text together
long_string = ','.join(list(df['Text_processed'].values))

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

# Generate a word cloud
wordcloud.generate(long_string)

# Visualize the word cloud
wordcloud.to_image()

In [None]:
# See reference [2]
# Tokenize
tokenizer = RegexpTokenizer(r'\w+') # Separate the sentence into words without punctuation

# Use english stop wrods from stopwords package
stop_words = stopwords.words('english')
# Append stop words that are specific to the State of the Union Addresses
stop_words.extend(['shall', 'applause', 'nation', 'year', 'must', 'world', 
                   'people', 'america', 'american', 'us', 'congress','u', 'y'])

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

# Topic Modeling for all SOTU addresses (SOTU addresses as a whole)

# Compile speech documents into a list
doc_set = df['Text'].values.tolist()

# Tokenize speeches
texts = []

# Loop through speech document list
for i in doc_set:
    
    # Clean and tokenize each speech document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # Remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in stop_words]
    
    # Stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # Add tokens to list
    texts.append(stemmed_tokens)


# Turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

# Filter extremes
# No tokens appearing in less than 5 speeches, no tokens in above 50% of speeches
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)

# Convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# Generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, 
                                           num_topics=3, 
                                           id2word = dictionary, 
                                           passes=100)

# Print topics from the model
for idx, topic in ldamodel.print_topics():
  print("Topic: {} \nWords: {}".format(idx, topic ))
  print("\n")

In [None]:
# Coherence of entire corpus, not each topic for each speech

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=ldamodel, texts=texts, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.5303897251003954


In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
vis

  default_term_info = default_term_info.sort_values(


##References
[1] S. Kapadia. "Topic Modeling in Python: Latent Dirichlet Allocation (LDA)." Medium. https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0 (retrieved Feb. 19, 2023).

[2] J. Barber. "Latent Dirichlet Allocation (LDA) with Python." AmazonAWS. https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html (retrieved Feb. 19, 2023).