In [1]:
%reset -fs

In [2]:
import pandas as pd
import numpy as np
import string
import re
import spacy
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from corextopic import corextopic as ct
from corextopic import vis_topic as vt

In [3]:
stop_words = text.ENGLISH_STOP_WORDS.union(['nof', 'nthe', 'nand', 'nto', 'nin', 'nit', 'nfor', 'na', 'nthat', 'nbe', 'applause', 'nwhich', 'nis', 'nare', 'ni', 'nnot', 'nby', 'nhave', 'nbut', 'nwe', 'nwith', 'nfrom', 'nwill', 'nhas', 'nan', 'nif', 'nour'])

In [4]:
df = pd.read_csv('sotu_texts.csv')
df

Unnamed: 0,President,Year,Title,Text
0,George Washington,1790,First State of the Union Address,['I embrace with great satisfaction the opport...
1,George Washington,1790,Second State of the Union Address,['Fellow-Citizens of the Senate and the House ...
2,George Washington,1791,Third State of the Union Address,['Fellow-Citizens of the Senate and the House ...
3,George Washington,1792,Fourth State of the Union Address,['Fellow-Citizens of the Senate and of the Hou...
4,George Washington,1793,Fifth State of the Union Address,['Fellow Citizens of the Senate and of the Hou...
...,...,...,...,...
215,Barack Obama,2016,Barack Obama's Eighth State of the Union Address,"['Mr. Speaker, Mr. Vice President, Members of ..."
216,Donald Trump,2017,Donald Trump's First State of the Union Address,"['Mr. Speaker, Mr. Vice President, Members of ..."
217,Donald Trump,2018,Donald Trump's Second State of the Union Address,"['Mr. Speaker, Mr. Vice President, Members of ..."
218,Donald Trump,2019,Donald Trump's Third State of the Union Address,"['Madam Speaker, Mr. Vice President, Members o..."


In [5]:
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

df['Text'] = df.Text.map(alphanumeric).map(punc_lower)

In [None]:
nlp = spacy.load("en_core_web_sm")
docs = nlp.pipe(df.Text)

In [None]:
docs_clean = [[w.lemma_.lower() for w in doc if (not w.is_stop and not w.is_punct and not w.like_num)] for doc in docs]
df['docs_clean'] = docs_clean

In [None]:
docs_list_clean = [' '.join(doc) for doc in docs_clean]


In [None]:
count_vec = CountVectorizer(stop_words=stop_words, ngram_range=(1,3), min_df=1, max_df=220)
X = count_vec.fit_transform(docs_list_clean)

In [None]:
num_topics = 10
topics = TruncatedSVD(num_topics)
doc_topic = topics.fit_transform(X)

In [None]:
topic_word = pd.DataFrame(topics.components_.round(3),
             columns = count_vec.get_feature_names_out())

In [None]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
display_topics(topics, count_vec.get_feature_names_out(), 10)

In [None]:
tfidf_vec = TfidfVectorizer(stop_words=stop_words, ngram_range=(1,3), min_df=1, max_df=220)
X = tfidf_vec.fit_transform(docs_list_clean)

In [None]:
doc_topic = topics.fit_transform(X)

In [None]:
display_topics(topics, tfidf_vec.get_feature_names_out(), 10)

In [None]:
topics = LatentDirichletAllocation(n_components=num_topics)

In [None]:
X = count_vec.fit_transform(docs_list_clean)

In [None]:
doc_topic = topics.fit_transform(X)

In [None]:
display_topics(topics, count_vec.get_feature_names_out(), 10)

In [None]:
X = tfidf_vec.fit_transform(docs_list_clean)

In [None]:
doc_topic = topics.fit_transform(X)

In [None]:
display_topics(topics, tfidf_vec.get_feature_names_out(), 10)

In [None]:
doc_word = count_vec.fit_transform(docs_list_clean)
words = list(np.asarray(count_vec.get_feature_names_out()))

In [None]:
topic_model = ct.Corex(n_hidden=10, words=words, seed=1)
topic_model.fit(doc_word, words=words, docs=docs_list_clean, anchors=[['economy', 'job', 'work', 'program', 'employment'], ['manufacturing', 'production', 'build', 'commerce', 'business', 'farmer', 'agriculture', 'crop', 'private business', 'farm', 'land'], ['government', 'america', 'american', 'people', 'constitution', 'country', 'congress', 'united', 'states', 'people', 'nation', 'federal', 'state'], ['money', 'expenditure', 'tax', 'fiscal', 'treasury', 'stimulus', 'income', 'dollar', 'currency', 'tariff', 'debt', 'cent', 'bank', 'home'], ['protest', 'race', 'movement', 'strike', 'black', 'indian', 'latino', 'slave', 'civil','right','liberty', 'immigration', 'mexico', 'dreamer', 'citizenship'], ['germany', 'france', 'spain', 'nato', 'europe', 'hitler', 'japan', 'japanese', 'nazi', 'putin', 'ukraine', 'korea', 'soviet', 'ukraine', 'british', 'cuba', 'venezuela', 'iran', 'china', 'russia', 'canada', 'war', 'fight', 'vietnam', 'alliance', 'adversary', 'enemy', 'international', 'german', 'navy', 'military', 'troop', 'sea power', 'admiral', 'general', 'iraqi', 'iraq', 'insurgent', 'terrorist', 'saddam', 'kuwait', 'syria', 'israel', 'panama', 'army', 'peace', 'afghanistan', 'al qaeda', 'taliban', 'islamic', 'defense', 'treaty'], ['energy', 'atomic', 'renewable', 'oil', 'research', 'science', 'innovation', 'cyber', 'space', 'moon', 'rocket', 'cancer', 'health', 'industrial', 'covid', 'pandemic'], ['law', 'order', 'court', 'crime', 'gang', 'police', 'safe', 'gun', 'sedition', 'communist', 'communism'], ['legislation', 'congress', 'action', 'service', 'provide'], ['people', 'great', 'new', 'opportunity', 'providence', 'sacrifice', 'favor', 'fate', 'purpose', 'hardship', 'endure', 'overcome']], anchor_strength=2)

In [None]:
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

In [None]:
predictions = pd.DataFrame(topic_model.predict(doc_word), columns=['topic'+str(i) for i in range(10)])
predictions