In [1]:
%reset -fs

In [2]:
import pandas as pd
import numpy as np
import string
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
import spacy
from spacy import displacy


In [3]:
df = pd.read_csv('sotu_texts.csv')
df

Unnamed: 0,President,Year,Title,Text
0,George Washington,1790,First State of the Union Address,['I embrace with great satisfaction the opport...
1,George Washington,1790,Second State of the Union Address,['Fellow-Citizens of the Senate and the House ...
2,George Washington,1791,Third State of the Union Address,['Fellow-Citizens of the Senate and the House ...
3,George Washington,1792,Fourth State of the Union Address,['Fellow-Citizens of the Senate and of the Hou...
4,George Washington,1793,Fifth State of the Union Address,['Fellow Citizens of the Senate and of the Hou...
...,...,...,...,...
215,Barack Obama,2016,Barack Obama's Eighth State of the Union Address,"['Mr. Speaker, Mr. Vice President, Members of ..."
216,Donald Trump,2017,Donald Trump's First State of the Union Address,"['Mr. Speaker, Mr. Vice President, Members of ..."
217,Donald Trump,2018,Donald Trump's Second State of the Union Address,"['Mr. Speaker, Mr. Vice President, Members of ..."
218,Donald Trump,2019,Donald Trump's Third State of the Union Address,"['Madam Speaker, Mr. Vice President, Members o..."


In [4]:
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

df['Text'] = df.Text.map(alphanumeric).map(punc_lower)
df

Unnamed: 0,President,Year,Title,Text
0,George Washington,1790,First State of the Union Address,i embrace with great satisfaction the opport...
1,George Washington,1790,Second State of the Union Address,fellow citizens of the senate and the house ...
2,George Washington,1791,Third State of the Union Address,fellow citizens of the senate and the house ...
3,George Washington,1792,Fourth State of the Union Address,fellow citizens of the senate and of the hou...
4,George Washington,1793,Fifth State of the Union Address,fellow citizens of the senate and of the hou...
...,...,...,...,...
215,Barack Obama,2016,Barack Obama's Eighth State of the Union Address,mr speaker mr vice president members of ...
216,Donald Trump,2017,Donald Trump's First State of the Union Address,mr speaker mr vice president members of ...
217,Donald Trump,2018,Donald Trump's Second State of the Union Address,mr speaker mr vice president members of ...
218,Donald Trump,2019,Donald Trump's Third State of the Union Address,madam speaker mr vice president members o...


In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in word_tokenize(text)]

In [7]:
df['Lemma_Text'] = df.Text.apply(lemmatize_text)

In [8]:
df.Lemma_Text[0]

['i',
 'embrace',
 'with',
 'great',
 'satisfaction',
 'the',
 'opportunity',
 'which',
 'now',
 'present',
 'itself',
 'of',
 'congratulating',
 'you',
 'on',
 'the',
 'present',
 'favourable',
 'prospect',
 'of',
 'our',
 'public',
 'affair',
 'the',
 'recent',
 'accession',
 'of',
 'the',
 'important',
 'state',
 'of',
 'north',
 'carolina',
 'to',
 'the',
 'constitution',
 'of',
 'the',
 'united',
 'state',
 'of',
 'which',
 'official',
 'information',
 'ha',
 'been',
 'received',
 '—',
 'the',
 'rising',
 'credit',
 'and',
 'respectability',
 'of',
 'our',
 'country',
 '—',
 'the',
 'general',
 'and',
 'increasing',
 'good',
 'will',
 'towards',
 'the',
 'government',
 'of',
 'the',
 'union',
 '—',
 'and',
 'the',
 'concord',
 'peace',
 'and',
 'plenty',
 'with',
 'which',
 'we',
 'are',
 'blessed',
 'are',
 'circumstance',
 'auspicious',
 'in',
 'an',
 'eminent',
 'degree',
 'to',
 'our',
 'national',
 'prosperity',
 'in',
 'resuming',
 'your',
 'consultation',
 'for',
 'the',
 '

In [9]:
df

Unnamed: 0,President,Year,Title,Text,Lemma_Text
0,George Washington,1790,First State of the Union Address,i embrace with great satisfaction the opport...,"[i, embrace, with, great, satisfaction, the, o..."
1,George Washington,1790,Second State of the Union Address,fellow citizens of the senate and the house ...,"[fellow, citizen, of, the, senate, and, the, h..."
2,George Washington,1791,Third State of the Union Address,fellow citizens of the senate and the house ...,"[fellow, citizen, of, the, senate, and, the, h..."
3,George Washington,1792,Fourth State of the Union Address,fellow citizens of the senate and of the hou...,"[fellow, citizen, of, the, senate, and, of, th..."
4,George Washington,1793,Fifth State of the Union Address,fellow citizens of the senate and of the hou...,"[fellow, citizen, of, the, senate, and, of, th..."
...,...,...,...,...,...
215,Barack Obama,2016,Barack Obama's Eighth State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,..."
216,Donald Trump,2017,Donald Trump's First State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,..."
217,Donald Trump,2018,Donald Trump's Second State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,..."
218,Donald Trump,2019,Donald Trump's Third State of the Union Address,madam speaker mr vice president members o...,"[madam, speaker, mr, vice, president, member, ..."


In [10]:
stop_words = stopwords.words('english')

In [11]:
df['text_no_stopwords'] = df['Lemma_Text'].apply(lambda x: ' '.join([word for word in x if word not in (stop_words)]))
df

Unnamed: 0,President,Year,Title,Text,Lemma_Text,text_no_stopwords
0,George Washington,1790,First State of the Union Address,i embrace with great satisfaction the opport...,"[i, embrace, with, great, satisfaction, the, o...",embrace great satisfaction opportunity present...
1,George Washington,1790,Second State of the Union Address,fellow citizens of the senate and the house ...,"[fellow, citizen, of, the, senate, and, the, h...",fellow citizen senate house representative mee...
2,George Washington,1791,Third State of the Union Address,fellow citizens of the senate and the house ...,"[fellow, citizen, of, the, senate, and, the, h...",fellow citizen senate house representative mee...
3,George Washington,1792,Fourth State of the Union Address,fellow citizens of the senate and of the hou...,"[fellow, citizen, of, the, senate, and, of, th...",fellow citizen senate house representative aba...
4,George Washington,1793,Fifth State of the Union Address,fellow citizens of the senate and of the hou...,"[fellow, citizen, of, the, senate, and, of, th...",fellow citizen senate house representative sin...
...,...,...,...,...,...,...
215,Barack Obama,2016,Barack Obama's Eighth State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,...",mr speaker mr vice president member congress f...
216,Donald Trump,2017,Donald Trump's First State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,...",mr speaker mr vice president member congress f...
217,Donald Trump,2018,Donald Trump's Second State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,...",mr speaker mr vice president member congress f...
218,Donald Trump,2019,Donald Trump's Third State of the Union Address,madam speaker mr vice president members o...,"[madam, speaker, mr, vice, president, member, ...",madam speaker mr vice president member congres...


In [12]:
df['text_no_stopwords'] = [word_tokenize(i) for i in df.text_no_stopwords]

In [13]:
df

Unnamed: 0,President,Year,Title,Text,Lemma_Text,text_no_stopwords
0,George Washington,1790,First State of the Union Address,i embrace with great satisfaction the opport...,"[i, embrace, with, great, satisfaction, the, o...","[embrace, great, satisfaction, opportunity, pr..."
1,George Washington,1790,Second State of the Union Address,fellow citizens of the senate and the house ...,"[fellow, citizen, of, the, senate, and, the, h...","[fellow, citizen, senate, house, representativ..."
2,George Washington,1791,Third State of the Union Address,fellow citizens of the senate and the house ...,"[fellow, citizen, of, the, senate, and, the, h...","[fellow, citizen, senate, house, representativ..."
3,George Washington,1792,Fourth State of the Union Address,fellow citizens of the senate and of the hou...,"[fellow, citizen, of, the, senate, and, of, th...","[fellow, citizen, senate, house, representativ..."
4,George Washington,1793,Fifth State of the Union Address,fellow citizens of the senate and of the hou...,"[fellow, citizen, of, the, senate, and, of, th...","[fellow, citizen, senate, house, representativ..."
...,...,...,...,...,...,...
215,Barack Obama,2016,Barack Obama's Eighth State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,...","[mr, speaker, mr, vice, president, member, con..."
216,Donald Trump,2017,Donald Trump's First State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,...","[mr, speaker, mr, vice, president, member, con..."
217,Donald Trump,2018,Donald Trump's Second State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,...","[mr, speaker, mr, vice, president, member, con..."
218,Donald Trump,2019,Donald Trump's Third State of the Union Address,madam speaker mr vice president members o...,"[madam, speaker, mr, vice, president, member, ...","[madam, speaker, mr, vice, president, member, ..."


In [14]:
df['tagged_text'] = df.text_no_stopwords.apply(lambda x: pos_tag(x))
df

Unnamed: 0,President,Year,Title,Text,Lemma_Text,text_no_stopwords,tagged_text
0,George Washington,1790,First State of the Union Address,i embrace with great satisfaction the opport...,"[i, embrace, with, great, satisfaction, the, o...","[embrace, great, satisfaction, opportunity, pr...","[(embrace, NN), (great, JJ), (satisfaction, NN..."
1,George Washington,1790,Second State of the Union Address,fellow citizens of the senate and the house ...,"[fellow, citizen, of, the, senate, and, the, h...","[fellow, citizen, senate, house, representativ...","[(fellow, JJ), (citizen, NN), (senate, NN), (h..."
2,George Washington,1791,Third State of the Union Address,fellow citizens of the senate and the house ...,"[fellow, citizen, of, the, senate, and, the, h...","[fellow, citizen, senate, house, representativ...","[(fellow, JJ), (citizen, NN), (senate, NN), (h..."
3,George Washington,1792,Fourth State of the Union Address,fellow citizens of the senate and of the hou...,"[fellow, citizen, of, the, senate, and, of, th...","[fellow, citizen, senate, house, representativ...","[(fellow, JJ), (citizen, NN), (senate, NN), (h..."
4,George Washington,1793,Fifth State of the Union Address,fellow citizens of the senate and of the hou...,"[fellow, citizen, of, the, senate, and, of, th...","[fellow, citizen, senate, house, representativ...","[(fellow, JJ), (citizen, NN), (senate, NN), (h..."
...,...,...,...,...,...,...,...
215,Barack Obama,2016,Barack Obama's Eighth State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,...","[mr, speaker, mr, vice, president, member, con...","[(mr, NN), (speaker, NN), (mr, JJ), (vice, NN)..."
216,Donald Trump,2017,Donald Trump's First State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,...","[mr, speaker, mr, vice, president, member, con...","[(mr, NN), (speaker, NN), (mr, JJ), (vice, NN)..."
217,Donald Trump,2018,Donald Trump's Second State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,...","[mr, speaker, mr, vice, president, member, con...","[(mr, NN), (speaker, NN), (mr, JJ), (vice, NN)..."
218,Donald Trump,2019,Donald Trump's Third State of the Union Address,madam speaker mr vice president members o...,"[madam, speaker, mr, vice, president, member, ...","[madam, speaker, mr, vice, president, member, ...","[(madam, NN), (speaker, NN), (mr, JJ), (vice, ..."


In [15]:
nlp = spacy.load("en_core_web_sm")
docs = nlp.pipe(df.Text)
df

Unnamed: 0,President,Year,Title,Text,Lemma_Text,text_no_stopwords,tagged_text
0,George Washington,1790,First State of the Union Address,i embrace with great satisfaction the opport...,"[i, embrace, with, great, satisfaction, the, o...","[embrace, great, satisfaction, opportunity, pr...","[(embrace, NN), (great, JJ), (satisfaction, NN..."
1,George Washington,1790,Second State of the Union Address,fellow citizens of the senate and the house ...,"[fellow, citizen, of, the, senate, and, the, h...","[fellow, citizen, senate, house, representativ...","[(fellow, JJ), (citizen, NN), (senate, NN), (h..."
2,George Washington,1791,Third State of the Union Address,fellow citizens of the senate and the house ...,"[fellow, citizen, of, the, senate, and, the, h...","[fellow, citizen, senate, house, representativ...","[(fellow, JJ), (citizen, NN), (senate, NN), (h..."
3,George Washington,1792,Fourth State of the Union Address,fellow citizens of the senate and of the hou...,"[fellow, citizen, of, the, senate, and, of, th...","[fellow, citizen, senate, house, representativ...","[(fellow, JJ), (citizen, NN), (senate, NN), (h..."
4,George Washington,1793,Fifth State of the Union Address,fellow citizens of the senate and of the hou...,"[fellow, citizen, of, the, senate, and, of, th...","[fellow, citizen, senate, house, representativ...","[(fellow, JJ), (citizen, NN), (senate, NN), (h..."
...,...,...,...,...,...,...,...
215,Barack Obama,2016,Barack Obama's Eighth State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,...","[mr, speaker, mr, vice, president, member, con...","[(mr, NN), (speaker, NN), (mr, JJ), (vice, NN)..."
216,Donald Trump,2017,Donald Trump's First State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,...","[mr, speaker, mr, vice, president, member, con...","[(mr, NN), (speaker, NN), (mr, JJ), (vice, NN)..."
217,Donald Trump,2018,Donald Trump's Second State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,...","[mr, speaker, mr, vice, president, member, con...","[(mr, NN), (speaker, NN), (mr, JJ), (vice, NN)..."
218,Donald Trump,2019,Donald Trump's Third State of the Union Address,madam speaker mr vice president members o...,"[madam, speaker, mr, vice, president, member, ...","[madam, speaker, mr, vice, president, member, ...","[(madam, NN), (speaker, NN), (mr, JJ), (vice, ..."


In [16]:
docs_clean = [[w.lemma_.lower() for w in doc if (not w.is_stop and not w.is_punct and not w.like_num)] for doc in docs]
df['docs_clean'] = docs_clean

In [17]:
docs_list_clean = [' '.join(doc) for doc in docs_clean]
docs_list_clean[0]

'   embrace great satisfaction opportunity   present   congratulate present favourable prospect public affair   recent accession important state north carolina constitution united states   official information receive rise credit respectability country general increase good government union concord   peace plenty   bless   circumstance   auspicious eminent degree national prosperity      resume consultation general good   derive encouragement reflection measure session satisfactory constituent   novelty difficulty work allow hope       expectation   secure blessing gracious providence place reach   course present important session   cool deliberate exertion patriotism   firmness wisdom      interesting object   engage attention   provide common defence merit particular regard prepare war effectual mean preserve peace     free people ought armed   discipline end uniform digest plan requisite   safety interest require promote manufactory   tend render independent   essential   particular

In [18]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

count_vec = CountVectorizer(stop_words='english')
tfidf_vec = TfidfVectorizer(stop_words='english')
X = count_vec.fit_transform(docs_list_clean)

df_X = pd.DataFrame(X.toarray(), columns=count_vec.get_feature_names_out())
df_X.head()

Unnamed: 0,aa,aaa,aana,aar,aaron,abandon,abandoning,abandonment,abate,abatement,...,zimbabwe,zimbabwean,zinc,zion,zollverein,zone,zoological,zuloaga,ôtil,ѕў
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
from sklearn.decomposition import TruncatedSVD, NMF

num_topics = 10
topics = TruncatedSVD(num_topics)
#topics = NMF(num_topics)
doc_topic = topics.fit_transform(X)
#topics.explained_variance_ratio_

In [20]:
topic_word = pd.DataFrame(topics.components_.round(3),
             columns = count_vec.get_feature_names_out())
topic_word

Unnamed: 0,aa,aaa,aana,aar,aaron,abandon,abandoning,abandonment,abate,abatement,...,zimbabwe,zimbabwean,zinc,zion,zollverein,zone,zoological,zuloaga,ôtil,ѕў
0,0.0,0.0,0.0,0.0,0.0,0.007,0.0,0.001,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.003,0.0,0.0,0.0,0.0
1,-0.0,0.0,-0.0,0.0,0.0,-0.003,0.0,0.0,0.0,-0.0,...,0.001,0.001,-0.0,0.0,-0.0,0.001,-0.0,-0.0,-0.0,0.0
2,0.0,0.001,0.0,-0.0,-0.0,0.001,-0.0,0.001,0.001,0.0,...,0.001,0.001,0.0,-0.0,0.0,-0.001,-0.0,0.0,-0.0,0.0
3,-0.0,0.001,0.0,0.0,-0.001,-0.008,0.0,0.001,-0.0,-0.0,...,-0.0,0.0,-0.0,-0.0,-0.0,0.007,0.001,-0.001,-0.0,0.0
4,0.0,0.001,-0.0,0.0,-0.0,-0.003,0.0,-0.002,-0.001,-0.0,...,-0.002,-0.001,0.0,-0.0,0.0,-0.005,-0.001,0.001,0.0,0.0
5,0.0,-0.001,-0.0,0.0,-0.001,0.005,0.0,0.002,0.001,-0.0,...,0.003,0.001,-0.0,-0.0,0.0,-0.005,-0.0,0.0,0.0,0.0
6,0.0,0.0,-0.0,-0.0,-0.0,0.009,0.0,-0.001,0.0,-0.0,...,-0.0,-0.0,0.0,-0.0,0.0,0.009,0.0,-0.001,-0.0,-0.0
7,0.0,-0.0,-0.0,0.0,0.0,0.007,0.0,-0.0,-0.001,-0.0,...,0.0,0.0,-0.0,0.0,0.0,0.007,0.002,-0.0,0.0,-0.0
8,-0.0,0.001,-0.001,-0.0,0.0,-0.006,-0.0,-0.003,-0.001,-0.0,...,0.0,0.0,-0.0,0.0,-0.0,0.015,0.002,0.002,-0.0,0.0
9,0.0,-0.0,-0.0,0.0,-0.001,-0.004,-0.0,-0.006,0.002,0.0,...,-0.002,-0.001,0.0,-0.0,0.0,0.011,0.001,0.0,-0.0,0.0


In [21]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [22]:
display_topics(topics, count_vec.get_feature_names_out(), 10)


Topic  0
government, year, congress, united, states, country, state, great, law, people

Topic  1
program, year, world, new, work, need, help, america, nation, federal

Topic  2
program, dollar, year, fiscal, united, war, expenditure, policy, administration, states

Topic  3
man, law, court, service, business, department, dollar, legislation, national, need

Topic  4
war, dollar, man, expenditure, power, people, great, peace, public, state

Topic  5
nation, administration, state, policy, man, energy, effort, continue, program, power

Topic  6
mexico, united, war, states, american, texas, mexican, man, peace, army

Topic  7
mexico, country, nof, nthe, texas, mexican, nto, nand, army, public

Topic  8
state, dollar, constitution, american, government, department, program, business, canal, united

Topic  9
world, government, nof, nthe, american, free, shall, nand, nto, great
