In [1]:
%reset -fs

In [2]:
import pandas as pd
import numpy as np
import string
import re
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from corextopic import corextopic as ct
from corextopic import vis_topic as vt

In [3]:
df = pd.read_csv('sotu_texts.csv')
df

Unnamed: 0,President,Year,Title,Text
0,George Washington,1790,First State of the Union Address,['I embrace with great satisfaction the opport...
1,George Washington,1790,Second State of the Union Address,['Fellow-Citizens of the Senate and the House ...
2,George Washington,1791,Third State of the Union Address,['Fellow-Citizens of the Senate and the House ...
3,George Washington,1792,Fourth State of the Union Address,['Fellow-Citizens of the Senate and of the Hou...
4,George Washington,1793,Fifth State of the Union Address,['Fellow Citizens of the Senate and of the Hou...
...,...,...,...,...
215,Barack Obama,2016,Barack Obama's Eighth State of the Union Address,"['Mr. Speaker, Mr. Vice President, Members of ..."
216,Donald Trump,2017,Donald Trump's First State of the Union Address,"['Mr. Speaker, Mr. Vice President, Members of ..."
217,Donald Trump,2018,Donald Trump's Second State of the Union Address,"['Mr. Speaker, Mr. Vice President, Members of ..."
218,Donald Trump,2019,Donald Trump's Third State of the Union Address,"['Madam Speaker, Mr. Vice President, Members o..."


In [4]:
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

df['Text'] = df.Text.map(alphanumeric).map(punc_lower)

In [5]:
nlp = spacy.load("en_core_web_sm")
docs = nlp.pipe(df.Text)

In [6]:
docs_clean = [[w.lemma_.lower() for w in doc if (not w.is_stop and not w.is_punct and not w.like_num)] for doc in docs]
df['docs_clean'] = docs_clean

In [7]:
docs_list_clean = [' '.join(doc) for doc in docs_clean]


In [8]:
count_vec = CountVectorizer(stop_words='english')
X = count_vec.fit_transform(docs_list_clean)

In [9]:
num_topics = 10
topics = TruncatedSVD(num_topics)
doc_topic = topics.fit_transform(X)

In [10]:
topic_word = pd.DataFrame(topics.components_.round(3),
             columns = count_vec.get_feature_names_out())

In [11]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [12]:
display_topics(topics, count_vec.get_feature_names_out(), 10)


Topic  0
government, year, congress, united, states, country, state, great, law, people

Topic  1
program, year, world, new, work, need, help, america, nation, federal

Topic  2
program, dollar, year, fiscal, united, war, expenditure, policy, administration, states

Topic  3
man, law, court, service, business, department, dollar, legislation, national, need

Topic  4
war, dollar, man, expenditure, power, people, great, peace, public, state

Topic  5
nation, administration, state, policy, man, energy, effort, continue, program, power

Topic  6
mexico, united, war, states, american, texas, mexican, man, peace, army

Topic  7
mexico, country, nof, nthe, texas, mexican, nto, nand, army, public

Topic  8
state, dollar, constitution, american, government, program, department, business, canal, united

Topic  9
world, government, nof, nthe, american, free, shall, nand, nto, great


In [13]:
tfidf_vec = TfidfVectorizer(stop_words='english')
X = tfidf_vec.fit_transform(docs_list_clean)

In [14]:
doc_topic = topics.fit_transform(X)

In [15]:
display_topics(topics, tfidf_vec.get_feature_names_out(), 10)


Topic  0
government, year, congress, united, states, country, great, people, state, nation

Topic  1
america, world, job, program, help, tonight, americans, people, new, year

Topic  2
nof, nthe, nand, nto, nin, nit, nfor, na, nthat, nbe

Topic  3
nof, nthe, nand, tonight, applause, nto, america, job, ve, child

Topic  4
year, cent, department, government, secretary, report, silver, fiscal, work, legislation

Topic  5
man, applause, war, fight, enemy, world, japanese, great, corporation, people

Topic  6
applause, soviet, united, states, mexico, year, world, free, iraq, congress

Topic  7
applause, public, ve, iraq, federal, terrorist, iraqi, bank, recovery, economic

Topic  8
government, state, people, constitution, mexico, texas, power, bank, duty, union

Topic  9
states, united, spain, treaty, gentleman, government, commissioner, island, article, nation


In [16]:
topics = LatentDirichletAllocation(n_components=num_topics)

In [17]:
X = count_vec.fit_transform(docs_list_clean)

In [18]:
doc_topic = topics.fit_transform(X)

In [19]:
display_topics(topics, count_vec.get_feature_names_out(), 10)


Topic  0
man, law, nation, government, people, good, great, state, case, american

Topic  1
year, nation, world, people, congress, new, government, great, america, program

Topic  2
year, program, world, government, nation, federal, economic, increase, need, war

Topic  3
government, great, law, country, man, congress, people, public, work, nation

Topic  4
mexico, government, states, united, war, texas, country, congress, mexican, territory

Topic  5
armistice, canada, detroit, impressment, hull, brigadier, recompense, insufficiency, non, capricious

Topic  6
states, united, great, spain, government, power, treaty, law, vessel, article

Topic  7
state, government, united, public, states, congress, great, power, country, war

Topic  8
government, year, states, united, congress, country, law, state, great, people

Topic  9
year, america, people, work, american, new, world, job, country, help


In [20]:
X = tfidf_vec.fit_transform(docs_list_clean)

In [21]:
doc_topic = topics.fit_transform(X)

In [22]:
display_topics(topics, tfidf_vec.get_feature_names_out(), 10)


Topic  0
stacy, kayla, holet, rebecca, elyria, marcher, kent, allentown, scorekeeper, shouting

Topic  1
stacy, kayla, holet, rebecca, elyria, marcher, kent, allentown, scorekeeper, shouting

Topic  2
stacy, kayla, holet, rebecca, elyria, marcher, kent, allentown, scorekeeper, shouting

Topic  3
stacy, kayla, holet, rebecca, elyria, marcher, kent, allentown, scorekeeper, shouting

Topic  4
stacy, kayla, holet, rebecca, elyria, marcher, kent, allentown, scorekeeper, shouting

Topic  5
stacy, kayla, holet, rebecca, elyria, marcher, kent, allentown, scorekeeper, shouting

Topic  6
stacy, kayla, holet, rebecca, elyria, marcher, kent, allentown, scorekeeper, shouting

Topic  7
otto, kenton, cj, seong, licentiousness, indispensible, defence, vigilence, chearful, oeconomy

Topic  8
stacy, kayla, holet, rebecca, elyria, marcher, kent, allentown, scorekeeper, shouting

Topic  9
government, year, congress, united, states, people, country, great, nation, state


In [23]:
doc_word = count_vec.fit_transform(docs_list_clean)
words = list(np.asarray(count_vec.get_feature_names_out()))

In [24]:
topic_model = ct.Corex(n_hidden=10, words=words, seed=1)
topic_model.fit(doc_word, words=words, docs=docs_list_clean)



<corextopic.corextopic.Corex at 0x16685f8b0>

In [25]:
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: postmaster,mail,commercial,commissioner,navy,furnish,suggestion,mexico,refer,judgment
1: program,today,job,intercourse,vessel,budget,economic,goal,object,disposition
2: worker,start,student,cut,middle,big,problem,americans,fight,dollar
3: agricultural,advise,volume,enactment,limitation,requirement,revision,amend,chinese,arbitration
4: nand,nwith,nwe,nwhich,nis,nwill,nby,nbe,nhave,nit
5: association,response,wide,safeguard,outside,unit,aside,fear,item,corporation
6: charge,situate,accept,manufacturer,violate,ecuador,trial,premium,naturalization,spare
7: rule,vote,fall,invest,file,rival,opposition,pursuit,sign,criminal
8: minded,physically,forefather,crossing,bespeak,null,seriousness,devoutly,marketing,deference
9: human,science,tremendous,amazing,interim,spread,overwhelming,deliver,set,archive


In [26]:
topic_model.get_top_docs(topic=4, n_docs=2)

[('   madam speaker   madam vice president   lady gentleman   member congress cabinet   justice supreme court   fellow americans   year covid    keep apart   year finally   tonight   meet democrats republican independent   importantly americans   duty american people constitution   unwavering resolve freedom triumph tyranny   day ago   russia vladimir putin seek shake foundation free world thinking bend menacing way   badly miscalculate   think roll ukraine world roll   instead meet wall strength imagine   meet ukrainian people   president zelenskyy ukrainian   fearlessness   courage   determination   inspire world   group citizen block tank body   student retiree teacher turn soldier defend homeland   struggle president zelenskyy say speech european parliament light win darkness ukrainian ambassador united states tonight   let tonight chamber send unmistakable signal ukraine world   rise able   yes   united states america stand ukrainian people   history learn lesson dictator pay pric

In [27]:
predictions = pd.DataFrame(topic_model.predict(doc_word), columns=['topic'+str(i) for i in range(10)])
predictions

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9
0,False,True,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,True,False
4,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...
215,False,True,True,False,False,False,False,False,True,True
216,False,True,True,False,False,False,False,False,False,False
217,False,True,True,False,True,True,False,True,True,True
218,False,True,True,False,True,False,False,False,True,True
