In [1]:
%reset -fs

In [2]:
import pandas as pd
import numpy as np
import string
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [3]:
df = pd.read_csv('sotu_texts.csv')
df

Unnamed: 0,President,Year,Title,Text
0,George Washington,1790,First State of the Union Address,['I embrace with great satisfaction the opport...
1,George Washington,1790,Second State of the Union Address,['Fellow-Citizens of the Senate and the House ...
2,George Washington,1791,Third State of the Union Address,['Fellow-Citizens of the Senate and the House ...
3,George Washington,1792,Fourth State of the Union Address,['Fellow-Citizens of the Senate and of the Hou...
4,George Washington,1793,Fifth State of the Union Address,['Fellow Citizens of the Senate and of the Hou...
...,...,...,...,...
215,Barack Obama,2016,Barack Obama's Eighth State of the Union Address,"['Mr. Speaker, Mr. Vice President, Members of ..."
216,Donald Trump,2017,Donald Trump's First State of the Union Address,"['Mr. Speaker, Mr. Vice President, Members of ..."
217,Donald Trump,2018,Donald Trump's Second State of the Union Address,"['Mr. Speaker, Mr. Vice President, Members of ..."
218,Donald Trump,2019,Donald Trump's Third State of the Union Address,"['Madam Speaker, Mr. Vice President, Members o..."


In [4]:
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

df['Text'] = df.Text.map(alphanumeric).map(punc_lower)
df

Unnamed: 0,President,Year,Title,Text
0,George Washington,1790,First State of the Union Address,i embrace with great satisfaction the opport...
1,George Washington,1790,Second State of the Union Address,fellow citizens of the senate and the house ...
2,George Washington,1791,Third State of the Union Address,fellow citizens of the senate and the house ...
3,George Washington,1792,Fourth State of the Union Address,fellow citizens of the senate and of the hou...
4,George Washington,1793,Fifth State of the Union Address,fellow citizens of the senate and of the hou...
...,...,...,...,...
215,Barack Obama,2016,Barack Obama's Eighth State of the Union Address,mr speaker mr vice president members of ...
216,Donald Trump,2017,Donald Trump's First State of the Union Address,mr speaker mr vice president members of ...
217,Donald Trump,2018,Donald Trump's Second State of the Union Address,mr speaker mr vice president members of ...
218,Donald Trump,2019,Donald Trump's Third State of the Union Address,madam speaker mr vice president members o...


In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in word_tokenize(text)]

In [7]:
df['Lemma_Text'] = df.Text.apply(lemmatize_text)

In [8]:
df.Lemma_Text[0]

['i',
 'embrace',
 'with',
 'great',
 'satisfaction',
 'the',
 'opportunity',
 'which',
 'now',
 'present',
 'itself',
 'of',
 'congratulating',
 'you',
 'on',
 'the',
 'present',
 'favourable',
 'prospect',
 'of',
 'our',
 'public',
 'affair',
 'the',
 'recent',
 'accession',
 'of',
 'the',
 'important',
 'state',
 'of',
 'north',
 'carolina',
 'to',
 'the',
 'constitution',
 'of',
 'the',
 'united',
 'state',
 'of',
 'which',
 'official',
 'information',
 'ha',
 'been',
 'received',
 '—',
 'the',
 'rising',
 'credit',
 'and',
 'respectability',
 'of',
 'our',
 'country',
 '—',
 'the',
 'general',
 'and',
 'increasing',
 'good',
 'will',
 'towards',
 'the',
 'government',
 'of',
 'the',
 'union',
 '—',
 'and',
 'the',
 'concord',
 'peace',
 'and',
 'plenty',
 'with',
 'which',
 'we',
 'are',
 'blessed',
 'are',
 'circumstance',
 'auspicious',
 'in',
 'an',
 'eminent',
 'degree',
 'to',
 'our',
 'national',
 'prosperity',
 'in',
 'resuming',
 'your',
 'consultation',
 'for',
 'the',
 '

In [9]:
df

Unnamed: 0,President,Year,Title,Text,Lemma_Text
0,George Washington,1790,First State of the Union Address,i embrace with great satisfaction the opport...,"[i, embrace, with, great, satisfaction, the, o..."
1,George Washington,1790,Second State of the Union Address,fellow citizens of the senate and the house ...,"[fellow, citizen, of, the, senate, and, the, h..."
2,George Washington,1791,Third State of the Union Address,fellow citizens of the senate and the house ...,"[fellow, citizen, of, the, senate, and, the, h..."
3,George Washington,1792,Fourth State of the Union Address,fellow citizens of the senate and of the hou...,"[fellow, citizen, of, the, senate, and, of, th..."
4,George Washington,1793,Fifth State of the Union Address,fellow citizens of the senate and of the hou...,"[fellow, citizen, of, the, senate, and, of, th..."
...,...,...,...,...,...
215,Barack Obama,2016,Barack Obama's Eighth State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,..."
216,Donald Trump,2017,Donald Trump's First State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,..."
217,Donald Trump,2018,Donald Trump's Second State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,..."
218,Donald Trump,2019,Donald Trump's Third State of the Union Address,madam speaker mr vice president members o...,"[madam, speaker, mr, vice, president, member, ..."


In [10]:
stop_words = stopwords.words('english')

In [11]:
df['text_no_stopwords'] = df['Lemma_Text'].apply(lambda x: ' '.join([word for word in x if word not in (stop_words)]))
df

Unnamed: 0,President,Year,Title,Text,Lemma_Text,text_no_stopwords
0,George Washington,1790,First State of the Union Address,i embrace with great satisfaction the opport...,"[i, embrace, with, great, satisfaction, the, o...",embrace great satisfaction opportunity present...
1,George Washington,1790,Second State of the Union Address,fellow citizens of the senate and the house ...,"[fellow, citizen, of, the, senate, and, the, h...",fellow citizen senate house representative mee...
2,George Washington,1791,Third State of the Union Address,fellow citizens of the senate and the house ...,"[fellow, citizen, of, the, senate, and, the, h...",fellow citizen senate house representative mee...
3,George Washington,1792,Fourth State of the Union Address,fellow citizens of the senate and of the hou...,"[fellow, citizen, of, the, senate, and, of, th...",fellow citizen senate house representative aba...
4,George Washington,1793,Fifth State of the Union Address,fellow citizens of the senate and of the hou...,"[fellow, citizen, of, the, senate, and, of, th...",fellow citizen senate house representative sin...
...,...,...,...,...,...,...
215,Barack Obama,2016,Barack Obama's Eighth State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,...",mr speaker mr vice president member congress f...
216,Donald Trump,2017,Donald Trump's First State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,...",mr speaker mr vice president member congress f...
217,Donald Trump,2018,Donald Trump's Second State of the Union Address,mr speaker mr vice president members of ...,"[mr, speaker, mr, vice, president, member, of,...",mr speaker mr vice president member congress f...
218,Donald Trump,2019,Donald Trump's Third State of the Union Address,madam speaker mr vice president members o...,"[madam, speaker, mr, vice, president, member, ...",madam speaker mr vice president member congres...
