In [1]:
# Path to dataset
dataset_folder = 'C:/Users/aaady/Desktop/Mira/datasets/us_presidential_speeches/democrat/Barack Obama' # FILL OUT FIRST!!!

In [2]:
import os
import io
import pandas as pd

In [3]:
# Function to run through the dataset folder and add all text to one variable
def get_all_text_files(folder):
    
    files = os.listdir(folder)
    text = ""
    
    for path in files:
        file_path = os.path.join(folder, path)
        file_read = io.open(file_path, mode="r", encoding="utf-8")
        file_text = file_read.read()
        text += file_text
    
    return text

In [4]:
all_text = get_all_text_files(dataset_folder)

In [5]:
speech_dict = {
    'obama': all_text
}

data_df = pd.DataFrame.from_dict(speech_dict, orient='index')
data_df.columns = ['transcript']
data_df.head()

Unnamed: 0,transcript
obama,"Thank you, everybody. Thank you. (Applause.) T..."


In [6]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\([^()]*\)', '', text) # Removes anything between square and regular brackets
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [7]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean

Unnamed: 0,transcript
obama,thank you everybody thank you thank you so mu...


In [8]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [9]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean

Unnamed: 0,transcript
obama,thank you everybody thank you thank you so mu...


In [10]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,abandon,abandoned,abandonment,abbas,abbottabad,abdallahs,abernathy,aberration,abess,abhor,...,zakatlikewise,zardari,zero,zigs,zika,zimbabweans,zimmerman,zionist,zip,zones
obama,6,1,1,3,1,1,1,1,1,1,...,1,1,2,1,1,1,2,2,1,1


In [23]:
data = data_dtm.transpose()

In [24]:
data.head()

Unnamed: 0,obama
abandon,6
abandoned,1
abandonment,1
abbas,3
abbottabad,1


In [25]:
# Find the top 30 words said by each president
top_dict = {}
for c in data.columns:
    top = data[c].sort_values(ascending=False).head(30)
    top_dict[c]= list(zip(top.index, top.values))

top_dict['obama']

[('people', 854),
 ('thats', 648),
 ('just', 530),
 ('america', 517),
 ('new', 497),
 ('make', 477),
 ('american', 450),
 ('know', 442),
 ('time', 430),
 ('years', 418),
 ('work', 412),
 ('world', 386),
 ('country', 376),
 ('right', 361),
 ('americans', 352),
 ('want', 351),
 ('like', 349),
 ('weve', 343),
 ('jobs', 341),
 ('going', 316),
 ('economy', 309),
 ('need', 296),
 ('im', 293),
 ('dont', 279),
 ('states', 277),
 ('think', 275),
 ('united', 253),
 ('future', 251),
 ('care', 248),
 ('year', 247)]

# Add Donald Trump speeches to the pandas dataframe: