# Set up

In [1]:
# data processing
import pandas as pd

#import the class clean from Object Oriented Programming
import data_processing_oop

# lemmatisation
from nltk import WordNetLemmatizer

# stopwords
from nltk.corpus import stopwords

#read/write Excel files
import openpyxl

# Data processing

In [2]:
# import data
df_speech = pd.read_excel("output/obama_speeches_dataframe.xlsx", index_col = 0)
df_speech.head()

Unnamed: 0,date,title,content
0,20 Jan 2009,First Presidential Inaugural Addre,[Chief Justice John G. \r\n\t\tRoberts adminis...
1,24 Jan 2009,>American Rhetoric: Barack Obama: First Presid...,: First Presidential Weekly Address \r\n(01-24...
2,26 Jan 2009,Al-Arabiya Television Intervi,- Al-Arabiya Television InterviewBarackObamaAl...
3,04 Feb 2009,Remarks on Executive Compensati,- Remarks on Executive CompensationBarackObama...
4,09 Feb 2009,First Presidential Prime Time Press \r\nConferen,"Good \r\n\t\tevening, everybody. Please be sea..."


In [3]:
# remove the redundant strings in dataframe
df_speech.iloc[0,2] = df_speech.iloc[0,2][df_speech.iloc[0,2].find("My fellow citizens:")
                                          +len("My fellow citizens:"):df_speech.iloc[0,2].find("(Drudge Report)")]
df_speech.iloc[1,2] = df_speech.iloc[1,2][df_speech.iloc[1,2].find("[as prepared for delivery]") + 
                                               len("[as prepared for delivery]"):]
df_speech.iloc[1,1] = df_speech.iloc[1,1][df_speech.iloc[1,1].find(">American Rhetoric: Barack Obama:")
                                         +len(">American Rhetoric: Barack Obama:"):df_speech.iloc[1,1].find(
                                         "(01-24-0")].strip()

# Clean data

In [4]:
# Extract the tokens from string of characters  
# Remove the punctuations
# Lower the tokens
# Remove stopword
# Lemmatize  
# above all finished using the class clean

clean_content = [data_processing_oop.clean.clean_text(i) for i in df_speech["content"].values.tolist() ]
df_speech["content_clean"] = clean_content
df_speech.head()

Unnamed: 0,date,title,content,content_clean
0,20 Jan 2009,First Presidential Inaugural Addre,I stand here today humbled \r\n\t\tby the task...,stand today humble task u grateful trust besto...
1,24 Jan 2009,First Presidential Weekly Address,We begin this year and this \r\n\t\tAdministra...,begin year administration midst unprecedented ...
2,26 Jan 2009,Al-Arabiya Television Intervi,- Al-Arabiya Television InterviewBarackObamaAl...,television television interview hisham melhemd...
3,04 Feb 2009,Remarks on Executive Compensati,- Remarks on Executive CompensationBarackObama...,remark executive compensationbarackobamaremark...
4,09 Feb 2009,First Presidential Prime Time Press \r\nConferen,"Good \r\n\t\tevening, everybody. Please be sea...",good even everybody please take question tonig...


In [5]:
# Extract nouns from speeches
nouns = [data_processing_oop.clean.nouns_extract(x) for x in clean_content]
df_speech["content_nouns"] = nouns
df_speech.head()

Unnamed: 0,date,title,content,content_clean,content_nouns
0,20 Jan 2009,First Presidential Inaugural Addre,I stand here today humbled \r\n\t\tby the task...,stand today humble task u grateful trust besto...,stand today task u trust bestow sacrifice bear...
1,24 Jan 2009,First Presidential Weekly Address,We begin this year and this \r\n\t\tAdministra...,begin year administration midst unprecedented ...,year administration midst crisis call action w...
2,26 Jan 2009,Al-Arabiya Television Intervi,- Al-Arabiya Television InterviewBarackObamaAl...,television television interview hisham melhemd...,television television interview hisham house w...
3,04 Feb 2009,Remarks on Executive Compensati,- Remarks on Executive CompensationBarackObama...,remark executive compensationbarackobamaremark...,remark executive compensationbarackobamaremark...
4,09 Feb 2009,First Presidential Prime Time Press \r\nConferen,"Good \r\n\t\tevening, everybody. Please be sea...",good even everybody please take question tonig...,question tonight speak briefly state economy n...


# Save data

In [6]:
content_nouns = df_speech['content_nouns']

In [7]:
# save cleaned nouns data as txt file
with open("output/cleaned_nouns.txt", "w",encoding="utf-8") as file:
    for text in content_nouns:
        file.write("%s\n" % text) # add line break after each speech to keep speeches separate
    file.close()

In [8]:
content_clean = df_speech['content_clean']

In [9]:
# save cleaned data as txt file
with open("output/cleaned_speeches.txt", "w",encoding="utf-8") as file:
    for text in content_clean:
        file.write("%s\n" % text) # add line break after each speech to keep speeches separate
    file.close()

In [10]:
# write data frame to excel file
df_speech.to_excel("output/obama_speeches_clean.xlsx")