##  Tweets From Obama, Trump, and All U.S. Senators

#### Importing Libraries 

In [1]:
import numpy as np
import pandas as pd 
import re

#### Reading in CSV & making initial data frames

In [2]:
obama_data = pd.read_csv("./twitter-ratio/BarackObama_e.csv")
df_obama = pd.DataFrame(obama_data)

trump_data = pd.read_csv("./twitter-ratio/realDonaldTrump_e.csv")
df_trump = pd.DataFrame(trump_data)

senator_data = pd.read_csv("./twitter-ratio/senators_e.csv")
df_senator = pd.DataFrame(senator_data)

##### Initial Look at Obama's Tweets

In [6]:
print(df_obama.shape,'\n')
print('\n',df_obama.info())
df_obama.head()

(3207, 7) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3207 entries, 0 to 3206
Data columns (total 7 columns):
created_at    3207 non-null object
text          3207 non-null object
url           3207 non-null object
replies       3207 non-null int64
retweets      3207 non-null int64
favorites     3207 non-null int64
user          3207 non-null object
dtypes: int64(3), object(4)
memory usage: 175.5+ KB

 None


Unnamed: 0,created_at,text,url,replies,retweets,favorites,user
0,10/16/17 22:59,I'm grateful to @SenJohnMcCain for his lifetim...,https://twitter.com/BarackObama/status/9200615...,17064,89916,641842,BarackObama
1,10/2/17 12:41,Michelle &amp; I are praying for the victims i...,https://twitter.com/BarackObama/status/9148326...,21588,405895,1715753,BarackObama
2,9/30/17 2:22,Proud to cheer on Team USA at the Invictus Gam...,https://twitter.com/BarackObama/status/9139520...,8476,60651,503255,BarackObama
3,9/26/17 17:03,We're expanding our efforts to help Puerto Ric...,https://twitter.com/BarackObama/status/9127244...,10657,113807,334901,BarackObama
4,9/25/17 23:36,"Prosecutor, soldier, family man, citizen. Beau...",https://twitter.com/BarackObama/status/9124608...,3886,58449,328106,BarackObama


##### Initial Look at Trumps's Tweets

In [7]:
print(df_trump.shape,'\n')
print('\n',df_trump.info())
df_trump.head()

(3232, 7) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3232 entries, 0 to 3231
Data columns (total 7 columns):
created_at    3232 non-null object
text          3232 non-null object
url           3232 non-null object
replies       3232 non-null int64
retweets      3232 non-null int64
favorites     3232 non-null int64
user          3232 non-null object
dtypes: int64(3), object(4)
memory usage: 176.8+ KB

 None


Unnamed: 0,created_at,text,url,replies,retweets,favorites,user
0,10/23/17 12:30,I had a very respectful conversation with the ...,https://twitter.com/realDonaldTrump/status/922...,46228,10243,49468,realDonaldTrump
1,10/23/17 11:53,Two dozen NFL players continue to kneel during...,https://twitter.com/realDonaldTrump/status/922...,31419,14006,62406,realDonaldTrump
2,10/23/17 11:42,There will be NO change to your 401(k). This h...,https://twitter.com/realDonaldTrump/status/922...,9552,13719,62662,realDonaldTrump
3,10/22/17 12:08,It is finally sinking through. 46% OF PEOPLE B...,https://twitter.com/realDonaldTrump/status/922...,56238,25102,112890,realDonaldTrump
4,10/22/17 12:02,Wacky Congresswoman Wilson is the gift that ke...,https://twitter.com/realDonaldTrump/status/922...,32136,21573,97145,realDonaldTrump


##### Initial Look @ Sentaor Tweets

In [8]:
print(df_senator.shape,'\n')
print('\n',df_senator.info())
df_senator.head()

(288615, 10) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288615 entries, 0 to 288614
Data columns (total 10 columns):
created_at     288615 non-null object
text           288615 non-null object
url            288615 non-null object
replies        288615 non-null int64
retweets       288615 non-null int64
favorites      288615 non-null int64
user           288615 non-null object
bioguide_id    288615 non-null object
party          288615 non-null object
state          288615 non-null object
dtypes: int64(3), object(7)
memory usage: 22.0+ MB

 None


Unnamed: 0,created_at,text,url,replies,retweets,favorites,user,bioguide_id,party,state
0,10/19/17 21:47,We released bipartisan healthcare bill today &...,https://twitter.com/amyklobuchar/status/921130...,21,129,533,amyklobuchar,K000367,D,MN
1,10/19/17 18:48,I spoke with @Morning_Joe team abt #HonestAds ...,https://twitter.com/amyklobuchar/status/921085...,8,46,150,amyklobuchar,K000367,D,MN
2,10/19/17 18:14,Lots of interest in my bill with Senators Warn...,https://twitter.com/amyklobuchar/status/921077...,36,227,932,amyklobuchar,K000367,D,MN
3,10/19/17 18:04,"Today's the day @MarkWarner, @SenJohnMcCain &a...",https://twitter.com/amyklobuchar/status/921074...,17,167,550,amyklobuchar,K000367,D,MN
4,10/19/17 16:33,".@MarkWarner, @SenJohnMcCain &amp; I put toget...",https://twitter.com/amyklobuchar/status/921051...,31,279,893,amyklobuchar,K000367,D,MN


##### Initial Data Cleaning

In [9]:
#'bioguide_id' is an official ID, assigned by Congress, that works as a unique identifier 
#'user' field is unique for each senator; dropping bioguide_id so it's easier to merge data frames 

df_senator.drop(columns = 'bioguide_id', # column I want to drop
                axis = 1, # specifying column
                inplace = True) # will ensure drop sticks 

In [10]:
# Adding 'party' feature to df_obama and populating with 'D' for Democrat 
df_obama['party'] = 'D'

# Adding 'party' feature to df_trump and populating with 'R' for Republican 
df_trump['party'] = 'R'

# Adding 'state' feature to df_trump & df_obama. State will be equal to 'US'
df_obama['state'] = 'US'
df_trump['state'] = 'US'

# Sanity Check 
print("Columns for Obama df:", df_obama.columns)
print("\nColumns for Trump df:", df_trump.columns)
print("\nColumns for Senators df:", df_senator.columns)

Columns for Obama df: Index(['created_at', 'text', 'url', 'replies', 'retweets', 'favorites', 'user',
       'party', 'state'],
      dtype='object')

Columns for Trump df: Index(['created_at', 'text', 'url', 'replies', 'retweets', 'favorites', 'user',
       'party', 'state'],
      dtype='object')

Columns for Senators df: Index(['created_at', 'text', 'url', 'replies', 'retweets', 'favorites', 'user',
       'party', 'state'],
      dtype='object')


##### Creating Master Data Frame

In [11]:
# creating variable called 'frames', which is a list of the 3 dataframes 
frames = [df_obama, df_senator, df_trump]

# converting 'frames' to single dataframe
df = pd.concat(frames)
type(df)

# resetting index
df.reset_index(inplace = True)

print(df.shape)
df.head(3)

(295054, 10)


Unnamed: 0,index,created_at,text,url,replies,retweets,favorites,user,party,state
0,0,10/16/17 22:59,I'm grateful to @SenJohnMcCain for his lifetim...,https://twitter.com/BarackObama/status/9200615...,17064,89916,641842,BarackObama,D,US
1,1,10/2/17 12:41,Michelle &amp; I are praying for the victims i...,https://twitter.com/BarackObama/status/9148326...,21588,405895,1715753,BarackObama,D,US
2,2,9/30/17 2:22,Proud to cheer on Team USA at the Invictus Gam...,https://twitter.com/BarackObama/status/9139520...,8476,60651,503255,BarackObama,D,US


##### Creating President Data Frame

In [15]:
# variable of just presidential tweets 
frame_pres = [df_obama, df_trump]

# making a president only data frame 
df_pres = pd.concat(frame_pres)

# Resetting the index 
df_pres.reset_index(inplace = True)

df_pres.drop(columns = 'index', axis = 1, inplace = True)

print(df_pres.shape)

(6439, 9)


In [16]:
# converting target variable ('user') binary 
df_pres['user'] = df_pres['user'].map({'BarackObama':1, 'realDonaldTrump':0})

# Binary Description 
    # Trump == 0 
    # Obama == 1
# are my classes balanced? (yes)
print(df_pres['user'].value_counts(normalize = True))

0    0.501941
1    0.498059
Name: user, dtype: float64


##### Cleaning 'text' using lambda 

In [17]:
# replacing '&amp;' with '&'
df['text'] = df['text'].map(lambda ampersand: ampersand.replace("&amp;", "&"))
df_pres['text'] = df_pres['text'].map(lambda ampersand: ampersand.replace("&amp;", "&"))

# replacing '‰ÛÒ' with '–'
df['text'] = df['text'].map(lambda dash: dash.replace("‰ÛÒ", '–'))
df_pres['text'] = df_pres['text'].map(lambda dash: dash.replace("‰ÛÒ", '–'))

# replacing '‰ÏÓ•üè' with nothing
df['text'] = df['text'].map(lambda check: check.replace('‰ÏÓ•üè', ''))
df_pres['text'] = df_pres['text'].map(lambda check: check.replace('‰ÏÓ•üè', ''))

# replacing '\r' with nothing 
df['text'] = df['text'].map(lambda r: r.replace('\r', ''))
df_pres['text'] = df_pres['text'].map(lambda r: r.replace('\r', ''))

#### Function to remove non ASCII characters that are within words with no spaces 

In [None]:
# This is going to allow me to replace any non-ascii characters with '-'

def _removeNonAscii(s):
    my_str=''
    for i in s:
        if ord(i)<128:
            my_str=my_str+i
        else:
            my_str=my_str+'-'
    return my_str

#### For loops to iterate over df['text']

--> Using REGEX, I am iterating over df['text'] and looking for any non ASCII characters and removing them.
--> This for loop is calling the _removeNonAscii function, replacing each non ASCII character with a '-'. 
--> This was a time-intensive process, so I am exporting the cleaned data to a new CSV 'ALLDATA_2'

In [None]:
for i in range(0,df_all.shape[0]):
    if len(re.findall(r'([^0-9A-z\r:.\'\/@\s])', df_all[['text']].iloc[i,0]) )>0:
        df_all.iloc[i,1] = _removeNonAscii(df_all[['text']].iloc[i,0])

In [2]:
# exporting cleaned data to csv 
df_all.to_csv('../capstone/ALLDATA_2.csv')

NameError: name 'df_all' is not defined

In [19]:
# reading in cleaned CSV
data_all = pd.read_csv('../capstone/ALLDATA_2.csv')

# creating a new DF 
df_all = pd.DataFrame(data_all)

# Dropping uncessary column 
df_all.drop(columns = 'Unnamed: 0', axis = 1, inplace = True)

# Sanity check 
print(df_all.shape)
df_all.head(2)

(295054, 9)


Unnamed: 0,created_at,text,url,replies,retweets,favorites,user,party,state
0,10/16/17 22:59,I'm grateful to @SenJohnMcCain for his lifetim...,https://twitter.com/BarackObama/status/9200615...,17064,89916,641842,BarackObama,D,US
1,10/2/17 12:41,Michelle & I are praying for the victims in La...,https://twitter.com/BarackObama/status/9148326...,21588,405895,1715753,BarackObama,D,US


#### For loops to iterate over df_pres['text']

In [None]:
for i in range(0,df_pres.shape[0]):
    if len(re.findall(r'([^0-9A-z\r:.\'\/@\s])', df_pres[['text']].iloc[i,0]) )>0:
        df_pres.iloc[i,1] = _removeNonAscii(df_pres[['text']].iloc[i,0])

In [None]:
# exporting cleaned data to CSV 
df_pres.to_csv('../capstone/PRESDATA.csv')

In [21]:
pres_data_2 = pd.read_csv('../capstone/PRESDATA.csv')

df_pres_2 = pd.DataFrame(pres_data_2)

df_pres_2.drop(columns = 'Unnamed: 0', axis = 1, inplace = True)

print(df_pres_2.shape)
df_pres_2.head(3)

(6439, 9)


Unnamed: 0,created_at,text,url,replies,retweets,favorites,user,party,state
0,10/16/17 22:59,I'm grateful to @SenJohnMcCain for his lifetim...,https://twitter.com/BarackObama/status/9200615...,17064,89916,641842,1,D,US
1,10/2/17 12:41,Michelle & I are praying for the victims in La...,https://twitter.com/BarackObama/status/9148326...,21588,405895,1715753,1,D,US
2,9/30/17 2:22,Proud to cheer on Team USA at the Invictus Gam...,https://twitter.com/BarackObama/status/9139520...,8476,60651,503255,1,D,US
