### Notebook for cleaning, completing, and merging the meta data

In [None]:
import pandas as pd
from datetime import datetime

## Reformat the date

In [None]:
agg_tweets = pd.read_csv('Data/aggregated_tweets.csv')

# convert created time from string to datetime
agg_tweets['created'] = [datetime.strptime(date, '%Y-%m-%d %H:%M:%S') for date in agg_tweets['created']]
# subset to days on or after Nov 6th
agg_tweets[agg_tweets['created'] >= pd.Timestamp(year=2019, month=11, day=6)]

# subset to tweets occuring during the pre-registration time frame
agg_tweets[agg_tweets['created'] >= pd.Timestamp(year=2019, month=11, day=6)]

## Complete meta data

In [78]:
# read in meta data scraped from twitter
meta_data = pd.read_csv('Meta Data/congress_meta_data.csv')
meta_data = meta_data[['screen_name', 'id', 'location', 'description', 'url']]
meta_data = meta_data.rename(columns = {'id':'user_id'})
meta_data.head()

Unnamed: 0,screen_name,user_id,location,description,url
0,RepRossSpano,1090328229548826627,"Dover, FL",Proudly Representing #FL15 in the U.S. House o...,https://t.co/s70X5B8YiV
1,RepJimBaird,1086316494450032640,"Indiana, USA",Lifelong Hoosier proudly representing the 4th ...,https://t.co/8voGTCDqS3
2,RepHagedorn,1083474782602125318,"Washington, DC",This is the official Twitter account for Congr...,https://t.co/D2A7FksK6h
3,RepCasten,1083472286089396224,,"Official twitter account for Rep. Sean Casten,...",https://t.co/hJKrtCC1y8
4,CongressmanJVD,1083469084648505344,South Jersey,Congressman for New Jersey's 2nd District. Rep...,https://t.co/Tp26SFAMd6


In [87]:
# read in meta data pulled from github
git_meta = pd.read_csv('Meta Data/legislators-current.csv')
git_meta = git_meta[['gender', 'party', 'state', 'district', 'twitter']]
git_meta = git_meta.rename(columns = {'twitter': 'screen_name'})
git_meta.head()

Unnamed: 0,gender,party,state,district,screen_name
0,M,Democrat,OH,,SenSherrodBrown
1,F,Democrat,WA,,SenatorCantwell
2,M,Democrat,MD,,SenatorCardin
3,M,Democrat,DE,,SenatorCarper
4,M,Democrat,PA,,SenBobCasey


In [90]:
# join with twitter meta data
merged_meta = pd.merge(meta_data, git_meta, how = 'left', on = 'screen_name')

In [116]:
# correcting formatting errors in twitter ID introduced by libre office
merged_meta = pd.read_csv('Meta Data/meta_data.csv')
merged_meta = pd.merge(meta_data, merged_meta, how = 'left', on = 'screen_name')

Unnamed: 0,screen_name,user_id,location,description,url,gender,party,state,district
0,RepRossSpano,1090328229548826627,"Dover, FL",Proudly Representing #FL15 in the U.S. House o...,https://t.co/s70X5B8YiV,M,Republican,FL,15.0
1,RepJimBaird,1086316494450032640,"Indiana, USA",Lifelong Hoosier proudly representing the 4th ...,https://t.co/8voGTCDqS3,M,Republican,IN,4.0
2,RepHagedorn,1083474782602125318,"Washington, DC",This is the official Twitter account for Congr...,https://t.co/D2A7FksK6h,M,Republican,MN,1.0
3,RepCasten,1083472286089396224,,"Official twitter account for Rep. Sean Casten,...",https://t.co/hJKrtCC1y8,M,Democrat,IL,6.0
4,CongressmanJVD,1083469084648505344,South Jersey,Congressman for New Jersey's 2nd District. Rep...,https://t.co/Tp26SFAMd6,M,Democrat,NJ,2.0
...,...,...,...,...,...,...,...,...,...
95,SenHydeSmith,983348251972816896,Brookhaven,Office of U.S. Senator Cindy Hyde-Smith (R-Mis...,https://t.co/cwhXwzYx2R,F,Republican,MS,
96,SenDougJones,941080085121175552,"Birmingham, AL",Latest news from the Office of U.S. Senator Do...,https://t.co/ybmvFhggOr,M,Democrat,AL,
97,SenTinaSmith,941000686275387392,,"Mom, wife, daughter, & loving but distracted f...",https://t.co/ijpKqkvEVA,F,Democrat,MN,
98,RepJohnCurtis,931614483050414080,Utah and DC,Official account for United States Congressman...,https://t.co/EXDQ281Hrl,M,Republican,UT,3.0


In [129]:
# write to csv
merged_meta.to_csv('Meta Data/meta_data.csv', index = False)

In [127]:
# correcting individual cells
merged_meta['party'][merged_meta['screen_name'] == 'RepJenniffer'] = "Republican"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


## Merge with Meta Data

In [135]:
# read in meta data
meta_data = pd.read_csv('Meta Data/meta_data.csv')
# drop columns not needed
meta_data = meta_data[['user_id', 'screen_name', 'gender', 'party', 'state', 'district']]

# read in the tweet data
agg_tweets = pd.read_csv('Data/aggregated_tweets.csv')

In [136]:
pd.merge(agg_tweets, meta_data, how = 'left', on = 'user_id')

Unnamed: 0,tweet_id,text,created,retweet,user_id,screen_name,gender,party,state,district
0,1190038962376720384,That didn’t take long. After a vote to make th...,2019-10-31 22:52:53,False,1090328229548826627,RepRossSpano,M,Republican,FL,15.0
1,1189921234592636928,Looking forward to showing @RepMarkGreen aroun...,2019-10-31 15:05:04,False,1090328229548826627,RepRossSpano,M,Republican,FL,15.0
2,1189909515032055808,A yes vote on this resolution gives a stamp of...,2019-10-31 14:18:30,False,1090328229548826627,RepRossSpano,M,Republican,FL,15.0
3,1189896903640723456,The House will vote on a resolution that will ...,2019-10-31 13:28:23,False,1090328229548826627,RepRossSpano,M,Republican,FL,15.0
4,1189627626484310022,My DC team and I were happy to help @the_USO t...,2019-10-30 19:38:23,False,1090328229548826627,RepRossSpano,M,Republican,FL,15.0
...,...,...,...,...,...,...,...,...,...,...
79358,1192130149086646273,The far left’s latest cure for health care... ...,2019-11-06 17:22:30,False,7270292,JimInhofe,M,Republican,OK,
79359,1192108203095535617,Democrats are still in denial about the 2016 e...,2019-11-06 15:55:18,False,7270292,JimInhofe,M,Republican,OK,
79360,1192511047007186945,My staff hosted Mobile Office Hours in Charles...,2019-11-07 18:36:04,False,5558312,JohnBoozman,M,Republican,AR,
79361,1192435983733739520,Getting ready to participate in @GCIOMedia’s V...,2019-11-07 13:37:47,False,5558312,JohnBoozman,M,Republican,AR,


In [10]:
# change party labels to R and D
meta_data = pd.read_csv('Meta Data/meta_data.csv')
meta_data['party'] = meta_data['party'].map({'Republican':'R', 'Democrat':'D'})
meta_data.to_csv('Meta Data/meta_data.csv', index = False)