# Data Cleaning (Demographic Data)

**Overview of Notebook:**
- Data sourced from [MIT Election Data and Science Lab](https://github.com/MEDSL/2018-elections-unoffical)
- Cleaned FIPS codes
- Created voting result variables for 2012 and 2016 presidential elections
- Update Proportional Data

In [1]:
# Set up 
import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

%xmode Minimal

Exception reporting mode: Minimal


### Load Data

In [2]:
local_dg = pd.read_csv('final_data/election-context-2018.csv', 
                       converters = {'fips': str})

### Explore Data

In [3]:
local_dg.head()
local_dg.info()
local_dg.columns
local_dg.state.unique()

Unnamed: 0,state,county,fips,trump16,clinton16,otherpres16,romney12,obama12,otherpres12,demsen16,...,age29andunder_pct,age65andolder_pct,median_hh_inc,clf_unemploy_pct,lesshs_pct,lesscollege_pct,lesshs_whites_pct,lesscollege_whites_pct,rural_pct,ruralurban_cc
0,Alabama,Autauga,1001,18172,5936,865,17379,6363,190,6331.0,...,40.037058,13.978456,53099.0,5.591657,12.417046,75.407229,10.002112,74.065601,42.002162,2.0
1,Alabama,Baldwin,1003,72883,18458,3874,66016,18424,898,19145.0,...,35.474412,18.714851,51365.0,6.286843,9.972418,70.452889,7.842227,68.405607,42.279099,3.0
2,Alabama,Barbour,1005,5454,4871,144,5550,5912,47,4777.0,...,37.664387,16.528895,33956.0,12.824738,26.235928,87.132213,19.579752,81.364746,67.789635,6.0
3,Alabama,Bibb,1007,6738,1874,207,6132,2202,86,2082.0,...,37.329435,14.885699,39776.0,7.146827,19.301587,88.0,15.02049,87.471774,68.352607,1.0
4,Alabama,Blount,1009,22859,2156,573,20757,2970,279,2980.0,...,37.240053,17.192916,46212.0,5.953833,19.968585,86.950243,16.643368,86.16361,89.951502,1.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3114 entries, 0 to 3113
Data columns (total 39 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   3114 non-null   object 
 1   county                  3114 non-null   object 
 2   fips                    3114 non-null   object 
 3   trump16                 3114 non-null   int64  
 4   clinton16               3114 non-null   int64  
 5   otherpres16             3114 non-null   int64  
 6   romney12                3114 non-null   int64  
 7   obama12                 3114 non-null   int64  
 8   otherpres12             3114 non-null   int64  
 9   demsen16                1942 non-null   float64
 10  repsen16                1942 non-null   float64
 11  othersen16              1942 non-null   float64
 12  demhouse16              2862 non-null   float64
 13  rephouse16              2862 non-null   float64
 14  otherhouse16            2862 non-null   

Index(['state', 'county', 'fips', 'trump16', 'clinton16', 'otherpres16',
       'romney12', 'obama12', 'otherpres12', 'demsen16', 'repsen16',
       'othersen16', 'demhouse16', 'rephouse16', 'otherhouse16', 'demgov16',
       'repgov16', 'othergov16', 'repgov14', 'demgov14', 'othergov14',
       'total_population', 'cvap', 'white_pct', 'black_pct', 'hispanic_pct',
       'nonwhite_pct', 'foreignborn_pct', 'female_pct', 'age29andunder_pct',
       'age65andolder_pct', 'median_hh_inc', 'clf_unemploy_pct', 'lesshs_pct',
       'lesscollege_pct', 'lesshs_whites_pct', 'lesscollege_whites_pct',
       'rural_pct', 'ruralurban_cc'],
      dtype='object')

array(['Alabama', 'Arizona', 'Arkansas', 'California', 'Colorado',
       'Connecticut', 'Delaware', 'District of Columbia', 'Florida',
       'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
       'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'North Carolina',
       'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
       'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming'], dtype=object)

In [4]:
# re-install dropped zeros in fips code
leading_zero_states = ['Alabama', 'Alaska','Arizona','Arkansas', 'California', 'Colorado', 'Connecticut']


fips_clean = []

for i in local_dg.fips:
    if len(i) <5:
        result = '0' + i
        fips_clean.append(result)
    else: 
        result = i
        fips_clean.append(result)
        
        
local_dg['fips_clean'] = fips_clean

local_dg.head()

Unnamed: 0,state,county,fips,trump16,clinton16,otherpres16,romney12,obama12,otherpres12,demsen16,...,age65andolder_pct,median_hh_inc,clf_unemploy_pct,lesshs_pct,lesscollege_pct,lesshs_whites_pct,lesscollege_whites_pct,rural_pct,ruralurban_cc,fips_clean
0,Alabama,Autauga,1001,18172,5936,865,17379,6363,190,6331.0,...,13.978456,53099.0,5.591657,12.417046,75.407229,10.002112,74.065601,42.002162,2.0,1001
1,Alabama,Baldwin,1003,72883,18458,3874,66016,18424,898,19145.0,...,18.714851,51365.0,6.286843,9.972418,70.452889,7.842227,68.405607,42.279099,3.0,1003
2,Alabama,Barbour,1005,5454,4871,144,5550,5912,47,4777.0,...,16.528895,33956.0,12.824738,26.235928,87.132213,19.579752,81.364746,67.789635,6.0,1005
3,Alabama,Bibb,1007,6738,1874,207,6132,2202,86,2082.0,...,14.885699,39776.0,7.146827,19.301587,88.0,15.02049,87.471774,68.352607,1.0,1007
4,Alabama,Blount,1009,22859,2156,573,20757,2970,279,2980.0,...,17.192916,46212.0,5.953833,19.968585,86.950243,16.643368,86.16361,89.951502,1.0,1009


### Create Derived Political Voting History Variables

In [5]:
# Presidential Winners (2016) 
pres_16 = []

for index, row in local_dg.iterrows():
    if row['clinton16'] > row['trump16'] and row['clinton16'] > row['otherpres16']:
        result = 'CLINTON'
        pres_16.append(result)
    elif row['trump16'] > row['clinton16'] and row['trump16'] > row['otherpres16']:
        result = 'TRUMP'
        pres_16.append(result)
    elif row['otherpres16'] > row['clinton16'] and row['otherpres16'] > row['trump16']:
        result = 'THIRD'
        pres_16.append(result)
    else: 
        result = "check"
        pres_16.append(result)
        
# add this as a new column 
local_dg['pres_16'] = pres_16

local_dg.pres_16.unique()

# there's one check but it seems to be a mistake so let's remove it
local_dg[local_dg['pres_16']=='check']
local_dg = local_dg[local_dg['pres_16']!='check']
local_dg.pres_16.unique()

array(['TRUMP', 'CLINTON', 'THIRD', 'check'], dtype=object)

Unnamed: 0,state,county,fips,trump16,clinton16,otherpres16,romney12,obama12,otherpres12,demsen16,...,median_hh_inc,clf_unemploy_pct,lesshs_pct,lesscollege_pct,lesshs_whites_pct,lesscollege_whites_pct,rural_pct,ruralurban_cc,fips_clean,pres_16
2887,Virginia,Bedford,51515,0,0,0,1527,1225,53,,...,,,,,,,2.057216,2.0,51515,check


array(['TRUMP', 'CLINTON', 'THIRD'], dtype=object)

In [6]:
# Presidential Winners (2012) 
pres_12 = []

for index, row in local_dg.iterrows():
    if row['romney12'] > row['obama12'] and row['romney12'] > row['otherpres12']:
        result = 'ROMNEY'
        pres_12.append(result)
    elif row['obama12'] > row['romney12'] and row['obama12'] > row['otherpres12']:
        result = 'OBAMA'
        pres_12.append(result)
    elif row['otherpres12'] > row['obama12'] and row['otherpres12'] > row['romney12']:
        result = 'THIRD'
        pres_12.append(result)
    else: 
        result = "check"
        pres_12.append(result)
        
# add this as a new column         
local_dg['pres_12'] = pres_12

# check
local_dg.pres_12.unique()
#local_dg[local_dg['pres_12']=='check'] # none needed review

array(['ROMNEY', 'OBAMA'], dtype=object)

In [7]:
# Now turn the presidential winners into dummies 
local_dg = pd.get_dummies(local_dg, columns=['pres_16', 'pres_12'], drop_first=False)

In [8]:
# Change percentage to range between 0-1 instead of 0-100
local_dgcl = local_dg.copy()

pct_columns = ['white_pct', 'black_pct', 'hispanic_pct','nonwhite_pct', 'foreignborn_pct', 'female_pct', 
               'age29andunder_pct','age65andolder_pct', 'clf_unemploy_pct', 'lesshs_pct',
               'lesscollege_pct', 'lesshs_whites_pct', 'lesscollege_whites_pct','rural_pct']

for i in pct_columns: 
    result = local_dgcl[i]/100
    local_dgcl[i] = round(result,3)

local_dgcl.head()

Unnamed: 0,state,county,fips,trump16,clinton16,otherpres16,romney12,obama12,otherpres12,demsen16,...,lesshs_whites_pct,lesscollege_whites_pct,rural_pct,ruralurban_cc,fips_clean,pres_16_CLINTON,pres_16_THIRD,pres_16_TRUMP,pres_12_OBAMA,pres_12_ROMNEY
0,Alabama,Autauga,1001,18172,5936,865,17379,6363,190,6331.0,...,0.1,0.741,0.42,2.0,1001,0,0,1,0,1
1,Alabama,Baldwin,1003,72883,18458,3874,66016,18424,898,19145.0,...,0.078,0.684,0.423,3.0,1003,0,0,1,0,1
2,Alabama,Barbour,1005,5454,4871,144,5550,5912,47,4777.0,...,0.196,0.814,0.678,6.0,1005,0,0,1,1,0
3,Alabama,Bibb,1007,6738,1874,207,6132,2202,86,2082.0,...,0.15,0.875,0.684,1.0,1007,0,0,1,0,1
4,Alabama,Blount,1009,22859,2156,573,20757,2970,279,2980.0,...,0.166,0.862,0.9,1.0,1009,0,0,1,0,1


### Create and Save Small DF

In [9]:
local_dgcl.columns

Index(['state', 'county', 'fips', 'trump16', 'clinton16', 'otherpres16',
       'romney12', 'obama12', 'otherpres12', 'demsen16', 'repsen16',
       'othersen16', 'demhouse16', 'rephouse16', 'otherhouse16', 'demgov16',
       'repgov16', 'othergov16', 'repgov14', 'demgov14', 'othergov14',
       'total_population', 'cvap', 'white_pct', 'black_pct', 'hispanic_pct',
       'nonwhite_pct', 'foreignborn_pct', 'female_pct', 'age29andunder_pct',
       'age65andolder_pct', 'median_hh_inc', 'clf_unemploy_pct', 'lesshs_pct',
       'lesscollege_pct', 'lesshs_whites_pct', 'lesscollege_whites_pct',
       'rural_pct', 'ruralurban_cc', 'fips_clean', 'pres_16_CLINTON',
       'pres_16_THIRD', 'pres_16_TRUMP', 'pres_12_OBAMA', 'pres_12_ROMNEY'],
      dtype='object')

In [10]:
local_dg_sm = local_dgcl[['state', 'county', 'fips','fips_clean','total_population', 'cvap', 'white_pct', 'black_pct', 
                        'hispanic_pct','nonwhite_pct', 'foreignborn_pct', 'female_pct', 'age29andunder_pct',
                        'age65andolder_pct', 'clf_unemploy_pct', 'lesshs_pct','lesscollege_pct', 'lesshs_whites_pct',
                        'lesscollege_whites_pct', 'rural_pct', 'ruralurban_cc', 'median_hh_inc', 'pres_16_CLINTON', 
                        'pres_16_THIRD', 'pres_16_TRUMP', 'pres_12_OBAMA', 'pres_12_ROMNEY']]

local_dg_sm.head()

local_dg_sm.info()

Unnamed: 0,state,county,fips,fips_clean,total_population,cvap,white_pct,black_pct,hispanic_pct,nonwhite_pct,...,lesshs_whites_pct,lesscollege_whites_pct,rural_pct,ruralurban_cc,median_hh_inc,pres_16_CLINTON,pres_16_THIRD,pres_16_TRUMP,pres_12_OBAMA,pres_12_ROMNEY
0,Alabama,Autauga,1001,1001,55049.0,40690.0,0.757,0.184,0.026,0.243,...,0.1,0.741,0.42,2.0,53099.0,0,0,1,0,1
1,Alabama,Baldwin,1003,1003,199510.0,151770.0,0.832,0.092,0.044,0.168,...,0.078,0.684,0.423,3.0,51365.0,0,0,1,0,1
2,Alabama,Barbour,1005,1005,26614.0,20375.0,0.459,0.479,0.043,0.541,...,0.196,0.814,0.678,6.0,33956.0,0,0,1,1,0
3,Alabama,Bibb,1007,1007,22572.0,17590.0,0.748,0.212,0.022,0.252,...,0.15,0.875,0.684,1.0,39776.0,0,0,1,0,1
4,Alabama,Blount,1009,1009,57704.0,42430.0,0.877,0.016,0.087,0.123,...,0.166,0.862,0.9,1.0,46212.0,0,0,1,0,1


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3113 entries, 0 to 3113
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state                   3113 non-null   object 
 1   county                  3113 non-null   object 
 2   fips                    3113 non-null   object 
 3   fips_clean              3113 non-null   object 
 4   total_population        3111 non-null   float64
 5   cvap                    3111 non-null   float64
 6   white_pct               3111 non-null   float64
 7   black_pct               3111 non-null   float64
 8   hispanic_pct            3111 non-null   float64
 9   nonwhite_pct            3111 non-null   float64
 10  foreignborn_pct         3111 non-null   float64
 11  female_pct              3111 non-null   float64
 12  age29andunder_pct       3111 non-null   float64
 13  age65andolder_pct       3111 non-null   float64
 14  clf_unemploy_pct        3111 non-null   

In [11]:
# drop null values 
local_dg_sm = local_dg_sm[~local_dg_sm.isna()]

In [12]:
# save df 
%store local_dg_sm

Stored 'local_dg_sm' (DataFrame)
