##  We're going collect and clean all the data to create a final Data Frame for Clustering

In [1]:
#First set everthing up
import pyodbc
import graphlab as gl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import config #server information and credentials
from pullODBC import pullODBC
from pullODBCall import pullODBCall
from matplotlib.backends.backend_pdf import PdfPages
from __future__ import division # convert integer division to floating point
%matplotlib inline
plt.style.use('seaborn-white')
plt.rcParams['font.size'] = 7
plt.tight_layout;
plt.rc('xtick', labelsize=7) 
plt.rc('ytick', labelsize=7) 

We create a utility function that will standardize all columns except the first one (which should contain a key) to mean 0 and standard deviation of 1

In [2]:
def standardize(df):
    means = df.mean()
    std = df.std()
    size = len(df.columns)
    df.iloc[:,1:size] = (df.iloc[:,1:size]- means)/std
    return df
    

Next, we read the file containing election results by county/precinct combination, add a column 'trump_premium' that contains the fraction voted for Trump minus fraction voted for Newton, standardize this, and inspect the resulting data frame to ensure correctness.

In [3]:
results = pd.read_csv('results_V3.csv')
#results = results.iloc[:, 0:4]
results['trump_premium'] = results['fraction_trump'] - results['fraction_newton']
results = standardize(results)

In [4]:
results.head()

Unnamed: 0,county_pct,fraction_newton,fraction_trump,trump_premium
0,ALAMANCE01,1.288102,1.260423,0.3445
1,ALAMANCE02,1.381115,1.3422,0.316352
2,ALAMANCE035,0.327215,0.252114,-0.30321
3,ALAMANCE03C,0.438941,0.263283,-0.836746
4,ALAMANCE03N,0.141841,-0.069324,-1.15666


In [5]:
len(results)

3209

In [6]:
print ('The means are ' + str(results.mean()))
print ('The standard deviations are ' + str(results.std()))

The means are fraction_newton   -1.781761e-15
fraction_trump     4.073754e-15
trump_premium      5.252155e-16
dtype: float64
The standard deviations are fraction_newton    1.0
fraction_trump     1.0
trump_premium      1.0
dtype: float64


Next, we read the file precinct demographics by county/precinct combination, standardize this, and inspect the resulting data frame to ensure correctness.

In [7]:
demographics = pd.read_csv('precinct_demographics_nov2.csv')
#demographics = demographics.iloc[:, 0:]
demographics = standardize(demographics)

In [8]:
demographics.head()

Unnamed: 0,county_pct,population,asian,black,indian,mixed,other_race,white,DEM,LIB,REP,hispanic_latino,not_latino,female,male,has_license,NC_born,No_birth_state,registr_post_2000,median_age
0,ALAMANCE01,0.467571,-0.533549,-0.669621,-0.137957,-0.429933,-0.924418,0.784178,-0.987228,-0.401748,1.256634,-0.487041,0.452812,-0.828705,0.958648,-0.026753,1.274739,-0.831857,-0.523867,0.204141
1,ALAMANCE02,0.461559,-0.492953,-0.850421,-0.111205,-0.686231,-1.035001,0.957634,-1.149127,-0.634491,1.596286,-0.877865,0.589629,-0.858605,1.074886,-0.363345,1.306023,-0.926017,-1.203044,0.204141
2,ALAMANCE035,0.678631,-0.208847,-0.523804,-0.135987,0.673398,-0.618577,0.438694,-0.53572,0.12527,0.434837,-0.098868,-0.210479,-0.11947,-1.027064,-1.429859,0.319688,-0.111455,0.137051,-0.906541
3,ALAMANCE03C,-0.164274,0.293179,-0.656287,-0.147208,-0.196617,-0.091742,0.671241,-0.605625,-0.861391,0.984857,-0.636771,0.544773,-0.485154,0.885192,1.193507,0.619407,-0.900591,-1.570889,0.838816
4,ALAMANCE03N,0.554399,0.296825,-0.563337,-0.149422,0.406754,-0.296681,0.375567,-0.535883,0.566979,0.374761,-0.276754,-0.43895,-1.242591,-0.43743,-2.364532,-0.313338,0.177579,-0.093403,-0.589203


In [9]:
len(demographics)

2706

In [10]:
print ('The means are ' + str(demographics.mean()))
print ('The standard deviations are ' + str(demographics.std()))

The means are population          -1.136891e-16
asian               -3.406571e-16
black               -1.518905e-15
indian              -1.632512e-16
mixed               -1.832648e-15
other_race          -3.142760e-17
white               -1.405380e-15
DEM                  2.177202e-15
LIB                 -8.522069e-16
REP                 -2.895032e-15
hispanic_latino     -2.098264e-15
not_latino          -7.210172e-15
female               4.888058e-15
male                -2.534771e-14
has_license          1.537815e-14
NC_born              5.051310e-15
No_birth_state      -1.203193e-15
registr_post_2000    1.503733e-14
median_age           1.228630e-15
dtype: float64
The standard deviations are population           1.0
asian                1.0
black                1.0
indian               1.0
mixed                1.0
other_race           1.0
white                1.0
DEM                  1.0
LIB                  1.0
REP                  1.0
hispanic_latino      1.0
not_latino           1

Now we do an outer join and write to csv to find errors

In [11]:
combined_pct = pd.merge(demographics, results, how='outer')
combined_pct.to_csv('combined_pct.csv')

After observing and fixing errors, we now do a left join with demographic on the left and save this in a csv

In [12]:
combined_pct = pd.merge(demographics, results, how='left')
combined_pct.to_csv('combined_pct.csv')

Next, we read the file containing election turnout by county/precinct combination, standardize this, and inspect the resulting data frame to ensure correctness.

In [13]:
turnout = pd.read_csv('turnout_allV3.csv')
#turnout = turnout.iloc[:, 0:]
turnout = standardize(turnout)

In [14]:
turnout.head()

Unnamed: 0,county_pct,2016,2015,2014
0,ALAMANCE01,0.599096,-0.812651,0.249069
1,ALAMANCE02,0.999924,-0.705549,0.802573
2,ALAMANCE035,-1.179707,-0.6969,-0.955271
3,ALAMANCE03C,1.529294,2.305511,2.027266
4,ALAMANCE03N,-0.044367,0.290954,0.208969


In [15]:
len(turnout)

2777

In [16]:
print ('The means are ' + str(turnout.mean()))
print ('The standard deviations are ' + str(turnout.std()))

The means are 2016    1.840490e-15
2015   -1.922284e-15
2014    2.904042e-15
dtype: float64
The standard deviations are 2016    1.0
2015    1.0
2014    1.0
dtype: float64


We need to strip out all the blanks in column 'county_pct'

In [17]:
turnout['county_pct'] = turnout['county_pct'].map(lambda x: x.strip())
combined_pct['county_pct'] = combined_pct['county_pct'].map(lambda x: x.strip())

Now we do an outer join and write to csv to find errors

In [18]:
combined_pct_test = pd.merge(combined_pct, turnout, how='outer')
combined_pct_test.to_csv('combined_pct_test.csv')

After observing and fixing errors, we now replace all NaNs with 0 (mean) and do a left join with demographic on the left and save this in a csv

In [19]:
combined_pct_final = pd.merge(combined_pct, turnout, how='left')
combined_pct_final.fillna(value=0, inplace=True)
combined_pct_final.to_csv('combined_pct_final.csv')