##  We're going calculate turnout by county/precinct combination to use as 3 dimensions for our clustering work

In [1]:
#First set everthing up
import pyodbc
import graphlab as gl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import local #server information and credentials
from pullODBC import pullODBC
from pullODBCall import pullODBCall
from matplotlib.backends.backend_pdf import PdfPages
from __future__ import division # convert integer division to floating point
%matplotlib inline
plt.style.use('seaborn-white')
plt.rcParams['font.size'] = 7
plt.tight_layout;
plt.rc('xtick', labelsize=7) 
plt.rc('ytick', labelsize=7) 

Read in the full voter history

In [2]:
elections_flattened = pd.read_csv('elections_flattened.csv')

In [3]:
print len(elections_flattened)
print elections_flattened.head()
print elections_flattened.columns

6490208
   Unnamed: 0  2007  2008  2009  2010  2011  2012  2013  2014  2015  2016  \
0           0   0.0   1.0   0.0   1.0   1.0   1.0   0.0   1.0   0.0   1.0   
1           1   1.0   1.0   1.0   1.0   1.0   1.0   0.0   1.0   0.0   0.0   
2           2   0.0   1.0   1.0   0.0   0.0   1.0   0.0   0.0   0.0   0.0   
3           3   0.0   1.0   0.0   1.0   0.0   1.0   1.0   1.0   1.0   1.0   
4           4   0.0   1.0   0.0   1.0   0.0   1.0   1.0   1.0   1.0   1.0   

       ncid  
0  AA100007  
1   AA10001  
2  AA100023  
3  AA100062  
4  AA100074  
Index([u'Unnamed: 0', u'2007', u'2008', u'2009', u'2010', u'2011', u'2012',
       u'2013', u'2014', u'2015', u'2016', u'ncid'],
      dtype='object')


Since we know that muliple voting records are present in some elections, we need to see if any number is > 1.  Note that we already eliminated these for odd years, so we should only see these for even years

In [4]:
### First we set up a list of column names for the years
year_columns = ['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2015', '2015', '2016']
### Next, we iterate over all the columns to find any numbers > 1
for col in year_columns:
    bool_vector = elections_flattened[col] > 1
    print('For ' + str(col) + ' the number of voters that had more than one voting record is ' + str(sum(bool_vector)))

For 2007 the number of voters that had more than one voting record is 0
For 2008 the number of voters that had more than one voting record is 521
For 2009 the number of voters that had more than one voting record is 0
For 2010 the number of voters that had more than one voting record is 142
For 2011 the number of voters that had more than one voting record is 0
For 2012 the number of voters that had more than one voting record is 239
For 2013 the number of voters that had more than one voting record is 0
For 2015 the number of voters that had more than one voting record is 0
For 2015 the number of voters that had more than one voting record is 0
For 2016 the number of voters that had more than one voting record is 67


First, let's count how many votes were cast in each year counting multiple votes

In [5]:
for col in year_columns:
    print('For ' + str(col) + ' the number of votes cast were ' + str(int(sum(elections_flattened[col]))))

For 2007 the number of votes cast were 721800
For 2008 the number of votes cast were 4348017
For 2009 the number of votes cast were 593535
For 2010 the number of votes cast were 2699178
For 2011 the number of votes cast were 599494
For 2012 the number of votes cast were 4540926
For 2013 the number of votes cast were 620519
For 2015 the number of votes cast were 487360
For 2015 the number of votes cast were 487360
For 2016 the number of votes cast were 4768149


Now we need to change all of these numbers > 1 to 1

In [6]:
for col in year_columns:
    elections_flattened[col] = elections_flattened[col] >= 1

Let's once again look at how many votes were cast.  Odd years should be the same as before, and even years should be less.

In [7]:
for col in year_columns:
    print('For ' + str(col) + ' the number of votes cast were ' + str(int(sum(elections_flattened[col]))))

For 2007 the number of votes cast were 721800
For 2008 the number of votes cast were 4347492
For 2009 the number of votes cast were 593535
For 2010 the number of votes cast were 2699036
For 2011 the number of votes cast were 599494
For 2012 the number of votes cast were 4540687
For 2013 the number of votes cast were 620519
For 2015 the number of votes cast were 487360
For 2015 the number of votes cast were 487360
For 2016 the number of votes cast were 4768082


Now we need to pull in the demographics files for 2016, 2015, and 2015 to calculate turnout for each of these years

## First we do 2014

In [8]:
local.database = 'registration2014'
local.table = 'VR_Snapshot_20141104'

I want all rows with status codes 'A' and 'I'

In [9]:
condition1 = "[status_cd] = 'A'"
condition2 = "[status_cd] = 'I'"
condition = "(" + condition1 + " OR " + condition2 + ")"  
print condition

([status_cd] = 'A' OR [status_cd] = 'I')


In [10]:
#Set up the fields that I need to pull from the database 
fields = '[county_desc],[ncid],[precinct_abbrv]'
print fields

[county_desc],[ncid],[precinct_abbrv]


In [11]:
#Pull the data from the database
demographics=pullODBC(local.driver,
                      local.server,
                      local.database,
                      local.username,
                      local.password,
                      local.table,
                      fields,
                      condition)

In [12]:
print(len(demographics))
print demographics.tail(n=15)
raw_len = len(demographics)# Save the number of rows

6627320
        county_desc     ncid precinct_abbrv
6627305      WILSON  EP87415           PRWR
6627306      WILSON  EP87416           PRTO
6627307      WILSON  EP87417           PRTA
6627308      WILSON  EP87418           PRWL
6627309      WILSON  EP87419           PRWJ
6627310      WILSON  EP87420           PRTO
6627311      WILSON  EP87421           PRWE
6627312      WILSON  EP87422           PRCR
6627313      WILSON  EP87423           PRGA
6627314      WILSON  EP87424           PRTO
6627315      WILSON  EP87425           PRTA
6627316      WILSON  EP87427           PRWL
6627317      WILSON  EP87428           PRSP
6627318      WILSON  EP87429           PROL
6627319      WILSON  EP87430           PRWL


We now need to delete any rows with blank county_desc, ncid, or precinct_abbrv.  We do that by first changing blans to NaNs, and the deleting any rows with NaNs

In [13]:
def fix_blanks_NaN(x):
    if x == "":
        return np.nan
    elif x == " ":
        return np.nan
    else:
        return x

In [14]:
#Change blank fields to Nan
fields = demographics.columns
for col in fields:
    demographics[col] = demographics[col].apply(fix_blanks_NaN)
#Next, delete any rows with NaN and disply counts
demographics.dropna(inplace=True, how='any') #drop all rows with NaN
clean_len=len(demographics) #how many rows to I have left?
difference = raw_len - clean_len #how many rows have I dropped?
fraction = float(difference)/float(raw_len) #what fraction of rows have I dropped?
print "The number of rows I have dropped is " + str(difference) 
print "The number of rows remaining in the database is " + str(clean_len)
print "The precentage of rows that I have dropped is: " + str('{:.3%}'.format(fraction))

The number of rows I have dropped is 6316
The number of rows remaining in the database is 6621004
The precentage of rows that I have dropped is: 0.095%


Now we do a left join of elections_flattened to demographics since we use the 2014 registration data as our denominator for 2014 turnout

In [15]:
combined_2014 = pd.merge(demographics, elections_flattened, how='left', on='ncid')
print(len(combined_2014), len(demographics), len(elections_flattened))# combined should have the same number of rows as demographics

(6621004, 6621004, 6490208)


Let's take a look at combined_2014

In [16]:
print(combined_2014.tail(n=15))

        county_desc     ncid precinct_abbrv  Unnamed: 0   2007   2008   2009  \
6620989      WILSON  EP87415           PRWR         NaN    NaN    NaN    NaN   
6620990      WILSON  EP87416           PRTO   6449374.0  False  False  False   
6620991      WILSON  EP87417           PRTA   6449375.0  False  False  False   
6620992      WILSON  EP87418           PRWL   6449376.0  False  False  False   
6620993      WILSON  EP87419           PRWJ   6449377.0  False  False  False   
6620994      WILSON  EP87420           PRTO   6449378.0  False  False  False   
6620995      WILSON  EP87421           PRWE         NaN    NaN    NaN    NaN   
6620996      WILSON  EP87422           PRCR         NaN    NaN    NaN    NaN   
6620997      WILSON  EP87423           PRGA   6449379.0  False  False  False   
6620998      WILSON  EP87424           PRTO   6449380.0  False  False  False   
6620999      WILSON  EP87425           PRTA   6449381.0  False  False  False   
6621000      WILSON  EP87427           P

We now need to fill all the NaNs with False.  The only NaNs are for registered voters without voting records that were created by the left join

In [17]:
combined_2014.fillna(value=False, method=None, axis=None, inplace=True, limit=None, downcast=None)

In [18]:
print(combined_2014.tail(n=15))

        county_desc     ncid precinct_abbrv  Unnamed: 0   2007   2008   2009  \
6620989      WILSON  EP87415           PRWR         0.0  False  False  False   
6620990      WILSON  EP87416           PRTO   6449374.0  False  False  False   
6620991      WILSON  EP87417           PRTA   6449375.0  False  False  False   
6620992      WILSON  EP87418           PRWL   6449376.0  False  False  False   
6620993      WILSON  EP87419           PRWJ   6449377.0  False  False  False   
6620994      WILSON  EP87420           PRTO   6449378.0  False  False  False   
6620995      WILSON  EP87421           PRWE         0.0  False  False  False   
6620996      WILSON  EP87422           PRCR         0.0  False  False  False   
6620997      WILSON  EP87423           PRGA   6449379.0  False  False  False   
6620998      WILSON  EP87424           PRTO   6449380.0  False  False  False   
6620999      WILSON  EP87425           PRTA   6449381.0  False  False  False   
6621000      WILSON  EP87427           P

Next, we create a new field that combines county_desc with precinct_abbrv

In [19]:
combined_2014['county_pct'] = combined_2014['county_desc']+combined_2014['precinct_abbrv']

In [20]:
print(combined_2014.head(n=15))

   county_desc    ncid precinct_abbrv  Unnamed: 0   2007   2008   2009   2010  \
0       YANCEY  ES4286         06 JAC   6487514.0  False  False  False  False   
1       YANCEY  ES4299         06 JAC   6487521.0  False   True  False   True   
2       YANCEY  ES4231         06 JAC   6487477.0  False   True  False   True   
3       YANCEY  ES4244         06 JAC   6487486.0  False   True  False   True   
4       YANCEY  ES7365         06 JAC   6489106.0  False  False  False  False   
5       YANCEY  ES7352         02 CAN   6489096.0  False   True  False   True   
6       YANCEY  ES7322         02 CAN   6489080.0  False   True  False  False   
7       YANCEY  ES7323         02 CAN   6489081.0  False   True  False  False   
8       YANCEY  ES7294         11 PRI   6489067.0  False   True  False   True   
9       YANCEY  ES7167         01 BUR   6489008.0  False   True  False  False   
10      YANCEY  ES6891         09 SOU   6488861.0  False   True  False   True   
11      YANCEY  ES7153      

Subset the dataframe to keep only the needed columns

In [21]:
turnout = combined_2014.ix[:,['county_desc',  'county_pct', '2014']]

Convert '2014' to 1s and 0s

In [22]:
turnout['2014'] = turnout['2014'].astype(int)

In [23]:
print(turnout.head())

  county_desc    county_pct  2014
0      YANCEY  YANCEY06 JAC     0
1      YANCEY  YANCEY06 JAC     1
2      YANCEY  YANCEY06 JAC     0
3      YANCEY  YANCEY06 JAC     1
4      YANCEY  YANCEY06 JAC     1


Strip out blanks in 'county_pct'

In [24]:
turnout['county_pct'] = turnout['county_pct'].map(lambda x: x.replace(" ", ""))
print(turnout.head())

  county_desc   county_pct  2014
0      YANCEY  YANCEY06JAC     0
1      YANCEY  YANCEY06JAC     1
2      YANCEY  YANCEY06JAC     0
3      YANCEY  YANCEY06JAC     1
4      YANCEY  YANCEY06JAC     1


Use pandas pivot table functionality to create a table with turnout by count_precinct combination

First, we create a custome aggfunc to yield the fraction voted

In [25]:
def fraction_one(x):
    frac = np.sum(x)/np.size(x)
    return(frac)

Create a Pivot Table with index of county_precinct and column and aggregation '2014'

In [26]:
turnout_2014 = pd.pivot_table(turnout, index=['county_pct'], values=['2014'],  
                                 aggfunc=fraction_one,  margins = False) 

In [27]:
print len(turnout_2014)
print(turnout_2014.head())

2726
                 2014
county_pct           
ALAMANCE01   0.475307
ALAMANCE02   0.523978
ALAMANCE035  0.369405
ALAMANCE03C  0.631670
ALAMANCE03N  0.471780


Convert to a dataframe

In [28]:
turnout_2014 = pd.DataFrame.from_records(turnout_2014.to_records())
print turnout_2014.head()

    county_pct      2014
0   ALAMANCE01  0.475307
1   ALAMANCE02  0.523978
2  ALAMANCE035  0.369405
3  ALAMANCE03C  0.631670
4  ALAMANCE03N  0.471780


## Now we do the same thing for 2015

In [29]:
local.database = 'registration2015'
local.table = 'VR_Snapshot_20151103'

I want all rows with status codes 'A' and 'I'

In [30]:
condition1 = "[status_cd] = 'A'"
condition2 = "[status_cd] = 'I'"
condition = "(" + condition1 + " OR " + condition2 + ")"  
print condition

([status_cd] = 'A' OR [status_cd] = 'I')


In [31]:
#Set up the fields that I need to pull from the database 
fields = '[county_desc],[ncid],[precinct_abbrv]'
print fields

[county_desc],[ncid],[precinct_abbrv]


In [32]:
#Pull the data from the database
demographics=pullODBC(local.driver,
                      local.server,
                      local.database,
                      local.username,
                      local.password,
                      local.table,
                      fields,
                      condition)

In [33]:
print(len(demographics))
print demographics.tail(n=15)
raw_len = len(demographics)# Save the number of rows

6404860
        county_desc    ncid precinct_abbrv
6404845      WILSON  EP6063           PRWQ
6404846      WILSON  EP6048           PRGA
6404847      WILSON  EP4667           PRWJ
6404848      WILSON  EP5979           PRTA
6404849      WILSON  EP5989           PRWB
6404850      WILSON  EP7455           PRWM
6404851      WILSON  EP7569           PRWB
6404852      WILSON  EP7556           PRWE
6404853      WILSON  EP7557           PRWN
6404854      WILSON  EP7465           PRTA
6404855      WILSON  EP7599           PRBL
6404856      WILSON  EP7392           PRWM
6404857      WILSON  EP7420           PROL
6404858      WILSON  EP7104           PRWC
6404859      WILSON  EP7263           PRWK


We now need to delete any rows with blank county_desc, ncid, or precinct_abbrv.  We do that by first changing blans to NaNs, and the deleting any rows with NaNs

In [34]:
def fix_blanks_NaN(x):
    if x == "":
        return np.nan
    elif x == " ":
        return np.nan
    else:
        return x

In [35]:
#Change blank fields to Nan
fields = demographics.columns
for col in fields:
    demographics[col] = demographics[col].apply(fix_blanks_NaN)
#Next, delete any rows with NaN and disply counts
demographics.dropna(inplace=True, how='any') #drop all rows with NaN
clean_len=len(demographics) #how many rows to I have left?
difference = raw_len - clean_len #how many rows have I dropped?
fraction = float(difference)/float(raw_len) #what fraction of rows have I dropped?
print "The number of rows I have dropped is " + str(difference) 
print "The number of rows remaining in the database is " + str(clean_len)
print "The precentage of rows that I have dropped is: " + str('{:.3%}'.format(fraction))

The number of rows I have dropped is 4690
The number of rows remaining in the database is 6400170
The precentage of rows that I have dropped is: 0.073%


Now we do a left join of elections_flattened to demographics since we use the 2015 registration data as our denominator for 2015 turnout

In [36]:
combined_2015 = pd.merge(demographics, elections_flattened, how='left', on='ncid')
print(len(combined_2015), len(demographics), len(elections_flattened))# combined should have the same number of rows as demographics

(6400170, 6400170, 6490208)


Let's take a look at combined_2015

In [37]:
print(combined_2015.tail(n=15))

        county_desc    ncid precinct_abbrv  Unnamed: 0   2007  2008   2009  \
6400155      WILSON  EP6063           PRWQ   1447440.0  False  True  False   
6400156      WILSON  EP6048           PRGA   6432558.0  False  True  False   
6400157      WILSON  EP4667           PRWJ   1445871.0   True  True  False   
6400158      WILSON  EP5979           PRTA   6432263.0  False  True  False   
6400159      WILSON  EP5989           PRWB   1447350.0   True  True  False   
6400160      WILSON  EP7455           PRWM   1449191.0  False  True  False   
6400161      WILSON  EP7569           PRWB   1449329.0   True  True   True   
6400162      WILSON  EP7556           PRWE   1449311.0   True  True  False   
6400163      WILSON  EP7557           PRWN   1449312.0   True  True  False   
6400164      WILSON  EP7465           PRTA   6440465.0  False  True  False   
6400165      WILSON  EP7599           PRBL   6441291.0  False  True  False   
6400166      WILSON  EP7392           PRWM         NaN    NaN   

We now need to fill all the NaNs with False.  The only NaNs are for registered voters without voting records that were created by the left join

In [38]:
combined_2015.fillna(value=False, method=None, axis=None, inplace=True, limit=None, downcast=None)

In [39]:
print(combined_2015.tail(n=15))

        county_desc    ncid precinct_abbrv  Unnamed: 0   2007   2008   2009  \
6400155      WILSON  EP6063           PRWQ   1447440.0  False   True  False   
6400156      WILSON  EP6048           PRGA   6432558.0  False   True  False   
6400157      WILSON  EP4667           PRWJ   1445871.0   True   True  False   
6400158      WILSON  EP5979           PRTA   6432263.0  False   True  False   
6400159      WILSON  EP5989           PRWB   1447350.0   True   True  False   
6400160      WILSON  EP7455           PRWM   1449191.0  False   True  False   
6400161      WILSON  EP7569           PRWB   1449329.0   True   True   True   
6400162      WILSON  EP7556           PRWE   1449311.0   True   True  False   
6400163      WILSON  EP7557           PRWN   1449312.0   True   True  False   
6400164      WILSON  EP7465           PRTA   6440465.0  False   True  False   
6400165      WILSON  EP7599           PRBL   6441291.0  False   True  False   
6400166      WILSON  EP7392           PRWM         0

Next, we create a new field that combines county_desc with precinct_abbrv

In [40]:
combined_2015['county_pct'] = combined_2015['county_desc']+combined_2015['precinct_abbrv']

In [41]:
print(combined_2015.head(n=15))

   county_desc      ncid precinct_abbrv  Unnamed: 0   2007   2008   2009  \
0     ALAMANCE  AA108595            063       686.0  False   True  False   
1     ALAMANCE  AA108596             13   1458376.0  False   True  False   
2     ALAMANCE  AA108599           1210   1458377.0  False   True  False   
3     ALAMANCE  AA108604            09S         0.0  False  False  False   
4     ALAMANCE  AA108605            08N   1458379.0  False  False  False   
5     ALAMANCE  AA108607            09S   1458380.0  False   True  False   
6     ALAMANCE  AA108610            10N   1458381.0  False   True  False   
7     ALAMANCE  AA108611            126   1458382.0  False   True  False   
8     ALAMANCE  AA108612            10N   1458383.0  False   True  False   
9     ALAMANCE  AA108613            10N   1458384.0  False   True  False   
10    ALAMANCE  AA108616           03N2       687.0   True   True  False   
11    ALAMANCE  AA108619             07   1458387.0  False   True  False   
12    ALAMAN

Subset the dataframe to keep only the needed columns

In [42]:
turnout = combined_2015.ix[:,['county_desc',  'county_pct', '2015']]

Convert '2015' to 1s and 0s

In [43]:
turnout['2015'] = turnout['2015'].astype(int)

In [44]:
print(turnout.head())

  county_desc    county_pct  2015
0    ALAMANCE   ALAMANCE063     0
1    ALAMANCE    ALAMANCE13     0
2    ALAMANCE  ALAMANCE1210     0
3    ALAMANCE   ALAMANCE09S     0
4    ALAMANCE   ALAMANCE08N     0


Strip out blanks in 'county_pct'

In [45]:
turnout['county_pct'] = turnout['county_pct'].map(lambda x: x.replace(" ", ""))
print(turnout.head())

  county_desc    county_pct  2015
0    ALAMANCE   ALAMANCE063     0
1    ALAMANCE    ALAMANCE13     0
2    ALAMANCE  ALAMANCE1210     0
3    ALAMANCE   ALAMANCE09S     0
4    ALAMANCE   ALAMANCE08N     0


Use pandas pivot table functionality to create a table with turnout by count_precinct combination

Create a Pivot Table with index of county_precinct and column and aggregation '2015'

In [46]:
turnout_2015 = pd.pivot_table(turnout, index=['county_pct'], values=['2015'],  
                                 aggfunc=fraction_one,  margins = False) # Use the aggfunc created for 2014

In [47]:
print len(turnout_2015)
print(turnout_2015.head())

2710
                 2015
county_pct           
ALAMANCE01   0.000000
ALAMANCE02   0.009596
ALAMANCE035  0.010371
ALAMANCE03C  0.279386
ALAMANCE03N  0.098882


Convert to a dataframe

In [48]:
turnout_2015 = pd.DataFrame.from_records(turnout_2015.to_records())
print turnout_2015.head()

    county_pct      2015
0   ALAMANCE01  0.000000
1   ALAMANCE02  0.009596
2  ALAMANCE035  0.010371
3  ALAMANCE03C  0.279386
4  ALAMANCE03N  0.098882


## Now we do the same thing for 2016

In [49]:
local.database = 'registration2016'
local.table = 'VR_Snapshot_20161108'

I want all rows with status codes 'A' and 'I'

In [50]:
condition1 = "[status_cd] = 'A'"
condition2 = "[status_cd] = 'I'"
condition = "(" + condition1 + " OR " + condition2 + ")"  
print condition

([status_cd] = 'A' OR [status_cd] = 'I')


In [51]:
#Set up the fields that I need to pull from the database 
fields = '[county_desc],[ncid],[precinct_abbrv]'
print fields

[county_desc],[ncid],[precinct_abbrv]


In [52]:
#Pull the data from the database
demographics=pullODBC(local.driver,
                      local.server,
                      local.database,
                      local.username,
                      local.password,
                      local.table,
                      fields,
                      condition)

In [53]:
print(len(demographics))
print demographics.tail(n=15)
raw_len = len(demographics)# Save the number of rows

6918379
        county_desc     ncid precinct_abbrv
6918364      YADKIN  ER28291           EBND
6918365      YADKIN  ER28351           NLIB
6918366      YADKIN  ER28436           NKNB
6918367      YADKIN  ER28439           BNVL
6918368      YADKIN  ER28378           EBND
6918369      YADKIN  ER28380           NLIB
6918370      YADKIN  ER28408           NKNB
6918371      YADKIN  ER28409           EBND
6918372      YADKIN  ER28615           EBND
6918373      YADKIN  ER28616           FBSH
6918374      YADKIN  ER28587           SLIB
6918375      YADKIN  ER28614           DCRK
6918376      YADKIN  ER28644           BNVL
6918377      YADKIN  ER28646           NLIB
6918378      YADKIN  ER28647           EBND


We now need to delete any rows with blank county_desc, ncid, or precinct_abbrv.  We do that by first changing blans to NaNs, and the deleting any rows with NaNs

In [54]:
def fix_blanks_NaN(x):
    if x == "":
        return np.nan
    elif x == " ":
        return np.nan
    else:
        return x

In [55]:
#Change blank fields to Nan
fields = demographics.columns
for col in fields:
    demographics[col] = demographics[col].apply(fix_blanks_NaN)
#Next, delete any rows with NaN and disply counts
demographics.dropna(inplace=True, how='any') #drop all rows with NaN
clean_len=len(demographics) #how many rows to I have left?
difference = raw_len - clean_len #how many rows have I dropped?
fraction = float(difference)/float(raw_len) #what fraction of rows have I dropped?
print "The number of rows I have dropped is " + str(difference) 
print "The number of rows remaining in the database is " + str(clean_len)
print "The precentage of rows that I have dropped is: " + str('{:.3%}'.format(fraction))

The number of rows I have dropped is 3896
The number of rows remaining in the database is 6914483
The precentage of rows that I have dropped is: 0.056%


Now we do a left join of elections_flattened to demographics since we use the 2016 registration data as our denominator for 2016 turnout

In [56]:
combined_2016 = pd.merge(demographics, elections_flattened, how='left', on='ncid')
print(len(combined_2016), len(demographics), len(elections_flattened))# combined should have the same number of rows as demographics

(6914483, 6914483, 6490208)


Let's take a look at combined_2016

In [57]:
print(combined_2016.tail(n=15))

        county_desc     ncid precinct_abbrv  Unnamed: 0   2007   2008   2009  \
6914468      YADKIN  ER28291           EBND   6464824.0  False   True  False   
6914469      YADKIN  ER28351           NLIB   6464851.0  False   True  False   
6914470      YADKIN  ER28436           NKNB         NaN    NaN    NaN    NaN   
6914471      YADKIN  ER28439           BNVL   6464894.0  False   True  False   
6914472      YADKIN  ER28378           EBND   6464864.0  False   True  False   
6914473      YADKIN  ER28380           NLIB   6464865.0  False   True  False   
6914474      YADKIN  ER28408           NKNB         NaN    NaN    NaN    NaN   
6914475      YADKIN  ER28409           EBND         NaN    NaN    NaN    NaN   
6914476      YADKIN  ER28615           EBND   6464970.0  False   True  False   
6914477      YADKIN  ER28616           FBSH   6464971.0  False   True  False   
6914478      YADKIN  ER28587           SLIB   6464954.0  False  False  False   
6914479      YADKIN  ER28614           D

We now need to fill all the NaNs with False.  The only NaNs are for registered voters without voting records that were created by the left join

In [58]:
combined_2016.fillna(value=False, method=None, axis=None, inplace=True, limit=None, downcast=None)

In [59]:
print(combined_2016.tail(n=15))

        county_desc     ncid precinct_abbrv  Unnamed: 0   2007   2008   2009  \
6914468      YADKIN  ER28291           EBND   6464824.0  False   True  False   
6914469      YADKIN  ER28351           NLIB   6464851.0  False   True  False   
6914470      YADKIN  ER28436           NKNB         0.0  False  False  False   
6914471      YADKIN  ER28439           BNVL   6464894.0  False   True  False   
6914472      YADKIN  ER28378           EBND   6464864.0  False   True  False   
6914473      YADKIN  ER28380           NLIB   6464865.0  False   True  False   
6914474      YADKIN  ER28408           NKNB         0.0  False  False  False   
6914475      YADKIN  ER28409           EBND         0.0  False  False  False   
6914476      YADKIN  ER28615           EBND   6464970.0  False   True  False   
6914477      YADKIN  ER28616           FBSH   6464971.0  False   True  False   
6914478      YADKIN  ER28587           SLIB   6464954.0  False  False  False   
6914479      YADKIN  ER28614           D

Next, we create a new field that combines county_desc with precinct_abbrv

In [60]:
combined_2016['county_pct'] = combined_2016['county_desc']+combined_2016['precinct_abbrv']

In [61]:
print(combined_2016.head(n=15))

   county_desc     ncid precinct_abbrv  Unnamed: 0   2007   2008   2009  \
0     ALAMANCE  AA95530            063     14867.0   True   True   True   
1     ALAMANCE  AA95533            06N         0.0  False  False  False   
2     ALAMANCE  AA95534            03C     14868.0  False   True   True   
3     ALAMANCE  AA95538             01   1531195.0  False   True  False   
4     ALAMANCE  AA95549            03W   1531197.0  False   True  False   
5     ALAMANCE  AA95553            10S   1531198.0  False   True  False   
6     ALAMANCE  AA95972            127   1531344.0  False   True  False   
7     ALAMANCE  AA95980            063   1531346.0  False   True  False   
8     ALAMANCE  AA95981            063   1531347.0  False   True  False   
9     ALAMANCE  AA95982             11   1531348.0  False   True  False   
10    ALAMANCE  AA95983            063     14906.0  False   True  False   
11    ALAMANCE  AA95985            127   1531349.0  False   True  False   
12    ALAMANCE  AA95987  

Subset the dataframe to keep only the needed columns

In [62]:
turnout = combined_2016.ix[:,['county_desc',  'county_pct', '2016']]

Convert '2016' to 1s and 0s

In [63]:
turnout['2016'] = turnout['2016'].astype(int)

In [64]:
print(turnout.head())

  county_desc   county_pct  2016
0    ALAMANCE  ALAMANCE063     1
1    ALAMANCE  ALAMANCE06N     0
2    ALAMANCE  ALAMANCE03C     1
3    ALAMANCE   ALAMANCE01     1
4    ALAMANCE  ALAMANCE03W     1


Strip out blanks in 'county_pct'

In [65]:
turnout['county_pct'] = turnout['county_pct'].map(lambda x: x.replace(" ", ""))
print(turnout.head())

  county_desc   county_pct  2016
0    ALAMANCE  ALAMANCE063     1
1    ALAMANCE  ALAMANCE06N     0
2    ALAMANCE  ALAMANCE03C     1
3    ALAMANCE   ALAMANCE01     1
4    ALAMANCE  ALAMANCE03W     1


Use pandas pivot table functionality to create a table with turnout by count_precinct combination

Create a Pivot Table with index of county_precinct and column and aggregation '2016'

In [66]:
turnout_2016 = pd.pivot_table(turnout, index=['county_pct'], values=['2016'],  
                                 aggfunc=fraction_one,  margins = False) # Use the aggfunc created for 2014

In [67]:
print len(turnout_2016)
print(turnout_2016.head())

2704
                 2016
county_pct           
ALAMANCE01   0.733687
ALAMANCE02   0.764010
ALAMANCE035  0.599120
ALAMANCE03C  0.804057
ALAMANCE03N  0.685009


Convert to a dataframe

In [68]:
turnout_2016 = pd.DataFrame.from_records(turnout_2016.to_records())
print turnout_2016.head()

    county_pct      2016
0   ALAMANCE01  0.733687
1   ALAMANCE02  0.764010
2  ALAMANCE035  0.599120
3  ALAMANCE03C  0.804057
4  ALAMANCE03N  0.685009


## Now we will use outer joins to combine the 3 dataframes

In [69]:
turnout_all = pd.merge(turnout_2016, turnout_2015, how='outer', on='county_pct')
print(len(turnout_all), len(turnout_2016), len(turnout_2015))# turnout_all  should have more rows than 2016 and 2015
turnout_all = pd.merge(turnout_all, turnout_2014, how='outer', on='county_pct')
print(len(turnout_all), len(turnout_2014))# the new turnout_all should have more rows than the old one and 2014

(2718, 2704, 2710)
(2777, 2726)


Now we write this to a csv for analysis and inclusion as 3 dimensions in precinct clustering

In [70]:
turnout_all.to_csv('turnout_all.csv')