##  We're going to load the turnout data, flatten it, and add it to key demographic data to create a comprehensive database table for query and predictions

In [1]:
#First set everthing up
import pyodbc
import graphlab as gl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import local #server information and credentials
from pullODBC import pullODBC
from pullODBCall import pullODBCall
from matplotlib.backends.backend_pdf import PdfPages
from __future__ import division # convert integer division to floating point
%matplotlib inline
plt.style.use('seaborn-white')
plt.rcParams['font.size'] = 7
plt.tight_layout;
plt.rc('xtick', labelsize=7) 
plt.rc('ytick', labelsize=7) 

### 0.1.1 Section A: Extracting Voter History Data for each Registered Voter

In [2]:
#Set up the fields that I need to pull from the database 
fields = '["ncid"], ["election_lbl"], ["county_desc"]'
print fields

["ncid"], ["election_lbl"], ["county_desc"]


Change the database to 'history' and the table to 'ncvhis_Statewide'

In [3]:
local.database = 'history'
local.table = 'ncvhis_Statewide'

In [4]:
history=pullODBCall(local.driver,
                      local.server,
                      local.database,
                      local.username,
                      local.password,
                      local.table,
                      fields)

Get rid of all the double quotes from the dataframe

In [5]:
history.columns = history.columns.str.strip('"')
for i, col in enumerate(history.columns):
    history.iloc[:, i] = history.iloc[:, i].str.replace('"', '')
print history.head(n=15)

        ncid election_lbl county_desc
0    AA10737   11/02/2010    ALAMANCE
1    AA10737   11/08/2016    ALAMANCE
2    AA10737   03/15/2016    ALAMANCE
3    AA10737   05/06/2008    ALAMANCE
4    AA10737   11/05/2013    ALAMANCE
5    AA10737   11/04/2008    ALAMANCE
6    AA10737   11/03/2015    ALAMANCE
7   AA107373   11/06/2012    ALAMANCE
8   AA107373   05/08/2012    ALAMANCE
9   AA107373   05/06/2008    ALAMANCE
10  AA107373   05/04/2010    ALAMANCE
11  AA107373   07/17/2012    ALAMANCE
12  AA107373   11/08/2016    ALAMANCE
13  AA107373   03/15/2016    ALAMANCE
14  AA107373   11/04/2014    ALAMANCE


In [6]:
#Save the total number of rows to raw_len and print. 
raw_len=len(history)
print "The number of rows pulled from the database is " + str(raw_len)

The number of rows pulled from the database is 32482824


Create a generic function to change blank fields to NaN for all categorical fields 

In [7]:
def fix_blanks_NaN(x):
    if x == "":
        return np.nan
    elif x == " ":
        return np.nan
    else:
        return x

In [8]:
history['election_lbl'] = history['election_lbl'].apply(fix_blanks_NaN) # change blanks to NaN
history['ncid'] = history['ncid'].apply(fix_blanks_NaN) # etc.
history['county_desc'] = history['county_desc'].apply(fix_blanks_NaN) # etc.

Clean the data by dropping rows with missing data and print a summary

In [9]:
history.dropna(inplace=True, how='any') #drop all rows with NaN
clean_len=len(history) #how many rows to I have left?
difference = raw_len - clean_len #how many rows have I dropped?
fraction = float(difference)/float(raw_len) #what fraction of rows have I dropped?
print "The number of rows I have dropped is " + str(difference) 
print "The number of rows remaining in the database is " + str(clean_len)
print "The precentage of rows that I have dropped is: " + str('{:.3%}'.format(fraction))

The number of rows I have dropped is 0
The number of rows remaining in the database is 32482824
The precentage of rows that I have dropped is: 0.000%


Now we need to discard all elections not in November, since we only want to analyze general elections

In [10]:
history = history[history['election_lbl'].astype(str).str[0:2]=='11']

We need to flatten the database to a single row per NCID and massage the data

In [11]:
elections_by_ncid = pd.pivot_table(history[['ncid','election_lbl']], index=['ncid'],  
                                 aggfunc=len, columns='election_lbl', margins = False) # create a pivot table
elections_flattened = pd.DataFrame(elections_by_ncid.to_records()) # convert the pivot table to a dataframe
cols = elections_flattened.columns # get an index object with all the column names
cols = cols.delete(0) # remove the first column name 'ncid' since I don't want to rename this
elections_flattened.fillna(value=0, method=None, axis=None, inplace=True, limit=None, downcast=None) # replace NaN with 0
elections_flattened[cols] = elections_flattened[cols].astype(int)# convert values to integers
for col in cols: # rename each election column to only the election year rather than the full date
    elections_flattened.rename(index=str, inplace=True, columns={col : col[-4:]})
elections_flattened = elections_flattened.reindex_axis(sorted(elections_flattened.columns), axis=1) # sort the columns by year

print elections_flattened.head()

   2007  2008  2009  2010  2011  2012  2013  2014  2015  2016  2018      ncid
0     0     1     0     0     0     0     0     0     0     1     0  AA100000
1     0     1     0     1     0     1     0     1     0     1     0  AA100006
2     0     1     0     1     1     1     0     1     0     1     0  AA100007
3     0     1     0     0     0     1     0     0     0     0     0  AA100008
4     0     1     0     1     0     1     0     1     0     1     0  AA100009


In [12]:
elections_flattened.to_csv('elections_flattened.csv')

In [13]:
del elections_flattened
del elections_by_ncid

In [14]:
history.to_csv('history.csv')
del history

We now have a flat dataframe with one row for each ncid and a column for each year.  The value in that column is 1 if the person with the ncid voted in the general election that year, 0 of they didn't vote

### 0.1.2 Section B: Extracting Voter Demographics (we've done this many times)

First change the database and table names back to get registration data

In [15]:
local.database = 'registration'
local.table = 'ncvoter_Statewide'

We now do the standard extraction and cleaning.  I've excluded comments since task has been done many times

In [16]:
#Set up the fields that I need to pull from the database 
fields = '["birth_age"], ["gender_code"], ["race_code"], ["ethnic_code"],["party_cd"], ["status_cd"], ["zip_code"], ["birth_state"], ["drivers_lic"], ["county_desc"], ["registr_dt"],["ncid"],["precinct_abbrv"]'
print fields

["birth_age"], ["gender_code"], ["race_code"], ["ethnic_code"],["party_cd"], ["status_cd"], ["zip_code"], ["birth_state"], ["drivers_lic"], ["county_desc"], ["registr_dt"],["ncid"],["precinct_abbrv"]


In [17]:
condition1 = '["status_cd"] = ' + local.single_double_quote + 'A' + local.double_single_quote
condition2 = '["status_cd"] = ' + local.single_double_quote + 'I' + local.double_single_quote
condition3 = '["county_desc"] = ' + local.single_double_quote + 'Chatham' + local.double_single_quote
condition = "(" + condition1 + " OR " + condition2 + ")"  # This line will be run on Azure
#condition = "(" + condition1 + " OR " + condition2 + ")" + " AND " + condition3# This line will be run on local
print condition

(["status_cd"] = '"A"' OR ["status_cd"] = '"I"')


In [18]:
demographics=pullODBC(local.driver,
                      local.server,
                      local.database,
                      local.username,
                      local.password,
                      local.table,
                      fields,
                      condition)

Get rid of all the double quotes from the dataframe

In [19]:
demographics.columns = demographics.columns.str.strip('"')
for i, col in enumerate(demographics.columns):
    demographics.iloc[:, i] = demographics.iloc[:, i].str.replace('"', '')
print demographics.head()

  birth_age gender_code race_code ethnic_code party_cd status_cd zip_code  \
0        93           F         W          NL      DEM         A    27215   
1        51           F         W          NL      UNA         A    27253   
2        59           M         W          NL      REP         A    27253   
3        68           F         W          NL      REP         A    27217   
4        76           M         W          UN      REP         A    27217   

  birth_state drivers_lic county_desc  registr_dt      ncid precinct_abbrv  
0          SC           Y    ALAMANCE  07/02/1998  AA110814           03N2  
1          PA           Y    ALAMANCE  05/12/2014   DN40123            103  
2          WV           Y    ALAMANCE  05/13/2014   DN37217            103  
3          PA           Y    ALAMANCE  08/14/2001  AA124216             13  
4          IA           Y    ALAMANCE  10/23/2001  AA125111             13  


In [20]:
#Save the total number of rows to raw_len and print. 
raw_len=len(demographics)
print "The number of rows pulled from the database is " + str(raw_len)

The number of rows pulled from the database is 6762419


Create a generic function to change blank fields to NaN for all categorical fields 

In [21]:
def fix_blanks_NaN(x):
    if x == "":
        return np.nan
    elif x == " ":
        return np.nan
    else:
        return x

Create a generic function to change blank fields to NaN for all categorical fields 

In [22]:
def fix_blanks_NaN(x):
    if x == "":
        return np.nan
    elif x == " ":
        return np.nan
    else:
        return x

In [23]:
def fix_blanks(x):
    if x == "":
        return "None"
    elif x == " ":
        return "None"
    else:
        return x

In [25]:
demographics['birth_state'] = demographics['birth_state'].apply(fix_blanks) # change blanks to "None" in birth_state
demographics['gender_code'] = demographics['gender_code'].apply(fix_blanks_NaN) # change blanks to NaN in gender_code
demographics['race_code'] = demographics['race_code'].apply(fix_blanks_NaN) # etc.
demographics['ethnic_code'] = demographics['ethnic_code'].apply(fix_blanks_NaN) 
demographics['zip_code'] = demographics['zip_code'].apply(fix_blanks_NaN)
demographics['birth_state'] = demographics['birth_state'].apply(fix_blanks_NaN) 
demographics['drivers_lic'] = demographics['drivers_lic'].apply(fix_blanks_NaN) 
demographics['county_desc'] = demographics['county_desc'].apply(fix_blanks_NaN) 
demographics['registr_dt'] = demographics['registr_dt'].apply(fix_blanks_NaN)
demographics['ncid'] = demographics['ncid'].apply(fix_blanks_NaN)
demographics['precinct_abbrv'] = demographics['precinct_abbrv'].apply(fix_blanks_NaN)

In [26]:
demographics.dropna(inplace=True, how='any') #drop all rows with NaN
clean_len=len(demographics) #how many rows to I have left?
difference = raw_len - clean_len #how many rows have I dropped?
fraction = float(difference)/float(raw_len) #what fraction of rows have I dropped?
print "The number of rows I have dropped is " + str(difference) 
print "The number of rows remaining in the database is " + str(clean_len)
print "The precentage of rows that I have dropped is: " + str('{:.3%}'.format(fraction))

The number of rows I have dropped is 2963
The number of rows remaining in the database is 6759456
The precentage of rows that I have dropped is: 0.044%


In [27]:
demographics['birth_age'] = demographics['birth_age'].astype('float64')
demographics['registr_dt'] = pd.to_datetime(demographics['registr_dt'])

In [28]:
elections_flattened = pd.read_csv('elections_flattened.csv')

In [29]:
combined = pd.merge(demographics, elections_flattened, how='left', on='ncid')

In [30]:
print len(combined), len(demographics), len(elections_flattened)

6759456 6759456 6485857


Now save the dataframe 'combined' in a .csv

In [31]:
combined.to_csv('combined.csv')

In [32]:
print combined.head()

   birth_age gender_code race_code ethnic_code party_cd status_cd zip_code  \
0       93.0           F         W          NL      DEM         A    27215   
1       51.0           F         W          NL      UNA         A    27253   
2       59.0           M         W          NL      REP         A    27253   
3       68.0           F         W          NL      REP         A    27217   
4       76.0           M         W          UN      REP         A    27217   

  birth_state drivers_lic county_desc  ...  2008 2009 2010  2011  2012  2013  \
0          SC           Y    ALAMANCE  ...   1.0  0.0  1.0   1.0   1.0   0.0   
1          PA           Y    ALAMANCE  ...   1.0  0.0  0.0   0.0   1.0   0.0   
2          WV           Y    ALAMANCE  ...   1.0  0.0  1.0   0.0   1.0   0.0   
3          PA           Y    ALAMANCE  ...   1.0  0.0  1.0   0.0   1.0   0.0   
4          IA           Y    ALAMANCE  ...   1.0  0.0  1.0   0.0   1.0   0.0   

   2014  2015  2016  2018  
0   1.0   0.0   1.0   