##  We're going to load the turnout data, flatten it, and add it to key demographic data to create a comprehensive database table for query and predictions

In [1]:
#First set everthing up
import pyodbc
import graphlab as gl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import local #server information and credentials
from pullODBC import pullODBC
from pullODBCall import pullODBCall
from matplotlib.backends.backend_pdf import PdfPages
from __future__ import division # convert integer division to floating point
%matplotlib inline
plt.style.use('seaborn-white')
plt.rcParams['font.size'] = 7
plt.tight_layout;
plt.rc('xtick', labelsize=7) 
plt.rc('ytick', labelsize=7) 

### 0.1.1 Section A: Extracting Voter History Data for each Registered Voter

This is now in a static csv, so we don't need to pull this from the database any more.

## 0.1.2 Section B: Extracting Voter Demographics (2008)

First change the database and table names back to get registration data

In [2]:
local.database = 'registration2008'
local.table = 'ncvoter_Statewide'

We now do the standard extraction and cleaning.  I've excluded comments since task has been done many times

In [3]:
#Set up the fields that I need to pull from the database 
fields = "birth_age, " + "gender_code, " + "race_code, " + "ethnic_code, " + "party_cd, " + "status_cd, " + "zip_code, " + "birth_state, " + "county_desc, " + "registr_dt, " + "ncid, " + "precinct_abbrv"
print fields

birth_age, gender_code, race_code, ethnic_code, party_cd, status_cd, zip_code, birth_state, county_desc, registr_dt, ncid, precinct_abbrv


In [4]:
condition1 = "status_cd = " + "'A'"
condition2 = "status_cd = " + "'I'" 
condition3 = "county_desc = " + "'Chatham'"
condition = "(" + condition1 + " OR " + condition2 + ")"  # This line will be run on Azure
#condition = "(" + condition1 + " OR " + condition2 + ")" + " AND " + condition3# This line will be run on local
print condition

(status_cd = 'A' OR status_cd = 'I')


In [5]:
demographics=pullODBC(local.driver,
                      local.server,
                      local.database,
                      local.username,
                      local.password,
                      local.table,
                      fields,
                      condition)

Get rid of all the double quotes from the dataframe

In [6]:
demographics.columns = demographics.columns.str.strip('"')
for i, col in enumerate(demographics.columns):
    demographics.iloc[:, i] = demographics.iloc[:, i].str.replace('"', '')
print demographics.head()

  birth_age gender_code race_code ethnic_code party_cd status_cd zip_code  \
0        74           M       B           NL       DEM        A     27207   
1        70           F       W           NL       DEM        A     28086   
2        64           F       W           NL       DEM        A     28472   
3        52           M       W           NL       DEM        A     28586   
4        50           F       W           NL       REP        A     28348   

                      birth_state county_desc           registr_dt    ncid  \
0                                     CHATHAM  1969-09-15 00:00:00  AX2144   
1  NC                               CLEVELAND  1964-10-17 00:00:00  BB2130   
2  NC                                COLUMBUS  1978-03-30 00:00:00  BC2200   
3  NC                                  CRAVEN  1984-04-09 00:00:00  BD2222   
4  MI                              CUMBERLAND  1993-03-11 00:00:00  BE2235   

  precinct_abbrv  
0         45      
1         KM 2    
2         P

In [7]:
#Save the total number of rows to raw_len and print. 
raw_len=len(demographics)
print "The number of rows pulled from the database is " + str(raw_len)

The number of rows pulled from the database is 6264730


Create a generic function to change blank fields to NaN for all categorical fields 

In [8]:
def fix_blanks_NaN(x):
    if x == "":
        return np.nan
    elif x == " ":
        return np.nan
    else:
        return x

In [9]:
def fix_blanks(x):
    if x == "":
        return "None"
    elif x == " ":
        return "None"
    else:
        return x

In [10]:
demographics['birth_state'] = demographics['birth_state'].apply(fix_blanks) # change blanks to "None" in birth_state
demographics['gender_code'] = demographics['gender_code'].apply(fix_blanks_NaN) # change blanks to NaN in gender_code
demographics['race_code'] = demographics['race_code'].apply(fix_blanks_NaN) # etc.
demographics['ethnic_code'] = demographics['ethnic_code'].apply(fix_blanks_NaN) 
demographics['zip_code'] = demographics['zip_code'].apply(fix_blanks_NaN)
demographics['birth_state'] = demographics['birth_state'].apply(fix_blanks_NaN) 
demographics['county_desc'] = demographics['county_desc'].apply(fix_blanks_NaN) 
demographics['registr_dt'] = demographics['registr_dt'].apply(fix_blanks_NaN)
demographics['ncid'] = demographics['ncid'].apply(fix_blanks_NaN)
demographics['precinct_abbrv'] = demographics['precinct_abbrv'].apply(fix_blanks_NaN)

In [11]:
demographics.dropna(inplace=True, how='any') #drop all rows with NaN
clean_len=len(demographics) #how many rows to I have left?
difference = raw_len - clean_len #how many rows have I dropped?
fraction = float(difference)/float(raw_len) #what fraction of rows have I dropped?
print "The number of rows I have dropped is " + str(difference) 
print "The number of rows remaining in the database is " + str(clean_len)
print "The precentage of rows that I have dropped is: " + str('{:.3%}'.format(fraction))

The number of rows I have dropped is 103
The number of rows remaining in the database is 6264627
The precentage of rows that I have dropped is: 0.002%


In [12]:
demographics['birth_age'] = demographics['birth_age'].astype('float64')
demographics['registr_dt'] = pd.to_datetime(demographics['registr_dt'],yearfirst=True, errors='ignore')

In [13]:
elections_flattened = pd.read_csv('elections_flattened.csv')

In [14]:
combined = pd.merge(demographics, elections_flattened, how='left', on='ncid')

In [15]:
print len(combined), len(demographics), len(elections_flattened)

6264627 6264627 6485857


Now save the dataframe 'combined' in a .csv

In [16]:
combined.to_csv('combined2008.csv')