##  We're now going to read the turnout data, make predictions, and analyze turnout by precinct for 2016 based upon voter peferences

In [1]:
#First set everthing up
import pyodbc
import graphlab as gl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import local #server information and credentials
from pullODBC import pullODBC
from pullODBCall import pullODBCall
from matplotlib.backends.backend_pdf import PdfPages
from __future__ import division # convert integer division to floating point
%matplotlib inline
plt.style.use('seaborn-white')
plt.rcParams['font.size'] = 7
plt.tight_layout;
plt.rc('xtick', labelsize=7) 
plt.rc('ytick', labelsize=7) 

### Section A: Analyzing Turnout by Precinct for Voters Affiliated with a Party

Read in the csv for the combined data: county, precinct, party code, and 2106 turnout

In [2]:
combined = pd.read_csv('C:/voter/combined.csv', low_memory=True) # use the latest dataset so we can predict

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
print len(combined)

6759456


In [4]:
print combined['2016'].sum()

4736106.0


Print out the columns of this dataframe

In [5]:
print combined.columns

Index([u'Unnamed: 0', u'birth_age', u'gender_code', u'race_code',
       u'ethnic_code', u'party_cd', u'status_cd', u'zip_code', u'birth_state',
       u'drivers_lic', u'county_desc', u'registr_dt', u'ncid',
       u'precinct_abbrv', u'Unnamed: 0.1', u'2007', u'2008', u'2009', u'2010',
       u'2011', u'2012', u'2013', u'2014', u'2015', u'2016', u'2018'],
      dtype='object')


Let's get rid of two of the columns added during the merge: 'Unnamed: 0' and 'Unnamed: 0.1'

In [6]:
combined.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1, level=None, inplace=True, errors='raise')

Let's count the rows and put the number into raw_len

In [7]:
raw_len = len(combined)

We need to predict sentiment and add columns to reflect this.  Let's create a function to help calculate years since registered.

In [8]:
def convert_to_float(x): # function to convert a timedelta object to a float of years
    return x.total_seconds()/(60*60*24*365.2422)

In [9]:
combined['birth_age'] = combined['birth_age'].astype('float64')
combined['registr_dt'] = pd.to_datetime(combined['registr_dt'])

Now, we create the interaction variables

In [10]:
combined['race_gender'] = combined['race_code'] + combined ['gender_code']
combined['race_ethnicity'] = combined['race_code'] + combined['ethnic_code']
combined['gender_ethnicity'] = combined['gender_code'] + combined ['ethnic_code']
combined['age_squared'] = (combined['birth_age'])**2
combined['age_cubed'] = (combined['birth_age'])**3
now = pd.to_datetime('today') # get today's data
combined['years_since_registration'] = now - combined['registr_dt'] # get days since registration as a timedelta object
combined['years_since_registration'] = combined['years_since_registration'].apply(convert_to_float) # convert to float
combined['years_squared'] = (combined['years_since_registration'])**2
combined['years_cubed'] = (combined['years_since_registration'])**3
combined = combined.drop(['race_code', 'gender_code', 'ethnic_code'], 1) # We no longer need these

Next, remove the rows with invalid ages and registration dates

In [11]:
combined = combined[(combined['birth_age'] <= 99) & (combined['years_since_registration'] <= 61)]

Now, we need to change values in all 'year' fields to numeric.  We start by using pd.to_numeric for each column, coercing all non-convertable columns to NaN

In [12]:
for col in ['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016','2018']:
    combined[col] = pd.to_numeric(combined[col], errors='coerce')

We now have NaN values in some of the 'year' columns.  We now need to change these all to 0.

In [13]:
def clean_NaN(x):
    if pd.isnull(x):
        return 0
    else:
        return x

In [14]:
for col in ['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016','2018']:
    combined[col] = combined[col].apply(clean_NaN)

Convert precinct_desc to a string

In [16]:
combined['precinct_abbrv']= combined['precinct_abbrv'].astype('str')

In [17]:
# For prediction, we need to convert our data into a GraphLab SFrame
combinedS=gl.SFrame(combined)

This non-commercial license of GraphLab Create for academic use is assigned to scottsmi@live.unc.edu and will expire on May 15, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\Scott\AppData\Local\Temp\graphlab_server_1500990486.log.0


Load model file for predictions

In [18]:
model = gl.load_model('party_model_file')

Let's create two new functions to make predictions based upon a specific threshold (our new decision boundary)

In [19]:
def return_actual_or_predicted_party(x): # This function returns the party if affiliated, if not returns the predicted party
    if x['party_cd'] == "DEM":
        return"DEM"
    elif x['party_cd'] == "REP":
        return "REP"
    elif x['predict_positive'] == True:
        return "REP"
    else:
        return "DEM"
    

def get_model_predictions(model, sframe, threshold):
    sframe['percent'] = model.predict(sframe, output_type='probability') # add a column with the model generated probabilities
    sframe['predict_positive'] = sframe['percent'] >= threshold # add a column that contains 'true' if a positive is predicted, a false otherwise
    sframe['actual_or_predicted']= sframe.apply(return_actual_or_predicted_party)# add this column to contain the actual or predicted party preference
    return sframe

Let's make predictions for the entire state of North Carolina

In [20]:
new_sframe = get_model_predictions(model, combinedS, 0.56)

Now convert back to a pandas dataframe

In [21]:
combined = new_sframe.to_dataframe()

Creat a custom aggfunc to count the percentage of rows with '1' in column '2016'

In [22]:
def percent_non_zero(x):
    frac = np.sum(x)/np.size(x)
    return(frac)

Create a Pivot Table with index of county, precinct and columns party_cd and aggregation 2016

In [23]:
registered_by_precinct = pd.pivot_table(combined, index=['county_desc','precinct_abbrv'], values=['2016'],  
                                 aggfunc=percent_non_zero, columns=['actual_or_predicted'], margins = False) 


In [24]:
pd.options.display.float_format = '{:.1%}'.format #display as percentages
new_dataframe = pd.DataFrame.from_records(registered_by_precinct.to_records())
new_dataframe.columns




Index([u'county_desc', u'precinct_abbrv', u'('2016', 'DEM')',
       u'('2016', 'REP')'],
      dtype='object')

In [25]:
new_dataframe.to_csv('C:/voter/turnout_predicted.csv')

In [29]:
print new_dataframe.columns

Index([u'county_desc', u'precinct_abbrv', u'('2016', 'DEM')',
       u'('2016', 'REP')'],
      dtype='object')


In [31]:
new_dataframe = new_dataframe.groupby(['county_desc', 'precinct_abbrv'])["('2016', 'REP')", "('2016', 'DEM')"].mean()

In [33]:
new_dataframe = new_dataframe.rename(columns = {"('2016', 'DEM')" : "DEM", 
                                                "('2016', 'REP')" : "REP"
                                                })

In [37]:
by_republican = new_dataframe.sort_values('REP', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last')
by_democrat = new_dataframe.sort_values('DEM', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last')

In [39]:
pd.set_option('max_rows', 2000)
%store by_democrat > precinct_turnout_prediction_sorted_democrat.txt
%store by_republican > precinct_turnout_prediction_sorted_republican.txt

Writing 'by_democrat' (DataFrame) to file 'precinct_turnout_prediction_sorted_democrat.txt'.
Writing 'by_republican' (DataFrame) to file 'precinct_turnout_prediction_sorted_republican.txt'.
