##  We're going calculate vote fraction for Trump and Newton by county/precinct combination to use as 2 dimensions for our clustering work

In [1]:
#First set everthing up
import pyodbc
import graphlab as gl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import config #server information and credentials
from pullODBC import pullODBC
from pullODBCall import pullODBCall
from matplotlib.backends.backend_pdf import PdfPages
from __future__ import division # convert integer division to floating point
%matplotlib inline
plt.style.use('seaborn-white')
plt.rcParams['font.size'] = 7
plt.tight_layout;
plt.rc('xtick', labelsize=7) 
plt.rc('ytick', labelsize=7) 

In [2]:
#Set up the fields that I need to pull from the database 
fields = 'candidate, contest_name, election_date, precinct, total_votes, county'
print fields

candidate, contest_name, election_date, precinct, total_votes, county


In [3]:
#Set up the condition
condition0 = "contest_name = 'NC ATTORNEY GENERAL'"
condition1 = "contest_name = 'US PRESIDENT'"
condition2 = "election_date='2016-11-08'"
condition = "(" + condition0 + " OR " + condition1 + ") AND " + condition2
print condition

(contest_name = 'NC ATTORNEY GENERAL' OR contest_name = 'US PRESIDENT') AND election_date='2016-11-08'


Read in the results for President and Attorney General for 2016 General Election

In [4]:
#Pull the data from the database
results=pullODBC(config.driver,
                      config.server,
                      config.database,
                      config.username,
                      config.password,
                      config.table,
                      fields,
                      condition)

In [5]:
print(len(results))
print results.tail(n=15)
raw_len = len(results)# Save the number of rows

22459
                      candidate         contest_name election_date precinct  \
22444           Hillary Clinton         US PRESIDENT    2016-11-08       W3   
22445                Josh Stein  NC ATTORNEY GENERAL    2016-11-08     50.1   
22446                Josh Stein  NC ATTORNEY GENERAL    2016-11-08     KM N   
22447                Josh Stein  NC ATTORNEY GENERAL    2016-11-08     MARB   
22448           Hillary Clinton         US PRESIDENT    2016-11-08     PR32   
22449              Gary Johnson         US PRESIDENT    2016-11-08       SH   
22450     Jill Stein (Write-In)         US PRESIDENT    2016-11-08     EKWD   
22451     Jill Stein (Write-In)         US PRESIDENT    2016-11-08       T1   
22452           Donald J. Trump         US PRESIDENT    2016-11-08     GC09   
22453              Gary Johnson         US PRESIDENT    2016-11-08       30   
22454               Buck Newton  NC ATTORNEY GENERAL    2016-11-08     ROWA   
22455                Josh Stein  NC ATTORNEY G

We now need to clean the data to remove any rows with blank columns

In [6]:
def fix_blanks_NaN(x):
    if x == "":
        return np.nan
    elif x == " ":
        return np.nan
    else:
        return x

In [7]:
#Change blank fields to Nan
fields = results.columns
for col in fields:
    results[col] = results[col].apply(fix_blanks_NaN)
#Next, delete any rows with NaN and disply counts
results.dropna(inplace=True, how='any') #drop all rows with NaN
clean_len=len(results) #how many rows to I have left?
difference = raw_len - clean_len #how many rows have I dropped?
fraction = float(difference)/float(raw_len) #what fraction of rows have I dropped?
print "The number of rows I have dropped is " + str(difference) 
print "The number of rows remaining in the database is " + str(clean_len)
print "The precentage of rows that I have dropped is: " + str('{:.3%}'.format(fraction))

The number of rows I have dropped is 0
The number of rows remaining in the database is 22459
The precentage of rows that I have dropped is: 0.000%


The data is clean.  No rows have been dropped

Now we create a now column county_pct that combines county and precinct and strip out any blanks in this new column.  

In [8]:
results['county_pct'] = results['county']+results['precinct'] # create new column
results['county_pct'] = results['county_pct'].map(lambda x: x.replace(" ", "")) # strip out the blanks
print(results.head(n=15))


          candidate  contest_name election_date     precinct  total_votes  \
0   Donald J. Trump  US PRESIDENT    2016-11-08           18          120   
1   Donald J. Trump  US PRESIDENT    2016-11-08     ONE STOP         2110   
2   Donald J. Trump  US PRESIDENT    2016-11-08           13          152   
3   Donald J. Trump  US PRESIDENT    2016-11-08           02          550   
4   Donald J. Trump  US PRESIDENT    2016-11-08           16          361   
5   Donald J. Trump  US PRESIDENT    2016-11-08           14          241   
6   Donald J. Trump  US PRESIDENT    2016-11-08           19          148   
7   Donald J. Trump  US PRESIDENT    2016-11-08           08          145   
8   Donald J. Trump  US PRESIDENT    2016-11-08           05          116   
9   Donald J. Trump  US PRESIDENT    2016-11-08           11          127   
10  Donald J. Trump  US PRESIDENT    2016-11-08           12          127   
11  Donald J. Trump  US PRESIDENT    2016-11-08           17          240   

Now we create a pivot table to summarize the results

In [9]:
results_pivot = pd.pivot_table(results, index=['county_pct'], values=['total_votes'],  
                                 aggfunc=np.sum,  columns = ['contest_name','candidate'], margins = False) 
results_pivot.head()

Unnamed: 0_level_0,total_votes,total_votes,total_votes,total_votes,total_votes,total_votes,total_votes,total_votes
contest_name,NC ATTORNEY GENERAL,NC ATTORNEY GENERAL,US PRESIDENT,US PRESIDENT,US PRESIDENT,US PRESIDENT,US PRESIDENT,US PRESIDENT
candidate,Buck Newton,Josh Stein,Donald J. Trump,Gary Johnson,Hillary Clinton,JIll Stein (Write-In),Jill Stein (Write-In),Write-In (Miscellaneous)
county_pct,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
ALAMANCE01,1784.0,519.0,1865.0,50.0,411.0,,10.0,15.0
ALAMANCE02,1909.0,497.0,2004.0,40.0,403.0,,5.0,19.0
ALAMANCE035,1442.0,1042.0,1485.0,77.0,995.0,,14.0,13.0
ALAMANCE03C,1094.0,720.0,1059.0,64.0,679.0,,5.0,28.0
ALAMANCE03N,1198.0,1008.0,1160.0,66.0,1036.0,,12.0,23.0


We now need to fill all the NaNs with 0  

In [10]:
results_pivot.fillna(value=0, method=None, axis=None, inplace=True, limit=None, downcast=None)
results_pivot.head()

Unnamed: 0_level_0,total_votes,total_votes,total_votes,total_votes,total_votes,total_votes,total_votes,total_votes
contest_name,NC ATTORNEY GENERAL,NC ATTORNEY GENERAL,US PRESIDENT,US PRESIDENT,US PRESIDENT,US PRESIDENT,US PRESIDENT,US PRESIDENT
candidate,Buck Newton,Josh Stein,Donald J. Trump,Gary Johnson,Hillary Clinton,JIll Stein (Write-In),Jill Stein (Write-In),Write-In (Miscellaneous)
county_pct,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
ALAMANCE01,1784.0,519.0,1865.0,50.0,411.0,0.0,10.0,15.0
ALAMANCE02,1909.0,497.0,2004.0,40.0,403.0,0.0,5.0,19.0
ALAMANCE035,1442.0,1042.0,1485.0,77.0,995.0,0.0,14.0,13.0
ALAMANCE03C,1094.0,720.0,1059.0,64.0,679.0,0.0,5.0,28.0
ALAMANCE03N,1198.0,1008.0,1160.0,66.0,1036.0,0.0,12.0,23.0


Now we add new columns that sum across contest_name

In [11]:
results_pivot[['contest_total_ag', 'contest_total_president']] = results_pivot['total_votes'].sum(axis=1, level=0)
results_pivot.head()

Unnamed: 0_level_0,total_votes,total_votes,total_votes,total_votes,total_votes,total_votes,total_votes,total_votes,contest_total_ag,contest_total_president
contest_name,NC ATTORNEY GENERAL,NC ATTORNEY GENERAL,US PRESIDENT,US PRESIDENT,US PRESIDENT,US PRESIDENT,US PRESIDENT,US PRESIDENT,Unnamed: 9_level_1,Unnamed: 10_level_1
candidate,Buck Newton,Josh Stein,Donald J. Trump,Gary Johnson,Hillary Clinton,JIll Stein (Write-In),Jill Stein (Write-In),Write-In (Miscellaneous),Unnamed: 9_level_2,Unnamed: 10_level_2
county_pct,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
ALAMANCE01,1784.0,519.0,1865.0,50.0,411.0,0.0,10.0,15.0,2303.0,2351.0
ALAMANCE02,1909.0,497.0,2004.0,40.0,403.0,0.0,5.0,19.0,2406.0,2471.0
ALAMANCE035,1442.0,1042.0,1485.0,77.0,995.0,0.0,14.0,13.0,2484.0,2584.0
ALAMANCE03C,1094.0,720.0,1059.0,64.0,679.0,0.0,5.0,28.0,1814.0,1835.0
ALAMANCE03N,1198.0,1008.0,1160.0,66.0,1036.0,0.0,12.0,23.0,2206.0,2297.0


In [12]:
results_pivot['fraction_newton'] = results_pivot['total_votes', 'NC ATTORNEY GENERAL','Buck Newton']/results_pivot['contest_total_ag']
results_pivot['fraction_trump'] = results_pivot['total_votes', 'US PRESIDENT','Donald J. Trump']/results_pivot['contest_total_president']

Subset the dataframe to keep only the needed columns

In [13]:
results_pivot.head()

Unnamed: 0_level_0,total_votes,total_votes,total_votes,total_votes,total_votes,total_votes,total_votes,total_votes,contest_total_ag,contest_total_president,fraction_newton,fraction_trump
contest_name,NC ATTORNEY GENERAL,NC ATTORNEY GENERAL,US PRESIDENT,US PRESIDENT,US PRESIDENT,US PRESIDENT,US PRESIDENT,US PRESIDENT,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
candidate,Buck Newton,Josh Stein,Donald J. Trump,Gary Johnson,Hillary Clinton,JIll Stein (Write-In),Jill Stein (Write-In),Write-In (Miscellaneous),Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
county_pct,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
ALAMANCE01,1784.0,519.0,1865.0,50.0,411.0,0.0,10.0,15.0,2303.0,2351.0,0.774642,0.793279
ALAMANCE02,1909.0,497.0,2004.0,40.0,403.0,0.0,5.0,19.0,2406.0,2471.0,0.793433,0.811008
ALAMANCE035,1442.0,1042.0,1485.0,77.0,995.0,0.0,14.0,13.0,2484.0,2584.0,0.580515,0.57469
ALAMANCE03C,1094.0,720.0,1059.0,64.0,679.0,0.0,5.0,28.0,1814.0,1835.0,0.603087,0.577112
ALAMANCE03N,1198.0,1008.0,1160.0,66.0,1036.0,0.0,12.0,23.0,2206.0,2297.0,0.543064,0.505007


Create clean dataframe to convert to csv

In [14]:
results_pivot['county_pct'] = results_pivot.index
results_df = results_pivot[['county_pct','fraction_newton','fraction_trump']]
results_df.to_csv('results.csv')


In [15]:
print results_df.head()

               county_pct fraction_newton fraction_trump
contest_name                                            
candidate                                               
county_pct                                              
ALAMANCE01     ALAMANCE01        0.774642       0.793279
ALAMANCE02     ALAMANCE02        0.793433       0.811008
ALAMANCE035   ALAMANCE035        0.580515       0.574690
ALAMANCE03C   ALAMANCE03C        0.603087       0.577112
ALAMANCE03N   ALAMANCE03N        0.543064       0.505007
