In [1]:
import numpy as np
import pandas as pd
from collections import Counter

import sklearn as sk
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.base import TransformerMixin
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Preprocessing

In [65]:
# Load Data
cf = pd.read_csv('./data/county_facts.csv')
cfd = pd.read_csv('./data/county_facts_dictionary.csv')
#pr = pd.read_csv('./data/primary_results.csv')
pr = pd.read_csv('./data/scraped_primary_results4.csv')

In [26]:
#cf['primary_date'] =pd.to_datetime(raw_data['Mycol'], format='%d%b%Y:%H:%M:%S.%f')
#Series(np.zeros(sLength), index=df1.index)
#cf[(cf['state_abbreviation'] == 'KS') | 
#   (cf['state_abbreviation'] == 'KY') | 
#   (cf['state_abbreviation'] == 'LA') | 
#   (cf['state_abbreviation'] == 'NE')]

Unnamed: 0,fips,area_name,state_abbreviation,PST045214,PST040210,PST120214,POP010210,AGE135214,AGE295214,AGE775214,...,SBO415207,SBO015207,MAN450207,WTN220207,RTN130207,RTN131207,AFN120207,BPS030214,LND110210,POP060210
906,20001,Allen County,KS,12909,13371,-3.5,13371,6.0,23.3,19.0,...,0.0,28.4,434704,37176,121694,9077,12409,13,500.30,26.7
907,20003,Anderson County,KS,7883,8102,-2.7,8102,6.2,25.2,20.6,...,0.0,15.3,0,53525,87242,11102,5600,11,579.65,14.0
908,20005,Atchison County,KS,16513,16924,-2.4,16924,6.4,23.4,16.3,...,0.0,38.3,0,0,130130,7912,14931,2,431.17,39.3
909,20007,Barber County,KS,4897,4861,0.7,4861,6.9,22.5,20.3,...,3.3,37.2,0,35570,70317,14910,2727,0,1134.07,4.3
910,20009,Barton County,KS,27385,27674,-1.0,27674,6.9,24.5,17.3,...,0.0,18.8,561411,0,339355,12254,35379,58,895.40,30.9
911,20011,Bourbon County,KS,14772,15173,-2.6,15173,6.8,25.0,18.5,...,0.0,24.4,178967,554890,120585,8136,16285,2,635.47,23.9
912,20013,Brown County,KS,9815,9984,-1.7,9984,7.0,25.4,19.0,...,0.0,24.1,91416,87482,67859,6771,7611,2,570.87,17.5
913,20015,Butler County,KS,66227,65880,0.5,65880,6.0,26.0,14.0,...,0.0,24.3,0,130113,567483,9024,57370,143,1429.86,46.1
914,20017,Chase County,KS,2692,2790,-3.5,2790,4.5,21.1,23.1,...,0.0,0.0,0,0,10868,3815,2283,0,773.06,3.6
915,20019,Chautauqua County,KS,3481,3669,-5.1,3669,5.5,20.5,25.8,...,0.0,0.0,0,0,15764,4176,0,0,638.88,5.7


In [3]:
# Feature/Label vector builders
def get_x(cf, fips):
    x = []
    for countyid in fips:
        x.append(cf[cf['fips'] == countyid].iloc[0].to_dict())
    return x

def get_y(pr, x_array, party):
    new_x = []
    y = []
    for county in x_array:
        results = pr[(pr['fips'] == county['fips']) & (pr['party'] == party)]
        if len(results) > 0:
            y.append(results['candidate'].loc[results['votes'].argmax()])
            new_x.append(county)
    return pd.Series(y), new_x

# Metrics and helpers
def get_winner(predictions):
    return Counter(predictions)

def get_scaled_winner(predictions, x):
    count = Counter()
    for (prediction, data) in zip(predictions, x):
        population = data['PST045214']
        count[prediction] += population
    return count

def report_metrics(y_true, y_pred):
    print "Accuracy: " + str(accuracy_score(y_true, y_pred))
    '''
    print "F1: " + str(f1_score(y_true, y_pred, pos_label=pd.Series(y_true).max()))
    print "Precision: " + str(precision_score(y_true, y_pred, pos_label=pd.Series(y_true).max()))
    print "Recall: " + str(recall_score(y_true, y_pred, pos_label=pd.Series(y_true).max()))
    '''

# ML Pipeline

In [4]:
class DensifyTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        tX = map(lambda x: x.toarray()[0], X)
        return tX
    
    def fit(self, X, y=None, **fit_params):
        return self

def get_pipeline():
    return Pipeline([
        ('vectorize', DictVectorizer()),
        #('densify', DensifyTransformer()),
        #('PCA', PCA(n_components=25)),
        #('svm', SVC())
        #('linear_svm', LinearSVC())
        #('kneighbors', KNeighborsClassifier(5))
        ('rf', RandomForestClassifier(n_estimators=250))
        #('ab', AdaBoostClassifier(n_estimators=250))
        #('regression', LogisticRegression())
    ])

In [95]:
x = get_x(cf, cf['fips'].unique())
y, x = get_y(pr, x, "Republican")

In [98]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)
pipeline = get_pipeline()
pipeline.fit(x_train, y_train)
y_pred = pipeline.predict(x_test)

print get_scaled_winner(y_pred,x)
print report_metrics(y_test, y_pred)

Counter({'Donald Trump': 16271009, 'Ted Cruz': 12555340, 'Marco Rubio': 33389})
Accuracy: 0.803278688525
None


# Baseline Comparisons

## Random Vector

In [99]:
def get_random_x(dimensions, length):
    x = []
    for i in range(length):
        rand = np.random.rand(1, dimensions)[0]
        feature_dict = {}
        for j in range(dimensions):
            feature_dict[j] = rand[j]
        x.append(feature_dict)
    return pd.Series(x)

randx_train = get_random_x(len(x_train[0]), len(x_train))
randx_test = get_random_x(len(x_test[0]), len(x_test))

pipe = get_pipeline()
pipe.fit(randx_train, y_train)
y_pred = pipe.predict(randx_test)

report_metrics(y_test, y_pred)

Accuracy: 0.567213114754


## Predicting Mode

In [100]:
mode = y_train.max()
mode = get_scaled_winner(y_train, x_train).most_common(1)[0][0]
print mode
y_pred = [mode for i in range(len(x_test))]

report_metrics(y_test, y_pred)

Donald Trump
Accuracy: 0.580327868852


# State Level Predictions

In [20]:
def get_state_winner(pipeline, state, cf):
    statex = get_x(cf, cf[cf['state_abbreviation'] == state]['fips'])
    state_pred = pipe.predict(statex)
    return get_winner(state_pred)
    #return get_scaled_winner(state_pred, statex)

In [101]:
pipe = get_pipeline()
pipe.fit(x, y)
get_state_winner(pipe, 'OH', cf)

Counter({'Donald Trump': 84, 'Ted Cruz': 3, 'John Kasich': 1})