In [255]:
import numpy as np
import pandas as pd
from collections import Counter

import sklearn as sk
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.base import TransformerMixin
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Preprocessing

In [80]:
# Load Data
cf = pd.read_csv('./data/county_facts.csv')
cfd = pd.read_csv('./data/county_facts_dictionary.csv')
#pr = pd.read_csv('./data/primary_results.csv')
pr = pd.read_csv('./data/scraped_primary_results.csv')

In [52]:
# Feature/Label vector builders
def get_x(cf, fips):
    x = []
    for countyid in fips:
        x.append(cf[cf['fips'] == countyid].iloc[0].to_dict())
    return x

def get_y(pr, x_array, party):
    new_x = []
    y = []
    for county in x_array:
        results = pr[(pr['fips'] == county['fips']) & (pr['party'] == party)]
        if len(results) > 0:
            y.append(results['candidate'].loc[results['votes'].argmax()])
            new_x.append(county)
    return pd.Series(y), new_x

# Metrics and helpers
def get_winner(predictions):
    return pd.Series(predictions).max()

def get_scaled_winner(predictions, x):
    count = Counter()
    for (prediction, data) in zip(predictions, x):
        population = data['PST045214']
        count[prediction] += population
    return count

def report_metrics(y_true, y_pred):
    print "Accuracy: " + str(accuracy_score(y_true, y_pred))
    '''
    print "F1: " + str(f1_score(y_true, y_pred, pos_label=pd.Series(y_true).max()))
    print "Precision: " + str(precision_score(y_true, y_pred, pos_label=pd.Series(y_true).max()))
    print "Recall: " + str(recall_score(y_true, y_pred, pos_label=pd.Series(y_true).max()))
    '''

# ML Pipeline

In [278]:
class DensifyTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        tX = map(lambda x: x.toarray()[0], X)
        return tX
    
    def fit(self, X, y=None, **fit_params):
        return self

def get_pipeline():
    return Pipeline([
        ('vectorize', DictVectorizer()),
        #('densify', DensifyTransformer()),
        #('PCA', PCA(n_components=25)),
        #('svm', SVC())
        #('linear_svm', LinearSVC())
        #('kneighbors', KNeighborsClassifier(100))
        ('rf', RandomForestClassifier(n_estimators=1000))
        #('ab', AdaBoostClassifier(n_estimators=250))
        #('regression', LogisticRegression())
    ])

In [296]:
x = get_x(cf, cf[cf['state_abbreviation'] != 'OK']['fips'].unique())
y, x = get_y(pr, x, "Republican")

In [297]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)
pipeline = get_pipeline()
pipeline.fit(x_train, y_train)
y_pred = pipeline.predict(x_test)

print get_scaled_winner(y_pred,x)
print report_metrics(y_test, y_pred)

Counter({'Donald Trump': 18547823, 'Ted Cruz': 7683806, 'Marco Rubio': 13170})
Accuracy: 0.790983606557
None


# Baseline Comparisons

## Random Vector

In [288]:
def get_random_x(dimensions, length):
    x = []
    for i in range(length):
        rand = np.random.rand(1, dimensions)[0]
        feature_dict = {}
        for j in range(dimensions):
            feature_dict[j] = rand[j]
        x.append(feature_dict)
    return pd.Series(x)

randx_train = get_random_x(len(x_train[0]), len(x_train))
randx_test = get_random_x(len(x_test[0]), len(x_test))

pipe = get_pipeline()
pipe.fit(randx_train, y_train)
y_pred = pipe.predict(randx_test)

report_metrics(y_test, y_pred)

Accuracy: 0.567901234568


## Predicting Mode

In [287]:
mode = y_train.max()
mode = get_scaled_winner(y_train, x_train).most_common(1)[0][0]
print mode
y_pred = [mode for i in range(len(x_test))]

report_metrics(y_test, y_pred)

Donald Trump
Accuracy: 0.592592592593


# State Level Predictions

In [108]:
def get_state_winner(pipeline, state, cf):
    statex = get_x(cf, cf[cf['state_abbreviation'] == state]['fips'])
    state_pred = pipe.predict(statex)
    #return get_winner(state_pred), state_pred
    return get_scaled_winner(state_pred, statex)

In [253]:
pipe = get_pipeline()
pipe.fit(x, y)
get_state_winner(pipe, 'MN', cf)

Counter({'Donald Trump': 5067636, 'Ted Cruz': 389537})