In [470]:
import numpy as np
import pandas as pd

import sklearn as sk
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.base import TransformerMixin
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Preprocessing

In [164]:
# Load Data
cf = pd.read_csv('./data/county_facts.csv')
cfd = pd.read_csv('./data/county_facts_dictionary.csv')
pr = pd.read_csv('./data/primary_results.csv')

In [243]:
# Feature/Label vector builders
def get_x(cf, fips):
    x = []
    for countyid in fips:
        x.append(cf[cf['fips'] == countyid].iloc[0].to_dict())
    return x

def get_y(pr, x_array, party):
    y = []
    for county in x_array:
        results = pr[(pr['fips'] == county['fips']) & (pr['party'] == party)]
        y.append(results['candidate'].loc[results['votes'].argmax()])
    return pd.Series(y)

# Metrics and helpers
def get_winner(predictions):
    return pd.Series(predictions).max()

def report_metrics(y_true, y_pred):
    print "Accuracy: " + str(accuracy_score(y_true, y_pred))
    '''
    print "F1: " + str(f1_score(y_true, y_pred, pos_label=pd.Series(y_true).max()))
    print "Precision: " + str(precision_score(y_true, y_pred, pos_label=pd.Series(y_true).max()))
    print "Recall: " + str(recall_score(y_true, y_pred, pos_label=pd.Series(y_true).max()))
    '''

# ML Pipeline

In [420]:
class DensifyTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        tX = map(lambda x: x.toarray()[0], X)
        return tX
    
    def fit(self, X, y=None, **fit_params):
        return self

def get_pipeline():
    return Pipeline([
        ('vectorize', DictVectorizer()),
        ('densify', DensifyTransformer()),
        ('PCA', PCA(n_components=10)),
        #('svm', SVC())
        #('kneighbors', KNeighborsClassifier(15))
        ('rf', RandomForestClassifier(n_estimators=100))
        #('ab', AdaBoostClassifier(n_estimators=200))
        #('regression', LogisticRegression())
    ])

In [291]:
x = get_x(cf, pr['fips'].unique())
y = get_y(pr, x, "Republican")

In [468]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)
pipeline = get_pipeline()
pipeline.fit(x_train, y_train)
y_pred = pipeline.predict(x_test)

print get_winner(y_pred)
print report_metrics(y_test, y_pred)

Ted Cruz
Accuracy: 0.736842105263
None


# Baseline Comparisons

## Random Vector

In [469]:
def get_random_x(dimensions, length):
    x = []
    for i in range(length):
        rand = np.random.rand(1, dimensions)[0]
        feature_dict = {}
        for j in range(dimensions):
            feature_dict[j] = rand[j]
        x.append(feature_dict)
    return pd.Series(x)

randx_train = get_random_x(len(x_train[0]), len(x_train))
randx_test = get_random_x(len(x_test[0]), len(x_test))

pipe = get_pipeline()
pipe.fit(randx_train, y_train)
y_pred = pipe.predict(randx_test)

report_metrics(y_test, y_pred)

Accuracy: 0.543859649123


## Predicting Mode

In [277]:
mode = y_train.max()
print mode
y_pred = [mode for i in range(len(x_test))]

report_metrics(y_test, y_pred)

Ted Cruz
Accuracy: 0.385964912281


# State Level Predictions

In [167]:
def get_state_winner(pipeline, state, cf):
    statex = get_x(cf, cf[cf['state_abbreviation'] == state]['fips'])
    state_pred = pipe.predict(statex)
    return get_winner(state_pred)

In [286]:
pipe = get_pipeline()
pipe.fit(x, y)
get_state_winner(pipe, 'AR', cf)

'Ted Cruz'