In [2]:
#Packages Used
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats
from matplotlib import pyplot as plt
%matplotlib inline

## Part 5 - Creating a Model to Predict Election Outcome

#### Input: X = Numeric Data Features , y =  State Election Outcomes for 2008 and 2012
#### Output: Election outcome prediction for each state 

define X:

In [38]:
#make a copy of all numeric variabels from the data.csv dataframe
df_log = df_data_clean.copy()

#Declare independent variable X and mean normalize before passing X
#into the model
X = df_log[df_log.columns[
    2:len(df_log.columns)]].groupby(df['State']).mean().apply(
    lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))
X.head()

Unnamed: 0_level_0,Precincts,Less Than High School Diploma,At Least High School Diploma,At Least Bachelors's Degree,Graduate Degree,School Enrollment,Median Earnings 2010,Native American Population,Other Race or Races,Latino Population,...,White,Black,Asian,SIRE_homogeneity,median_age,Children.in.single.parent.households,Adult.obesity,Diabetes,Uninsured,Unemployment
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,-0.077755,0.464248,-0.148869,-0.264974,-0.1812,-0.012754,-0.144352,-0.092312,-0.036711,-0.115357,...,-0.166794,0.380332,-0.045341,-0.137524,-0.091301,0.207347,0.368148,0.503946,0.046533,0.088773
Arizona,0.065748,0.176586,-0.043689,-0.147305,-0.090189,-0.039478,-0.082358,0.871648,-0.016742,0.48015,...,-0.369304,-0.130844,-0.026932,-0.33281,-0.155123,0.135144,-0.086555,-0.009799,0.192575,0.415613
Arkansas,-0.087043,0.342769,-0.104452,-0.33013,-0.290216,-0.016924,-0.243802,-0.092933,-0.027579,-0.077936,...,-0.00584,0.138276,-0.044498,-0.027126,-0.006518,0.119173,0.277318,0.295051,0.217124,0.051962
California,0.85238,0.140121,-0.030356,0.142438,0.091229,0.03524,0.137561,-0.008604,0.030581,0.441915,...,-0.293821,-0.101922,0.146049,-0.337213,-0.169899,0.005002,-0.310781,-0.266848,0.118666,0.512985
Colorado,-0.052534,-0.175835,0.08517,0.305835,0.179064,-0.000126,-2.4e-05,-0.068742,-0.022754,0.244062,...,-0.019341,-0.136373,-0.029546,-0.06444,0.043765,-0.138398,-0.530811,-0.496054,0.160312,-0.020453


Define y:

In [60]:
#Join votes dataframe to features data frame
df_p5 = df_data_clean.join(df_votes_clean,how='inner').groupby(df['State']).sum()

#Create a Boolean column with Republican wins or losses each year
df_p5['Winners 08'] = df_p5['Republicans 08 (Votes)'] > df_p5['Democrats 08 (Votes)']
df_p5['Winners 12'] = df_p5['Republicans 12 (Votes)'] > df_p5['Democrats 12 (Votes)']
df_p5['Winners 16'] = df_p5['Republicans 16 (Votes)'] > df_p5['Democrats 16 (Votes)']
df_p5.head(5)

In [40]:
#Declare our dependent and independent variables to be fed into our
#Logistic Regression Model
y_08 = df_p5['Winners 08']
y_12 = df_p5['Winners 12']
y_16 = df_p5['Winners 16']

## Logistical Regression Model

In [41]:
from sklearn import linear_model

In [42]:
#Call logistic regression function to train our model on our election
#features and results for the 08' and 12' elections.

logreg = linear_model.LogisticRegression()
model_08 = logreg.fit(X, y_08)
model_12 = logreg.fit(X,y_12)

#See how well the model predicted the 08' and 12' elections
print('08 Model Accuracy:',model_08.score(X,y_16))
print('12 Model Accuracy:',model_12.score(X,y_16))

08 Model Accuracy: 0.82
12 Model Accuracy: 0.82


In [43]:
#The 08' and 12' had very similair results so it is appropriate that they 
#yield similair results

In [44]:
#Get state election outcome predictions from our models
predictions_08 = model_08.predict(X)
predictions_12 = model_12.predict(X)

In [45]:
#Add predictions to our Dataframe
df_p5['Predicted Winners 08'] = predictions_08
df_p5['Predicted Winners 12'] = predictions_12

In [46]:
#Set EC_Votes index to 'State' for our join
try:
    df_ec.set_index('State', inplace=True)
except:
    print('Index already Set to State')

In [47]:
#Join dataframe to ec_votes date frame 
#on state to get electoral votes in each state
df_results = df_p5.join(df_ec,how='inner')
df_results['Elector Votes (Republican)'] = df_results['Winners 16']*df_results['EC_votes']

#Calculate predicted electoral votes by multiplying Predicted
#Repulican Win column by EC_Votes Column

df_results['Predicted Elector Votes (Republican) 08'] = \
df_results['Predicted Winners 08']*df_results['EC_votes']

df_results['Predicted Elector Votes (Republican) 12'] = \
df_results['Predicted Winners 12']*df_results['EC_votes']

In [48]:
#Calculate the predicted number of seats won by republicans in our model
seat_share_08 = np.sum(df_results['Predicted Elector Votes (Republican) 08'])\
/np.sum(df_results['EC_votes'])

seat_share_12 = np.sum(df_results['Predicted Elector Votes (Republican) 12'])\
/np.sum(df_results['EC_votes'])

#Calculate the actual number of seats won by republicans 
seat_share_actual = np.sum(df_results['Elector Votes (Republican)'])\
/np.sum(df_results['EC_votes'])

In [49]:
print('Predicted Republican Electoral Vote Share (Using 2008 Election Results):')
print('%',seat_share_08)
print('')
print('Predicted Republican Electoral Vote Share (Using 2012 Election Results):')
print('%',seat_share_12)
print('')
print('Actual Republican Electoral Vote Share:')
print('%',seat_share_actual)

Predicted Republican Electoral Vote Share (Using 2008 Election Results):
% 0.3719626168224299

Predicted Republican Electoral Vote Share (Using 2012 Election Results):
% 0.3719626168224299

Actual Republican Electoral Vote Share:
% 0.5644859813084112


Our model predicted a democratic win. Both models had the same exact predictions. This is likely because our X matrix is the same for all years. The model is limited in that the independent variable (voter information) we are trying to use to predict our dependent variable (votes) is the same across each election. In practice voter information and votes from a particular year would be used to fit a model then that model would be used on a data set from a future year to predict an election result.

## Linear Regression Model (for comparison)

In [50]:
#Assign dependent variable for linear model
y_lin_08 = (df_p5['Republicans 08 (Votes)']-df_p5['Democrats 08 (Votes)'])

In [51]:
#feed dependent and independent variables into our model
est = sm.OLS(y_lin_08, X).fit()
lin_predicts_08 = est.fittedvalues
#est_ST12 = sm.OLS(y_08, X_ST).fit()

In [52]:
#Create new data fram for results:
df_model_ST = df_ec[['EC_votes']]
df_model_ST = df_model_ST.copy()

#Add predictions and results to our dataframe
df_model_ST['Predicted Vote Count 16 (Republicans)'] = lin_predicts_08
df_model_ST['Republican Win (Prediction)'] = lin_predicts_08 > 0
df_model_ST['Republican Win (Actual)'] = (df_p5['Republicans 16 (Votes)']\
- df_p5['Democrats 16 (Votes)']) > 0
df_model_ST['Democrat Win (Actual)'] = (df_p5['Republicans 16 (Votes)']\
- df_p5['Democrats 16 (Votes)']) < 0

#Logic returns Nan Values instead of False values. 
#Use Fillna to eliminate.
df_model_ST = df_model_ST.fillna(False)

#Calculate the predicted and actual number of seats won by Republicans.
#in 2016
df_model_ST['Predicted Elector Votes (Republican) 08'] = \
df_model_ST['Republican Win (Prediction)']*df_model_ST['EC_votes']

df_model_ST['Actual Electoral Votes (Republican)'] = \
df_model_ST['Republican Win (Actual)']*df_model_ST['EC_votes']

In [53]:
vote_share_08 = np.sum(df_model_ST['Predicted Elector Votes (Republican) 08'])\
/np.sum(df_results['EC_votes'])

vote_share_actual = np.sum(df_model_ST['Actual Electoral Votes (Republican)'])\
/np.sum(df_model_ST['EC_votes'])
print('Predicted Republican Electoral Vote Share:')
print('%',vote_share_08)
print('')
print('Actual Republican Electoral Vote Share:')
print('%',vote_share_actual)
print('')
print('R-Squared:',est.rsquared)

Predicted Republican Electoral Vote Share:
% 0.43177570093457945

Actual Republican Electoral Vote Share:
% 0.5613382899628253

R-Squared: 0.8297022745428937


Both models predicted a Democrat win. The Logistical regression predicted a %37.2 electoral vote share for the Republicans and a linear regression model predicted a %43.2 electoral vote share for Republicans