In [86]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import matplotlib.pyplot as plt

In [63]:
df_historical = pd.read_csv('Camp_Election.Oracle\\Historical Election Results\\MIT_Lab_Historical_Election_Results\\1976-2020-president.csv')
df_historical = df_historical[['year', 'state', 'candidatevotes', 'totalvotes','party_simplified']]
clean_tfc = pd.read_csv('clean_TFC_data.csv')
clean_tfc.head(5)

Unnamed: 0,year,state,pct_2p_vote,term2,real_gdp_pct_growth,net_approval
0,1976,ALABAMA,43.333276,0,5.4,5
1,1976,ALASKA,61.891829,0,5.4,5
2,1976,ARIZONA,58.613303,0,5.4,5
3,1976,ARKANSAS,34.951149,0,5.4,5
4,1976,CALIFORNIA,50.917827,0,5.4,5


In [64]:
df_historical.head(5)

Unnamed: 0,year,state,candidatevotes,totalvotes,party_simplified
0,1976,ALABAMA,659170,1182850,DEMOCRAT
1,1976,ALABAMA,504070,1182850,REPUBLICAN
2,1976,ALABAMA,9198,1182850,OTHER
3,1976,ALABAMA,6669,1182850,OTHER
4,1976,ALABAMA,1954,1182850,OTHER


In [65]:
df = pd.merge(df_historical, clean_tfc, on = ['year', 'state'])

In [66]:
display(df)

Unnamed: 0,year,state,candidatevotes,totalvotes,party_simplified,pct_2p_vote,term2,real_gdp_pct_growth,net_approval
0,1976,ALABAMA,659170,1182850,DEMOCRAT,43.333276,0,5.4,5
1,1976,ALABAMA,504070,1182850,REPUBLICAN,43.333276,0,5.4,5
2,1976,ALABAMA,9198,1182850,OTHER,43.333276,0,5.4,5
3,1976,ALABAMA,6669,1182850,OTHER,43.333276,0,5.4,5
4,1976,ALABAMA,1954,1182850,OTHER,43.333276,0,5.4,5
...,...,...,...,...,...,...,...,...,...
4301,2020,WYOMING,5768,278503,LIBERTARIAN,72.480434,1,-2.2,-6
4302,2020,WYOMING,2208,278503,OTHER,72.480434,1,-2.2,-6
4303,2020,WYOMING,1739,278503,OTHER,72.480434,1,-2.2,-6
4304,2020,WYOMING,279,278503,OTHER,72.480434,1,-2.2,-6


In [67]:
enc = OneHotEncoder()
scaler = StandardScaler()

In [84]:
y = df['candidatevotes']

state_labels = np.unique(df['state'])
state_matrix = enc.fit_transform(df['state'].values.reshape(-1,1)).toarray()
state_matrix = pd.DataFrame(state_matrix, columns=state_labels)

party_label = np.unique(df['party_simplified'])
party_matrix = enc.fit_transform(df['party_simplified'].values.reshape(-1,1)).toarray()
party_matrix = pd.DataFrame(party_matrix, columns = party_label )

numerical_matrix = scaler.fit_transform(df[['pct_2p_vote','term2','real_gdp_pct_growth', 'net_approval']])
numerical_matrix = pd.DataFrame(numerical_matrix, columns = ['pct_2p_vote','term2','real_gdp_pct_growth', 'net_approval'])

X = pd.concat([state_matrix, party_matrix, numerical_matrix], axis = 1)

display(X)

Unnamed: 0,ALABAMA,ALASKA,ARIZONA,ARKANSAS,CALIFORNIA,COLORADO,CONNECTICUT,DELAWARE,DISTRICT OF COLUMBIA,FLORIDA,...,WISCONSIN,WYOMING,DEMOCRAT,LIBERTARIAN,OTHER,REPUBLICAN,pct_2p_vote,term2,real_gdp_pct_growth,net_approval
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,-0.568311,-1.038840,1.086187,0.266512
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,-0.568311,-1.038840,1.086187,0.266512
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.568311,-1.038840,1.086187,0.266512
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.568311,-1.038840,1.086187,0.266512
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.568311,-1.038840,1.086187,0.266512
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4301,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.979956,0.962612,-1.775451,-0.305857
4302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.979956,0.962612,-1.775451,-0.305857
4303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.979956,0.962612,-1.775451,-0.305857
4304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.979956,0.962612,-1.775451,-0.305857


In [85]:
X = sm.add_constant(X)
model = sm.OLS(y,X).fit()
model.summary()

0,1,2,3
Dep. Variable:,candidatevotes,R-squared:,0.528
Model:,OLS,Adj. R-squared:,0.522
Method:,Least Squares,F-statistic:,83.39
Date:,"Thu, 08 Feb 2024",Prob (F-statistic):,0.0
Time:,15:54:48,Log-Likelihood:,-62823.0
No. Observations:,4306,AIC:,125800.0
Df Residuals:,4248,BIC:,126100.0
Df Model:,57,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.117e+05,7763.765,53.032,0.000,3.97e+05,4.27e+05
ALABAMA,-4.724e+04,6.38e+04,-0.740,0.459,-1.72e+05,7.79e+04
ALASKA,-2.69e+05,5.92e+04,-4.542,0.000,-3.85e+05,-1.53e+05
ARIZONA,-5.621e+04,5.57e+04,-1.009,0.313,-1.65e+05,5.3e+04
ARKANSAS,-1.373e+05,5.53e+04,-2.483,0.013,-2.46e+05,-2.89e+04
CALIFORNIA,1.348e+06,5.68e+04,23.734,0.000,1.24e+06,1.46e+06
COLORADO,-5701.9067,4.34e+04,-0.131,0.895,-9.07e+04,7.93e+04
CONNECTICUT,-7.833e+04,6e+04,-1.305,0.192,-1.96e+05,3.93e+04
DELAWARE,-2.486e+05,5.75e+04,-4.325,0.000,-3.61e+05,-1.36e+05

0,1,2,3
Omnibus:,3672.026,Durbin-Watson:,1.154
Prob(Omnibus):,0.0,Jarque-Bera (JB):,261046.249
Skew:,3.659,Prob(JB):,0.0
Kurtosis:,40.436,Cond. No.,4920000000000000.0
