In [147]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import statsmodels.formula.api as smf

from patsy import dmatrices, dmatrix, build_design_matrices

import math
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

# This sets some nicer defaults for plotting.
# This must be run in a separate cell from importing matplotlib due to a bug.
params = {'legend.fontsize': 'large',
          'figure.figsize': (11.0, 11.0),
          'axes.labelsize': 'x-large',
          'axes.titlesize':'xx-large',
          'xtick.labelsize':'large',
          'ytick.labelsize':'large'}
mpl.rcParams.update(params)

# This makes it so that the pandas dataframes don't get truncated horizontally.
pd.options.display.max_columns = 200

In [148]:
def summarize_dataframe(df):
    """Summarize a dataframe, and report missing values."""
    missing_values = pd.DataFrame({'Variable Name': df.columns,
                                   'Data Type': df.dtypes,
                                   'Missing Values': df.isnull().sum(),
                                   'Unique Values': [df[name].nunique() for name in df.columns]}
                                 ).set_index('Variable Name')
    with pd.option_context("display.max_rows", 1000):
        display(pd.concat([missing_values, df.describe(include='all').transpose()], axis=1).fillna(""))

In [159]:
def accuracy(y_true, y_pred):
    """Function that returns a table showing RMSE and MAE."""
    acc_df = pd.DataFrame(data = {"RMSE": [mean_squared_error(y_true, y_pred)**(1/2)],
                                  "MAE": [mean_absolute_error(y_true, y_pred)],
                                  "R^2": [r2_score(y_true, y_pred)]})
    display(acc_df.style.hide_index())

In [149]:
# Read Data, fix Date type
elect_df = pd.read_csv("/home/jovyan/Data Analytics I/S08 and S11 - Clinton-Obama/Obama.csv")
elect_df["ElectionDate"] = pd.to_datetime(elect_df["ElectionDate"])


In [150]:
# Split into observed data, and forecast data
elect_df_train = elect_df[elect_df["ElectionDate"] < pd.to_datetime("2/19/2008")].copy()
elect_df_forecast= elect_df[elect_df["ElectionDate"] >= pd.to_datetime("2/19/2008")].copy()

# Create Obama Win Margins
elect_df_train["Obama_margin"] = elect_df_train["Obama"] - elect_df_train["Clinton"]
elect_df_train["Obama_margin_percent"] = elect_df_train["Obama_margin"] / elect_df_train["TotalVote"]
elect_df_train["Obama_wins"] = elect_df_train["Obama_margin"] > 0

# Create a Copy (going to try different impute methods)
elect_df_train2=elect_df_train.copy()

#Create Imputer

In [151]:
imputer = IterativeImputer(missing_values=np.nan, max_iter=10,random_state=0,verbose=0, initial_strategy='mean')

# 2 methods to impute - code / iterative_imputer

In [152]:
elect_df_train.loc[:, "MalesPer100Females":"FarmArea"] = imputer.fit_transform(elect_df_train.loc[:, "MalesPer100Females":"FarmArea"])

In [153]:
# Replace all null values in elect_df with the state average
    # Ignores columns in 'ignore' array
ignore = ['FIPS' ,'ElectionDate','ElectionType','Obama','Clinton','TotalVote','County','State','Region']
for col in elect_df_train2.columns:
    if (col not in ignore):
#         print('Replacing ' + str(elect_df[col].isna().sum()) + ' values in ' + col+ ' with state average')
        elect_df_train2[col] = elect_df_train2.groupby("State").transform(lambda x: x.fillna(x.mean()))[col] 


In [154]:
# Make sure this is working... Plot black  voter % by state
# elect_df.groupby('State').Asian.mean().plot(kind = 'bar')

# Created 2 training datasets.. 
elect_df_train
elect_df_train2
Now split them both into testing/training parts.

In [157]:
# Split Train Dataset into Train/Test for tuning model
    #Use folding?
elect_df_smaller_train1, elect_df_validation1 = train_test_split(elect_df_train, test_size=.25, random_state=201)
elect_df_smaller_train2, elect_df_validation2 = train_test_split(elect_df_train2, test_size=.25, random_state=201)

# Linear Regression

In [186]:
lm_1 = smf.ols(formula='Obama_margin_percent ~ Region + Black + Asian + AmericanIndian+ Hispanic+ HighSchool + Bachelors + Poverty +MedianIncome+ PopDensity ', data=elect_df_smaller_train1).fit()
lm_1_pred = lm_1.predict(elect_df_validation1)
accuracy(elect_df_validation["Obama_margin_percent"], lm_1_pred)
# lm_1.summary()
# lm_2 = smf.ols(formula='Obama_margin_percent ~ Region + Black + HighSchool + Poverty + PopDensity + LandArea', data=elect_df_smaller_train2).fit()

RMSE,MAE,R^2
0.180158,0.136433,0.662076


# Lasso Regularization

In [202]:
variables = [variable for variable in elect_df_smaller_train.columns if variable not in ['FIPS', 'County', 'State', 'ElectionDate', 'TotalVote', 'Obama_margin', 'Obama_margin_percent', 'Obama_wins', 'Clinton', 'Obama']]

In [189]:
formula = 'Obama_margin_percent ~ ' + " + ".join(variables[:2]) +  " + standardize(" + ") + standardize(".join(variables[2:]) + ")"

In [190]:
y_linear, X_linear = dmatrices(formula, elect_df_smaller_train, return_type="dataframe")

In [193]:
lm_lasso = Lasso(alpha = 1, max_iter=1000000)
lm_lasso.fit(X_linear, y_linear)
X_validation_linear = build_design_matrices([X_linear.design_info], elect_df_validation1, return_type="dataframe")[0]

In [195]:
lm_lass_pred = lm_lasso.predict(X_validation_linear)
accuracy(lm_lass_pred, elect_df_validation["Obama_margin_percent"])

RMSE,MAE,R^2
0.311966,0.254121,-8.085241931389027e+33


In [197]:
alphas = np.logspace(-5,0, 20)
gs_lasso = GridSearchCV(Lasso(max_iter=1000000), {"alpha": alphas}, scoring='neg_mean_squared_error', cv=4)
gs_lasso.fit(X_linear, y_linear)

GridSearchCV(cv=4, estimator=Lasso(max_iter=1000000),
             param_grid={'alpha': array([1.00000000e-05, 1.83298071e-05, 3.35981829e-05, 6.15848211e-05,
       1.12883789e-04, 2.06913808e-04, 3.79269019e-04, 6.95192796e-04,
       1.27427499e-03, 2.33572147e-03, 4.28133240e-03, 7.84759970e-03,
       1.43844989e-02, 2.63665090e-02, 4.83293024e-02, 8.85866790e-02,
       1.62377674e-01, 2.97635144e-01, 5.45559478e-01, 1.00000000e+00])},
             scoring='neg_mean_squared_error')

In [199]:
lm_lasso_optimized = Lasso(alpha = gs_lasso.best_params_['alpha'], max_iter=1000000)
lm_lasso_optimized.fit(X_linear,y_linear)
lm_lasso_optimized_pred = lm_lasso_optimized.predict(X_validation_linear)

In [200]:
accuracy(lm_lasso_optimized_pred, elect_df_validation["Obama_margin_percent"])

RMSE,MAE,R^2
0.167787,0.125647,0.584926
