# Imports and Functions

In [None]:
import io
import pandas as pd
import xgboost as xgb
import numpy as np

from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from numpy import nan
from matplotlib import pyplot as plt
import matplotlib.pylab as pylab


In [None]:
def toDem(row):
  if ((row['16_R'] > row['16_D']) and (row['20_R'] < row['20_D'])):
    return 1
  else:
    return 0

def toRep(row):
  if ((row['16_R'] < row['16_D']) and (row['20_R'] > row['20_D'])):
    return 1
  else:
    return 0
def flip(row):
    if (row['ToRep']==1) or (row['ToDem']==1):
        return 1
    else:
        return 0

# Data Pre-Processing

In [None]:
df = pd.read_csv('../input/2020-general-election-polls/county_statistics.csv', index_col=0) 
df = df.drop(columns=['county','state'])
df = df.rename(columns={"percentage16_Donald_Trump": "16_R", "percentage16_Hillary_Clinton":"16_D", "percentage20_Donald_Trump": "20_R", "percentage20_Joe_Biden": "20_D" })
df = df.drop(columns=["votes16_Donald_Trump", "votes16_Hillary_Clinton", "votes20_Donald_Trump", "votes20_Joe_Biden"])

In [None]:
df['ToRep'] = df.apply(lambda row: toRep(row),axis = 1)
df['ToDem'] = df.apply(lambda row: toDem(row),axis = 1)
df['Flip'] = df.apply(lambda row: flip(row),axis = 1)
df['Men_p'] = df.apply(lambda row: 100*(row['Men']/row['TotalPop']), axis=1)
df['Women_p'] = df.apply(lambda row: 100*(row['Women']/row['TotalPop']), axis=1)
df['total_votes16_p'] = df.apply(lambda row: 100*(row['total_votes16']/row['TotalPop']), axis=1)
df['total_votes20_p'] = df.apply(lambda row: 100*(row['total_votes20']/row['TotalPop']), axis=1)
df['VotingAgeCitizen_p'] = df.apply(lambda row: 100*(row['VotingAgeCitizen']/row['TotalPop']), axis=1)
df['cases_p'] = df.apply(lambda row: 100*(row['cases']/row['TotalPop']), axis=1)
df['deaths_p'] = df.apply(lambda row: 100*(row['deaths']/row['TotalPop']), axis=1)
df['Employed_p'] = df.apply(lambda row: 100*(row['Employed']/row['TotalPop']), 
                            axis=1)
df = df.drop(columns=['16_R', '16_D', '20_R', '20_D','Men','Women',
                      'total_votes16', 'total_votes20', 'VotingAgeCitizen', 
                      'cases', 'deaths','Employed'])

In [None]:
df_toRep = df['ToRep']
df_toDem = df['ToDem']
df_Flip = df['Flip']
df = df.drop(columns=['ToRep','ToDem','Flip'])
df = df.fillna(0)

In [None]:
df_nocovid = df.copy()
df_nocovid = df_nocovid.drop(columns=['cases_p','deaths_p'])

# Base Tuning

In [None]:
gbm = xgb.XGBClassifier(scale_pos_weight=173, tree_method="gpu_hist")


folds = 5
param_comb = 5

params = {
        'n_estimators': [100,500],
        'learning_rate': [0.1,.01],
        'max_depth': [4,10],
        }

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 42)
random_search_rep = RandomizedSearchCV(gbm, param_distributions=params, 
                                   n_iter=param_comb, scoring='roc_auc', 
                                   cv=skf.split(df,df_toRep), verbose=3, 
                                   random_state=42)
#random_search_rep.fit(df, df_toRep)
#random_search_rep.best_score_

In [None]:
random_search_dem = RandomizedSearchCV(gbm, param_distributions=params, 
                                   n_iter=param_comb, scoring='roc_auc', 
                                   cv=skf.split(df,df_toRep), verbose=3, 
                                   random_state=42)
#random_search_dem.fit(df, df_toDem)
#random_search_dem.best_score_

In [None]:
random_search_flip = RandomizedSearchCV(gbm, param_distributions=params, 
                                   n_iter=param_comb, scoring='roc_auc', 
                                   cv=skf.split(df,df_toRep), verbose=3, 
                                   random_state=42)
#random_search_flip.fit(df, df_Flip)
#random_search_flip.best_score_

In [None]:
random_search_rep.fit(df_nocovid,df_toRep)
random_search_rep.best_score_



In [None]:
random_search_dem.fit(df_nocovid,df_toDem)
random_search_dem.best_score_

In [None]:
random_search_flip.fit(df_nocovid,df_Flip)
random_search_flip.best_score_

# Ideal Classifier Plots

In [None]:
rep_covid_xgb = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan,
              monotone_constraints='(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=173, subsample=1,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)
rep_covid_xgb.fit(df, df_toRep)

In [None]:
xgb.plot_importance(rep_covid_xgb, max_num_features=5)
plt.show()

In [None]:
dem_covid_xgb = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan,
              monotone_constraints='(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=173, subsample=1,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)
dem_covid_xgb.fit(df, df_toDem)
xgb.plot_importance(dem_covid_xgb, max_num_features=5)
plt.show()

In [None]:
flip_covid_xgb = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan,
              monotone_constraints='(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=173, subsample=1,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)
flip_covid_xgb.fit(df, df_Flip)
xgb.plot_importance(flip_covid_xgb, max_num_features=5)
plt.show()

In [None]:
rep_nocovid_xgb = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan,
              monotone_constraints='(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=173, subsample=1,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)
rep_nocovid_xgb.fit(df_nocovid, df_toRep)
xgb.plot_importance(rep_covid_xgb, max_num_features=5)
plt.show()

In [None]:
dem_nocovid_xgb = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan,
              monotone_constraints='(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=173, subsample=1,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)
dem_nocovid_xgb.fit(df_nocovid, df_toDem)
xgb.plot_importance(dem_nocovid_xgb, max_num_features=5)
plt.show()

In [None]:
flip_nocovid_xgb = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan,
              monotone_constraints='(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=173, subsample=1,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)
flip_nocovid_xgb.fit(df_nocovid, df_Flip)
xgb.plot_importance(flip_nocovid_xgb, max_num_features=5)
plt.show()

In [None]:
df_nocovid.head()