In [None]:
import json
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from scipy.stats import skew
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, GradientBoostingRegressor
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV, LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, validation_curve, cross_val_score, train_test_split, KFold
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, average_precision_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from tqdm import tqdm
# from google.colab import files
import warnings
warnings.filterwarnings('ignore')

# !wandb login bba1e1fa6dcd5919bf34e3094e5632c056b0c4d6
# import wandb
# from wandb.xgboost import wandb_callback
# wandb.init(project="dslab-kaggle")

%matplotlib inline

# County Predictions for 2020 Election (Covid)

### Load Data and explore

In [None]:
df = pd.read_csv('data/county_statistics.csv', index_col=0)
df.head()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(12,8))
corrs = df[list(df.columns[15:]) + ["votes20_Donald_Trump", "votes20_Joe_Biden", "votes16_Donald_Trump", "votes16_Hillary_Clinton"]].corr()
corrs = corrs[["votes20_Donald_Trump", "votes20_Joe_Biden", "votes16_Donald_Trump", "votes16_Hillary_Clinton"]]
hm = sns.heatmap(corrs, annot = True)
plt.title('Relations between votes in 2020 and 2016 to non voting data')
plt.show()

In [None]:
trump_win_counties_16 = df.loc[df['votes16_Donald_Trump'] >= df['votes16_Hillary_Clinton']]['county']
trump_win_counties_20 = df.loc[df['votes20_Donald_Trump'] >= df['votes20_Joe_Biden']]['county']
hillary_win_counties_16 = df.loc[df['votes16_Donald_Trump'] < df['votes16_Hillary_Clinton']]['county']
biden_win_counties_20 = df.loc[df['votes20_Donald_Trump'] < df['votes20_Joe_Biden']]['county']

### 2020 Democratic Votes

In [None]:
hc_jb = df.loc[(df['votes20_Donald_Trump'] < df['votes20_Joe_Biden']) & (df['votes16_Donald_Trump'] < df['votes16_Hillary_Clinton'])]
hc_dt = df.loc[(df['votes20_Donald_Trump'] > df['votes20_Joe_Biden']) & (df['votes16_Donald_Trump'] < df['votes16_Hillary_Clinton'])]
plt.bar(["JB", "DT"], [len(hc_jb.index), len(hc_dt.index)],color=["blue","red"])
[len(hc_jb.index), len(hc_dt.index)]
plt.title("Candidate voted by counties that voted HC in 2016")
plt.xlabel("Candidate")
plt.ylabel("Number of counties")
plt.show()
print("Number of counties that shifted to from HC to DT: {}".format(len(hc_dt.index)))
print("Percentage of counties that shifted to from HC to DT: {:%}".format((len(hc_dt.index)/len(df.index))))

In [None]:
hc_dt.head()

### 2020 Republican Votes

In [None]:
dt_jb = df.loc[(df['votes20_Donald_Trump'] < df['votes20_Joe_Biden']) & (df['votes16_Donald_Trump'] > df['votes16_Hillary_Clinton'])]
dt_dt = df.loc[(df['votes20_Donald_Trump'] > df['votes20_Joe_Biden']) & (df['votes16_Donald_Trump'] > df['votes16_Hillary_Clinton'])]
plt.bar(["JB", "DT"], [len(dt_jb.index), len(dt_dt.index)],color=["blue","red"])
[len(dt_jb.index), len(dt_dt.index)]
plt.title("Candidate voted by counties that voted DT in 2016")
plt.xlabel("Candidate")
plt.ylabel("Number of counties")
plt.show()
print("Number of counties that shifted to from DT to JB: {}".format(len(dt_jb.index)))
print("Percentage of counties that shifted to from DT to JB: {:%}".format((len(dt_jb.index)/len(df.index))))

In [None]:
dt_jb.head()

### Adding Binary Columns

In [None]:
df['flipped_rep'] = np.where((df['votes20_Donald_Trump'] > df['votes20_Joe_Biden']) & (df['votes16_Donald_Trump'] < df['votes16_Hillary_Clinton']), 1, 0)
df['flipped_dem'] = np.where((df['votes20_Donald_Trump'] < df['votes20_Joe_Biden']) & (df['votes16_Donald_Trump'] > df['votes16_Hillary_Clinton']), 1, 0)
df = df.iloc[0:3111]
df

### Looking at wider range of flip values

In [None]:
flip_threshold = 0.1
rep_shift = df.loc[(df['percentage20_Donald_Trump'] >= df['percentage16_Donald_Trump'] + flip_threshold)]
dem_shift = df.loc[(df['percentage20_Joe_Biden'] >= df['percentage16_Hillary_Clinton'] + flip_threshold)]
dem_shift.head()

In [None]:
rep_shift_threshold = []
dem_shift_threshold = []
threshold_values = np.linspace(0.35,0,35,endpoint=False)
for flip_threshold in threshold_values:
    rep_shift = df.loc[(df['percentage20_Donald_Trump'] >= df['percentage16_Donald_Trump'] + flip_threshold) & (df['percentage20_Donald_Trump'] < df['percentage16_Donald_Trump'] + flip_threshold + 0.01)]
    dem_shift = df.loc[(df['percentage20_Joe_Biden'] >= df['percentage16_Hillary_Clinton'] + flip_threshold) & (df['percentage20_Joe_Biden'] < df['percentage16_Hillary_Clinton'] + flip_threshold + 0.01)]
    rep_shift_threshold.append(len(rep_shift.index))
    dem_shift_threshold.append(len(dem_shift.index))

In [None]:
threshold_values = np.flip(threshold_values) * 100
fig = plt.figure(1, (7,4))
ax = fig.add_subplot(1,1,1)
ax.plot(threshold_values, np.flip(rep_shift_threshold), label = "Shifted Rep", color="red")
ax.plot(threshold_values, np.flip(dem_shift_threshold), label = "Shifted Dem", color="blue")
plt.xlabel('Increment of shift')
plt.ylabel('# Counties')
plt.title('Number of Counties by Voting Shift %')
plt.legend()
# plt.yscale("log")
fmt = '%.0f%%'
xticks = mtick.FormatStrFormatter(fmt)
ax.xaxis.set_major_formatter(xticks)
plt.show()
fig.savefig('counties_shift.jpg')

### Try percentile bucket -> Petersen Index

In [None]:
df

In [None]:
rep_shift_threshold = pd.DataFrame()
dem_shift_threshold = pd.DataFrame()
threshold_values = np.linspace(0.35,0,35,endpoint=False)
rep_shift_threshold

In [None]:
for flip_threshold in threshold_values:
    rep_shift = df.loc[(df['percentage20_Donald_Trump'] >= df['percentage16_Donald_Trump'] + flip_threshold) & (df['percentage20_Donald_Trump'] < df['percentage16_Donald_Trump'] + flip_threshold + 0.01)]
    dem_shift = df.loc[(df['percentage20_Joe_Biden'] >= df['percentage16_Hillary_Clinton'] + flip_threshold) & (df['percentage20_Joe_Biden'] < df['percentage16_Hillary_Clinton'] + flip_threshold + 0.01)]
    print(dem_shift.head())
    title_r = str(round(flip_threshold, 2)) + "_rep"
    title_d = str(round(flip_threshold, 2)) + "_dem"
    rep_shift_threshold[title_r] = rep_shift.index
    dem_shift_threshold[title_d] = dem_shift.index

# Historical Scores - Volatility

In [None]:
df = pd.read_csv('data/countypres_2000-2016.csv', index_col=0)
county_scores = pd.DataFrame({'volatility_score': 0}, index=df.FIPS.unique())

df.head()

In [None]:
# general cleaning
df = df[df['party'].isin(['democrat', 'republican'])]
df = df.drop(['state', 'office', 'candidate', 'version'],  axis=1)
df.head()

In [None]:
# transform votes into percentages for better scoring
df['percentage_dem'] = np.where(df['party']=='democrat', df['candidatevotes'] / df['totalvotes'], 0)
df['percentage_rep'] = np.where(df['party']=='republican', df['candidatevotes'] / df['totalvotes'], 0)
df = df.drop(['candidatevotes', 'totalvotes', 'party'], axis=1)
df = df.groupby(by=['year', 'FIPS']).aggregate({'county': 'first', 'state_po': 'first', 'percentage_dem': 'sum','percentage_rep': 'sum'})
df

In [None]:
# sort by fips, and by year
for fips, fips_df in tqdm(df.groupby(level=1)):
    # (can use percentage or variance)
    rep_var = np.average([abs(j-i) for i,j in zip(list(fips_df['percentage_rep']), list(fips_df['percentage_rep'][1:]))])
    dem_var = np.average([abs(j-i) for i,j in zip(list(fips_df['percentage_dem']), list(fips_df['percentage_dem'][1:]))])
    # rep_var = fips_df.var(axis=0)['percentage_rep']
    # dem_var = fips_df.var(axis=0)['percentage_dem']
    county_scores.at[fips] = (rep_var + dem_var) / 2
    

In [None]:
# Print county scores
county_scores

### Model Training

#### Clean Data

In [None]:
df = df.drop(['percentage16_Donald_Trump', 'percentage16_Hillary_Clinton', 'total_votes16', 'votes16_Donald_Trump', 'votes16_Hillary_Clinton'],  axis=1)
df = df.drop('county', axis=1)
df = df.dropna(how='any', subset=['percentage20_Donald_Trump'])
df = df.dropna(how='any', subset=['lat'])

In [None]:
trump_target = df['percentage20_Donald_Trump']
df = df.drop(['percentage20_Donald_Trump', 'percentage20_Joe_Biden', 'votes20_Donald_Trump', 'votes20_Joe_Biden'],  axis=1)
one_hot = pd.get_dummies(df['state'])
df = df.drop('state',axis = 1)
df = df.join(one_hot)

In [None]:
df.head()

#### Train Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, trump_target, test_size=0.33)
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)

In [None]:
preds = gb.predict(X_test)
mean_squared_error(y_test,preds)

In [None]:
df["cases_per_pop"] = df['cases'] / df['TotalPop'] 
df['death_per_pop'] = df['deaths'] / df['TotalPop']

subset_data = df.drop(['cases_per_pop'], axis=1)
scores = cross_val_score(gb, subset_data, trump_target, cv=10, scoring="neg_mean_absolute_error")

print("Accuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std()))