This notebook is focused on trying Dr. Dimakis' suggestion of trying different subsets to see how COVID stats interacts with other features.

In [28]:
from google.colab import files
import io
import pandas as pd
import numpy as np
from matplotlib import pyplot
import missingno as msno
import seaborn as sns 

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

## File Upload and Data Read

In [13]:
uploaded = files.upload()
data = pd.read_csv(io.BytesIO(uploaded['county_statistics.csv']))

Saving county_statistics.csv to county_statistics (1).csv


## Data Preprocessing

In [19]:
## Prepare data by getting rid of missing values and features we don't want to use.

# Drop useless entries (no covid data, 2016 data, some missing labels)
data = data[:3111]

# Drop 2016 results, too powerful to use.
data = data.drop(['percentage16_Donald_Trump', 'percentage16_Hillary_Clinton', 'total_votes16', 'votes16_Donald_Trump', 'votes16_Hillary_Clinton'],  axis=1)

# Drop ID column
data = data.drop(data.columns[0], axis=1)

# Drop county name, this doesn't really matter for now.
data = data.drop('county', axis=1)

# Drop rows with any NaN in the label column. Now have about ~3k useable counties left.
data = data.dropna(how='any', subset=['percentage20_Donald_Trump'])

# Drop rows with any NaN in the lat, long, cases, deaths columns. There are about 40 of these.
data = data.dropna(how='any', subset=['lat'])

KeyError: ignored

In [15]:
# Split label
trump = data['percentage20_Donald_Trump']

# Remove all other 2020 voting info
data = data.drop(['percentage20_Donald_Trump', 'percentage20_Joe_Biden', 'votes20_Donald_Trump', 'votes20_Joe_Biden'],  axis=1)

In [16]:
# Create new features to capture COVID data per capita
data["cases_per_pop"] = data['cases'] / data['TotalPop'] 
data['death_per_pop'] = data['deaths'] / data['TotalPop']

In [17]:
# Drop old population related stats and flat covid
data = data.drop(['TotalPop', 'total_votes20', 'cases', 'deaths'], axis=1)

In [23]:
# One hot encode the State column
data = pd.get_dummies(data)

In [24]:
data.head()

Unnamed: 0,lat,long,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,VotingAgeCitizen,Income,IncomeErr,IncomePerCap,IncomePerCapErr,Poverty,ChildPoverty,Professional,Service,Office,Construction,Production,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,cases_per_pop,death_per_pop,state_AL,state_AR,state_AZ,...,state_HI,state_IA,state_ID,state_IL,state_IN,state_KS,state_KY,state_LA,state_MA,state_MD,state_ME,state_MI,state_MN,state_MO,state_MS,state_MT,state_NC,state_ND,state_NE,state_NH,state_NJ,state_NM,state_NV,state_NY,state_OH,state_OK,state_OR,state_PA,state_RI,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
0,34.223334,-82.461707,12044.0,12744.0,1.3,68.9,27.6,0.1,0.3,0.0,19452.0,35254.0,2259.0,19234.0,799.0,22.7,32.1,27.2,20.7,20.8,10.6,20.7,78.3,11.1,0.5,1.8,1.8,6.5,25.8,9505.0,78.8,13.3,7.8,0.1,9.4,0.032475,0.000686,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,30.295065,-92.414197,30433.0,32174.0,2.4,77.5,17.6,0.1,0.1,0.0,45197.0,40492.0,2544.0,21591.0,1002.0,21.5,27.6,27.6,16.9,25.7,15.0,14.8,83.2,10.3,0.2,1.6,2.2,2.5,27.6,24982.0,80.0,12.1,7.6,0.3,8.9,0.050825,0.001629,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,37.767072,-75.632346,16079.0,16761.0,8.8,60.3,28.3,0.3,0.7,0.0,24408.0,42260.0,2253.0,24266.0,1564.0,19.8,31.8,31.1,17.7,18.8,15.1,17.3,80.0,10.6,0.5,2.6,1.8,4.5,22.0,13837.0,74.6,18.1,7.1,0.2,5.4,0.037363,0.000579,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,43.452658,-116.241552,217999.0,217118.0,7.9,85.2,1.2,0.4,2.6,0.1,316189.0,60151.0,1294.0,31642.0,725.0,11.8,13.1,43.0,16.6,25.0,6.9,8.4,80.7,7.7,0.5,1.5,2.8,6.9,20.4,214984.0,78.3,15.0,6.6,0.1,4.3,0.040106,0.000416,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,41.330756,-94.471059,3552.0,3640.0,1.7,96.6,0.3,0.0,0.4,0.0,5572.0,49477.0,2633.0,28861.0,2055.0,9.5,12.1,28.2,16.9,20.0,17.3,17.6,77.9,12.4,0.3,2.8,0.4,6.2,22.3,3680.0,73.8,15.3,10.4,0.5,3.0,0.030868,0.000139,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Subset Creation
Here we are selecting a subset of features to work with.

In [78]:
# subset_data = data[['cases_per_pop', 'death_per_pop', 'VotingAgeCitizen']].copy()
subset_data = data.drop(['cases_per_pop'], axis=1)

## Model Creation

Using an unfitted Gradient Boosting here. Not interested in predicting Trump percentage as accurately as possible here but rather how COVID stats interact with other features.

**Questions:**

1. Are there features that render the COVID stats useless?
2. Are there subsets of features for which the COVID stats help? Help a lot? What are they and what are the possible explanations?

In [32]:
gb = GradientBoostingRegressor(random_state=0)
# gb.fit(data, trump)

In [79]:
scores = cross_val_score(gb, subset_data, trump, cv=10, scoring="neg_mean_absolute_error")

print("Accuracy: %0.5f (+/- %0.5f)" % (scores.mean(), scores.std()))

Accuracy: -0.04498 (+/- 0.00285)


# Results

All results below are using (negative) MAE unless otherwise stated.

---

Using all features : Accuracy: **-0.04494** (+/- 0.00317)

Using all features BUT COVID cols: Accuracy: **-0.04497** (+/- 0.00298) --> Just barley worse!!

Using all features BUT DEATH col: Accuracy: **-0.04482** (+/- 0.00307) --> Actually better?

Accuracy: -0.04498 (+/- 0.00285)

Using JUST covid_per_capita, deaths_per_capita: Accuracy: **-0.12159** (+/- 0.00586)

---
### Race Columns

Just Asian: Accuracy: -0.09933 (+/- 0.00408)

Asian w/ COVID cols: Accuracy: -0.09538 (+/- 0.00305)

Just White: Accuracy: -0.10411 (+/- 0.00529)

White w/ COVID cols: Accuracy: -0.09720 (+/- 0.00473)

---
### Other

Just VotingAgeCitizen: Accuracy: -0.09968 (+/- 0.00399)

VotingAgeCitizen w/ COVID cols: Accuracy: -0.09609 (+/- 0.00359)

Unemployed: Accuracy: -0.12076 (+/- 0.00610)

Unemployed w/ COVID cols: Accuracy: -0.11553 (+/- 0.00654)




## Recursive Feature Elimination

"Recursive Feature elimination: Recursive feature elimination performs a greedy search to find the best performing feature subset. It iteratively creates models and determines the best or the worst performing feature at each iteration. It constructs the subsequent models with the left features until all the features are explored. It then ranks the features based on the order of their elimination. In the worst case, if a dataset contains N number of features RFE will do a greedy search for 2N combinations of features."

In [47]:
from sklearn.feature_selection import RFE

selector = RFE(gb, n_features_to_select=10, step=1)
selector.fit(data, trump)

RFE(estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                        criterion='friedman_mse', init=None,
                                        learning_rate=0.1, loss='ls',
                                        max_depth=3, max_features=None,
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_iter_no_change=None,
                                        presort='deprecated', random_state=0,
                                        subsample=1.0, tol=0.0001,
                                        validation_fraction=0.1, verbose=0,
                                        warm_start=False),
    n_feat

In [48]:
selector.support_

array([ True,  True, False, False, False,  True,  True, False,  True,
       False,  True, False, False, False, False, False, False, False,
       False, False,  True,  True, False, False,  True, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False])

(3046, 87)

In [49]:
selector.ranking_

array([ 1,  1, 16,  9, 26,  1,  1, 36,  1, 53,  1, 12, 25,  3, 32, 21, 19,
       10,  4, 11,  1,  1,  2, 38,  1, 14, 13,  6, 24,  1, 35, 31, 18, 34,
        7, 15, 33, 67, 69, 71, 23,  8, 37, 78, 45, 43, 57, 59, 22, 68, 65,
       74, 41, 76, 73, 62, 72, 63, 52, 44, 39, 46, 77, 27, 70, 30, 64, 66,
       40, 60, 20, 75, 42, 55, 56, 61, 48, 49, 47,  5, 50, 51, 28, 54, 17,
       58, 29])

In [63]:
for i,rank in enumerate(list(selector.ranking_)):
  if rank == 1: 
    print(data.columns[i])

print('\n')

print(f'Rank of death_per_pop: {list(selector.ranking_)[list(data.columns).index("death_per_pop")]}')
print(f'Rank of cases_per_pop: {list(selector.ranking_)[list(data.columns).index("cases_per_pop")]}')

lat
long
White
Black
Asian
VotingAgeCitizen
Construction
Production
Transit
Employed


Rank of death_per_pop: 33
Rank of cases_per_pop: 15
