# Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [60]:
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor

# Import Data

In [3]:
df = pd.read_csv('../cleaned_data/merged.csv')
df.head()

Unnamed: 0,fips_code,state,estimated_hesitant,estimated_hesitant_or_unsure,estimated_strongly_hesitant,social_vulnerability_index_(svi),svi_category,cvac_level_of_concern_for_vaccination_rollout,cvac_level_of_concern,percent_adults_fully_vaccinated_against_covid-19_(as_of_6/10/21),...,series_complete_65pluspop_pct_svi,metro_status,series_complete_pop_pct_ur_equity,series_complete_12pluspop_pct_ur_equity,series_complete_18pluspop_pct_ur_equity,series_complete_65pluspop_pct_ur_equity,administered_dose1_recip_5plus,administered_dose1_recip_5pluspop_pct,series_complete_5plus,series_complete_5pluspop_pct
0,1123,ALABAMA,0.1806,0.24,0.1383,0.89,Very High Vulnerability,0.64,High Concern,0.305,...,15.0,Non-metro,6.0,7.0,7.0,7.0,17618.0,46.1,14892.0,39.0
1,1121,ALABAMA,0.1783,0.235,0.1368,0.87,Very High Vulnerability,0.84,Very High Concern,0.265,...,15.0,Non-metro,6.0,6.0,7.0,7.0,33819.0,44.7,27653.0,36.5
2,1131,ALABAMA,0.1735,0.2357,0.1337,0.93,Very High Vulnerability,0.94,Very High Concern,0.394,...,16.0,Non-metro,7.0,8.0,8.0,8.0,6538.0,67.3,5151.0,53.0
3,1129,ALABAMA,0.1735,0.2357,0.1337,0.73,High Vulnerability,0.82,Very High Concern,0.308,...,12.0,Non-metro,7.0,8.0,8.0,8.0,8636.0,56.0,7122.0,46.2
4,1133,ALABAMA,0.1805,0.2313,0.1379,0.7,High Vulnerability,0.8,High Concern,0.163,...,10.0,Non-metro,5.0,5.0,5.0,6.0,5938.0,26.6,4913.0,22.0


In [4]:
df.columns

Index(['fips_code', 'state', 'estimated_hesitant',
       'estimated_hesitant_or_unsure', 'estimated_strongly_hesitant',
       'social_vulnerability_index_(svi)', 'svi_category',
       'cvac_level_of_concern_for_vaccination_rollout',
       'cvac_level_of_concern',
       'percent_adults_fully_vaccinated_against_covid-19_(as_of_6/10/21)',
       'percent_hispanic',
       'percent_non-hispanic_american_indian/alaska_native',
       'percent_non-hispanic_asian', 'percent_non-hispanic_black',
       'percent_non-hispanic_native_hawaiian/pacific_islander',
       'percent_non-hispanic_white', 'recip_county', 'recip_state',
       'series_complete_pop_pct', 'series_complete_yes',
       'series_complete_12plus', 'series_complete_12pluspop_pct',
       'series_complete_18plus', 'series_complete_18pluspop_pct',
       'series_complete_65plus', 'series_complete_65pluspop_pct',
       'completeness_pct', 'administered_dose1_recip',
       'administered_dose1_pop_pct', 'administered_dose1_rec

In [5]:
pruned = df[['state','social_vulnerability_index_(svi)','estimated_hesitant_or_unsure',
           'percent_hispanic','percent_non-hispanic_american_indian/alaska_native',
           'percent_non-hispanic_asian','percent_non-hispanic_black','percent_non-hispanic_native_hawaiian/pacific_islander',
           'percent_non-hispanic_white']]

In [6]:
X = pruned.drop(columns=['estimated_hesitant_or_unsure'])
y = pruned['estimated_hesitant_or_unsure']

In [7]:
X = pd.get_dummies(data=X, columns=['state'])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   random_state=42)

# Linear Regression
(0.0023815017692421525, 0.0025046182379516026)

In [9]:
linreg=LinearRegression()
linreg.fit(X_train, y_train)
mean_squared_error(y_train, linreg.predict(X_train)),mean_squared_error(y_test, linreg.predict(X_test))

(0.00018690041483654678, 0.00022874974127649097)

In [10]:
coefs = pd.DataFrame(list(zip(X.columns,linreg.coef_)), columns=['variable','coefficient'])

In [11]:
coefs.sort_values(by='coefficient',ascending=False)

Unnamed: 0,variable,coefficient
5,percent_non-hispanic_native_hawaiian/pacific_i...,0.173841
32,state_MONTANA,0.1175782
2,percent_non-hispanic_american_indian/alaska_na...,0.1110447
56,state_WYOMING,0.1091615
30,state_MISSISSIPPI,0.09441496
6,percent_non-hispanic_white,0.08884735
8,state_ALASKA,0.07976118
18,state_IDAHO,0.0755779
4,percent_non-hispanic_black,0.07516749
24,state_LOUISIANA,0.07360075


# Random Forests
(4.793140984201448e-05, 0.0003138931016051236)

In [12]:
RandomForestRegressor().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [73]:
gs = GridSearchCV(estimator=RandomForestRegressor(),
                 param_grid={
                     'n_estimators':[500],
                     'max_depth':np.linspace(10,50,5),
                     'warm_start':[True,False],
                 }, cv=5, n_jobs=-1,
                 scoring='neg_mean_squared_error')

In [74]:
%%time

gs.fit(X_train,y_train)

Wall time: 1min 15s


GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': array([10., 20., 30., 40., 50.]),
                         'n_estimators': [500], 'warm_start': [True, False]},
             scoring='neg_mean_squared_error')

In [75]:
mean_squared_error(y_train, gs.predict(X_train)),mean_squared_error(y_test, gs.predict(X_test))

(4.793140984201448e-05, 0.0003138931016051236)

In [76]:
gs.best_params_

{'max_depth': 40.0, 'n_estimators': 500, 'warm_start': False}

# AdaBoost
(0.001259134303402612, 0.00137422633741578)

In [33]:
AdaBoostRegressor().get_params()

{'base_estimator': None,
 'learning_rate': 1.0,
 'loss': 'linear',
 'n_estimators': 50,
 'random_state': None}

In [77]:
ada_gs = GridSearchCV(estimator=AdaBoostRegressor(),
                     param_grid={
                         'learning_rate': np.linspace(.1,2,5),
                         'loss': ['linear','square','exponential'],
                         'n_estimators': [500],
                         'random_state': [42]
                     },cv=5, n_jobs=-1,
                     scoring='neg_mean_squared_error')

In [78]:
%%time

ada_gs.fit(X_train, y_train)

Wall time: 34.5 s


GridSearchCV(cv=5, estimator=AdaBoostRegressor(), n_jobs=-1,
             param_grid={'learning_rate': array([0.1  , 0.575, 1.05 , 1.525, 2.   ]),
                         'loss': ['linear', 'square', 'exponential'],
                         'n_estimators': [500], 'random_state': [42]},
             scoring='neg_mean_squared_error')

In [79]:
ada_gs.best_params_

{'learning_rate': 1.525,
 'loss': 'square',
 'n_estimators': 500,
 'random_state': 42}

In [80]:
mean_squared_error(y_train, ada_gs.predict(X_train)),mean_squared_error(y_test, ada_gs.predict(X_test))

(0.001259134303402612, 0.00137422633741578)

# XGBoost
(1.7443373963809892e-05, 0.0002301318735520251)

In [63]:
XGBRegressor().get_params()

{'objective': 'reg:squarederror',
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'enable_categorical': False,
 'gamma': None,
 'gpu_id': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [81]:
xgb = XGBRegressor()
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [82]:
mean_squared_error(y_train, xgb.predict(X_train)),mean_squared_error(y_test, xgb.predict(X_test))

(3.855291674682709e-05, 0.00023472043586748872)

In [98]:
xgb_gs = GridSearchCV(estimator=XGBRegressor(),
                     param_grid={
                         'max_depth':[10,11,12,13],
                         'learning_rate':np.linspace(.01,.1,3),
                         'n_estimators':[200],
                         'random_state':[42]
                     },
                     scoring='neg_mean_squared_error',
                     n_jobs=-1,
                     cv=5)

In [99]:
xgb_gs.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n...None,
                                    num_parallel_tree=None, predictor=None,
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
                            

In [100]:
xgb_gs.best_params_

{'learning_rate': 0.1,
 'max_depth': 10,
 'n_estimators': 200,
 'random_state': 42}

In [101]:
mean_squared_error(y_train, xgb_gs.predict(X_train)),mean_squared_error(y_test, xgb_gs.predict(X_test))

(1.7443373963809892e-05, 0.0002301318735520251)