In [1]:
import build_master_df
import pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
raw = build_master_df.build_df()

Reading Google mobility data...
Reading / merging NAICS business pattern data...
Reading / merging ACS census data...
Reading / merging CDC health data...
    Interpolating missing CDC county data with state data...
    Dropping extra CDC columns...
Reading / merging CDC cases and death data...
Reading / merging Kaggle data...
Reading /merging NOAA weather data...
	Interpolating missing weather data
	Interpolating TMAX...
	Interpolating TMIN...
	Interpolating PRCP...
	Interpolating TMIN_3d_avg...
	Interpolating TMIN_5d_avg...
	Interpolating TMIN_7d_avg...
	Interpolating TMIN_10d_avg...
	Interpolating TMAX_3d_avg...
	Interpolating TMAX_5d_avg...
	Interpolating TMAX_7d_avg...
	Interpolating TMAX_10d_avg...
	Creating precipiation dummy...
reading interventions...
	Transforming intervention columns...
	Transforming int_date_public schools...
	Transforming int_date_restaurant dine-in...
	Transforming int_date_federal guidelines...
	Transforming int_date_foreign travel ban...


In [3]:
# Identify variables
index_vars   = ['StateName','CountyName','fips','date']
target_vars  = [col for col in raw.columns if (col.endswith('change_from_baseline'))]
main_target  = 'retail_and_recreation_percent_change_from_baseline'
features     = [col for col in raw.columns if (col not in index_vars) and (col not in target_vars)]

# Get full dataset for use
df = raw.dropna(subset=[main_target])[features+[main_target]+['date']]
df.drop(columns=['state_x','state_y','precip_dummy_x','precip_dummy_y','county',
                 'stay_at_home_announced','stay_at_home_effective'], inplace=True)

In [4]:
# Split train test 
train_full,test_full = pipeline.get_train_test(df,train_size=0.8,time_series=True)
train_target = train_full[main_target]
test_target = test_full[main_target]
train_features = train_full.drop(columns=[main_target])
test_features = test_full.drop(columns=[main_target])

# Impute and normalize
train_features,test_features = pipeline.impute_missing(train_features,test_features,how='median')
train_features,test_features = pipeline.normalize_vars(train_features,test_features)

In [5]:
# Identify number of components that explain 95% of variance
pca = PCA(n_components=75)
pca.fit(train_features)
cumsums = np.cumsum(pca.explained_variance_ratio_)
num_components = next(x for x,val in enumerate(cumsums) if val > 0.95)+1

pca = PCA(n_components=num_components)
pca.fit(train_features)
train_pca_features = pca.transform(train_features)
test_pca_features  = pca.transform(test_features)

In [23]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR, LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor

# Config: Dictionaries of models and hyperparameters
MODELS = {
    'LinearRegression': LinearRegression(), 
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'LinearSVR': LinearSVR(), 
    'RandomForestRegressor': RandomForestRegressor(),
    'AdaBoostRegressor':AdaBoostRegressor(),
    'KNeighborsRegressor':KNeighborsRegressor()
}


GRID = {
    'LinearRegression': [{}],
    'Lasso': [{'alpha':x, 'random_state':0, 'max_iter':10000} for x in [0.01,0.05,0.1,0.5,1,5,10,50,100,500,1000]],
    'Ridge': [{'alpha':x, 'random_state':0, 'max_iter':10000} for x in [0.01,0.05,0.1,0.5,1,5,10,50,100,500,1000]],
    'LinearSVR': [{'C': x, 'epsilon':y, 'random_state': 0, 'max_iter':10000} \
                  for x in [0.01,0.05,0.1,0.5,1,5]
                  for y in [0.01,0.1,1]],
    'RandomForestRegressor': [{'n_estimators':x, 'max_features':y,
                               'n_jobs':-1} \
                               for y in ['auto','log2','sqrt']
                               for x in [100,500,1000]],
    'AdaBoostRegressor': [{'n_estimators':y} for y in [50, 100, 150]],
    'KNeighborsRegressor': [{'n_neighbors':x} for x in np.arange(5,20)]
}

model_results = pipeline.build_regressors(MODELS, GRID,
                                          train_pca_features, train_target,
                                          test_pca_features, test_target)

	Training: LinearRegression | {}
	Time elapsed to train:  0:00:00.285666 

	Training: Lasso | {'alpha': 0.01, 'random_state': 0, 'max_iter': 10000}
	Time elapsed to train:  0:00:00.232981 

	Training: Lasso | {'alpha': 0.05, 'random_state': 0, 'max_iter': 10000}
	Time elapsed to train:  0:00:00.243482 

	Training: Lasso | {'alpha': 0.1, 'random_state': 0, 'max_iter': 10000}
	Time elapsed to train:  0:00:00.239682 

	Training: Lasso | {'alpha': 0.5, 'random_state': 0, 'max_iter': 10000}
	Time elapsed to train:  0:00:00.232989 

	Training: Lasso | {'alpha': 1, 'random_state': 0, 'max_iter': 10000}
	Time elapsed to train:  0:00:00.269148 

	Training: Lasso | {'alpha': 5, 'random_state': 0, 'max_iter': 10000}
	Time elapsed to train:  0:00:00.267060 

	Training: Lasso | {'alpha': 10, 'random_state': 0, 'max_iter': 10000}
	Time elapsed to train:  0:00:00.242777 

	Training: Lasso | {'alpha': 50, 'random_state': 0, 'max_iter': 10000}
	Time elapsed to train:  0:00:00.216727 

	Training: Lasso 