# A basic template to apply any models to train on the collected data

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import zscore
from sklearn import preprocessing
pd.set_option('display.max_columns', None)
import graphviz
plt.rcParams['figure.figsize'] = [18, 10]

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split



In [2]:
trainDF = pd.read_csv('../outputs/trainXY.csv',parse_dates=['date_implement'])
trainDF = trainDF.sample(frac=1)

In [3]:
trainDF.head()
trainDF = trainDF.dropna()

# Selecting a unique policy

In [5]:
policies = trainDF['policy'].unique()
policy_selected = policies[12]
print(f'Selected Policy: {policy_selected}')
trainDF = trainDF[trainDF['policy']==policy_selected]

Selected Policy: C6_Stay at home requirements


In [6]:
# Finalizing Indexes so we can drop state names 
trainDF = trainDF.reset_index(drop=True)

# Get a list of columns with features 
columns = trainDF.columns.to_list()

In [7]:
Y_columns = [x for x in columns if 'Y_' in x]

In [8]:
X_columns = [x for x in columns if 'Y_' not in x and 'Unnamed' not in x]

X_columns.remove('date_implement')
# X_columns.remove('policy')
X_columns.remove('state_x')
X_columns.remove('policy_type')
X_columns.remove('stateName')
X_columns.remove('CEN_stateCode')
X_columns.remove('submission_date')
X_columns.remove('state_y')

X_columns.remove('caseInterpolate_gauss3')
X_columns.remove('caseInterpolate_MA7')
X_columns.remove('caseInterpolate_savitzky31_3')
X_columns.remove('caseInterpolate_gauss8')
X_columns.remove('new_case_zscore')
X_columns.remove('new_case')
X_columns.remove('policy')



In [9]:
X = trainDF[X_columns]
y = trainDF[Y_columns]


In [10]:
y

Unnamed: 0,Y_TREND_caseInterpolate_MA7_14,Y_TRENDQUANT_caseInterpolate_MA7_14,Y_SLOPE_caseInterpolate_MA7_14,Y_TREND_caseInterpolate_MA7_21,Y_TRENDQUANT_caseInterpolate_MA7_21,Y_SLOPE_caseInterpolate_MA7_21,Y_TREND_caseInterpolate_MA7_28,Y_TRENDQUANT_caseInterpolate_MA7_28,Y_SLOPE_caseInterpolate_MA7_28,Y_TREND_caseInterpolate_gauss8_14,Y_TRENDQUANT_caseInterpolate_gauss8_14,Y_SLOPE_caseInterpolate_gauss8_14,Y_TREND_caseInterpolate_gauss8_21,Y_TRENDQUANT_caseInterpolate_gauss8_21,Y_SLOPE_caseInterpolate_gauss8_21,Y_TREND_caseInterpolate_gauss8_28,Y_TRENDQUANT_caseInterpolate_gauss8_28,Y_SLOPE_caseInterpolate_gauss8_28,Y_TREND_caseInterpolate_gauss3_14,Y_TRENDQUANT_caseInterpolate_gauss3_14,Y_SLOPE_caseInterpolate_gauss3_14,Y_TREND_caseInterpolate_gauss3_21,Y_TRENDQUANT_caseInterpolate_gauss3_21,Y_SLOPE_caseInterpolate_gauss3_21,Y_TREND_caseInterpolate_gauss3_28,Y_TRENDQUANT_caseInterpolate_gauss3_28,Y_SLOPE_caseInterpolate_gauss3_28
0,1.0,0.530612,0.221538,1.0,0.353741,0.221538,1.0,0.265306,0.221538,1.0,1.571429,0.906923,1.0,1.142857,0.906923,1.0,0.857143,0.906923,1.0,1.000000,0.452308,1.0,0.714286,0.452308,1.0,0.535714,0.452308
1,-1.0,-5.693878,1.839451,1.0,3.598639,1.839451,-1.0,-13.321429,1.839451,-1.0,-2.500000,-2.403077,-1.0,-2.523810,-2.403077,-1.0,-2.464286,-2.403077,-1.0,-3.071429,0.220769,1.0,1.142857,0.220769,-1.0,-5.428571,0.220769
2,-1.0,-6.591837,-0.555714,-1.0,-1.904762,-0.555714,1.0,1.943878,-0.555714,-1.0,-4.500000,-1.363846,-1.0,-2.095238,-1.363846,1.0,0.678571,-1.363846,-1.0,-7.071429,-2.706154,-1.0,-2.619048,-2.706154,-1.0,-0.428571,-2.706154
3,1.0,0.316327,-1.925934,-1.0,-0.523810,-1.925934,1.0,0.076531,-1.925934,-1.0,-2.142857,-1.816154,-1.0,-1.952381,-1.816154,-1.0,-1.678571,-1.816154,-1.0,-0.571429,-1.645385,-1.0,-1.380952,-1.645385,-1.0,-0.571429,-1.645385
4,-1.0,-4.673469,-5.547912,-1.0,-5.353741,-5.547912,-1.0,-4.831633,-5.547912,-1.0,-3.571429,-4.757692,-1.0,-4.380952,-4.757692,-1.0,-5.357143,-4.757692,-1.0,-3.785714,-5.088462,-1.0,-4.761905,-5.088462,-1.0,-4.535714,-5.088462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,-1.0,-2.704082,-1.063297,-1.0,-0.945578,-1.063297,1.0,0.290816,-1.063297,-1.0,-1.571429,-0.910769,-1.0,-1.047619,-0.910769,-1.0,-0.392857,-0.910769,-1.0,-1.928571,-2.084615,-1.0,-2.000000,-2.084615,1.0,0.250000,-2.084615
143,1.0,0.255102,0.117802,1.0,0.170068,0.117802,1.0,0.127551,0.117802,1.0,4.357143,2.092308,1.0,3.000000,2.092308,1.0,2.250000,2.092308,1.0,1.071429,0.359231,1.0,0.714286,0.359231,1.0,0.535714,0.359231
144,-1.0,-78.285714,-90.662967,-1.0,-83.102041,-90.662967,-1.0,-79.255102,-90.662967,-1.0,-61.071429,-75.640769,-1.0,-70.857143,-75.640769,-1.0,-77.750000,-75.640769,-1.0,-57.000000,-85.017692,-1.0,-81.190476,-85.017692,-1.0,-70.321429,-85.017692
145,-1.0,-4.346939,-3.046813,-1.0,-2.319728,-3.046813,-1.0,-12.647959,-3.046813,-1.0,-5.642857,-7.318462,-1.0,-6.857143,-7.318462,-1.0,-10.678571,-7.318462,-1.0,-8.785714,-3.233846,-1.0,-3.238095,-3.233846,-1.0,-8.571429,-3.233846


# Encoding Policy Names 

In [11]:
# X['policy'].unique()

In [12]:
# le = preprocessing.LabelEncoder()
# le.fit(X['policy'])
# print(le.classes_)

In [13]:
# X['policy'] = pd.Series(le.transform(X['policy']))
# original_policies_decoded = pd.Series(le.inverse_transform(X['policy']))

In [14]:
# for i in range(len(le.classes_)):
#     print(f'{i}: {le.classes_[i]}')

In [15]:
# unique_policy = X['policy'].unique()

------

-------

# Train Models Here

In [16]:
y

Unnamed: 0,Y_TREND_caseInterpolate_MA7_14,Y_TRENDQUANT_caseInterpolate_MA7_14,Y_SLOPE_caseInterpolate_MA7_14,Y_TREND_caseInterpolate_MA7_21,Y_TRENDQUANT_caseInterpolate_MA7_21,Y_SLOPE_caseInterpolate_MA7_21,Y_TREND_caseInterpolate_MA7_28,Y_TRENDQUANT_caseInterpolate_MA7_28,Y_SLOPE_caseInterpolate_MA7_28,Y_TREND_caseInterpolate_gauss8_14,Y_TRENDQUANT_caseInterpolate_gauss8_14,Y_SLOPE_caseInterpolate_gauss8_14,Y_TREND_caseInterpolate_gauss8_21,Y_TRENDQUANT_caseInterpolate_gauss8_21,Y_SLOPE_caseInterpolate_gauss8_21,Y_TREND_caseInterpolate_gauss8_28,Y_TRENDQUANT_caseInterpolate_gauss8_28,Y_SLOPE_caseInterpolate_gauss8_28,Y_TREND_caseInterpolate_gauss3_14,Y_TRENDQUANT_caseInterpolate_gauss3_14,Y_SLOPE_caseInterpolate_gauss3_14,Y_TREND_caseInterpolate_gauss3_21,Y_TRENDQUANT_caseInterpolate_gauss3_21,Y_SLOPE_caseInterpolate_gauss3_21,Y_TREND_caseInterpolate_gauss3_28,Y_TRENDQUANT_caseInterpolate_gauss3_28,Y_SLOPE_caseInterpolate_gauss3_28
0,1.0,0.530612,0.221538,1.0,0.353741,0.221538,1.0,0.265306,0.221538,1.0,1.571429,0.906923,1.0,1.142857,0.906923,1.0,0.857143,0.906923,1.0,1.000000,0.452308,1.0,0.714286,0.452308,1.0,0.535714,0.452308
1,-1.0,-5.693878,1.839451,1.0,3.598639,1.839451,-1.0,-13.321429,1.839451,-1.0,-2.500000,-2.403077,-1.0,-2.523810,-2.403077,-1.0,-2.464286,-2.403077,-1.0,-3.071429,0.220769,1.0,1.142857,0.220769,-1.0,-5.428571,0.220769
2,-1.0,-6.591837,-0.555714,-1.0,-1.904762,-0.555714,1.0,1.943878,-0.555714,-1.0,-4.500000,-1.363846,-1.0,-2.095238,-1.363846,1.0,0.678571,-1.363846,-1.0,-7.071429,-2.706154,-1.0,-2.619048,-2.706154,-1.0,-0.428571,-2.706154
3,1.0,0.316327,-1.925934,-1.0,-0.523810,-1.925934,1.0,0.076531,-1.925934,-1.0,-2.142857,-1.816154,-1.0,-1.952381,-1.816154,-1.0,-1.678571,-1.816154,-1.0,-0.571429,-1.645385,-1.0,-1.380952,-1.645385,-1.0,-0.571429,-1.645385
4,-1.0,-4.673469,-5.547912,-1.0,-5.353741,-5.547912,-1.0,-4.831633,-5.547912,-1.0,-3.571429,-4.757692,-1.0,-4.380952,-4.757692,-1.0,-5.357143,-4.757692,-1.0,-3.785714,-5.088462,-1.0,-4.761905,-5.088462,-1.0,-4.535714,-5.088462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,-1.0,-2.704082,-1.063297,-1.0,-0.945578,-1.063297,1.0,0.290816,-1.063297,-1.0,-1.571429,-0.910769,-1.0,-1.047619,-0.910769,-1.0,-0.392857,-0.910769,-1.0,-1.928571,-2.084615,-1.0,-2.000000,-2.084615,1.0,0.250000,-2.084615
143,1.0,0.255102,0.117802,1.0,0.170068,0.117802,1.0,0.127551,0.117802,1.0,4.357143,2.092308,1.0,3.000000,2.092308,1.0,2.250000,2.092308,1.0,1.071429,0.359231,1.0,0.714286,0.359231,1.0,0.535714,0.359231
144,-1.0,-78.285714,-90.662967,-1.0,-83.102041,-90.662967,-1.0,-79.255102,-90.662967,-1.0,-61.071429,-75.640769,-1.0,-70.857143,-75.640769,-1.0,-77.750000,-75.640769,-1.0,-57.000000,-85.017692,-1.0,-81.190476,-85.017692,-1.0,-70.321429,-85.017692
145,-1.0,-4.346939,-3.046813,-1.0,-2.319728,-3.046813,-1.0,-12.647959,-3.046813,-1.0,-5.642857,-7.318462,-1.0,-6.857143,-7.318462,-1.0,-10.678571,-7.318462,-1.0,-8.785714,-3.233846,-1.0,-3.238095,-3.233846,-1.0,-8.571429,-3.233846


In [17]:
X = X.apply(zscore)
y = y.apply(zscore)

In [18]:
from sklearn.datasets import load_iris
from sklearn import tree

In [19]:
clf = tree.DecisionTreeRegressor()

In [20]:
y_selected = y[['Y_SLOPE_caseInterpolate_MA7_21']]

In [21]:
clf = clf.fit(X, y_selected)

In [22]:
# dot_data = tree.export_graphviz(clf, out_file=None) 
# graph = graphviz.Source(dot_data) 
# graph.render("tree") 

In [23]:
dot_data = tree.export_graphviz(clf, out_file=None, 
                     feature_names=X.columns.to_list(), 
                     filled=True, rounded=True,  
                     special_characters=True)  
graph = graphviz.Source(dot_data)  
graph.render(f'../outputs/TREEOP_{policy_selected}')

'../outputs/TREEOP_C6_Stay at home requirements.pdf'

# Most Important Features 

In [24]:
# Random Forest 

regr = RandomForestRegressor(max_depth=100, random_state=0)
regr.fit(X, y_selected)

  regr.fit(X, y_selected)


RandomForestRegressor(max_depth=100, random_state=0)

In [25]:
def imp_df(column_names, importances):
    data = {
        'Feature': column_names,
        'Importance': importances,
    }
    df = pd.DataFrame(data) \
        .set_index('Feature') \
        .sort_values('Importance', ascending=False)

    return df

In [26]:
base_imp = imp_df(X.columns, regr.feature_importances_)
print(base_imp)

                                                 Importance
Feature                                                    
metric_change                                      0.321274
S1_people_fully_vaccinated                         0.090440
FD_moreThan10kCitizens                             0.073605
S1_people_fully_vaccinated_per_hundred             0.060951
CEN_Without Health Care Coverage                   0.042862
S2_people_vaccinated                               0.037793
S4_people_vaccinated                               0.034381
CEN_Hispanic or Latino (of any race)               0.034286
S4_people_fully_vaccinated                         0.032027
CEN_Total Population                               0.025160
S3_people_vaccinated                               0.021419
CEN_Total Employer Establishments                  0.020184
S3_people_fully_vaccinated                         0.017044
CEN_Total Households                               0.015767
FD_adminAndFireResponse                 