# Chicago COVID mobility reduction

## Set up

In [1]:
import pandas as pd
import numpy as np

### Load features

In [3]:
df_features_census = pd.read_csv('./data/census_processed.csv', dtype={'geo_12': 'str'})
df_features_census["geo_12"] = df_features_census["GEO_ID"].map(lambda x: str(x)[-12:])
df_features_census.drop(["GEO_ID"], axis=1, inplace=True)

In [5]:
df_features_places = pd.read_csv('./data/places_count_by_census_block.csv', dtype={'geo_12': 'str'})
df_features_places.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2194 entries, 0 to 2193
Data columns (total 11 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   geo_12                                               2194 non-null   object 
 1   automotive_repair_and_maintenance                    2194 non-null   float64
 2   child_day_care_services                              2194 non-null   float64
 3   elementary_and_secondary_schools                     2194 non-null   float64
 4   grocery_stores                                       2194 non-null   float64
 5   health_and_personal_care_stores                      2194 non-null   float64
 6   museums,_historical_sites,_and_similar_institutions  2194 non-null   float64
 7   offices_of_physicians                                2194 non-null   float64
 8   other_amusement_and_recreation_industries            2194 non-null  

In [6]:
df_features = df_features_places.merge(df_features_census, on='geo_12')

## Targets - differenced data 
### Load targets

In [7]:
df_targets_diff = pd.read_csv('./data/COVID_mobility_targets_adjusted.csv', dtype={'geo_12': 'str'})

### Create dataframe

In [8]:
df_diff = df_features.merge(df_targets_diff, on='geo_12')

In [9]:
df_diff.head()

Unnamed: 0,geo_12,automotive_repair_and_maintenance,child_day_care_services,elementary_and_secondary_schools,grocery_stores,health_and_personal_care_stores,"museums,_historical_sites,_and_similar_institutions",offices_of_physicians,other_amusement_and_recreation_industries,religious_organizations,...,Percent_Received_SNAP,Percent_Men_Usually_Fulltime_Employed,Percent_Women_Usually_Fulltime_Employed,Percent_No_Internet_Access,Percent_Computing_Device,Week,fraction_of_devices_home_adj,fraction_of_devices_work_adj,avg_time_away_all_adj,avg_time_away_leave_adj
0,170310101001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.18696,0.885906,0.382653,0.217742,0.782258,13,-0.086209,0.032714,-3.271241,-6.105604
1,170310101001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.18696,0.885906,0.382653,0.217742,0.782258,13,-0.090202,-0.035234,-2.828382,-5.560152
2,170310101001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.18696,0.885906,0.382653,0.217742,0.782258,13,-0.15146,0.077526,-2.280366,-5.380484
3,170310101001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.18696,0.885906,0.382653,0.217742,0.782258,14,-0.252164,0.048961,0.766927,-1.502261
4,170310101001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.18696,0.885906,0.382653,0.217742,0.782258,14,-0.256158,-0.018986,1.209787,-0.956809


## Data analysis

### Look at correlations between features and targets to get an idea about which features might be important

In [10]:
df_diff.corr()

Unnamed: 0,automotive_repair_and_maintenance,child_day_care_services,elementary_and_secondary_schools,grocery_stores,health_and_personal_care_stores,"museums,_historical_sites,_and_similar_institutions",offices_of_physicians,other_amusement_and_recreation_industries,religious_organizations,restaurants_and_other_eating_places,...,Percent_Received_SNAP,Percent_Men_Usually_Fulltime_Employed,Percent_Women_Usually_Fulltime_Employed,Percent_No_Internet_Access,Percent_Computing_Device,Week,fraction_of_devices_home_adj,fraction_of_devices_work_adj,avg_time_away_all_adj,avg_time_away_leave_adj
automotive_repair_and_maintenance,1.0,0.089269,0.043694,0.108628,0.07183,0.030565,0.014515,0.080897,-0.005203,0.055921,...,-0.029214,0.05315894,-0.010843,0.01662934,-0.01114432,2.871369e-05,-0.040189,0.061055,0.038112,0.032708
child_day_care_services,0.089269,1.0,0.166491,0.120443,0.159918,0.20162,0.053632,0.142352,0.154398,0.192279,...,0.046579,-0.04779458,-0.00149,0.02017015,-0.02424201,3.437736e-05,0.045434,0.011793,-0.071588,-0.054158
elementary_and_secondary_schools,0.043694,0.166491,1.0,0.034569,0.032199,0.071838,0.059372,0.035574,0.190001,0.008739,...,0.075722,-0.08904497,-0.068436,0.02584205,-0.0123293,3.99561e-05,0.052699,-0.001437,-0.032609,-0.003877
grocery_stores,0.108628,0.120443,0.034569,1.0,0.355782,0.258065,0.026786,0.305776,-0.017846,0.481659,...,-0.018014,0.05612708,0.005824,-0.02143545,0.02017757,3.070792e-05,0.03474,0.032632,-0.07426,-0.063371
health_and_personal_care_stores,0.07183,0.159918,0.032199,0.355782,1.0,0.402375,0.314788,0.507243,-0.034102,0.701092,...,-0.196417,0.1437923,0.141808,-0.1585893,0.1441897,-4.133537e-05,0.006746,0.065487,-0.125267,-0.144787
"museums,_historical_sites,_and_similar_institutions",0.030565,0.20162,0.071838,0.258065,0.402375,1.0,0.121199,0.425576,-0.021382,0.517765,...,-0.139215,0.08443492,0.093338,-0.1330151,0.1154594,-0.0004234932,0.027146,0.035899,-0.120698,-0.128087
offices_of_physicians,0.014515,0.053632,0.059372,0.026786,0.314788,0.121199,1.0,0.14668,0.007566,0.174453,...,-0.112917,0.07606267,0.07017,-0.08121441,0.06991075,-0.0004950518,0.022417,0.02811,-0.048665,-0.045551
other_amusement_and_recreation_industries,0.080897,0.142352,0.035574,0.305776,0.507243,0.425576,0.14668,1.0,-0.043343,0.55165,...,-0.272249,0.2380633,0.256193,-0.2423743,0.2176391,-1.714266e-05,-0.054263,0.125001,-0.126782,-0.199451
religious_organizations,-0.005203,0.154398,0.190001,-0.017846,-0.034102,-0.021382,0.007566,-0.043343,1.0,-0.050089,...,0.294093,-0.274231,-0.137452,0.2057544,-0.1575955,4.170567e-05,0.165167,-0.101711,-0.043848,0.042001
restaurants_and_other_eating_places,0.055921,0.192279,0.008739,0.481659,0.701092,0.517765,0.174453,0.55165,-0.050089,1.0,...,-0.197764,0.1685919,0.147591,-0.1808712,0.1650446,1.833493e-05,0.021623,0.066961,-0.161741,-0.180185


Looks like a lot of correlations that are not very strong.

### Split into training and testing

In [11]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_diff, test_size=0.2, random_seed=23)
train.is_copy = None
test.is_copy = None

### Impute missing

In [12]:
train['Median_Income'].fillna((train['Median_Income'].median()), inplace=True)
test['Median_Income'].fillna((train['Median_Income'].median()), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [13]:
train.dropna(inplace=True)
test.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


## Regression model 

In [66]:
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
features = ['automotive_repair_and_maintenance',
            'child_day_care_services', 'elementary_and_secondary_schools',
            'grocery_stores', 'health_and_personal_care_stores',
            'museums,_historical_sites,_and_similar_institutions',
            'offices_of_physicians', 'other_amusement_and_recreation_industries',
            'religious_organizations', 'restaurants_and_other_eating_places',
            'Median_Income', 'Median_Age', 'Percent_NonCitizen',
            'Percent_SpeakEngl_Poorly', 'Percent_less_than_HS', 'Percent_HS',
            'Percent_SomeCollege', 'Percent_Bach', 'Percent_Grad',
            'Percent_No_vehicals', 'Percent_Received_SNAP',
            'Percent_Men_Usually_Fulltime_Employed',
            'Percent_Women_Usually_Fulltime_Employed', 'Percent_No_Internet_Access',
            'Percent_Computing_Device']

targets = ['fraction_of_devices_work_adj',
           'avg_time_away_all_adj']

def calc_lasso(a, xtrain, xtest, ytrain, ytest):
    lasso = Lasso(alpha=a)
    lasso.fit(xtrain, ytrain)
    test_score=lasso.score(xtest, ytest)
    coeff = lasso.coef_
    n_coeff = len([c for c in coeff if c != 0])
    preds_test = lasso.predict(xtest)
    mse = mean_squared_error(ytest, preds_test)
    r2 = r2_score(ytest, preds_test)
    results = (test_score, mse, r2, n_coeff)
    
    return(results)


def run_linear_regression(train, test, target, verbose=False):
    
    X_train, X_test = train[features].values, test[features].values
    y_train, y_test = train[target].values, test[target].values
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test) 
    
    for num in [0.0001, 0.001, 0.005, 0.01, 0.1, 0.5, 0.7]:
        print("alpha = ", num)
        test_score, MSE, R2, n_coeff = calc_lasso(num, X_train_scaled, X_test_scaled, y_train, y_test)

        if verbose:
            print("\nRegression with {} target".format(target))
            print("MSE: ", MSE, "Lasso Test Score: ", test_score, "N Coefficients: ", n_coeff)


def run_final_model(train, test, target, alpha):
    X_train, X_test = train[features].values, test[features].values
    y_train, y_test = train[target].values, test[target].values
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test) 
    lasso = Lasso(alpha=alpha)
    regr = LinearRegression()
    regr.fit(X_train_scaled, y_train)
    preds_norm = regr.predict(X_test_scaled)
    mse_norm = mean_squared_error(preds_norm, y_test)
    print("MSE norm:", mse_norm)
    lasso.fit(X_train_scaled, y_train)
    
    return lasso
            
def feature_importance(regr_model, df):
    return pd.DataFrame(regr_model.coef_, index = df[features].columns,
            columns=['importance']).sort_values('importance', ascending=False)

In [67]:
for target in targets:
    regr = run_linear_regression(train, test, target, True)
    #print(feature_importance(regr, df_diff))
    print('\n')

alpha =  0.0001

Regression with fraction_of_devices_home_adj target
MSE:  0.007627133806599738 Lasso Test Score:  0.26894056950224443 N Coefficients:  23
alpha =  0.001

Regression with fraction_of_devices_home_adj target
MSE:  0.007657633263265166 Lasso Test Score:  0.2660172019587217 N Coefficients:  19
alpha =  0.005

Regression with fraction_of_devices_home_adj target
MSE:  0.007923908003933128 Lasso Test Score:  0.24049481501695638 N Coefficients:  9
alpha =  0.01

Regression with fraction_of_devices_home_adj target
MSE:  0.008243785739905453 Lasso Test Score:  0.20983459042688968 N Coefficients:  5
alpha =  0.1

Regression with fraction_of_devices_home_adj target
MSE:  0.010436116439027534 Lasso Test Score:  -0.0002999205182443454 N Coefficients:  0
alpha =  0.5

Regression with fraction_of_devices_home_adj target
MSE:  0.010436116439027534 Lasso Test Score:  -0.0002999205182443454 N Coefficients:  0
alpha =  0.7

Regression with fraction_of_devices_home_adj target
MSE:  0.01043

  positive)



Regression with avg_time_away_all_adj target
MSE:  1.5174938943394685 Lasso Test Score:  0.16627671535067468 N Coefficients:  25
alpha =  0.001

Regression with avg_time_away_all_adj target
MSE:  1.5175463286339672 Lasso Test Score:  0.1662479075298322 N Coefficients:  23
alpha =  0.005

Regression with avg_time_away_all_adj target
MSE:  1.5180287180519667 Lasso Test Score:  0.16598287892473873 N Coefficients:  21
alpha =  0.01

Regression with avg_time_away_all_adj target
MSE:  1.5197261057538298 Lasso Test Score:  0.16505032054318647 N Coefficients:  20
alpha =  0.1

Regression with avg_time_away_all_adj target
MSE:  1.5887119065450686 Lasso Test Score:  0.1271489697408028 N Coefficients:  7
alpha =  0.5

Regression with avg_time_away_all_adj target
MSE:  1.8208903708348791 Lasso Test Score:  -0.00041173583738673974 N Coefficients:  0
alpha =  0.7

Regression with avg_time_away_all_adj target
MSE:  1.8208903708348791 Lasso Test Score:  -0.00041173583738673974 N Coefficients:  0


al

  positive)



Regression with avg_time_away_leave_adj target
MSE:  2.5836175955268916 Lasso Test Score:  0.2013894444347003 N Coefficients:  25
alpha =  0.001

Regression with avg_time_away_leave_adj target
MSE:  2.5835606050402355 Lasso Test Score:  0.2014070605108136 N Coefficients:  23
alpha =  0.005

Regression with avg_time_away_leave_adj target
MSE:  2.584359640670388 Lasso Test Score:  0.20116007415740844 N Coefficients:  23
alpha =  0.01

Regression with avg_time_away_leave_adj target
MSE:  2.587088853222517 Lasso Test Score:  0.20031645939170728 N Coefficients:  20
alpha =  0.1

Regression with avg_time_away_leave_adj target
MSE:  2.6641119086256317 Lasso Test Score:  0.17650820496066633 N Coefficients:  6
alpha =  0.5

Regression with avg_time_away_leave_adj target
MSE:  3.0229800416294186 Lasso Test Score:  0.06558007087107454 N Coefficients:  1
alpha =  0.7

Regression with avg_time_away_leave_adj target
MSE:  3.238594249255946 Lasso Test Score:  -0.001067478843138181 N Coefficients:  0

0.01 and below are best alpha levels

In [68]:
for target in targets:
    regr = run_final_model(train, test, target, 0.001)
    print("Target: ", target)
    print(feature_importance(regr, train))

MSE norm: 0.007626261949475098
Target:  fraction_of_devices_home_adj
                                                    importance
Percent_No_vehicals                                   0.010659
Percent_Received_SNAP                                 0.009133
Percent_less_than_HS                                  0.008594
restaurants_and_other_eating_places                   0.007613
Percent_NonCitizen                                    0.007289
offices_of_physicians                                 0.003437
grocery_stores                                        0.002135
Percent_HS                                            0.001513
elementary_and_secondary_schools                      0.000869
religious_organizations                               0.000763
Percent_No_Internet_Access                            0.000429
museums,_historical_sites,_and_similar_institut...    0.000048
Percent_Grad                                         -0.000000
Percent_SomeCollege                              

## Random Forest Model

### Create target and features

In [51]:
features_cols = list(df_features.columns)
features_cols.remove('geo_12')

features_train = train[features_cols]
features_test = test[features_cols]

fodh_target_train, fodh_target_test = train['fraction_of_devices_home_adj'], test['fraction_of_devices_home_adj']
ftaa_target_train, ftaa_target_test = train['avg_time_away_all_adj'], test['avg_time_away_all_adj']


### Hyperparameter tuning set up

In [31]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 400, num = 5)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 200, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]


In [32]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [33]:
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 57, 105, 152, 200, None],
 'min_samples_leaf': [1, 2],
 'min_samples_split': [2, 5],
 'n_estimators': [10, 107, 205, 302, 400]}


In [34]:
def random_grid_search(train_X, train_y):
    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=True, random_state=42, n_jobs = -1)

    # Fit the random search model
    rf_random.fit(train_X, train_y)

    #Print best parameters
    print(rf_random.best_params_)

### Model 1 - Fraction of devices work

In [42]:
# Best parameters from random grid search
random_grid_search(features_train, fodw_target_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 15.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 24.7min finished


{'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 152, 'bootstrap': True}


In [44]:
# Run grid search to find best model
param_grid = {'bootstrap': [True],
             'min_samples_leaf': [2],
             'min_samples_split': [3,5,7],
             'max_depth': [120, 140, 160],
             'n_estimators': [400]}


rf_fodw = RandomForestRegressor()# Instantiate the grid search model
grid_search_fodw = GridSearchCV(estimator = rf_fodw, param_grid = param_grid, 
                         cv = 3, n_jobs = -1, verbose = 2)

In [45]:
grid_search_fodw.fit(features_train, fodw_target_train)

grid_search_fodw.best_params_

grid_search_fodw.best_estimator_

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  4.2min finished


RandomForestRegressor(max_depth=120, min_samples_leaf=2, min_samples_split=7,
                      n_estimators=400)

In [47]:
# Run best model
rf_best_fodw = RandomForestRegressor(n_jobs=-1, 
                                   n_estimators=400,
                                   min_samples_split=7,
                                   min_samples_leaf=2,
                                   max_depth=120,
                                   bootstrap=True
                            )
model = rf_best_fodw.fit(features_train,fodw_target_train)

In [48]:
fodw_y_pred=rf_best_fodw.predict(features_test)
print("MSE: ", mean_squared_error(fodw_y_pred, fodw_target_test))
print("RMSE: ", np.sqrt(mean_squared_error(fodw_y_pred, fodw_target_test))

0.0009360726041542685

#### One to one plot

In [None]:
plt.plot(fodw_y_pred, fodw_target_test, '.', alpha = 0.2)
plt.ylabel('True Reduction in Fraction of Devices \n Leaving for Work', fontsize=14)
plt.xlabel('Predicted Reduction in Fraction of Devices \n Leaving for Work', fontsize=14)
plt.title('One to One Plot - Reduction in Fraction of Devices \n Leaving for Work \n RMSE = {}'.format(fodw_rmse), 
          fontsize=16)
line_x, line_y = [-0.2,0.3], [-0.2,0.3]
line_cutoff = [0.05,0.05]
plt.plot(line_x,line_y,'k--')
#plt.plot(line_x,line_cutoff,'r--')
#plt.plot(line_cutoff, line_y, 'g--')
plt.show()

#### Feature importance

In [58]:
feature_importances_fodw_rf = pd.DataFrame(rf_best_fodw.feature_importances_,
                                           index = fodw_features_train.columns,
                                           columns=['importance']).sort_values('importance', ascending=False)
feature_importances_fodw_rf

Unnamed: 0,importance
Percent_Received_SNAP,0.207904
Median_Income,0.195785
Percent_No_Internet_Access,0.164025
Percent_No_vehicals,0.116773
Percent_SomeCollege,0.082106
Percent_less_than_HS,0.070494
Percent_NonCitizen,0.068384
Percent_SpeakEngl_Poorly,0.064341
other_amusement_and_recreation_industries,0.030188


#### Residual Plot

In [None]:
residuals_fodw = fodw_y_pred - fodw_target_test
plt.plot(fodw_y_pred, residuals_fodw, '.', alpha=0.3)
plt.xlabel('Predicted Reduction in Fraction of Devices \n Leaving for Work', fontsize=14)
plt.ylabel('Model Residual (Predicted - True)', fontsize=14)

Interpretation: 
- Areas with greater median income and the fraction of residents with some college see a larger reduction in devices exhibiting leaving for work behavior.
- Areas with a greater fraction of residents receiving SNAP benefits and areas with a greater fraction of residents with no internet acces or vehical see a smaller reduction in devices exhibiting leaving for work behavior.

### Model 2 - Fraction of time away all

In [52]:
# Best parameters from random grid search
random_grid_search(features_train, ftaa_target_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 15.5min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 25.8min finished


{'n_estimators': 107, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': None, 'bootstrap': True}


In [54]:
# Run grid search to find best model
param_grid = {'bootstrap': [True],
             'min_samples_leaf': [2],
             'min_samples_split': [1, 2,4],
             'n_estimators': [100, 120, 140]}


rf_ftaa = RandomForestRegressor()# Instantiate the grid search model
grid_search_ftaa = GridSearchCV(estimator = rf_ftaa, param_grid = param_grid, 
                         cv = 3, n_jobs = -1, verbose = 2)

In [55]:
grid_search_ftaa.fit(features_train, ftaa_target_train)

grid_search_ftaa.best_params_

grid_search_ftaa.best_estimator_

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   55.6s finished


RandomForestRegressor(min_samples_leaf=2, min_samples_split=4, n_estimators=120)

In [56]:
# Run best model
rf_best_ftaa = RandomForestRegressor(n_jobs=-1, 
                                    n_estimators=120,
                                    min_samples_split=4,
                                    min_samples_leaf=2,
                                    bootstrap=True
                            )
model = rf_best_ftaa.fit(features_train,ftaa_target_train)

In [57]:
# Evaluate model
ftaa_y_pred=rf_best_ftaa.predict(features_test)
print("MSE: ", mean_squared_error(ftaa_y_pred, ftaa_target_test))
print("RMSE: ", np.sqrt(mean_squared_error(ftaa_y_pred, ftaa_target_test))

0.5269904643338871

#### One to one plot

In [None]:
plt.plot(ataa_y_pred, ataa_target_test, '.', alpha=0.3)
plt.ylabel('True Reduction in \n Average Time Away (hours)', fontsize=14)
plt.xlabel('Predicted Reduction in \n Average Time Away (hours)', fontsize=14)
plt.title('One to One Plot - Reduction in \n Average Time Away (hours) \n RMSE = {}'.format(ataa_rmse), 
          fontsize=16)
line_x, line_y = [-10,5], [-10,5]
line_cutoff = [0,0]
plt.plot(line_x,line_y,'k--')
#plt.plot(line_x,line_cutoff,'r--')
#plt.plot(line_cutoff, line_y, 'g--')
plt.show()

#### Residual Plot

In [None]:
residuals_ataa = ataa_y_pred - ataa_target_test
plt.plot(ataa_y_pred, residuals_ataa, '.', alpha=0.3)
plt.xlabel('Predicted Reduction in \n Average Time Away (hours)', fontsize=14)
plt.ylabel('Model Residual (Predicted - True)', fontsize=14)

#### Feature importance

In [71]:
feature_importances_ftaa_rf = pd.DataFrame(rf_best_ftaa.feature_importances_,
                                           index = ftaa_features_train.columns,
                                           columns=['importance']).sort_values('importance', ascending=False)
feature_importances_ftaa_rf

Unnamed: 0,importance
Median_Age,0.207964
Percent_No_vehicals,0.147463
Percent_Grad,0.111845
Median_Income,0.089264
Percent_Women_Usually_Fulltime_Employed,0.080575
Percent_Men_Usually_Fulltime_Employed,0.066761
Percent_Received_SNAP,0.051095
Percent_Bach,0.049437
Percent_NonCitizen,0.044985
Percent_less_than_HS,0.041004


Interpretation: 
- Areas with higher median age, higher median income, and higher fraction of residents with full time employment see a greater reduction in the fraction of time spent away from home during the stay home order.
- Areas with a higher percentage of people without a vehical and percent with a graduate degree see a smaller reduction in the fraction of time spent away from home during the stay home order.