# Chicago COVID mobility data

In [16]:
import pandas as pd
import numpy as np

## Set up

### Load features

In [2]:
df_features_census = pd.read_csv('./data/census_processed.csv', dtype={'geo_12': 'str'})
df_features_census["geo_12"] = df_features_census["GEO_ID"].map(lambda x: str(x)[-12:])
df_features_census.drop(["GEO_ID"], axis=1, inplace=True)
df_features_census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3993 entries, 0 to 3992
Data columns (total 16 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Median_Income                            3836 non-null   float64
 1   Median_Age                               3993 non-null   float64
 2   Percent_NonCitizen                       3989 non-null   float64
 3   Percent_SpeakEngl_Poorly                 3989 non-null   float64
 4   Percent_less_than_HS                     3989 non-null   float64
 5   Percent_HS                               3989 non-null   float64
 6   Percent_SomeCollege                      3989 non-null   float64
 7   Percent_Bach                             3989 non-null   float64
 8   Percent_Grad                             3989 non-null   float64
 9   Percent_No_vehicals                      3989 non-null   float64
 10  Percent_Received_SNAP                    3989 no

In [3]:
df_features_places = pd.read_csv('./data/places_count_by_census_block.csv', dtype={'geo_12': 'str'})
df_features_places.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2194 entries, 0 to 2193
Data columns (total 11 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   geo_12                                               2194 non-null   object 
 1   automotive_repair_and_maintenance                    2194 non-null   float64
 2   child_day_care_services                              2194 non-null   float64
 3   elementary_and_secondary_schools                     2194 non-null   float64
 4   grocery_stores                                       2194 non-null   float64
 5   health_and_personal_care_stores                      2194 non-null   float64
 6   museums,_historical_sites,_and_similar_institutions  2194 non-null   float64
 7   offices_of_physicians                                2194 non-null   float64
 8   other_amusement_and_recreation_industries            2194 non-null  

In [4]:
df_features = df_features_places.merge(df_features_census, on='geo_12')
df_features.head()

Unnamed: 0,geo_12,automotive_repair_and_maintenance,child_day_care_services,elementary_and_secondary_schools,grocery_stores,health_and_personal_care_stores,"museums,_historical_sites,_and_similar_institutions",offices_of_physicians,other_amusement_and_recreation_industries,religious_organizations,...,Percent_HS,Percent_SomeCollege,Percent_Bach,Percent_Grad,Percent_No_vehicals,Percent_Received_SNAP,Percent_Men_Usually_Fulltime_Employed,Percent_Women_Usually_Fulltime_Employed,Percent_No_Internet_Access,Percent_Computing_Device
0,170310101001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.168511,0.255138,0.289598,0.17926,0.344589,0.18696,0.885906,0.382653,0.217742,0.782258
1,170310101002,0.0,1.0,1.0,0.0,0.0,3.0,1.0,2.0,1.0,...,0.168511,0.255138,0.289598,0.17926,0.344589,0.18696,0.507064,0.452071,0.330517,0.829989
2,170310101003,0.0,1.0,0.0,0.0,0.0,3.0,0.0,1.0,1.0,...,0.168511,0.255138,0.289598,0.17926,0.344589,0.18696,0.765318,0.663338,0.074041,0.950045
3,170310102011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.24611,0.246314,0.193898,0.114251,0.140014,0.316592,0.559184,0.437107,0.10503,0.921598
4,170310102012,1.0,4.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.24611,0.246314,0.193898,0.114251,0.140014,0.316592,0.495495,0.3208,0.202247,0.822868


## Targets - diff data 2019-2020
### Load targets

In [5]:
df_targets_diff = pd.read_csv('./data/COVID_mobility_targets_adjusted.csv', dtype={'geo_12': 'str'})
df_targets_diff.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58062 entries, 0 to 58061
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   geo_12                        58062 non-null  object 
 1   Week                          58062 non-null  int64  
 2   fraction_of_devices_home_adj  58062 non-null  float64
 3   fraction_of_devices_work_adj  58062 non-null  float64
 4   fraction_time_away_all_adj    58062 non-null  float64
 5   fraction_time_away_leave_adj  58062 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 2.7+ MB


### Create dataframe

In [6]:
df_diff = df_features.merge(df_targets_diff, on='geo_12')
df_diff.columns

Index(['geo_12', 'automotive_repair_and_maintenance',
       'child_day_care_services', 'elementary_and_secondary_schools',
       'grocery_stores', 'health_and_personal_care_stores',
       'museums,_historical_sites,_and_similar_institutions',
       'offices_of_physicians', 'other_amusement_and_recreation_industries',
       'religious_organizations', 'restaurants_and_other_eating_places',
       'Median_Income', 'Median_Age', 'Percent_NonCitizen',
       'Percent_SpeakEngl_Poorly', 'Percent_less_than_HS', 'Percent_HS',
       'Percent_SomeCollege', 'Percent_Bach', 'Percent_Grad',
       'Percent_No_vehicals', 'Percent_Received_SNAP',
       'Percent_Men_Usually_Fulltime_Employed',
       'Percent_Women_Usually_Fulltime_Employed', 'Percent_No_Internet_Access',
       'Percent_Computing_Device', 'Week', 'fraction_of_devices_home_adj',
       'fraction_of_devices_work_adj', 'fraction_time_away_all_adj',
       'fraction_time_away_leave_adj'],
      dtype='object')

### Impute missing

In [7]:
df_diff['Median_Income'] = df_diff['Median_Income'].fillna(df_diff.median())

In [8]:
# Need to change this
df_diff.dropna(inplace=True)

## Feature & Target selection

### Look at correlations between features and targets to get an idea about which features might be important

In [10]:
df_diff.corr()

Unnamed: 0,automotive_repair_and_maintenance,child_day_care_services,elementary_and_secondary_schools,grocery_stores,health_and_personal_care_stores,"museums,_historical_sites,_and_similar_institutions",offices_of_physicians,other_amusement_and_recreation_industries,religious_organizations,restaurants_and_other_eating_places,...,Percent_Received_SNAP,Percent_Men_Usually_Fulltime_Employed,Percent_Women_Usually_Fulltime_Employed,Percent_No_Internet_Access,Percent_Computing_Device,Week,fraction_of_devices_home_adj,fraction_of_devices_work_adj,fraction_time_away_all_adj,fraction_time_away_leave_adj
automotive_repair_and_maintenance,1.0,0.085083,0.041788,0.1114921,0.07421685,0.03194654,0.01788637,0.08078375,-0.001812855,0.05433176,...,-0.02070637,0.04338496,-0.01811937,0.02233448,-0.01752393,-1.9061779999999999e-19,-0.007709,0.045037,0.043838,0.050714
child_day_care_services,0.08508338,1.0,0.165675,0.1218192,0.1550656,0.2091628,0.04963597,0.1446378,0.1491348,0.1941497,...,0.04816019,-0.04994664,0.002548483,0.0172797,-0.02132096,0.0,0.03798,0.018632,-0.06657,-0.051946
elementary_and_secondary_schools,0.04178819,0.165675,1.0,0.03564841,0.01977635,0.07081538,0.02864988,0.03069125,0.1891763,0.004115814,...,0.08649266,-0.09056854,-0.06486161,0.02665707,-0.02000581,0.0,0.075254,-0.013228,-0.033194,0.006749
grocery_stores,0.1114921,0.121819,0.035648,1.0,0.3593401,0.2626586,0.03040448,0.3075581,-0.0149481,0.4852629,...,-0.0099632,0.04916961,0.003103735,-0.01451538,0.008186056,2.773528e-18,0.071074,0.022872,-0.068531,-0.040114
health_and_personal_care_stores,0.07421685,0.155066,0.019776,0.3593401,1.0,0.4148705,0.3068574,0.5150677,-0.04021551,0.711737,...,-0.2019488,0.1518174,0.1494416,-0.170369,0.1508599,-6.678883999999999e-19,-0.011429,0.065824,-0.119091,-0.140351
"museums,_historical_sites,_and_similar_institutions",0.03194654,0.209163,0.070815,0.2626586,0.4148705,1.0,0.1144857,0.4276164,-0.01874119,0.5236997,...,-0.1318635,0.08638951,0.09498099,-0.1306107,0.1095283,-9.462982999999999e-20,0.019817,0.053915,-0.123989,-0.128406
offices_of_physicians,0.01788637,0.049636,0.02865,0.03040448,0.3068574,0.1144857,1.0,0.1479818,-0.005639528,0.1721578,...,-0.1094401,0.09244798,0.09341107,-0.08470125,0.07232299,-2.240719e-19,-0.010344,0.034043,-0.039264,-0.047452
other_amusement_and_recreation_industries,0.08078375,0.144638,0.030691,0.3075581,0.5150677,0.4276164,0.1479818,1.0,-0.04148249,0.5513455,...,-0.2721899,0.2411442,0.2609516,-0.2446288,0.2184308,-9.893523999999999e-20,-0.054387,0.120943,-0.133197,-0.198668
religious_organizations,-0.001812855,0.149135,0.189176,-0.0149481,-0.04021551,-0.01874119,-0.005639528,-0.04148249,1.0,-0.05004075,...,0.2984285,-0.2740515,-0.1405288,0.2080053,-0.1615053,-8.081592999999999e-20,0.14505,-0.080043,-0.033196,0.044045
restaurants_and_other_eating_places,0.05433176,0.19415,0.004116,0.4852629,0.711737,0.5236997,0.1721578,0.5513455,-0.05004075,1.0,...,-0.1968999,0.1687893,0.1490767,-0.1830599,0.1664604,1.411563e-18,0.017919,0.065385,-0.167488,-0.180921


Looks like a lot of correlations that are not very strong.

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

features = ['automotive_repair_and_maintenance',
            'child_day_care_services', 'elementary_and_secondary_schools',
            'grocery_stores', 'health_and_personal_care_stores',
            'museums,_historical_sites,_and_similar_institutions',
            'offices_of_physicians', 'other_amusement_and_recreation_industries',
            'religious_organizations', 'restaurants_and_other_eating_places',
            'Median_Income', 'Median_Age', 'Percent_NonCitizen',
            'Percent_SpeakEngl_Poorly', 'Percent_less_than_HS', 'Percent_HS',
            'Percent_SomeCollege', 'Percent_Bach', 'Percent_Grad',
            'Percent_No_vehicals', 'Percent_Received_SNAP',
            'Percent_Men_Usually_Fulltime_Employed',
            'Percent_Women_Usually_Fulltime_Employed', 'Percent_No_Internet_Access',
            'Percent_Computing_Device']

targets = ['fraction_of_devices_home_adj','fraction_of_devices_work_adj',
           'fraction_time_away_all_adj','fraction_time_away_leave_adj']


def split_data(df):
    return train_test_split(df, test_size=0.2)
    
def run_linear_regression(target, verbose=False):
    X_train, X_test = train[features].values, test[features].values
    y_train, y_test = train[target].values, test[target].values
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test) 
    regr = linear_model.LinearRegression()

    regr.fit(X_train_scaled, y_train)
    y_pred = regr.predict(X_test_scaled)

    MSE = mean_squared_error(y_pred, y_test)
    R2 = regr.score(X_train_scaled, y_train)
    if verbose:
        print("\nRegression with {} target".format(target))
        print("MSE: ", MSE, "R2: ", R2)
    return regr


def feature_importance(regr_model, df):
    return pd.DataFrame(regr_model.coef_, index = df[features].columns,
            columns=['importance']).sort_values('importance', ascending=False)

In [12]:
train, test = split_data(df_diff)
for target in targets:
    regr = run_linear_regression(target, True)
    print(feature_importance(regr, df_diff))
    print('\n')


Regression with fraction_of_devices_home_adj target
MSE:  0.00787210149186464 R2:  0.2240033216204319
                                                    importance
Percent_NonCitizen                                    0.025113
Percent_less_than_HS                                  0.021127
Percent_No_vehicals                                   0.008946
grocery_stores                                        0.004799
restaurants_and_other_eating_places                   0.003965
Percent_HS                                            0.003593
elementary_and_secondary_schools                      0.003466
offices_of_physicians                                 0.003066
religious_organizations                               0.001942
Percent_Computing_Device                              0.000930
museums,_historical_sites,_and_similar_institut...    0.000892
Percent_No_Internet_Access                            0.000804
other_amusement_and_recreation_industries             0.000474
Percent_Men_Usu

So it looks like the differenced data explains more of the variance in the models than the raw data.

Targets: diff data

It seems as though abs(0.002) is a sensible cut off for feature selection 

### Feature selection

In [67]:
def create_selected_dfs(target):
    regr_diff = run_linear_regression(target)
    selected_features = list(pd.Series(features)[list(abs(regr_diff.coef_) > 0.002)])
    return train[selected_features], train[target], test[selected_features], test[target]

In [71]:
fodh_features_train, fodh_target_train, fodh_features_test, fodh_target_test = create_selected_dfs('fraction_of_devices_home_adj')

fodw_features_train, fodw_target_train, fodw_features_test, fodw_target_test = create_selected_dfs('fraction_of_devices_work_adj')

ftaa_features_train, ftaa_target_train, ftaa_features_test, ftaa_features_test = create_selected_dfs('fraction_time_away_all_adj')

ftal_features_train, ftal_target_train, ftal_features_test, ftal_target_test = create_selected_dfs('fraction_time_away_leave_adj')

### Random Forest Model

In [26]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 400, num = 5)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 200, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True, False]


In [27]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [28]:
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 57, 105, 152, 200, None],
 'min_samples_leaf': [1, 2],
 'min_samples_split': [2, 5],
 'n_estimators': [10, 107, 205, 302, 400]}


In [31]:
def random_grid_search(train_X, train_y):
    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=True, random_state=42, n_jobs = -1)

    # Fit the random search model
    rf_random.fit(train_X, train_y)

    #Print best parameters
    print(rf_random.best_params_)

### Fraction of devices home

In [35]:
# Best parameters from random grid search
random_grid_search(fodh_features_train, fodh_target_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  8.0min finished


{'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 152, 'bootstrap': True}


In [39]:
from sklearn.model_selection import GridSearchCV
# run grid search
param_grid = {'bootstrap': [True],
              'min_samples_leaf': [1],
              'min_samples_split': [3,5,7],
              'n_estimators': [300,400,500]}


rf = RandomForestRegressor()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
grid_search.fit(fodh_features_train, fodh_target_train)

grid_search.best_params_

grid_search.best_estimator_

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [74]:
rf = RandomForestRegressor(n_jobs=-1, 
                           n_estimators=500,
                           min_samples_split=7,
                           min_samples_leaf=1,
                           bootstrap=True
                            )
model = rf.fit(fodh_features_train,fodh_target_train)

In [75]:
y_pred=rf.predict(fodh_features_test)
mean_squared_error(y_pred, fodh_target_test)

0.003803479615596143

In [76]:
# #Show how much each feature contributes to the model prediction
# feature_importances = pd.DataFrame(rf.feature_importances_,
#                                    index = df[features].columns,
#                                     columns=['importance']).sort_values('importance', ascending=False)
# print(feature_importances)

NameError: name 'df' is not defined

In [None]:
# pd.DataFrame.from_dict(rf_random.cv_results_)

In [None]:
# best_random = rf_random.best_estimator_