# Chicago COVID mobility data

In [1]:
import pandas as pd

## Set up

### Load features

In [13]:
df_features_census = pd.read_csv('./data/census_processed.csv', dtype={'geo_12': 'str'})
df_features_census["geo_12"] = df_features_census["GEO_ID"].map(lambda x: str(x)[-12:])
df_features_census.drop(["GEO_ID"], axis=1, inplace=True)
df_features_census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3993 entries, 0 to 3992
Data columns (total 16 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Median_Income                            3836 non-null   float64
 1   Median_Age                               3993 non-null   float64
 2   Percent_NonCitizen                       3989 non-null   float64
 3   Percent_SpeakEngl_Poorly                 3989 non-null   float64
 4   Percent_less_than_HS                     3989 non-null   float64
 5   Percent_HS                               3989 non-null   float64
 6   Percent_SomeCollege                      3989 non-null   float64
 7   Percent_Bach                             3989 non-null   float64
 8   Percent_Grad                             3989 non-null   float64
 9   Percent_No_vehicals                      3989 non-null   float64
 10  Percent_Received_SNAP                    3989 no

In [14]:
df_features_places = pd.read_csv('./data/places_count_by_census_block.csv', dtype={'geo_12': 'str'})
df_features_places.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2194 entries, 0 to 2193
Data columns (total 11 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   geo_12                                               2194 non-null   object 
 1   automotive_repair_and_maintenance                    2194 non-null   float64
 2   child_day_care_services                              2194 non-null   float64
 3   elementary_and_secondary_schools                     2194 non-null   float64
 4   grocery_stores                                       2194 non-null   float64
 5   health_and_personal_care_stores                      2194 non-null   float64
 6   museums,_historical_sites,_and_similar_institutions  2194 non-null   float64
 7   offices_of_physicians                                2194 non-null   float64
 8   other_amusement_and_recreation_industries            2194 non-null  

In [20]:
df_features = df_features_places.merge(df_features_census, on='geo_12')
df_features.head()

Unnamed: 0,geo_12,automotive_repair_and_maintenance,child_day_care_services,elementary_and_secondary_schools,grocery_stores,health_and_personal_care_stores,"museums,_historical_sites,_and_similar_institutions",offices_of_physicians,other_amusement_and_recreation_industries,religious_organizations,...,Percent_HS,Percent_SomeCollege,Percent_Bach,Percent_Grad,Percent_No_vehicals,Percent_Received_SNAP,Percent_Men_Usually_Fulltime_Employed,Percent_Women_Usually_Fulltime_Employed,Percent_No_Internet_Access,Percent_Computing_Device
0,170310101001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.168511,0.255138,0.289598,0.17926,0.344589,0.18696,0.885906,0.382653,0.217742,0.782258
1,170310101002,0.0,1.0,1.0,0.0,0.0,3.0,1.0,2.0,1.0,...,0.168511,0.255138,0.289598,0.17926,0.344589,0.18696,0.507064,0.452071,0.330517,0.829989
2,170310101003,0.0,1.0,0.0,0.0,0.0,3.0,0.0,1.0,1.0,...,0.168511,0.255138,0.289598,0.17926,0.344589,0.18696,0.765318,0.663338,0.074041,0.950045
3,170310102011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.24611,0.246314,0.193898,0.114251,0.140014,0.316592,0.559184,0.437107,0.10503,0.921598
4,170310102012,1.0,4.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.24611,0.246314,0.193898,0.114251,0.140014,0.316592,0.495495,0.3208,0.202247,0.822868


## Targets - diff data 2019-2020
### Load targets

In [67]:
df_targets_diff = pd.read_csv('./data/COVID_mobility_targets_adjusted.csv', dtype={'geo_12': 'str'})
df_targets_diff.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58062 entries, 0 to 58061
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   geo_12                        58062 non-null  object 
 1   Week                          58062 non-null  int64  
 2   fraction_of_devices_home_adj  58062 non-null  float64
 3   fraction_of_devices_work_adj  58062 non-null  float64
 4   fraction_time_away_all_adj    58062 non-null  float64
 5   fraction_time_away_leave_adj  58062 non-null  float64
dtypes: float64(4), int64(1), object(1)
memory usage: 2.7+ MB


### Create dataframe

In [69]:
df_diff = df_features.merge(df_targets_diff, on='geo_12')
df_diff.columns

Index(['geo_12', 'automotive_repair_and_maintenance',
       'child_day_care_services', 'elementary_and_secondary_schools',
       'grocery_stores', 'health_and_personal_care_stores',
       'museums,_historical_sites,_and_similar_institutions',
       'offices_of_physicians', 'other_amusement_and_recreation_industries',
       'religious_organizations', 'restaurants_and_other_eating_places',
       'Median_Income', 'Median_Age', 'Percent_NonCitizen',
       'Percent_SpeakEngl_Poorly', 'Percent_less_than_HS', 'Percent_HS',
       'Percent_SomeCollege', 'Percent_Bach', 'Percent_Grad',
       'Percent_No_vehicals', 'Percent_Received_SNAP',
       'Percent_Men_Usually_Fulltime_Employed',
       'Percent_Women_Usually_Fulltime_Employed', 'Percent_No_Internet_Access',
       'Percent_Computing_Device', 'Week', 'fraction_of_devices_home_adj',
       'fraction_of_devices_work_adj', 'fraction_time_away_all_adj',
       'fraction_time_away_leave_adj'],
      dtype='object')

### Impute missing

In [77]:
df_diff['Median_Income'] = df_diff['Median_Income'].fillna(df_diff.median())

In [92]:
# Need to change this
df_diff.dropna(inplace=True)

## Feature & Target selection

### Look at correlations between features and targets to get an idea about which features might be important

In [28]:
feature_selection_df.corr()

Unnamed: 0.1,automotive_repair_and_maintenance,child_day_care_services,elementary_and_secondary_schools,grocery_stores,health_and_personal_care_stores,"museums,_historical_sites,_and_similar_institutions",offices_of_physicians,other_amusement_and_recreation_industries,religious_organizations,restaurants_and_other_eating_places,...,Percent_Men_Usually_Fulltime_Employed,Percent_Women_Usually_Fulltime_Employed,Percent_No_Internet_Access,Percent_Computing_Device,Unnamed: 0,Week,fraction_of_devices_home,fraction_of_devices_work,fraction_time_away_all,fraction_time_away_leave
automotive_repair_and_maintenance,1.0,0.08508338,0.04178819,0.1114921,0.07421685,0.03194654,0.01788637,0.08078375,-0.001812855,0.05433176,...,0.04338496,-0.01811937,0.02233448,-0.01752393,0.099512,-6.451187e-18,0.024724,-0.074272,-0.050777,-0.051757
child_day_care_services,0.08508338,1.0,0.1656755,0.1218192,0.1550656,0.2091628,0.04963597,0.1446378,0.1491348,0.1941497,...,-0.04994664,0.002548483,0.0172797,-0.02132096,0.091724,1.5528599999999998e-19,-0.006681,-0.071834,0.04825,0.042734
elementary_and_secondary_schools,0.04178819,0.1656755,1.0,0.03564841,0.01977635,0.07081538,0.02864988,0.03069125,0.1891763,0.004115814,...,-0.09056854,-0.06486161,0.02665707,-0.02000581,0.130508,5.815748e-18,-0.060474,-0.053034,0.025429,-0.0068
grocery_stores,0.1114921,0.1218192,0.03564841,1.0,0.3593401,0.2626586,0.03040448,0.3075581,-0.0149481,0.4852629,...,0.04916961,0.003103735,-0.01451538,0.008186056,0.127582,4.6858659999999995e-20,-0.024275,-0.110666,0.042684,0.032682
health_and_personal_care_stores,0.07421685,0.1550656,0.01977635,0.3593401,1.0,0.4148705,0.3068574,0.5150677,-0.04021551,0.711737,...,0.1518174,0.1494416,-0.170369,0.1508599,0.056651,-6.617296e-18,-0.029041,-0.048431,0.132063,0.131205
"museums,_historical_sites,_and_similar_institutions",0.03194654,0.2091628,0.07081538,0.2626586,0.4148705,1.0,0.1144857,0.4276164,-0.01874119,0.5236997,...,0.08638951,0.09498099,-0.1306107,0.1095283,0.05581,1.9194800000000002e-18,-0.037228,-0.061094,0.125156,0.11834
offices_of_physicians,0.01788637,0.04963597,0.02864988,0.03040448,0.3068574,0.1144857,1.0,0.1479818,-0.005639528,0.1721578,...,0.09244798,0.09341107,-0.08470125,0.07232299,-0.028994,-2.7480539999999996e-19,-0.033411,-0.010927,0.070965,0.059843
other_amusement_and_recreation_industries,0.08078375,0.1446378,0.03069125,0.3075581,0.5150677,0.4276164,0.1479818,1.0,-0.04148249,0.5513455,...,0.2411442,0.2609516,-0.2446288,0.2184308,0.037566,1.303337e-18,0.013513,-0.069047,0.142307,0.179066
religious_organizations,-0.001812855,0.1491348,0.1891763,-0.0149481,-0.04021551,-0.01874119,-0.005639528,-0.04148249,1.0,-0.05004075,...,-0.2740515,-0.1405288,0.2080053,-0.1615053,0.118986,-1.2285680000000001e-17,-0.098672,-0.039705,0.020463,-0.029318
restaurants_and_other_eating_places,0.05433176,0.1941497,0.004115814,0.4852629,0.711737,0.5236997,0.1721578,0.5513455,-0.05004075,1.0,...,0.1687893,0.1490767,-0.1830599,0.1664604,0.046572,6.431886e-18,-0.04114,-0.070977,0.166825,0.163488


In [None]:
feature_selection_df_diff.corr()

Looks like a lot of correlations that are not very strong.

In [109]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

features = ['automotive_repair_and_maintenance',
            'child_day_care_services', 'elementary_and_secondary_schools',
            'grocery_stores', 'health_and_personal_care_stores',
            'museums,_historical_sites,_and_similar_institutions',
            'offices_of_physicians', 'other_amusement_and_recreation_industries',
            'religious_organizations', 'restaurants_and_other_eating_places',
            'Median_Income', 'Median_Age', 'Percent_NonCitizen',
            'Percent_SpeakEngl_Poorly', 'Percent_less_than_HS', 'Percent_HS',
            'Percent_SomeCollege', 'Percent_Bach', 'Percent_Grad',
            'Percent_No_vehicals', 'Percent_Received_SNAP',
            'Percent_Men_Usually_Fulltime_Employed',
            'Percent_Women_Usually_Fulltime_Employed', 'Percent_No_Internet_Access',
            'Percent_Computing_Device']

targets = ['fraction_of_devices_home_adj','fraction_of_devices_work_adj',
           'fraction_time_away_all_adj','fraction_time_away_leave_adj']


def split_data(df):
    return train_test_split(df, test_size=0.2)
    
def run_linear_regression(target, verbose=False):
    X_train, X_test = train[features].values, test[features].values
    y_train, y_test = train[target].values, test[target].values
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test) 
    regr = linear_model.LinearRegression()

    regr.fit(X_train_scaled, y_train)
    y_pred = regr.predict(X_test_scaled)

    MSE = mean_squared_error(y_pred, y_test)
    R2 = regr.score(X_train_scaled, y_train)
    if verbose:
        print("\nRegression with {} target".format(target))
        print("MSE: ", MSE, "R2: ", R2)
    return regr


def feature_importance(regr_model, df):
    return pd.DataFrame(regr_model.coef_, index = df[features].columns,
            columns=['importance']).sort_values('importance', ascending=False)

In [106]:
train, test = split_data(df_diff)
for target in targets:
    regr = run_linear_regression(target, True)
    print(feature_importance(regr, df_diff))
    print('\n')


Regression with fraction_of_devices_home_adj target
MSE:  0.007769916395069526 R2:  0.22131191191268706
                                                    importance
Percent_NonCitizen                                    0.023572
Percent_less_than_HS                                  0.020284
Percent_No_vehicals                                   0.009121
grocery_stores                                        0.005667
restaurants_and_other_eating_places                   0.005601
elementary_and_secondary_schools                      0.004852
Percent_HS                                            0.004497
offices_of_physicians                                 0.002568
religious_organizations                               0.001731
Percent_No_Internet_Access                            0.001383
other_amusement_and_recreation_industries             0.001320
museums,_historical_sites,_and_similar_institut...    0.001047
Percent_Computing_Device                              0.000061
child_day_car

So it looks like the differenced data explains more of the variance in the models than the raw data.

Targets: diff data

It seems as though abs(0.002) is a sensible cut off for feature selection 

### Feature selection

In [107]:
def create_selected_dfs(df, target):
    regr_diff = run_linear_regression(target)
    selected_features = list(pd.Series(features)[list(abs(regr_diff.coef_) > 0.002)])
    return df[selected_features], df[target]

In [110]:
fodh_features_train, fodh_target_train = create_selected_dfs(train, 'fraction_of_devices_home_adj')
fodh_features_test, fodh_target_test = create_selected_dfs(test, 'fraction_of_devices_home_adj')

fodw_features_train, fodw_target_train = create_selected_dfs(train, 'fraction_of_devices_work_adj')
fodh_features_test, fodh_target_test = create_selected_dfs(test, 'fraction_of_devices_work_adj')

ftaa_features_train, ftaa_target_train = create_selected_dfs(train, 'fraction_time_away_all_adj')
ftaa_features_test, ftaa_features_test = create_selected_dfs(test, 'fraction_time_away_all_adj')

ftal_features_train, ftal_target_train = create_selected_dfs(train, 'fraction_time_away_leave_adj')
ftal_features_test, ftal_target_test = create_selected_dfs(test, 'fraction_time_away_leave_adj')

For target: fraction_time_away_leave
Percent_Grad 	0.012660
Percent_No_vehicals 	0.011007
Percent_NonCitizen 	0.008608
restaurants_and_other_eating_places 	0.003511
other_amusement_and_recreation_industries 	0.002147
child_day_care_services 	0.001538
offices_of_physicians 	0.001052
grocery_stores 	-0.002207
Percent_less_than_HS 	-0.002730
Percent_HS 	-0.003446
Percent_Received_SNAP 	-0.004576
Median_Age 	-0.005178
Percent_Men_Usually_Fulltime_Employed 	-0.006382
Percent_Women_Usually_Fulltime_Employed 	-0.006666
Percent_SomeCollege 	-0.007579
Percent_SpeakEngl_Poorly 	-0.008516


For target: fraction_of_devices_home
Percent_SpeakEngl_Poorly 	0.042864
Percent_Bach 	0.012126
Percent_Received_SNAP 	0.011084
Median_Age 	0.008225
Percent_Women_Usually_Fulltime_Employed 	0.004302
Percent_Grad 	0.004125
child_day_care_services 	0.002590
Percent_Men_Usually_Fulltime_Employed 	0.002115
grocery_stores 	-0.001893
Percent_No_Internet_Access 	-0.002190
Median_Income 	-0.002456
elementary_and_secondary_schools 	-0.003602
Percent_No_vehicals 	-0.004472
offices_of_physicians 	-0.005166
restaurants_and_other_eating_places 	-0.006488
Percent_HS 	-0.006724
Percent_NonCitizen 	-0.012125
Percent_less_than_HS 	-0.013243

For target: fraction_of_devices_work
Percent_less_than_HS 	0.004021
Percent_No_vehicals 	0.003656
Median_Income 	0.002395
Median_Age 	0.001979
Percent_Men_Usually_Fulltime_Employed 	0.001618
automotive_repair_and_maintenance 	-0.001159
religious_organizations 	-0.001167
child_day_care_services 	-0.001319
Percent_Bach 	-0.001738
other_amusement_and_recreation_industries 	-0.002410
grocery_stores 	-0.002565
Percent_SomeCollege 	-0.003431
Percent_SpeakEngl_Poorly 	-0.005126

For target: fraction_time_away_all
Percent_NonCitizen 	0.008536
Percent_No_vehicals 	0.007544
Percent_Grad 	0.005820
restaurants_and_other_eating_places 	0.004979
Median_Income 	0.002161
offices_of_physicians 	0.002006
Percent_less_than_HS 	0.001936
elementary_and_secondary_schools 	0.001215
grocery_stores 	-0.001344
Percent_Bach 	-0.003378
Percent_SomeCollege 	-0.004304
Median_Age 	-0.005621
Percent_Men_Usually_Fulltime_Employed 	-0.006261
Percent_Women_Usually_Fulltime_Employed 	-0.006540
Percent_Received_SNAP 	-0.007669
Percent_SpeakEngl_Poorly 	-0.015919

For target: fraction_of_devices_home_adj
Percent_NonCitizen 	0.023994
Percent_less_than_HS 	0.020374
Percent_No_vehicals 	0.008571
grocery_stores 	0.004792
restaurants_and_other_eating_places 	0.004613
elementary_and_secondary_schools 	0.003403
Percent_HS 	0.003297
offices_of_physicians 	0.002395
Percent_Received_SNAP 	-0.002058
Percent_SomeCollege 	-0.006043
Percent_Grad 	-0.006297
Percent_Bach 	-0.009700
Median_Age 	-0.010470
Median_Income 	-0.018861
Percent_SpeakEngl_Poorly 	-0.052438

For target: fraction_of_devices_work_adj
Percent_SpeakEngl_Poorly 	0.011330
Median_Income 	0.006510
Percent_SomeCollege 	0.004885
Percent_Bach 	0.003057
other_amusement_and_recreation_industries 	0.002628
Percent_Computing_Device 	-0.002266
Percent_No_Internet_Access 	-0.003070
Percent_Received_SNAP 	-0.003886
Percent_NonCitizen 	-0.005314
Percent_less_than_HS 	-0.007342
Percent_No_vehicals 	-0.007825


For target: fraction_time_away_all_adj
Percent_SpeakEngl_Poorly 	0.019809
Median_Age 	0.007751
Percent_SomeCollege 	0.007102
Percent_Received_SNAP 	0.007081
Median_Income 	0.005984
Percent_Women_Usually_Fulltime_Employed 	0.005387
Percent_Men_Usually_Fulltime_Employed 	0.005327
Percent_Bach 	0.004115
Percent_less_than_HS 	-0.004690
restaurants_and_other_eating_places 	-0.005138
Percent_Grad 	-0.007203
Percent_No_vehicals 	-0.010223
Percent_NonCitizen 	-0.013027


For target: fraction_time_away_leave_adj
Percent_SpeakEngl_Poorly 	0.011855
Percent_SomeCollege 	0.009853
Percent_Received_SNAP 	0.008280
Median_Age 	0.007132
Percent_Men_Usually_Fulltime_Employed 	0.006372
Percent_Women_Usually_Fulltime_Employed 	0.006190
Percent_HS 	0.005026
grocery_stores 	0.002894
Median_Income 	0.002238
Percent_No_Internet_Access 	0.002024
child_day_care_services 	-0.002385
restaurants_and_other_eating_places 	-0.003690
other_amusement_and_recreation_industries 	-0.004755
Percent_NonCitizen 	-0.012492
Percent_No_vehicals 	-0.013934
Percent_Grad 	-0.014604

### Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_jobs=-1, 
                            n_estimators=100,
                           min_samples_leaf=1
                           )
model = rf.fit(X_train,y_train)

In [None]:
y_pred=rf.predict(X_test)

In [None]:
mean_squared_error(y_pred, y_test)

This is lower than the MSE for the linear regression model.

In [None]:
#Show how much each feature contributes to the model prediction
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = df[features].columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)