In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import pmdarima as pm
from pmdarima.utils import diff
from sklearn.metrics import mean_squared_error
from pmdarima.metrics import smape
import pickle
import sklearn

In [2]:
pd.set_option('display.max_rows', 500)

# Importing and preparing zri and acs data for merging

In [3]:
zri = pd.read_csv('./../data/zri_multifamily_v2.csv')

In [4]:
zri['zip'] = zri['zip'].map(lambda x: str(x))

In [5]:
zri['zip'].map(lambda x: len(x)).value_counts()

5    79272
4    14472
Name: zip, dtype: int64

In [6]:
zri['zip'] = zri['zip'].map(lambda x: '0' + x if len(x)<5 else x)

In [7]:
zri['zip'].map(lambda x: len(x)).value_counts()

5    93744
Name: zip, dtype: int64

In [8]:
zri['year']=zri['year-month'].map(lambda x: int(x[0:4]))

In [9]:
zri['month']=zri['year-month'].map(lambda x: int(x[5:]))

In [10]:
years = range(2014,2020)

In [11]:
months = range(1, 13)

In [12]:
datetime_col = [datetime(year=i, month=j, day=1) for i in years for j in months]

In [13]:
datetime_col = datetime_col*1302

In [14]:
zri['datetime'] = datetime_col

In [15]:
zri.drop(columns = 'year-month', inplace = True)

In [16]:
zri

Unnamed: 0,zip,City,State,Metro,CountyName,zri,year,month,datetime
0,01013,Chicopee,MA,Springfield,Hampden County,928.0,2014,1,2014-01-01
1,01013,Chicopee,MA,Springfield,Hampden County,931.0,2014,2,2014-02-01
2,01013,Chicopee,MA,Springfield,Hampden County,934.0,2014,3,2014-03-01
3,01013,Chicopee,MA,Springfield,Hampden County,929.0,2014,4,2014-04-01
4,01013,Chicopee,MA,Springfield,Hampden County,929.0,2014,5,2014-05-01
...,...,...,...,...,...,...,...,...,...
93739,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1277.0,2019,8,2019-08-01
93740,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1271.0,2019,9,2019-09-01
93741,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1299.0,2019,10,2019-10-01
93742,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1261.5,2019,11,2019-11-01


In [17]:
acs = pd.read_csv('./../data/acs_engineered_features.csv')

In [18]:
acs['zip'] = acs['zip'].map(lambda x: str(x))

In [19]:
acs['zip'].map(lambda x: len(x)).value_counts()

5    83482
4     9170
Name: zip, dtype: int64

In [20]:
acs['zip'] = acs['zip'].map(lambda x: '0' + x if len(x)<5 else x)

In [21]:
acs['zip'].map(lambda x: len(x)).value_counts()

5    92652
Name: zip, dtype: int64

In [22]:
acs.drop(columns = 'census_period', axis = 1, inplace = True)

In [23]:
acs

Unnamed: 0,zip,year_usable,percent_white,percent_black,percent_asian,percent_hispanic,percent_native_am,percent_other_race,percent_0_17,percent_18_39,...,median_building_age,income_per_capita,poverty_rate,total_pop,percent_workforce_unemployed,percent_work_from_home,median_age,percent_female,gini_index,percent_not_us_citizen
0,01013,2013,0.729579,0.020723,0.013756,0.218793,0.000000,0.005002,0.230494,0.311375,...,62.0,20433.0,0.190522,22391,0.104113,0.008853,36.4,0.510250,0.4210,0.055960
1,01013,2014,0.714417,0.023463,0.013473,0.231235,0.000000,0.004491,0.224682,0.312437,...,62.0,20940.0,0.201543,21822,0.104320,0.014770,36.6,0.501650,0.4179,0.057190
2,01013,2015,0.720119,0.027857,0.016777,0.217971,0.001357,0.005065,0.217248,0.335730,...,66.0,20889.0,0.173591,22113,0.113411,0.012946,35.5,0.506851,0.4110,0.056483
3,01013,2016,0.713799,0.017445,0.016638,0.237499,0.000538,0.002197,0.221131,0.328625,...,72.0,21371.0,0.155696,22299,0.115569,0.013701,35.8,0.512669,0.4061,0.050182
4,01013,2017,0.703894,0.022238,0.018442,0.241627,0.001116,0.001652,0.222336,0.338573,...,73.0,21477.0,0.169816,22394,0.114075,0.020243,34.9,0.514200,0.4117,0.050013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92647,99901,2015,0.653872,0.002271,0.063146,0.044026,0.142407,0.002710,0.233756,0.279833,...,38.0,31563.0,0.101846,13651,0.086316,0.021073,38.7,0.478353,0.3981,0.025493
92648,99901,2016,0.654374,0.004016,0.061487,0.046444,0.143932,0.000000,0.232584,0.279977,...,38.0,31487.0,0.114447,13694,0.085095,0.022509,38.8,0.481233,0.4057,0.023952
92649,99901,2017,0.649705,0.004228,0.073631,0.048334,0.135379,0.001604,0.230079,0.285121,...,40.0,32010.0,0.121194,13717,0.078951,0.023050,38.7,0.485602,0.4110,0.025953
92650,99901,2018,0.645762,0.004947,0.076319,0.046126,0.129138,0.005384,0.225609,0.284103,...,40.0,32671.0,0.108124,13745,0.075428,0.026582,39.2,0.485922,0.4158,0.035504


# Imputing ACS monthly data

In [24]:
acs_zips = acs['zip'].unique()

In [25]:
years = acs['year_usable'].unique()

In [26]:
zri_zips = zri['zip'].unique()

In [27]:
zri_zips

array(['01013', '01020', '01040', ..., '99504', '99508', '99654'],
      dtype=object)

In [28]:
acs_zips_in_zri = []

In [29]:
for item in acs_zips:
    if item in zri_zips:
        acs_zips_in_zri.append(item)

In [30]:
acs = acs.set_index('zip')

In [31]:
acs = acs.loc[acs_zips_in_zri,:]

In [32]:
acs = acs.reset_index(drop=False)

In [33]:
acs

Unnamed: 0,zip,year_usable,percent_white,percent_black,percent_asian,percent_hispanic,percent_native_am,percent_other_race,percent_0_17,percent_18_39,...,median_building_age,income_per_capita,poverty_rate,total_pop,percent_workforce_unemployed,percent_work_from_home,median_age,percent_female,gini_index,percent_not_us_citizen
0,01013,2013,0.729579,0.020723,0.013756,0.218793,0.000000,0.005002,0.230494,0.311375,...,62.0,20433.0,0.190522,22391,0.104113,0.008853,36.4,0.510250,0.4210,0.055960
1,01013,2014,0.714417,0.023463,0.013473,0.231235,0.000000,0.004491,0.224682,0.312437,...,62.0,20940.0,0.201543,21822,0.104320,0.014770,36.6,0.501650,0.4179,0.057190
2,01013,2015,0.720119,0.027857,0.016777,0.217971,0.001357,0.005065,0.217248,0.335730,...,66.0,20889.0,0.173591,22113,0.113411,0.012946,35.5,0.506851,0.4110,0.056483
3,01013,2016,0.713799,0.017445,0.016638,0.237499,0.000538,0.002197,0.221131,0.328625,...,72.0,21371.0,0.155696,22299,0.115569,0.013701,35.8,0.512669,0.4061,0.050182
4,01013,2017,0.703894,0.022238,0.018442,0.241627,0.001116,0.001652,0.222336,0.338573,...,73.0,21477.0,0.169816,22394,0.114075,0.020243,34.9,0.514200,0.4117,0.050013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9102,99654,2015,0.807711,0.008025,0.016069,0.045364,0.054007,0.000842,0.297085,0.296824,...,21.0,29022.0,0.098645,53456,0.104884,0.047280,33.0,0.480245,0.3941,0.017716
9103,99654,2016,0.810662,0.006580,0.015781,0.044054,0.055171,0.001699,0.292142,0.298324,...,21.0,29855.0,0.099912,55319,0.105642,0.039896,33.2,0.476292,0.3925,0.020933
9104,99654,2017,0.806640,0.008930,0.017115,0.044119,0.053475,0.002468,0.286711,0.305601,...,21.0,29865.0,0.097616,56325,0.099910,0.042248,33.2,0.474887,0.3889,0.022725
9105,99654,2018,0.798655,0.009730,0.015504,0.048770,0.051062,0.003108,0.282481,0.306271,...,21.0,30155.0,0.093663,58889,0.102903,0.047433,33.5,0.473110,0.3974,0.016234


In [34]:
acs = acs.loc[acs.index.repeat(12)].reset_index(drop=True)

In [35]:
acs['month'] = 0

In [36]:
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] * len(acs_zips_in_zri) * len(years)

In [37]:
acs['month'] = months

In [38]:
acs.loc[acs['month']!=12, acs.columns[2:-1]] = np.nan

In [39]:
acs

Unnamed: 0,zip,year_usable,percent_white,percent_black,percent_asian,percent_hispanic,percent_native_am,percent_other_race,percent_0_17,percent_18_39,...,income_per_capita,poverty_rate,total_pop,percent_workforce_unemployed,percent_work_from_home,median_age,percent_female,gini_index,percent_not_us_citizen,month
0,01013,2013,,,,,,,,,...,,,,,,,,,,1
1,01013,2013,,,,,,,,,...,,,,,,,,,,2
2,01013,2013,,,,,,,,,...,,,,,,,,,,3
3,01013,2013,,,,,,,,,...,,,,,,,,,,4
4,01013,2013,,,,,,,,,...,,,,,,,,,,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109279,99654,2019,,,,,,,,,...,,,,,,,,,,8
109280,99654,2019,,,,,,,,,...,,,,,,,,,,9
109281,99654,2019,,,,,,,,,...,,,,,,,,,,10
109282,99654,2019,,,,,,,,,...,,,,,,,,,,11


In [40]:
for item in acs_zips_in_zri:
    temp = acs.loc[acs['zip']==item,:]
    for col in temp.columns[2:-1]:
        temp[col] = temp[col].interpolate(method='linear')
    acs.loc[acs['zip']==item,:] = temp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp[col] = temp[col].interpolate(method='linear')


In [41]:
acs

Unnamed: 0,zip,year_usable,percent_white,percent_black,percent_asian,percent_hispanic,percent_native_am,percent_other_race,percent_0_17,percent_18_39,...,income_per_capita,poverty_rate,total_pop,percent_workforce_unemployed,percent_work_from_home,median_age,percent_female,gini_index,percent_not_us_citizen,month
0,01013,2013,,,,,,,,,...,,,,,,,,,,1
1,01013,2013,,,,,,,,,...,,,,,,,,,,2
2,01013,2013,,,,,,,,,...,,,,,,,,,,3
3,01013,2013,,,,,,,,,...,,,,,,,,,,4
4,01013,2013,,,,,,,,,...,,,,,,,,,,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109279,99654,2019,0.796036,0.008872,0.014941,0.049974,0.049727,0.003070,0.282075,0.308002,...,30251.666667,0.094625,60057.0,0.102326,0.049199,33.566667,0.475244,0.39940,0.018076,8
109280,99654,2019,0.795709,0.008765,0.014871,0.050125,0.049560,0.003065,0.282024,0.308218,...,30263.750000,0.094745,60203.0,0.102254,0.049419,33.575000,0.475511,0.39965,0.018306,9
109281,99654,2019,0.795381,0.008658,0.014801,0.050275,0.049393,0.003060,0.281973,0.308434,...,30275.833333,0.094865,60349.0,0.102181,0.049640,33.583333,0.475778,0.39990,0.018537,10
109282,99654,2019,0.795054,0.008550,0.014730,0.050426,0.049226,0.003055,0.281923,0.308651,...,30287.916667,0.094986,60495.0,0.102109,0.049861,33.591667,0.476045,0.40015,0.018767,11


In [42]:
zri.drop(columns = ['City', 'State', 'Metro', 'CountyName'], inplace = True)

In [51]:
merged = pd.merge(zri, acs, left_on = ['zip','year', 'month'], right_on = ['zip','year_usable', 'month'], how = 'left')

In [52]:
null_check = {}
for col in merged.columns:
    if merged[col].isnull().any():
        null_check[col] = merged[col].isnull().sum()
null_check

{'year_usable': 72,
 'percent_white': 72,
 'percent_black': 72,
 'percent_asian': 72,
 'percent_hispanic': 72,
 'percent_native_am': 72,
 'percent_other_race': 72,
 'percent_0_17': 72,
 'percent_18_39': 72,
 'percent_40_64': 72,
 'percent_65+': 72,
 'percent_rental_units_vacant': 72,
 'percent_rental_units_occupied': 72,
 'percent_graduate_deg': 72,
 'percent_bachelors': 72,
 'percent_associates': 72,
 'percent_highschool': 72,
 'percent_less_highschool': 72,
 'percent_commute_public_transport': 72,
 'percent_commute_less_30': 72,
 'percent_commute_30_to_59': 72,
 'percent_commute_60_to_89': 72,
 'percent_commute_90_more': 72,
 'percent_new_city': 72,
 'percent_new_unit': 72,
 'percent_units_owner_occupied': 72,
 'median_building_age': 72,
 'income_per_capita': 72,
 'poverty_rate': 72,
 'total_pop': 72,
 'percent_workforce_unemployed': 72,
 'percent_work_from_home': 72,
 'median_age': 72,
 'percent_female': 72,
 'gini_index': 72,
 'percent_not_us_citizen': 72}

In [53]:
merged= merged[merged['zip']!= '11249']

In [54]:
null_check = {}
for col in merged.columns:
    if merged[col].isnull().any():
        null_check[col] = merged[col].isnull().sum()
null_check

{}

In [55]:
merged['zip'].nunique()

1301

In [56]:
df = merged.drop(columns = ['year_usable'])

In [57]:
df

Unnamed: 0,zip,zri,year,month,datetime,percent_white,percent_black,percent_asian,percent_hispanic,percent_native_am,...,median_building_age,income_per_capita,poverty_rate,total_pop,percent_workforce_unemployed,percent_work_from_home,median_age,percent_female,gini_index,percent_not_us_citizen
0,01013,928.0,2014,1,2014-01-01,0.728315,0.020951,0.013732,0.219830,0.000000,...,62.000000,20475.250000,0.191440,22343.583333,0.104130,0.009346,36.416667,0.509533,0.420742,0.056062
1,01013,931.0,2014,2,2014-02-01,0.727052,0.021179,0.013708,0.220867,0.000000,...,62.000000,20517.500000,0.192359,22296.166667,0.104147,0.009839,36.433333,0.508816,0.420483,0.056165
2,01013,934.0,2014,3,2014-03-01,0.725788,0.021408,0.013685,0.221904,0.000000,...,62.000000,20559.750000,0.193277,22248.750000,0.104165,0.010332,36.450000,0.508100,0.420225,0.056267
3,01013,929.0,2014,4,2014-04-01,0.724525,0.021636,0.013661,0.222940,0.000000,...,62.000000,20602.000000,0.194196,22201.333333,0.104182,0.010825,36.466667,0.507383,0.419967,0.056370
4,01013,929.0,2014,5,2014-05-01,0.723261,0.021864,0.013638,0.223977,0.000000,...,62.000000,20644.250000,0.195114,22153.916667,0.104199,0.011318,36.483333,0.506666,0.419708,0.056472
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93739,99654,1277.0,2019,8,2019-08-01,0.796036,0.008872,0.014941,0.049974,0.049727,...,22.333333,30251.666667,0.094625,60057.000000,0.102326,0.049199,33.566667,0.475244,0.399400,0.018076
93740,99654,1271.0,2019,9,2019-09-01,0.795709,0.008765,0.014871,0.050125,0.049560,...,22.500000,30263.750000,0.094745,60203.000000,0.102254,0.049419,33.575000,0.475511,0.399650,0.018306
93741,99654,1299.0,2019,10,2019-10-01,0.795381,0.008658,0.014801,0.050275,0.049393,...,22.666667,30275.833333,0.094865,60349.000000,0.102181,0.049640,33.583333,0.475778,0.399900,0.018537
93742,99654,1261.5,2019,11,2019-11-01,0.795054,0.008550,0.014730,0.050426,0.049226,...,22.833333,30287.916667,0.094986,60495.000000,0.102109,0.049861,33.591667,0.476045,0.400150,0.018767


In [60]:
bikeshare = pd.read_csv('./../data/bikeshare_engineered_features.csv', index_col = 0)

In [65]:
bikeshare['month'] = bikeshare['year-month'].map(lambda x: int(x[5:7]))

In [67]:
bikeshare.drop(columns = ['City', 'State', 'Metro', 'CountyName', 'year-month', 'zri'], inplace = True)

In [73]:
bikeshare['zip'] = bikeshare['zip'].map(lambda x: str(x))

In [75]:
bikeshare['zip'] = bikeshare['zip'].map(lambda x: '0' +x if len(x)<5 else x)

In [76]:
df = pd.merge(df, bikeshare, on = ('zip', 'year', 'month'), how = 'left')

In [77]:
df

Unnamed: 0,zip,zri,year,month,datetime,percent_white,percent_black,percent_asian,percent_hispanic,percent_native_am,...,total_pop,percent_workforce_unemployed,percent_work_from_home,median_age,percent_female,gini_index,percent_not_us_citizen,bs_total_stations,bs_total_systems,has_bike_sharing
0,01013,928.0,2014,1,2014-01-01,0.728315,0.020951,0.013732,0.219830,0.000000,...,22343.583333,0.104130,0.009346,36.416667,0.509533,0.420742,0.056062,0.0,0.0,False
1,01013,931.0,2014,2,2014-02-01,0.727052,0.021179,0.013708,0.220867,0.000000,...,22296.166667,0.104147,0.009839,36.433333,0.508816,0.420483,0.056165,0.0,0.0,False
2,01013,934.0,2014,3,2014-03-01,0.725788,0.021408,0.013685,0.221904,0.000000,...,22248.750000,0.104165,0.010332,36.450000,0.508100,0.420225,0.056267,0.0,0.0,False
3,01013,929.0,2014,4,2014-04-01,0.724525,0.021636,0.013661,0.222940,0.000000,...,22201.333333,0.104182,0.010825,36.466667,0.507383,0.419967,0.056370,0.0,0.0,False
4,01013,929.0,2014,5,2014-05-01,0.723261,0.021864,0.013638,0.223977,0.000000,...,22153.916667,0.104199,0.011318,36.483333,0.506666,0.419708,0.056472,0.0,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93667,99654,1277.0,2019,8,2019-08-01,0.796036,0.008872,0.014941,0.049974,0.049727,...,60057.000000,0.102326,0.049199,33.566667,0.475244,0.399400,0.018076,0.0,0.0,False
93668,99654,1271.0,2019,9,2019-09-01,0.795709,0.008765,0.014871,0.050125,0.049560,...,60203.000000,0.102254,0.049419,33.575000,0.475511,0.399650,0.018306,0.0,0.0,False
93669,99654,1299.0,2019,10,2019-10-01,0.795381,0.008658,0.014801,0.050275,0.049393,...,60349.000000,0.102181,0.049640,33.583333,0.475778,0.399900,0.018537,0.0,0.0,False
93670,99654,1261.5,2019,11,2019-11-01,0.795054,0.008550,0.014730,0.050426,0.049226,...,60495.000000,0.102109,0.049861,33.591667,0.476045,0.400150,0.018767,0.0,0.0,False


In [79]:
df['has_bike_sharing'] = df['has_bike_sharing'].map(lambda x: 1 if x==True else 0)

In [81]:
# df.to_csv('./../data/zri_acs_bikeshare_merged.csv', index = 0)

In [82]:
zips = list(df['zip'].unique())

# Merging sarima residuals

In [83]:
resid = pd.read_csv('./../data/sarima_residuals.csv', index_col = 0)

In [84]:
resid.index = resid.index.map(lambda x: str(x))

In [85]:
resid.index = resid.index.map(lambda x: '0' + x if len(x) <5 else x)

In [86]:
resid.reset_index(drop=False, inplace = True)

In [87]:
resid.rename(columns = {'index':'zip'}, inplace = True)

In [88]:
resid = resid.melt(id_vars = 'zip', value_vars = resid.columns[1:], var_name = 'month', value_name = 'resid')

In [89]:
resid['year'] = 2019

In [90]:
resid['month'] = resid['month'].map(lambda x: int(x)+1)

In [92]:
df2 = pd.merge(df, resid, on = ['zip', 'year', 'month'], how = 'right')

In [93]:
df2 = df2.sort_values(['zip', 'year', 'month'])

# Random forest for feature importance in predicting sarima error

In [127]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [97]:
x = df2.drop(columns = ['zip', 'zri', 'year', 'month', 'resid', 'datetime'])

In [128]:
x_norm = ss.fit_transform(x)

In [130]:
y = df2[['resid']]

In [134]:
y_norm = ss.fit_transform(y)

In [168]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, LinearRegression

In [137]:
rf = RandomForestRegressor(random_state = 0)

In [138]:
params_rf = {
    
}

In [139]:
rf = rf.fit(x, y['resid'])

In [140]:
feat_imp_rf = pd.DataFrame(rf.feature_importances_, index = x.columns).rename(columns = {0:'feat_imp'}).sort_values('feat_imp', ascending = False)

In [141]:
feat_imp_rf

Unnamed: 0,feat_imp
percent_white,0.055312
percent_highschool,0.046684
percent_associates,0.042249
percent_other_race,0.035813
percent_asian,0.033907
percent_black,0.033814
percent_new_unit,0.033681
percent_female,0.032311
total_pop,0.031019
percent_work_from_home,0.030858


In [159]:
rf.score(x, y)

0.9916963901150792

In [169]:
lm = LinearRegression()

In [175]:
lm.fit(x, y)

LinearRegression()

In [195]:
lm.score(x, y)

0.09834568777672781

In [197]:
feat_imp_lm = pd.DataFrame(lm.coef_)
feat_imp_lm = feat_imp_lm.T.rename(columns = {0:'feat_imp'}).sort_values('feat_imp', ascending = False)
feat_imp_lm.index = x.columns
feat_imp_lm

Unnamed: 0,feat_imp
percent_white,73.746762
percent_black,67.327663
percent_asian,64.634292
percent_hispanic,32.799209
percent_native_am,32.322566
percent_other_race,26.780697
percent_0_17,25.565928
percent_18_39,23.446538
percent_40_64,21.800598
percent_65+,17.580254


In [142]:
lasso = Lasso(random_state = 0)

In [150]:
lasso_params = {
    'alpha' : [1e-2, 0.1, 1, 10, 100]
}

In [151]:
tuner = GridSearchCV(lasso, lasso_params, cv=3)

In [152]:
tuner = tuner.fit(x_norm, y_norm)

In [153]:
tuner.best_estimator_

Lasso(alpha=0.1, random_state=0)

In [154]:
tuner.best_estimator_.coef_

array([-0.        ,  0.        ,  0.        ,  0.02992732, -0.        ,
        0.        ,  0.        , -0.        ,  0.        ,  0.        ,
       -0.        ,  0.        , -0.        , -0.        ,  0.        ,
        0.        ,  0.0357008 , -0.        , -0.        ,  0.        ,
        0.        ,  0.        , -0.        , -0.        ,  0.        ,
       -0.        , -0.        , -0.        ,  0.        ,  0.        ,
       -0.        ,  0.        ,  0.        , -0.        ,  0.        ,
       -0.        , -0.02684243, -0.        ])

In [155]:
tuner.best_params_

{'alpha': 0.1}

In [156]:
tuner.best_score_

-0.03042379646551689

In [157]:
feat_imp_lasso = pd.DataFrame(tuner.best_estimator_.coef_, index = x.columns).rename(columns = {0:'feat_imp'}).sort_values('feat_imp', ascending = False)

In [158]:
feat_imp_lasso

Unnamed: 0,feat_imp
percent_less_highschool,0.035701
percent_hispanic,0.029927
percent_white,-0.0
total_pop,0.0
percent_new_city,-0.0
percent_new_unit,-0.0
percent_units_owner_occupied,0.0
median_building_age,-0.0
income_per_capita,-0.0
poverty_rate,-0.0


# Trying same with residuals from simple (1,1,0) run on all zip codes

In [259]:
resid_avg = pd.read_csv('./../data/sarima_avg_residuals.csv', index_col = 0)

In [260]:
resid_avg.index = resid_avg.index.map(lambda x: str(x))

In [261]:
resid_avg.index = resid_avg.index.map(lambda x: '0' + x if len(x) <5 else x)

In [262]:
resid_avg.reset_index(drop=False, inplace = True)

In [263]:
resid_avg.rename(columns = {'index':'zip'}, inplace = True)

In [264]:
resid_avg = resid_avg.melt(id_vars = 'zip', value_vars = resid_avg.columns[1:], var_name = 'month', value_name = 'resid')

In [265]:
resid_avg['year'] = 2019

In [266]:
resid_avg['month'] = resid_avg['month'].map(lambda x: int(x))

In [267]:
df3 = pd.merge(df, resid_avg, on = ['zip', 'year', 'month'], how = 'right')

In [268]:
df3 = df3.sort_values(['zip', 'year', 'month'])

In [269]:
df3

Unnamed: 0,zip,zri,year,month,datetime,percent_white,percent_black,percent_asian,percent_hispanic,percent_native_am,...,percent_workforce_unemployed,percent_work_from_home,median_age,percent_female,gini_index,percent_not_us_citizen,bs_total_stations,bs_total_systems,has_bike_sharing,resid
0,01013,1099.0,2019,1,2019-01-01,0.684293,0.030216,0.022684,0.251817,0.001256,...,0.094917,0.029128,35.000000,0.516298,0.416825,0.044844,0.0,0.0,0,7.257261
1301,01013,1098.0,2019,2,2019-02-01,0.683671,0.029834,0.022529,0.252881,0.001292,...,0.094235,0.028976,35.000000,0.516403,0.417950,0.044751,0.0,0.0,0,7.567699
2602,01013,1106.0,2019,3,2019-03-01,0.683049,0.029452,0.022373,0.253946,0.001328,...,0.093552,0.028823,35.000000,0.516509,0.419075,0.044658,0.0,0.0,0,0.172512
3903,01013,1114.0,2019,4,2019-04-01,0.682427,0.029070,0.022218,0.255011,0.001364,...,0.092870,0.028671,35.000000,0.516614,0.420200,0.044565,0.0,0.0,0,-6.406482
5204,01013,1116.0,2019,5,2019-05-01,0.681805,0.028688,0.022063,0.256075,0.001400,...,0.092188,0.028518,35.000000,0.516720,0.421325,0.044472,0.0,0.0,0,-6.470685
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10407,99654,1277.0,2019,8,2019-08-01,0.796036,0.008872,0.014941,0.049974,0.049727,...,0.102326,0.049199,33.566667,0.475244,0.399400,0.018076,0.0,0.0,0,77.011988
11708,99654,1271.0,2019,9,2019-09-01,0.795709,0.008765,0.014871,0.050125,0.049560,...,0.102254,0.049419,33.575000,0.475511,0.399650,0.018306,0.0,0.0,0,83.009265
13009,99654,1299.0,2019,10,2019-10-01,0.795381,0.008658,0.014801,0.050275,0.049393,...,0.102181,0.049640,33.583333,0.475778,0.399900,0.018537,0.0,0.0,0,54.522188
14310,99654,1261.5,2019,11,2019-11-01,0.795054,0.008550,0.014730,0.050426,0.049226,...,0.102109,0.049861,33.591667,0.476045,0.400150,0.018767,0.0,0.0,0,92.266909


In [270]:
x = df3.drop(columns = ['zip', 'zri', 'year', 'month', 'resid', 'datetime'])

In [271]:
x_norm = ss.fit_transform(x)

In [272]:
y = df3[['resid']]

In [273]:
y_norm = ss.fit_transform(y)

In [274]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, LinearRegression

In [275]:
rf = RandomForestRegressor(random_state = 0)

In [276]:
params_rf = {
    
}

In [277]:
rf = rf.fit(x, y['resid'])

In [278]:
feat_imp_rf = pd.DataFrame(rf.feature_importances_, index = x.columns).rename(columns = {0:'feat_imp'}).sort_values('feat_imp', ascending = False)

In [279]:
feat_imp_rf

Unnamed: 0,feat_imp
percent_white,0.055865
percent_highschool,0.046035
percent_associates,0.041864
percent_other_race,0.036239
percent_new_unit,0.033901
percent_black,0.033752
percent_asian,0.033616
percent_female,0.032657
percent_commute_public_transport,0.031414
total_pop,0.031129


In [280]:
rf.score(x, y)

0.9916888701027813

In [281]:
lm = LinearRegression()

In [282]:
lm.fit(x, y)

LinearRegression()

In [283]:
lm.score(x, y)

0.09825338694085894

In [284]:
feat_imp_lm = pd.DataFrame(lm.coef_)
feat_imp_lm = feat_imp_lm.T.rename(columns = {0:'feat_imp'}).sort_values('feat_imp', ascending = False)
feat_imp_lm.index = x.columns
feat_imp_lm

Unnamed: 0,feat_imp
percent_white,244.770496
percent_black,223.43771
percent_asian,214.575428
percent_hispanic,108.612703
percent_native_am,107.647863
percent_other_race,89.760982
percent_0_17,85.206848
percent_18_39,78.957409
percent_40_64,72.7152
percent_65+,58.535297


In [285]:
lasso = Lasso(random_state = 0)

In [286]:
lasso_params = {
    'alpha' : [1e-2, 0.1, 1, 10, 100]
}

In [287]:
tuner = GridSearchCV(lasso, lasso_params, cv=3)

In [288]:
tuner = tuner.fit(x_norm, y_norm)

In [289]:
tuner.best_estimator_

Lasso(alpha=0.1, random_state=0)

In [290]:
tuner.best_estimator_.coef_

array([-0.        ,  0.        ,  0.        ,  0.02968437, -0.        ,
        0.        ,  0.        , -0.        ,  0.        ,  0.        ,
       -0.        ,  0.        , -0.        , -0.        ,  0.        ,
        0.        ,  0.03581016, -0.        , -0.        ,  0.        ,
        0.        ,  0.        , -0.        , -0.        ,  0.        ,
       -0.        , -0.        , -0.        ,  0.        ,  0.        ,
       -0.        ,  0.        ,  0.        , -0.        ,  0.        ,
       -0.        , -0.02718255, -0.        ])

In [291]:
tuner.best_params_

{'alpha': 0.1}

In [292]:
tuner.best_score_

-0.030204863794668208

In [293]:
feat_imp_lasso = pd.DataFrame(tuner.best_estimator_.coef_, index = x.columns).rename(columns = {0:'feat_imp'}).sort_values('feat_imp', ascending = False)

In [294]:
feat_imp_lasso

Unnamed: 0,feat_imp
percent_less_highschool,0.03581
percent_hispanic,0.029684
percent_white,-0.0
total_pop,0.0
percent_new_city,-0.0
percent_new_unit,-0.0
percent_units_owner_occupied,0.0
median_building_age,-0.0
income_per_capita,-0.0
poverty_rate,-0.0
