In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import pmdarima as pm
from pmdarima.utils import diff
from sklearn.metrics import mean_squared_error
from pmdarima.metrics import smape
import pickle
import sklearn

In [2]:
pd.set_option('display.max_rows', 500)

# Importing and preparing zri and acs data for merging

In [3]:
zri = pd.read_csv('./../data/zri_multifamily_v2.csv')

In [4]:
zri['zip'] = zri['zip'].map(lambda x: str(x))

In [5]:
zri['zip'].map(lambda x: len(x)).value_counts()

5    79272
4    14472
Name: zip, dtype: int64

In [6]:
zri['zip'] = zri['zip'].map(lambda x: '0' + x if len(x)<5 else x)

In [7]:
zri['zip'].map(lambda x: len(x)).value_counts()

5    93744
Name: zip, dtype: int64

In [8]:
zri['year']=zri['year-month'].map(lambda x: int(x[0:4]))

In [9]:
zri['month']=zri['year-month'].map(lambda x: int(x[5:]))

In [10]:
years = range(2014,2020)

In [11]:
months = range(1, 13)

In [12]:
datetime_col = [datetime(year=i, month=j, day=1) for i in years for j in months]

In [13]:
datetime_col = datetime_col*1302

In [14]:
zri['datetime'] = datetime_col

In [15]:
zri.drop(columns = 'year-month', inplace = True)

In [16]:
zri

Unnamed: 0,zip,City,State,Metro,CountyName,zri,year,month,datetime
0,01013,Chicopee,MA,Springfield,Hampden County,928.0,2014,1,2014-01-01
1,01013,Chicopee,MA,Springfield,Hampden County,931.0,2014,2,2014-02-01
2,01013,Chicopee,MA,Springfield,Hampden County,934.0,2014,3,2014-03-01
3,01013,Chicopee,MA,Springfield,Hampden County,929.0,2014,4,2014-04-01
4,01013,Chicopee,MA,Springfield,Hampden County,929.0,2014,5,2014-05-01
...,...,...,...,...,...,...,...,...,...
93739,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1277.0,2019,8,2019-08-01
93740,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1271.0,2019,9,2019-09-01
93741,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1299.0,2019,10,2019-10-01
93742,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1261.5,2019,11,2019-11-01


In [17]:
acs = pd.read_csv('./../data/acs_engineered_features.csv')

In [18]:
acs['zip'] = acs['zip'].map(lambda x: str(x))

In [19]:
acs['zip'].map(lambda x: len(x)).value_counts()

5    83482
4     9170
Name: zip, dtype: int64

In [20]:
acs['zip'] = acs['zip'].map(lambda x: '0' + x if len(x)<5 else x)

In [21]:
acs['zip'].map(lambda x: len(x)).value_counts()

5    92652
Name: zip, dtype: int64

In [22]:
acs.drop(columns = 'census_period', axis = 1, inplace = True)

In [23]:
acs

Unnamed: 0,zip,year_usable,percent_white,percent_black,percent_asian,percent_hispanic,percent_native_am,percent_other_race,percent_0_17,percent_18_39,...,median_building_age,income_per_capita,poverty_rate,total_pop,percent_workforce_unemployed,percent_work_from_home,median_age,percent_female,gini_index,percent_not_us_citizen
0,01013,2013,0.729579,0.020723,0.013756,0.218793,0.000000,0.005002,0.230494,0.311375,...,62.0,20433.0,0.190522,22391,0.104113,0.008853,36.4,0.510250,0.4210,0.055960
1,01013,2014,0.714417,0.023463,0.013473,0.231235,0.000000,0.004491,0.224682,0.312437,...,62.0,20940.0,0.201543,21822,0.104320,0.014770,36.6,0.501650,0.4179,0.057190
2,01013,2015,0.720119,0.027857,0.016777,0.217971,0.001357,0.005065,0.217248,0.335730,...,66.0,20889.0,0.173591,22113,0.113411,0.012946,35.5,0.506851,0.4110,0.056483
3,01013,2016,0.713799,0.017445,0.016638,0.237499,0.000538,0.002197,0.221131,0.328625,...,72.0,21371.0,0.155696,22299,0.115569,0.013701,35.8,0.512669,0.4061,0.050182
4,01013,2017,0.703894,0.022238,0.018442,0.241627,0.001116,0.001652,0.222336,0.338573,...,73.0,21477.0,0.169816,22394,0.114075,0.020243,34.9,0.514200,0.4117,0.050013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92647,99901,2015,0.653872,0.002271,0.063146,0.044026,0.142407,0.002710,0.233756,0.279833,...,38.0,31563.0,0.101846,13651,0.086316,0.021073,38.7,0.478353,0.3981,0.025493
92648,99901,2016,0.654374,0.004016,0.061487,0.046444,0.143932,0.000000,0.232584,0.279977,...,38.0,31487.0,0.114447,13694,0.085095,0.022509,38.8,0.481233,0.4057,0.023952
92649,99901,2017,0.649705,0.004228,0.073631,0.048334,0.135379,0.001604,0.230079,0.285121,...,40.0,32010.0,0.121194,13717,0.078951,0.023050,38.7,0.485602,0.4110,0.025953
92650,99901,2018,0.645762,0.004947,0.076319,0.046126,0.129138,0.005384,0.225609,0.284103,...,40.0,32671.0,0.108124,13745,0.075428,0.026582,39.2,0.485922,0.4158,0.035504


# Imputing ACS monthly data

In [24]:
acs_zips = acs['zip'].unique()

In [25]:
years = acs['year_usable'].unique()

In [26]:
zri_zips = zri['zip'].unique()

In [27]:
zri_zips

array(['01013', '01020', '01040', ..., '99504', '99508', '99654'],
      dtype=object)

In [28]:
acs_zips_in_zri = []

In [29]:
for item in acs_zips:
    if item in zri_zips:
        acs_zips_in_zri.append(item)

In [30]:
acs = acs.set_index('zip')

In [31]:
acs = acs.loc[acs_zips_in_zri,:]

In [32]:
acs = acs.reset_index(drop=False)

In [33]:
acs

Unnamed: 0,zip,year_usable,percent_white,percent_black,percent_asian,percent_hispanic,percent_native_am,percent_other_race,percent_0_17,percent_18_39,...,median_building_age,income_per_capita,poverty_rate,total_pop,percent_workforce_unemployed,percent_work_from_home,median_age,percent_female,gini_index,percent_not_us_citizen
0,01013,2013,0.729579,0.020723,0.013756,0.218793,0.000000,0.005002,0.230494,0.311375,...,62.0,20433.0,0.190522,22391,0.104113,0.008853,36.4,0.510250,0.4210,0.055960
1,01013,2014,0.714417,0.023463,0.013473,0.231235,0.000000,0.004491,0.224682,0.312437,...,62.0,20940.0,0.201543,21822,0.104320,0.014770,36.6,0.501650,0.4179,0.057190
2,01013,2015,0.720119,0.027857,0.016777,0.217971,0.001357,0.005065,0.217248,0.335730,...,66.0,20889.0,0.173591,22113,0.113411,0.012946,35.5,0.506851,0.4110,0.056483
3,01013,2016,0.713799,0.017445,0.016638,0.237499,0.000538,0.002197,0.221131,0.328625,...,72.0,21371.0,0.155696,22299,0.115569,0.013701,35.8,0.512669,0.4061,0.050182
4,01013,2017,0.703894,0.022238,0.018442,0.241627,0.001116,0.001652,0.222336,0.338573,...,73.0,21477.0,0.169816,22394,0.114075,0.020243,34.9,0.514200,0.4117,0.050013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9102,99654,2015,0.807711,0.008025,0.016069,0.045364,0.054007,0.000842,0.297085,0.296824,...,21.0,29022.0,0.098645,53456,0.104884,0.047280,33.0,0.480245,0.3941,0.017716
9103,99654,2016,0.810662,0.006580,0.015781,0.044054,0.055171,0.001699,0.292142,0.298324,...,21.0,29855.0,0.099912,55319,0.105642,0.039896,33.2,0.476292,0.3925,0.020933
9104,99654,2017,0.806640,0.008930,0.017115,0.044119,0.053475,0.002468,0.286711,0.305601,...,21.0,29865.0,0.097616,56325,0.099910,0.042248,33.2,0.474887,0.3889,0.022725
9105,99654,2018,0.798655,0.009730,0.015504,0.048770,0.051062,0.003108,0.282481,0.306271,...,21.0,30155.0,0.093663,58889,0.102903,0.047433,33.5,0.473110,0.3974,0.016234


In [34]:
acs = acs.loc[acs.index.repeat(12)].reset_index(drop=True)

In [35]:
acs['month'] = 0

In [36]:
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] * len(acs_zips_in_zri) * len(years)

In [37]:
acs['month'] = months

In [38]:
acs.loc[acs['month']!=12, acs.columns[2:-1]] = np.nan

In [39]:
acs

Unnamed: 0,zip,year_usable,percent_white,percent_black,percent_asian,percent_hispanic,percent_native_am,percent_other_race,percent_0_17,percent_18_39,...,income_per_capita,poverty_rate,total_pop,percent_workforce_unemployed,percent_work_from_home,median_age,percent_female,gini_index,percent_not_us_citizen,month
0,01013,2013,,,,,,,,,...,,,,,,,,,,1
1,01013,2013,,,,,,,,,...,,,,,,,,,,2
2,01013,2013,,,,,,,,,...,,,,,,,,,,3
3,01013,2013,,,,,,,,,...,,,,,,,,,,4
4,01013,2013,,,,,,,,,...,,,,,,,,,,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109279,99654,2019,,,,,,,,,...,,,,,,,,,,8
109280,99654,2019,,,,,,,,,...,,,,,,,,,,9
109281,99654,2019,,,,,,,,,...,,,,,,,,,,10
109282,99654,2019,,,,,,,,,...,,,,,,,,,,11


In [40]:
for item in acs_zips_in_zri:
    temp = acs.loc[acs['zip']==item,:]
    for col in temp.columns[2:-1]:
        temp[col] = temp[col].interpolate(method='linear')
    acs.loc[acs['zip']==item,:] = temp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp[col] = temp[col].interpolate(method='linear')


In [41]:
acs

Unnamed: 0,zip,year_usable,percent_white,percent_black,percent_asian,percent_hispanic,percent_native_am,percent_other_race,percent_0_17,percent_18_39,...,income_per_capita,poverty_rate,total_pop,percent_workforce_unemployed,percent_work_from_home,median_age,percent_female,gini_index,percent_not_us_citizen,month
0,01013,2013,,,,,,,,,...,,,,,,,,,,1
1,01013,2013,,,,,,,,,...,,,,,,,,,,2
2,01013,2013,,,,,,,,,...,,,,,,,,,,3
3,01013,2013,,,,,,,,,...,,,,,,,,,,4
4,01013,2013,,,,,,,,,...,,,,,,,,,,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109279,99654,2019,0.796036,0.008872,0.014941,0.049974,0.049727,0.003070,0.282075,0.308002,...,30251.666667,0.094625,60057.0,0.102326,0.049199,33.566667,0.475244,0.39940,0.018076,8
109280,99654,2019,0.795709,0.008765,0.014871,0.050125,0.049560,0.003065,0.282024,0.308218,...,30263.750000,0.094745,60203.0,0.102254,0.049419,33.575000,0.475511,0.39965,0.018306,9
109281,99654,2019,0.795381,0.008658,0.014801,0.050275,0.049393,0.003060,0.281973,0.308434,...,30275.833333,0.094865,60349.0,0.102181,0.049640,33.583333,0.475778,0.39990,0.018537,10
109282,99654,2019,0.795054,0.008550,0.014730,0.050426,0.049226,0.003055,0.281923,0.308651,...,30287.916667,0.094986,60495.0,0.102109,0.049861,33.591667,0.476045,0.40015,0.018767,11


In [42]:
merged = pd.merge(zri, acs, left_on = ['zip','year', 'month'], right_on = ['zip','year_usable', 'month'], how = 'left')

In [45]:
null_check = {}
for col in merged.columns:
    if merged[col].isnull().any():
        null_check[col] = merged[col].isnull().sum()
null_check

{'Metro': 72,
 'year_usable': 72,
 'percent_white': 72,
 'percent_black': 72,
 'percent_asian': 72,
 'percent_hispanic': 72,
 'percent_native_am': 72,
 'percent_other_race': 72,
 'percent_0_17': 72,
 'percent_18_39': 72,
 'percent_40_64': 72,
 'percent_65+': 72,
 'percent_rental_units_vacant': 72,
 'percent_rental_units_occupied': 72,
 'percent_graduate_deg': 72,
 'percent_bachelors': 72,
 'percent_associates': 72,
 'percent_highschool': 72,
 'percent_less_highschool': 72,
 'percent_commute_public_transport': 72,
 'percent_commute_less_30': 72,
 'percent_buildings_less_10_units': 72,
 'percent_buildings_10_19_units': 72,
 'percent_buildings_20_49_units': 72,
 'percent_buildings_50+_units': 72,
 'percent_commute_30_to_59': 72,
 'percent_commute_60_to_89': 72,
 'percent_commute_90_more': 72,
 'percent_new_city': 72,
 'percent_new_unit': 72,
 'percent_units_owner_occupied': 72,
 'median_building_age': 72,
 'income_per_capita': 72,
 'poverty_rate': 72,
 'total_pop': 72,
 'percent_workforce

In [46]:
merged= merged[merged['zip']!= '11249']

In [47]:
null_check = {}
for col in merged.columns:
    if merged[col].isnull().any():
        null_check[col] = merged[col].isnull().sum()
null_check

{'Metro': 72}

In [48]:
merged['zip'].nunique()

1301

In [49]:
df = merged.drop(columns = ['year_usable'])

In [50]:
df

Unnamed: 0,zip,City,State,Metro,CountyName,zri,year,month,datetime,percent_white,...,median_building_age,income_per_capita,poverty_rate,total_pop,percent_workforce_unemployed,percent_work_from_home,median_age,percent_female,gini_index,percent_not_us_citizen
0,01013,Chicopee,MA,Springfield,Hampden County,928.0,2014,1,2014-01-01,0.728315,...,62.000000,20475.250000,0.191440,22343.583333,0.104130,0.009346,36.416667,0.509533,0.420742,0.056062
1,01013,Chicopee,MA,Springfield,Hampden County,931.0,2014,2,2014-02-01,0.727052,...,62.000000,20517.500000,0.192359,22296.166667,0.104147,0.009839,36.433333,0.508816,0.420483,0.056165
2,01013,Chicopee,MA,Springfield,Hampden County,934.0,2014,3,2014-03-01,0.725788,...,62.000000,20559.750000,0.193277,22248.750000,0.104165,0.010332,36.450000,0.508100,0.420225,0.056267
3,01013,Chicopee,MA,Springfield,Hampden County,929.0,2014,4,2014-04-01,0.724525,...,62.000000,20602.000000,0.194196,22201.333333,0.104182,0.010825,36.466667,0.507383,0.419967,0.056370
4,01013,Chicopee,MA,Springfield,Hampden County,929.0,2014,5,2014-05-01,0.723261,...,62.000000,20644.250000,0.195114,22153.916667,0.104199,0.011318,36.483333,0.506666,0.419708,0.056472
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93739,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1277.0,2019,8,2019-08-01,0.796036,...,22.333333,30251.666667,0.094625,60057.000000,0.102326,0.049199,33.566667,0.475244,0.399400,0.018076
93740,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1271.0,2019,9,2019-09-01,0.795709,...,22.500000,30263.750000,0.094745,60203.000000,0.102254,0.049419,33.575000,0.475511,0.399650,0.018306
93741,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1299.0,2019,10,2019-10-01,0.795381,...,22.666667,30275.833333,0.094865,60349.000000,0.102181,0.049640,33.583333,0.475778,0.399900,0.018537
93742,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1261.5,2019,11,2019-11-01,0.795054,...,22.833333,30287.916667,0.094986,60495.000000,0.102109,0.049861,33.591667,0.476045,0.400150,0.018767


In [51]:
bikeshare = pd.read_csv('./../data/bikeshare_engineered_features.csv', index_col = 0)

In [52]:
bikeshare['month'] = bikeshare['year-month'].map(lambda x: int(x[5:7]))

In [53]:
bikeshare.drop(columns = ['City', 'State', 'Metro', 'CountyName', 'year-month', 'zri'], inplace = True)

In [54]:
bikeshare['zip'] = bikeshare['zip'].map(lambda x: str(x))

In [55]:
bikeshare['zip'] = bikeshare['zip'].map(lambda x: '0' +x if len(x)<5 else x)

In [56]:
df = pd.merge(df, bikeshare, on = ('zip', 'year', 'month'), how = 'left')

In [57]:
df

Unnamed: 0,zip,City,State,Metro,CountyName,zri,year,month,datetime,percent_white,...,total_pop,percent_workforce_unemployed,percent_work_from_home,median_age,percent_female,gini_index,percent_not_us_citizen,bs_total_stations,bs_total_systems,has_bike_sharing
0,01013,Chicopee,MA,Springfield,Hampden County,928.0,2014,1,2014-01-01,0.728315,...,22343.583333,0.104130,0.009346,36.416667,0.509533,0.420742,0.056062,0.0,0.0,False
1,01013,Chicopee,MA,Springfield,Hampden County,931.0,2014,2,2014-02-01,0.727052,...,22296.166667,0.104147,0.009839,36.433333,0.508816,0.420483,0.056165,0.0,0.0,False
2,01013,Chicopee,MA,Springfield,Hampden County,934.0,2014,3,2014-03-01,0.725788,...,22248.750000,0.104165,0.010332,36.450000,0.508100,0.420225,0.056267,0.0,0.0,False
3,01013,Chicopee,MA,Springfield,Hampden County,929.0,2014,4,2014-04-01,0.724525,...,22201.333333,0.104182,0.010825,36.466667,0.507383,0.419967,0.056370,0.0,0.0,False
4,01013,Chicopee,MA,Springfield,Hampden County,929.0,2014,5,2014-05-01,0.723261,...,22153.916667,0.104199,0.011318,36.483333,0.506666,0.419708,0.056472,0.0,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93667,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1277.0,2019,8,2019-08-01,0.796036,...,60057.000000,0.102326,0.049199,33.566667,0.475244,0.399400,0.018076,0.0,0.0,False
93668,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1271.0,2019,9,2019-09-01,0.795709,...,60203.000000,0.102254,0.049419,33.575000,0.475511,0.399650,0.018306,0.0,0.0,False
93669,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1299.0,2019,10,2019-10-01,0.795381,...,60349.000000,0.102181,0.049640,33.583333,0.475778,0.399900,0.018537,0.0,0.0,False
93670,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1261.5,2019,11,2019-11-01,0.795054,...,60495.000000,0.102109,0.049861,33.591667,0.476045,0.400150,0.018767,0.0,0.0,False


In [58]:
df['has_bike_sharing'] = df['has_bike_sharing'].map(lambda x: 1 if x==True else 0)

In [59]:
# df.to_csv('./../data/zri_acs_bikeshare_merged.csv', index = 0)