In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import pmdarima as pm
from pmdarima.utils import diff
from sklearn.metrics import mean_squared_error
from pmdarima.metrics import smape
import pickle
import sklearn

# Importing and preparing zri and acs data for merging

In [2]:
zri = pd.read_csv('./../data/zri_multifamily_v2.csv')

In [3]:
zri['zip'] = zri['zip'].map(lambda x: str(x))

In [4]:
zri['zip'].map(lambda x: len(x)).value_counts()

5    79272
4    14472
Name: zip, dtype: int64

In [5]:
zri['zip'] = zri['zip'].map(lambda x: '0' + x if len(x)<5 else x)

In [6]:
zri['zip'].map(lambda x: len(x)).value_counts()

5    93744
Name: zip, dtype: int64

In [7]:
zri['year']=zri['year-month'].map(lambda x: int(x[0:4]))

In [8]:
zri['month']=zri['year-month'].map(lambda x: int(x[5:]))

In [9]:
years = range(2014,2020)

In [10]:
months = range(1, 13)

In [11]:
datetime_col = [datetime(year=i, month=j, day=1) for i in years for j in months]

In [12]:
datetime_col = datetime_col*1302

In [13]:
zri['datetime'] = datetime_col

In [14]:
zri.drop(columns = 'year-month', inplace = True)

In [15]:
zri

Unnamed: 0,zip,City,State,Metro,CountyName,zri,year,month,datetime
0,01013,Chicopee,MA,Springfield,Hampden County,928.0,2014,1,2014-01-01
1,01013,Chicopee,MA,Springfield,Hampden County,931.0,2014,2,2014-02-01
2,01013,Chicopee,MA,Springfield,Hampden County,934.0,2014,3,2014-03-01
3,01013,Chicopee,MA,Springfield,Hampden County,929.0,2014,4,2014-04-01
4,01013,Chicopee,MA,Springfield,Hampden County,929.0,2014,5,2014-05-01
...,...,...,...,...,...,...,...,...,...
93739,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1277.0,2019,8,2019-08-01
93740,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1271.0,2019,9,2019-09-01
93741,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1299.0,2019,10,2019-10-01
93742,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,1261.5,2019,11,2019-11-01


In [16]:
acs = pd.read_csv('./../data/acs_engineered_features.csv')

In [17]:
acs['zip'] = acs['zip'].map(lambda x: str(x))

In [18]:
acs['zip'].map(lambda x: len(x)).value_counts()

5    83482
4     9170
Name: zip, dtype: int64

In [19]:
acs['zip'] = acs['zip'].map(lambda x: '0' + x if len(x)<5 else x)

In [20]:
acs['zip'].map(lambda x: len(x)).value_counts()

5    92652
Name: zip, dtype: int64

In [21]:
acs.drop(columns = 'census_period', axis = 1, inplace = True)

In [22]:
acs

Unnamed: 0,zip,year_usable,percent_white,percent_black,percent_asian,percent_hispanic,percent_native_am,percent_other_race,percent_0_17,percent_18_39,...,median_building_age,income_per_capita,poverty_rate,total_pop,percent_workforce_unemployed,percent_work_from_home,median_age,percent_female,gini_index,percent_not_us_citizen
0,01013,2013,0.729579,0.020723,0.013756,0.218793,0.000000,0.005002,0.230494,0.311375,...,62.0,20433.0,0.190522,22391,0.104113,0.008853,36.4,0.510250,0.4210,0.055960
1,01013,2014,0.714417,0.023463,0.013473,0.231235,0.000000,0.004491,0.224682,0.312437,...,62.0,20940.0,0.201543,21822,0.104320,0.014770,36.6,0.501650,0.4179,0.057190
2,01013,2015,0.720119,0.027857,0.016777,0.217971,0.001357,0.005065,0.217248,0.335730,...,66.0,20889.0,0.173591,22113,0.113411,0.012946,35.5,0.506851,0.4110,0.056483
3,01013,2016,0.713799,0.017445,0.016638,0.237499,0.000538,0.002197,0.221131,0.328625,...,72.0,21371.0,0.155696,22299,0.115569,0.013701,35.8,0.512669,0.4061,0.050182
4,01013,2017,0.703894,0.022238,0.018442,0.241627,0.001116,0.001652,0.222336,0.338573,...,73.0,21477.0,0.169816,22394,0.114075,0.020243,34.9,0.514200,0.4117,0.050013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92647,99901,2015,0.653872,0.002271,0.063146,0.044026,0.142407,0.002710,0.233756,0.279833,...,38.0,31563.0,0.101846,13651,0.086316,0.021073,38.7,0.478353,0.3981,0.025493
92648,99901,2016,0.654374,0.004016,0.061487,0.046444,0.143932,0.000000,0.232584,0.279977,...,38.0,31487.0,0.114447,13694,0.085095,0.022509,38.8,0.481233,0.4057,0.023952
92649,99901,2017,0.649705,0.004228,0.073631,0.048334,0.135379,0.001604,0.230079,0.285121,...,40.0,32010.0,0.121194,13717,0.078951,0.023050,38.7,0.485602,0.4110,0.025953
92650,99901,2018,0.645762,0.004947,0.076319,0.046126,0.129138,0.005384,0.225609,0.284103,...,40.0,32671.0,0.108124,13745,0.075428,0.026582,39.2,0.485922,0.4158,0.035504


In [23]:
zri.drop(columns = ['City', 'State', 'Metro', 'CountyName'], inplace = True)

In [24]:
merged = pd.merge(zri, acs, left_on = ['zip','year'], right_on = ['zip','year_usable'], how = 'left')

In [25]:
null_check = {}
for col in merged.columns:
    if merged[col].isnull().any():
        null_check[col] = merged[col].isnull().sum()
null_check

{'year_usable': 72,
 'percent_white': 72,
 'percent_black': 72,
 'percent_asian': 72,
 'percent_hispanic': 72,
 'percent_native_am': 72,
 'percent_other_race': 72,
 'percent_0_17': 72,
 'percent_18_39': 72,
 'percent_40_64': 72,
 'percent_65+': 72,
 'percent_rental_units_vacant': 72,
 'percent_rental_units_occupied': 72,
 'percent_graduate_deg': 72,
 'percent_bachelors': 72,
 'percent_associates': 72,
 'percent_highschool': 72,
 'percent_less_highschool': 72,
 'percent_commute_public_transport': 72,
 'percent_commute_less_30': 72,
 'percent_commute_30_to_59': 72,
 'percent_commute_60_to_89': 72,
 'percent_commute_90_more': 72,
 'percent_new_city': 72,
 'percent_new_unit': 72,
 'percent_units_owner_occupied': 72,
 'median_building_age': 72,
 'income_per_capita': 72,
 'poverty_rate': 72,
 'total_pop': 72,
 'percent_workforce_unemployed': 72,
 'percent_work_from_home': 72,
 'median_age': 72,
 'percent_female': 72,
 'gini_index': 72,
 'percent_not_us_citizen': 72}

In [26]:
merged= merged[merged['zip']!= '11249']

In [27]:
null_check = {}
for col in merged.columns:
    if merged[col].isnull().any():
        null_check[col] = merged[col].isnull().sum()
null_check

{}

In [28]:
merged['zip'].nunique()

1301

In [29]:
df = merged.drop(columns = ['year_usable'])

# Train-test splitting and modeling

In [41]:
zips = list(df['zip'].unique())

test = df[df['year']>=2019].drop(columns = ['year', 'month'])
train = df[df['year']<2019].drop(columns = ['year', 'month'])

y_train = train[['zip','zri', 'zri_adj']]
y_test = test[['zip','zri', 'zri_adj']]

x_train = train.drop(columns = ['zri', 'zri_adj'])
x_test = test.drop(columns = ['zri', 'zri_adj'])