In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import pmdarima as pm
from pmdarima.utils import diff
from sklearn.metrics import mean_squared_error
import pickle
import sklearn
import statsmodels.tsa.statespace as sm
from sklearn.preprocessing import StandardScaler

In [25]:
none = pd.read_csv('arima_resids_no_exog.csv')

In [143]:
zips = list(none['zip'].unique())

In [27]:
def rmse_by_zip(df):
    zip_rmses = []
    for item in zips:
        temp = df[df['zip']==item]['zri']**2
        temp = np.sqrt(temp.sum()/len(temp))
        zip_rmses.append(temp)
    return sum(zip_rmses)/len(zip_rmses)

In [28]:
rmse_by_zip(none)

49.00701344825756

In [31]:
google = pd.read_csv('sarimax_resids_google.csv')

In [32]:
rmse_by_zip(google)

67.18954062470219

In [33]:
acs = pd.read_csv('sarimax_resids_acs.csv')

In [34]:
rmse_by_zip(acs)

232.45934296664882

In [35]:
bike = pd.read_csv('sarimax_resids_bike.csv')

In [36]:
rmse_by_zip(bike)

49.14345182801593

In [37]:
econ = pd.read_csv('sarimax_resids_econ.csv')

In [38]:
rmse_by_zip(econ)

49.475138061307284

In [145]:
all_ = pd.read_csv('sarimax_resids_all.csv')

In [146]:
rmse_by_zip(all_)

2081.0377683354004

In [39]:
df = pd.read_csv('./../data/full_dataset_unscaled.csv')
df['zip'] = df['zip'].map(lambda x: str(x))
df['zip'] = df['zip'].map(lambda x: '0' + x if len(x)<5 else x)
df['datetime']=df['datetime'].map(lambda x: datetime.strptime(x, '%Y-%m-%d'))
df = df.set_index('datetime')

In [40]:
# from Mo/Niki

df['net_job_rate']=df['job_creation_rate']-df['job_destruction_rate']
df['apartment_for_rent_searches'] = df[['apartment for rent','studio for rent','1 bedroom for rent','3 bedroom for rent']].sum(axis=1)
df['multifamily_for_rent_searches'] = df[['townhome for rent','townhouse for rent','house for rent','duplex apartments for rent','condos for rent']].sum(axis=1)
df['gun_searches'] = df[['gun range', 'gun control', 'gun violence']].sum(axis=1)


acs_cols = ['percent_white',
 'percent_black',
 'percent_asian',
 'percent_hispanic',
 'percent_native_am',
 'percent_other_race',
 'percent_0_17',
 'percent_18_39',
 'percent_40_64',
 'percent_65+',
 'percent_rental_units_vacant',
 'percent_rental_units_occupied',
 'percent_graduate_deg',
 'percent_bachelors',
 'percent_associates',
 'percent_highschool',
 'percent_less_highschool',
 'percent_commute_public_transport',
 'percent_commute_less_30',
 'percent_buildings_less_10_units',
 'percent_buildings_10_19_units',
 'percent_buildings_20_49_units',
 'percent_buildings_50+_units',
 'percent_commute_30_to_59',
 'percent_commute_60_to_89',
 'percent_commute_90_more',
 'percent_new_city',
 'percent_new_unit',
 'percent_units_owner_occupied',
 'median_building_age',
 'income_per_capita',
 'poverty_rate',
 'total_pop',
 'percent_workforce_unemployed',
 'percent_work_from_home',
 'median_age',
 'percent_female',
 'gini_index',
 'percent_not_us_citizen']
acs_cols_remove=['percent_other_race','percent_40_64','percent_0_17','percent_18_39',
       'percent_65+','percent_rental_units_vacant','percent_not_us_citizen','percent_less_highschool', 'percent_buildings_less_10_units',
 'percent_commute_30_to_59',
 'percent_commute_60_to_89',
 'percent_commute_90_more', 'percent_commute_less_30','percent_graduate_deg',
                'percent_female','gini_index','percent_hispanic','percent_black','percent_bachelors',
                 'percent_asian','percent_new_city','percent_new_unit']
acs_cols_keep=list(set(acs_cols) - set(acs_cols_remove))
bikeshare_cols = ['bs_total_stations',
 'bs_total_systems',
 'has_bike_sharing']
bikeshare_cols_remove=['has_bike_sharing','bs_total_systems']
bikeshare_cols_keep=list(set(bikeshare_cols) - set(bikeshare_cols_remove))
trends_cols = ['gun range',
 'gun control',
 'gun violence',
 'job opportunities',
 'unemployment',
 'retirement',
 'layoff',
 'lgbt',
 'same sex marriage',
 'they',
 'pronouns',
 'black lives matter',
 'political correctness',
 'make america great again',
 'euthanasia',
 'getaway',
 'places to go',
 'flight tickets',
 'twitter',
 'hashtag',
 'fake news',
 'hurricane',
 'wildfire',
 'flood',
 'fire',
 "trader joe's",
 'whole foods',
 'lululemon',
 'thrift',
 'condos for rent',
 'duplex apartments for rent',
 'townhomes for rent',
 'townhouses for rent',
 'home for rent',
 'house for rent',
 'townhome for rent',
 'townhouse for rent',
 'apartment for rent',
 'studio for rent',
 '1 bedroom for rent',
 '3 bedroom for rent',
 'starbucks',
  'apartment_for_rent_searches',
  'multifamily_for_rent_searches',
  'gun_searches']
trends_cols_remove=['they','apartment for rent','studio for rent','1 bedroom for rent',
                    '3 bedroom for rent', 'townhome for rent','townhouse for rent',
                    'house for rent','duplex apartments for rent','condos for rent',
                   'gun range', 'gun control', 'gun violence']
trends_cols_keep=list(set(trends_cols) - set(trends_cols_remove))
economic_cols = ['total_firms',
 'job_creation_rate',
 'job_destruction_rate',
 'startup_firms','state_local_perc', 'net_job_rate']
economic_cols_remove=['total_firms', 'job_creation_rate','job_destruction_rate',]
economic_cols_keep=list(set(economic_cols) - set(economic_cols_remove))

In [41]:
df = df[df['year']>=2015]

In [56]:
def plot_arima(zip_code, pred_dict, conf_int):
    

In [43]:
zips = list(df['zip'].unique())

y = df.drop(columns = ['year', 'month', 'City', 'State', 'Metro', 'CountyName']).loc[:,['zip', 'zri']]
x = df.drop(columns = ['year', 'month', 'City', 'State', 'Metro', 'CountyName', 'zri'])

In [120]:
x_all = x.loc[:, ['zip'] + acs_cols_keep + economic_cols_keep + trends_cols_keep + bikeshare_cols_keep]

In [50]:
pred_dict = {}
coef_dict = {}

In [121]:
curr_zip = '10305'

In [123]:
curr_y = y[y['zip']==curr_zip]
curr_x = x[x['zip']==curr_zip].drop(columns = 'zip').drop(columns = 'state_local_perc')
curr_y_train = curr_y.iloc[0:48,:]
curr_x_train = curr_x.iloc[0:48,:]
curr_x_test = curr_x.iloc[48:,:]

curr_model = sm.sarimax.SARIMAX(curr_y_train['zri'], exog = curr_x_train, order = (1,1,0), seasonal_order=(0, 0, 0, 0)).fit()

pred_dict[curr_zip] = curr_model.predict(start = 1, end = 59, exog = curr_x_test, dynamic = 47)
coef_dict[curr_zip] = curr_model.params
fcast = curr_model.get_forecast(steps = 12, exog = curr_x_test).summary_frame()


No frequency information was provided, so inferred frequency MS will be used.


No frequency information was provided, so inferred frequency MS will be used.


Maximum Likelihood optimization failed to converge. Check mle_retvals



In [125]:
fig = go.Figure()
fig.add_trace(go.Line(x = y[y['zip']==curr_zip].index, y = y[y['zip']==curr_zip]['zri'], mode = 'lines', name = 'Actual'))
fig.add_trace(go.Line(x = pred_dict[curr_zip].index, y = pred_dict[curr_zip], mode = 'lines', name = 'Predicted'))
fig.add_vline(x=y[y['zip']==curr_zip].index[47], line_width=3, line_dash="dash", line_color="green")
# fig.add_trace(go.Line(x = fcast.index, y = fcast['mean_ci_upper'], mode = 'lines', line=dict(color="#ffe476")))
# fig.add_trace(go.Line(x = fcast.index, y = fcast['mean_ci_lower'], mode = 'lines', fill = 'tonexty', line=dict(color="#ffe476"), name = 'Confidence interval'))
fig.update_yaxes(title = 'ZRI')
fig.update_xaxes(title = 'Time')
fig.show()