In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [2]:
from data_setup import ZRI_format

In [8]:
#Load Data
ZRI_MF = pd.read_pickle('./pickles/ZRI_filtered.p')
ACS = pd.read_pickle('./acs_data/ACS.p')

In [47]:
%%time
ZRI_new = ZRI_format(ZRI_MF, time_unit = 'Month', window_size = 0, future_time = 0)

Wall time: 848 ms


In [48]:
ZRI_new

Unnamed: 0,Target_index,Missing_Months,Target_ZRI,Month,Year,ZipCode,Predict_Year,Predict_Month
0,01013M10Y2014,0,946.0,10,2014,01013,2014,10
1,01013M10Y2015,0,974.0,10,2015,01013,2015,10
2,01013M10Y2016,0,1030.0,10,2016,01013,2016,10
3,01013M10Y2017,0,1091.0,10,2017,01013,2017,10
4,01013M10Y2018,0,1124.0,10,2018,01013,2018,10
...,...,...,...,...,...,...,...,...
90946,99654M9Y2015,2,1188.0,9,2015,99654,2015,9
90947,99654M9Y2016,2,1298.0,9,2016,99654,2016,9
90948,99654M9Y2017,2,1326.0,9,2017,99654,2017,9
90949,99654M9Y2018,2,1337.0,9,2018,99654,2018,9


In [13]:
acs_lag = 1
ACS = ACS.assign(year_avail = (ACS.year.astype(int) + 2).astype(str))

In [34]:
data_4_model = ZRI_new.merge(ACS,how = 'left',left_on = ['ZipCode','Predict_Year'], 
                                              right_on = ['geo_id','year_avail'])

In [16]:
#Columns to use in the final analysis
zip_columns = ['geo_id','unemployed_pop','white_pop','vacant_housing_units','total_pop','worked_at_home',
               'poverty','percent_income_spent_on_rent','occupied_housing_units',
               'median_year_structure_built','median_age','married_households','masters_degree',
               'male_pop','female_pop','income_per_capita','housing_units','employed_pop','black_pop',
               'asian_pop','amerindian_pop','graduate_professional_degree']

In [35]:
#Convert columns to percentage
#Columns to divide by total population
pop_columns = ['unemployed_pop','white_pop','masters_degree',
               'graduate_professional_degree','employed_pop','black_pop',
               'asian_pop','amerindian_pop','poverty','worked_at_home']

#Columns to divide by total housing units
house_columns = ['vacant_housing_units','occupied_housing_units']

#Division
data_4_model.loc[:,pop_columns] = data_4_model[pop_columns].div(data_4_model['total_pop'], axis = 0)
data_4_model.loc[:,house_columns] = data_4_model[house_columns].div(data_4_model['housing_units'], axis = 0)

In [21]:
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso
from sklearn.model_selection import train_test_split
from collections import defaultdict

In [None]:
#Imputation Strategy
#Dropna for now


In [36]:
#Find feature columns
full_feature_columns = [x for x in data_4_model.columns if 'minus' in x] +\
                                                     pop_columns  +\
                                                    house_columns +\
                                                    ['income_per_capita',
                                                    'percent_income_spent_on_rent',
                                                    'median_age']

In [37]:
ZRI_feature_columns = [x for x in data_4_model.columns if 'minus' in x]

In [38]:
#Train test split, test data is above a given year
test_year = 2019
data_4_model = data_4_model[full_feature_columns + ['Target_ZRI','Year']].dropna()
training_data = data_4_model[data_4_model.Year < test_year]
test_data = data_4_model[data_4_model.Year >= test_year]

In [39]:
X_train_full, y_train_full = training_data[full_feature_columns], training_data['Target_ZRI']

In [40]:
X_test_full, y_test_full = test_data[full_feature_columns], test_data['Target_ZRI']

In [41]:
lr_zri = LinearRegression()
lr_full = LinearRegression()

In [42]:
lr_full.fit(X_train_full,y_train_full)

LinearRegression()

In [43]:
lr_full.score(X_test_full,y_test_full), lr_full.score(X_train_full, y_train_full)

(0.878755300978112, 0.9441226344482254)

In [None]:
prediction_error_full = final_test_data['Target_ZRI'] - lr_full.predict(final_test_data[full_feature_columns])

In [None]:
prediction_error_ZRI = final_test_data['Target_ZRI'] - lr_zri.predict(final_test_data[ZRI_feature_columns])

In [None]:
prediction_error_full.describe(), prediction_error_ZRI.describe()

In [None]:
prediction_error = final_test_data['Target_ZRI'] - lr.predict(final_test_data[feature_columns])

In [None]:
prediction_error.describe()

In [None]:
lr.coef_, lr.alpha_

In [None]:
window_sizes = list(range(1,13))
future_time = 24
time_unit = 'Month'
num_obs = defaultdict()
errors = defaultdict()
scores = defaultdict()
coefficients = defaultdict()

for window_size in window_sizes:
    ZRI_new = ZRI_format(ZRI_MF, time_unit = time_unit, 
                         window_size = window_size,
                         future_time = future_time)
    ZRI_new = ZRI_new.dropna()
    num_obs[window_size] = ZRI_new.shape[0]
    feature_columns = [x for x in ZRI_new.columns if 'minus' in x]
    test_year = 2019
    training_data = ZRI_new[ZRI_new.Year < test_year]
    final_test_data = ZRI_new[ZRI_new.Year >= test_year]
    most_recent_feature = f'ZRI_minus_{future_time}{time_unit[0]}'
    X_train, X_test, y_train, y_test = train_test_split(training_data[feature_columns],
                                                     training_data['Target_ZRI'],
                                                     test_size = .1
                                                    ) 
    lr = LinearRegression()
    lr.fit(X_train,y_train)
    coefficients[window_size] = defaultdict()
    scores[window_size] = (lr.score(X_test,y_test), lr.score(X_train, y_train))
    errors[window_size] = (final_test_data['Target_ZRI'] - 
                           lr.predict(final_test_data[feature_columns])).div(final_test_data[most_recent_feature])
    

In [None]:
lr.coef_

In [None]:
plt.boxplot(errors.values())

In [None]:
plt.boxplot(list(map(lambda x: x.apply(lambda y: np.log10(y+1250)),errors.values())))

In [None]:
pd.DataFrame(errors).describe()