In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [None]:
from data_setup import ZRI_format

In [None]:
#Load Data
ZRI_MF = pd.read_pickle('./pickles/ZRI_filtered.p')
ACS = pd.read_pickle('./acs_data/ACS.p')
crime = pd.read_pickle('./pickles/crime.p')
dominant_county = pd.read_pickle('./pickles/dominant_county_zip.p')
weather = pd.read_pickle('./pickles/weather.p')
cbp = pd.read_pickle('./pickles/cbp_zip.p')

In [None]:
%%time
ZRI = ZRI_format(ZRI_MF, time_unit = 'Month', window_size = 9, future_time = 12)

In [None]:
[x for x in ZRI.columns if 'minus' in x]

In [None]:
acs_lag = 2
ACS = ACS.assign(year_avail = (ACS.year.astype(int) + acs_lag).astype(str))

In [None]:
cbp_lag = 2
cbp = cbp.assign(year_avail = (cbp.Year.astype(int) + cbp_lag).astype(str))

In [None]:
#ACS
ZRI = ZRI.merge(ACS,how = 'left',left_on = ['ZipCode','Predict_Year'], 
                                            right_on = ['ZipCode','year_avail'])

In [None]:
#Crime
ZRI = ZRI.assign(dominant_county = ZRI.ZipCode.apply(lambda x: dominant_county[x]))
ZRI = ZRI.merge(crime[['crime_rate_per_100000','county_fips_code']],how = 'left',
          left_on = 'dominant_county',right_on = 'county_fips_code').drop('county_fips_code',axis = 1)

In [None]:
#Weather 
#NOTE: will need to edit for quarterly/monthly data. Need to aggregate first
ZRI = ZRI.merge(weather,how = 'left', on = ['ZipCode','Month'])

In [None]:
#CBP
ZRI = ZRI.merge(cbp[['ZIP','num_businesses','num_employees','total_payroll','year_avail']],
                    how = 'left',
                left_on = ['ZipCode','Predict_Year'],
               right_on = ['ZIP','year_avail'])

In [None]:
#Feature Engineering
ZRI = ZRI.assign(gender_ratio = ZRI['male_pop'].div(ZRI['female_pop']),
                 temp_diff    =     ZRI['High_Temp'] - ZRI['Low_Temp'],
                 vacant_ratio = ZRI['vacant_housing_units'].div(ZRI['occupied_housing_units']))

In [None]:
#Columns to use in the final analysis
feature_columns = ['ZipCode','unemployed_pop','white_pop','total_pop','worked_at_home',
                  'poverty','percent_income_spent_on_rent',
                  'median_year_structure_built','median_age','married_households','masters_degree',
                  'income_per_capita','housing_units','employed_pop','black_pop',
                  'asian_pop','amerindian_pop','crime_rate_per_100000','num_businesses',
                  'num_employees','total_payroll','Rain_Fall','gender_ratio','Low_Temp', 'temp_diff',
                  'vacant_ratio']

In [None]:
#Convert columns to percentage
#Columns to divide by total population
pop_columns = ['unemployed_pop','white_pop','masters_degree',
               'employed_pop','black_pop',
               'asian_pop','amerindian_pop','poverty','worked_at_home']

#Division
ZRI.loc[:,pop_columns] = ZRI[pop_columns].div(ZRI['total_pop'], axis = 0)


In [None]:
from sklearn.linear_model import LinearRegression, RidgeCV, Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from collections import defaultdict

In [None]:
#Find feature columns
full_feature_columns = [x for x in ZRI.columns if 'minus' in x] + feature_columns                             

In [None]:
without_zri_columns = [x for x in full_feature_columns if 'minus' not in x]

In [None]:
#Train test split, test data is above a given year
test_year = 2019
data_4_model = ZRI[full_feature_columns + ['Target_ZRI','Year']].dropna()
training_data = data_4_model[data_4_model.Year < test_year]
test_data = data_4_model[data_4_model.Year >= test_year]

In [None]:
X_train_full, y_train_full = training_data[full_feature_columns], training_data['Target_ZRI']

In [None]:
X_train_no_zri, y_train_no_zri = training_data[without_zri_columns], training_data['Target_ZRI']

In [None]:
X_test_full, y_test_full = test_data[full_feature_columns], test_data['Target_ZRI']

In [None]:
X_test_no_zri, y_test_no_zri = test_data[without_zri_columns], test_data['Target_ZRI']

In [None]:
lasso_params = {'alpha': [0.1,1,2,3,4,5,10,20]}
lasso_grid = GridSearchCV(Lasso(), param_grid=lasso_params)
lasso_model = make_pipeline(StandardScaler(),lasso_grid)
rf_params = {'max_depth': [10, None],
          'max_features': ['auto'],
      'min_samples_leaf': [4],
     'min_samples_split': [10],
          'n_estimators': [200]}
rf_model = GridSearchCV(RandomForestRegressor(n_jobs = -1),param_grid= rf_params)
rf_model_no_zri = GridSearchCV(RandomForestRegressor(n_jobs = -1),param_grid= rf_params)

In [None]:
%%time
rf_model.fit(X_train_full,y_train_full)

In [None]:
rf_model_no_zri.fit(X_train_no_zri,y_train_no_zri)

In [None]:
rf_model_no_zri.score(X_test_no_zri,y_test_no_zri), rf_model_no_zri.score(X_train_no_zri,y_train_no_zri)

In [None]:
rf_no_zri_coef_importance = pd.Series(dict(zip(X_train_no_zri.columns, rf_model_no_zri.best_estimator_.feature_importances_))).sort_values()
rf_no_zri_coef_importance.plot(kind = 'bar', title = '1 Year RF Feature Importance No ZRI')

In [None]:
rf_model.score(X_test_full,y_test_full), rf_model.score(X_train_full,y_train_full)

In [None]:
rf_coef_importance = pd.Series(dict(zip(X_train_full.columns, rf_model.best_estimator_.feature_importances_))).sort_values()

In [None]:
rf_coef_importance.loc[[x for x in X_train_full.columns if ('minus' not in x) and (x != 'ZipCode')]].sort_values().plot(kind = 'bar',
                                                                                                                       title = '1 Year RF Feature Importance')

In [None]:
rf_coef_importance.sort_values().plot(kind = 'bar', title = '1 Year RF Feature Importance')

In [None]:
lasso_model.fit(X_train_full,y_train_full)

In [None]:
coefficients = pd.Series(dict(zip(X_train_full.columns, lasso_model.named_steps.gridsearchcv.best_estimator_.coef_))).sort_values()

In [None]:
coefficients.plot(kind = 'bar',title = 'Lasso 3 Year Forecast')

In [None]:
coefficients.loc[[x for x in X_train_full.columns if ('minus' not in x) and (x != 'ZipCode')]].sort_values().plot(kind='bar')

In [None]:
lasso_model.score(X_test_full,y_test_full), lasso_model.score(X_train_full, y_train_full)

In [None]:
lasso_columns = [x for x in without_zri_columns if x in ['vacant_ratio','white_pop','asian_pop']] + [x for x in ZRI.columns if 'minus' in x] 

In [None]:
ridge_model = lasso_model = make_pipeline(StandardScaler(),RidgeCV())

In [None]:
ridge_model.fit(X_train_full[lasso_columns], y_train_full)

In [None]:
ridge_model.score(X_test_full[lasso_columns],y_test_full), ridge_model.score(X_train_full[lasso_columns],y_train_full)