In [48]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, Normalizer, MinMaxScaler

import sklearn

from xgboost import XGBRegressor
from statsmodels.stats.outliers_influence import variance_inflation_factor


from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor

from statsmodels.tsa.seasonal import seasonal_decompose
import plotly.express as px

from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.feature_selection import RFECV


from datetime import datetime

pd.set_option('display.max_rows', 5000)

<H1>External feature integration

In [2]:
df=pd.read_csv('https://raw.githubusercontent.com/Kneck12/zillow-price-index-pred/main/data/zri_acs_bikeshare_merged.csv')

In [3]:
df['year-month']=df['datetime'].apply(lambda x: x[0:7])

In [4]:
# df2=df.groupby(['State','zip']).agg('count')
# df3=df2.groupby('State').agg('count').sort_values('zri')
# droplist=list(df3[df3['City']<10].index)
# df=df[~(df['State'].isin(droplist))]

In [5]:
#df=df[df['State']=='CA']

<H1>Standardization

In [6]:
#df=df.drop('zip', axis=1)

for zipcode in df['zip'].unique():
    globals()[f"minmax_{zipcode}"]=MinMaxScaler(copy=False)
    df_filtered=df[df['zip']==zipcode]
    df.loc[df['zip']==zipcode,'zri']=globals()[f"minmax_{zipcode}"].fit_transform(df_filtered[['zri']])

In [7]:
#columns to scale:
scale_columns = ['percent_white',
 'percent_black',
 'percent_asian',
 'percent_hispanic',
 'percent_native_am',
 'percent_other_race',
 'percent_0_17',
 'percent_18_39',
 'percent_40_64',
 'percent_65+',
 'percent_rental_units_vacant',
 'percent_rental_units_occupied',
 'percent_graduate_deg',
 'percent_bachelors',
 'percent_associates',
 'percent_highschool',
 'percent_less_highschool',
 'percent_commute_public_transport',
 'percent_commute_less_30',
 'percent_buildings_less_10_units',
 'percent_buildings_10_19_units',
 'percent_buildings_20_49_units',
 'percent_buildings_50+_units',
 'percent_commute_30_to_59',
 'percent_commute_60_to_89',
 'percent_commute_90_more',
 'percent_new_city',
 'percent_new_unit',
 'percent_units_owner_occupied',
 'median_building_age',
 'income_per_capita',
 'poverty_rate',
 'total_pop',
 'percent_workforce_unemployed',
 'percent_work_from_home',
 'median_age',
 'percent_female',
 'gini_index',
 'percent_not_us_citizen',
 'bs_total_stations',
 'bs_total_systems',
 'has_bike_sharing']

for zipcode in df['zip'].unique():
    globals()[f"minmax_features_{zipcode}"]=MinMaxScaler(copy=False)
    df_filtered=df[df['zip']==zipcode]
    df.loc[df['zip']==zipcode,scale_columns]=globals()[f"minmax_features_{zipcode}"].fit_transform(df_filtered[scale_columns])

<H1>Lag development

In [8]:
def vif_calc(df_VIF):
    viflist=[variance_inflation_factor(df_VIF.values, i) for i in range(len(df_VIF.columns))]
    list_of_tuples = list(zip(list(df_VIF.columns), viflist))
    df=pd.DataFrame(list_of_tuples,columns=['Feature','VIF'])
    df=df.sort_values('VIF',ascending=False)
    return df

In [9]:
df2=df.drop(['percent_white', 'percent_black',
       'percent_asian', 'percent_hispanic', 'percent_native_am',
       'percent_other_race','percent_0_17', 'percent_18_39', 'percent_40_64',
       'percent_65+','has_bike_sharing','percent_rental_units_occupied','percent_rental_units_vacant','bs_total_stations'],axis=1)

df2=df2.drop(['zip','year','month','datetime','City','State','Metro','CountyName','year-month','zri'],axis=1)

#vif_calc(df2)
df2.columns

Index(['percent_graduate_deg', 'percent_bachelors', 'percent_associates',
       'percent_highschool', 'percent_less_highschool',
       'percent_commute_public_transport', 'percent_commute_less_30',
       'percent_buildings_less_10_units', 'percent_buildings_10_19_units',
       'percent_buildings_20_49_units', 'percent_buildings_50+_units',
       'percent_commute_30_to_59', 'percent_commute_60_to_89',
       'percent_commute_90_more', 'percent_new_city', 'percent_new_unit',
       'percent_units_owner_occupied', 'median_building_age',
       'income_per_capita', 'poverty_rate', 'total_pop',
       'percent_workforce_unemployed', 'percent_work_from_home', 'median_age',
       'percent_female', 'gini_index', 'percent_not_us_citizen',
       'bs_total_systems'],
      dtype='object')

In [10]:
def laggenerator(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i]
    
    df.loc[:,f'{colname}_lag{i}']=df.loc[:,f'{colname}'].shift(i)
    
    df.loc[df['year-month'].isin(timelist),f'{colname}_lag{i}']=0
    return df

In [11]:
def laggenerator_diff(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i+1]
    
    df.loc[:,f'{colname}_diff_lag{i}_lag{i+1}']=df.loc[:,f'{colname}'].shift(i)-df.loc[:,f'{colname}'].shift(i+1)
    
    df.loc[df['year-month'].isin(timelist),f'{colname}_diff_lag{i}_lag{i+1}']=0
    return df

In [12]:
def laggenerator_diff6(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i+1]
    
    df.loc[:,f'{colname}_diff_lag{i}_lag{i+6}']=df.loc[:,f'{colname}'].shift(i)-df.loc[:,f'{colname}'].shift(i+6)
    
    df.loc[df['year-month'].isin(timelist),f'{colname}_diff_lag{i}_lag{i+6}']=0
    return df

In [13]:
def laggenerator_diff12(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i+11]
    
    df.loc[:,f'{colname}_diff_lag{i}_lag{i+11}']=df.loc[:,f'{colname}'].shift(i)-df.loc[:,f'{colname}'].shift(i+11)
    
    df.loc[df['year-month'].isin(timelist),f'{colname}_diff_lag{i}_lag{i+11}']=0
    return df

In [14]:
def laggenerator_MA(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i+4]
    
    df.loc[:,f'{colname}_MA{i}']=df.loc[:,f'{colname}'].shift(i)-(df.loc[:,f'{colname}'].shift(i)+df.loc[:,f'{colname}'].shift(i+1)+df.loc[:,f'{colname}'].shift(i+2)+df.loc[:,f'{colname}'].shift(i+3)+df.loc[:,f'{colname}'].shift(i+4))/5
    
    df.loc[df['year-month'].isin(timelist),f'{colname}_MA{i}']=0
    return df

In [15]:
def laggenerator_mean(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i-1]
    df.loc[:,f'{colname}_mean{i}']=0
    for x in range(1,i):
        df.loc[:,f'{colname}_mean{i}']+=df.loc[:,f'{colname}'].shift(x)
    df.loc[:,f'{colname}_mean{i}']=df.loc[:,f'{colname}_mean{i}']/i
    
    df.loc[df['year-month'].isin(timelist),f'{colname}_mean{i}']=0
    return df

In [16]:
def laggenerator_ewm(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i-1]
    df.loc[:,f'{colname}_ewm{i}']=0
    weightsum=0
    for y in range(1,i):
        weight=i-y
        weightsum+=weight
    for x in range(1,i):
        weight=i-x
        df.loc[:,f'{colname}_ewm{i}']+=df.loc[:,f'{colname}'].shift(x)*weight/weightsum
    df.loc[:,f'{colname}_ewm{i}']=df.loc[:,f'{colname}_ewm{i}']
    
    df.loc[df['year-month'].isin(timelist),f'{colname}_ewm{i}']=0
    return df

In [17]:
def laggenerator_min(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i-1]
    df.loc[:,f'{colname}_min{i}']=999999
    for x in range(1,i):
        df.loc[:,f'{colname}_min{i}']=np.minimum(df.loc[:,f'{colname}'].shift(x),df.loc[:,f'{colname}_min{i}'])
    
    df.loc[df['year-month'].isin(timelist),f'{colname}_min{i}']=0
    return df

In [18]:
def laggenerator_max(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i-1]
    df.loc[:,f'{colname}_max{i}']=0
    for x in range(1,i):
        df.loc[:,f'{colname}_max{i}']=np.maximum(df.loc[:,f'{colname}'].shift(x),df.loc[:,f'{colname}_max{i}'])
    
    df.loc[df['year-month'].isin(timelist),f'{colname}_max{i}']=0
    return df

In [19]:
def lag_generator_all(df, var):
    for i in range(1,6):   #14
        df=laggenerator(i, var,df)

    df=laggenerator_diff12(1, var,df)
    for i in range(1,12):  #11
        df=laggenerator_diff(i, var,df)

    # for i in range(1,2):
    #     df=laggenerator_diff6(i, var,df)

    #for i in range(1,11):
     #   df=laggenerator_MA(i, var,df)

#     for i in range(1,14):
#         df=laggenerator_mean(i, 'ZRI_scaled',df)

#     for i in range(1,2):
#         df=laggenerator_ewm(i, var,df)

#     for i in range(1,14):
#         df=laggenerator_min(i, var,df)

#     for i in range(1,14):
#         df=laggenerator_max(i, var,df)
    
    return df

def lag_generator_12M(df, var):
    df=laggenerator(12, var,df)
    return df

def lag_generator_dff_12M(df, var):
    df=laggenerator_diff12(1, var,df)
    return df

In [20]:
df.columns[5:43]

Index(['zri', 'year', 'month', 'datetime', 'percent_white', 'percent_black',
       'percent_asian', 'percent_hispanic', 'percent_native_am',
       'percent_other_race', 'percent_0_17', 'percent_18_39', 'percent_40_64',
       'percent_65+', 'percent_rental_units_vacant',
       'percent_rental_units_occupied', 'percent_graduate_deg',
       'percent_bachelors', 'percent_associates', 'percent_highschool',
       'percent_less_highschool', 'percent_commute_public_transport',
       'percent_commute_less_30', 'percent_buildings_less_10_units',
       'percent_buildings_10_19_units', 'percent_buildings_20_49_units',
       'percent_buildings_50+_units', 'percent_commute_30_to_59',
       'percent_commute_60_to_89', 'percent_commute_90_more',
       'percent_new_city', 'percent_new_unit', 'percent_units_owner_occupied',
       'median_building_age', 'income_per_capita', 'poverty_rate', 'total_pop',
       'percent_workforce_unemployed'],
      dtype='object')

In [21]:
# for var in df.columns[5:43]:
#     df=lag_generator_all(df, var)

df=lag_generator_all(df, 'zri')  


# for feat in [  'bs_total_systems','median_building_age',
#              'percent_graduate_deg',
#        'percent_bachelors', 'percent_associates', 'percent_highschool',
#        'percent_less_highschool', 'percent_commute_public_transport',
#        'percent_commute_less_30', 'percent_commute_30_to_59',
#        'percent_commute_60_to_89', 'percent_commute_90_more',
#        'percent_new_city', 'percent_new_unit', 'percent_units_owner_occupied',
#         'income_per_capita', 'poverty_rate', 'total_pop',
#        'percent_workforce_unemployed', 'percent_work_from_home', 'median_age',
#        'percent_female', 'gini_index', 'percent_not_us_citizen',   
#        'percent_buildings_less_10_units','percent_buildings_10_19_units','percent_buildings_20_49_units','percent_buildings_50+_units'
#        ]:
#     df=lag_generator_12M(df,feat)
#     df=lag_generator_dff_12M(df,feat)




# df=df.drop(['percent_white', 'percent_black',
#        'percent_asian', 'percent_hispanic', 'percent_native_am','bs_total_stations',
#        'percent_other_race','percent_0_17', 'percent_18_39', 'percent_40_64',
#        'percent_65+','has_bike_sharing','percent_rental_units_vacant',
#        'percent_rental_units_occupied'],axis=1)


df=df.drop(['percent_white', 'percent_black',
       'percent_asian', 'percent_hispanic', 'percent_native_am',
       'percent_other_race', 'percent_0_17', 'percent_18_39', 'percent_40_64',
       'percent_65+', 'percent_rental_units_vacant',
       'percent_rental_units_occupied', 'percent_graduate_deg',
       'percent_bachelors', 'percent_associates', 'percent_highschool',
       'percent_less_highschool', 'percent_commute_public_transport',
       'percent_commute_less_30', 'percent_commute_30_to_59',
        'median_building_age','bs_total_systems',
       'percent_commute_60_to_89', 'percent_commute_90_more',
       'percent_new_city', 'percent_new_unit', 'percent_units_owner_occupied',
        'income_per_capita', 'poverty_rate', 'total_pop',
       'percent_workforce_unemployed', 'percent_work_from_home', 'median_age',
       'percent_female', 'gini_index', 'percent_not_us_citizen', 'bs_total_stations',  'has_bike_sharing',
       'percent_buildings_less_10_units','percent_buildings_10_19_units','percent_buildings_20_49_units','percent_buildings_50+_units'],axis=1)

In [22]:
df.columns

Index(['zip', 'City', 'State', 'Metro', 'CountyName', 'zri', 'year', 'month',
       'datetime', 'year-month', 'zri_lag1', 'zri_lag2', 'zri_lag3',
       'zri_lag4', 'zri_lag5', 'zri_diff_lag1_lag12', 'zri_diff_lag1_lag2',
       'zri_diff_lag2_lag3', 'zri_diff_lag3_lag4', 'zri_diff_lag4_lag5',
       'zri_diff_lag5_lag6', 'zri_diff_lag6_lag7', 'zri_diff_lag7_lag8',
       'zri_diff_lag8_lag9', 'zri_diff_lag9_lag10', 'zri_diff_lag10_lag11',
       'zri_diff_lag11_lag12'],
      dtype='object')

In [23]:
train=df[(df['year-month']<='2018-12') & (df['year-month']>'2015-01')]

train=train.drop('year-month',axis=1)

Y_train=train['zri']
X_train=train.drop('zri',axis=1)


In [24]:
labelencoder = LabelEncoder()
labelencoder2 = LabelEncoder()
labelencoder3 = LabelEncoder()
labelencoder4 = LabelEncoder()
enc=OneHotEncoder()
enc2=OneHotEncoder()

# enc2_df = pd.DataFrame(enc2.fit_transform(X_train[['Metro']]).toarray())
# X_train = X_train.join(enc2_df, rsuffix='1')
# X_train=X_train.fillna(0)

X_train=X_train.drop(['zip','year','month','datetime','City','State','Metro','CountyName'],axis=1)
cols=X_train.columns

#X_train = X_train.join(norm_df, rsuffix='1')
#X_train=X_train.fillna(0)



In [25]:
# fold=TimeSeriesSplit(n_splits=5)

# linmod = XGBRegressor(n_estimators=1000, objective ='reg:squarederror')
# #linmod=RandomForestRegressor(n_estimators=500)


# params=({
#      'max_depth':[5], 
#         'eta':[0.03],
#     'subsample':[0.9],
#     'colsample_bytree':[0.8]
#  })

# grid=GridSearchCV(linmod, param_grid=params, cv=fold)
# grid.fit(X_train, Y_train)

# grid.best_params_

In [26]:
# fold=TimeSeriesSplit(n_splits=5)

# kr_model=XGBRegressor(n_estimators=1000, objective ='reg:squarederror', max_depth=5, eta=0.03, subsample=0.9, colsample_bytree=0.8)

# rfe_grid=RFECV(estimator=kr_model, cv=fold, min_features_to_select=1, scoring='r2')
# rfe_grid.fit(X_train, Y_train)

In [27]:
#kr_model=KernelRidge(kernel='chi2',gamma=0.0001)   #gamma=0.00000001

#{'degree': 3, 'gamma': 1.0000000230258512, 'kernel': 'linear'}
#kr_model=KernelRidge(kernel='linear',gamma=1, degree=3)   #gamma=0.00000001

#kr_model=KernelRidge(kernel='polynomial',gamma=1e-06,degree=3)

#kr_model=SVR(kernel='poly',gamma=0.000001,degree=3)
#kr_model=Lasso(alpha=100)   #alpha=10

kr_model=XGBRegressor(n_estimators=1000, objective ='reg:squarederror', max_depth=5, eta=0.03, subsample=0.9, colsample_bytree=0.8)

%time kr_model.fit(X_train, Y_train)

CPU times: user 1min 26s, sys: 154 ms, total: 1min 27s
Wall time: 1min 26s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.03, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=None, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=0.9, verbosity=1)

In [28]:
#RF: 2 features: 15 seconds

In [29]:
kr_model.score(X_train, Y_train)

0.9889135479277682

In [30]:
feature_importances=pd.DataFrame({'Feature':cols,'Importance':kr_model.feature_importances_}).sort_values('Importance', ascending=False)
feature_importances

Unnamed: 0,Feature,Importance
0,zri_lag1,0.41704
2,zri_lag3,0.274237
1,zri_lag2,0.250054
6,zri_diff_lag1_lag2,0.04665
7,zri_diff_lag2_lag3,0.001599
4,zri_lag5,0.001014
5,zri_diff_lag1_lag12,0.000987
3,zri_lag4,0.000964
11,zri_diff_lag6_lag7,0.000934
16,zri_diff_lag11_lag12,0.000902


In [31]:
#pd.DataFrame({'Feature':X_train.columns,'Coefs':kr_model.coef_}).sort_values('Coefs')

In [32]:
Y_test=df[(df['year-month']>'2018-12') & (df['year-month']<='2019-12')][['zri']]
predictor_table=df.copy()
X_test=predictor_table[predictor_table['year-month']>'2018-12']

#loop through all month in 2019
for month in list(X_test['year-month'].drop_duplicates()):
    
    #run prediction for one month
    X_test=X_test[X_test['year-month']==month]
    
    # enc2_df2 = pd.DataFrame(enc2.transform(X_test[['Metro']]).toarray())
    # X_test = X_test.join(enc2_df2, rsuffix='1')
    # X_test=X_test.fillna(0)    
    
    X_test=X_test.drop(['zip','year','month','datetime','City','State','Metro','CountyName','year-month','zri'],axis=1)
    #X_test = pd.DataFrame(normal.transform(X_test))
    
    val=kr_model.predict(X_test)

    #write current month prediction into predictor_table

    predictor_table.loc[predictor_table['year-month']==month,'zri']=val
        
    predictor_table=lag_generator_all(predictor_table, 'zri')
#     for var in predictor_table.columns[5:43]:
#         predictor_table=lag_generator_all(predictor_table, var)
        
    X_test=predictor_table[predictor_table['year-month']>'2018-12']



In [33]:
X_test

Unnamed: 0,zip,City,State,Metro,CountyName,zri,year,month,datetime,year-month,zri_lag1,zri_lag2,zri_lag3,zri_lag4,zri_lag5,zri_diff_lag1_lag12,zri_diff_lag1_lag2,zri_diff_lag2_lag3,zri_diff_lag3_lag4,zri_diff_lag4_lag5,zri_diff_lag5_lag6,zri_diff_lag6_lag7,zri_diff_lag7_lag8,zri_diff_lag8_lag9,zri_diff_lag9_lag10,zri_diff_lag10_lag11,zri_diff_lag11_lag12
60,1013,Chicopee,MA,Springfield,Hampden County,0.723925,2019,1,2019-01-01,2019-01,0.732794,0.757085,0.793522,0.850202,0.890688,-0.129555,-0.024291,-0.036437,-0.056680,-0.040486,-0.016194,-0.016194,0.020243,0.008097,0.012146,0.008097,0.012146
61,1013,Chicopee,MA,Springfield,Hampden County,0.728808,2019,2,2019-02-01,2019-02,0.723925,0.732794,0.757085,0.793522,0.850202,-0.150569,-0.008869,-0.024291,-0.036437,-0.056680,-0.040486,-0.016194,-0.016194,0.020243,0.008097,0.012146,0.008097
62,1013,Chicopee,MA,Springfield,Hampden County,0.730964,2019,3,2019-03-01,2019-03,0.728808,0.723925,0.732794,0.757085,0.793522,-0.153783,0.004883,-0.008869,-0.024291,-0.036437,-0.056680,-0.040486,-0.016194,-0.016194,0.020243,0.008097,0.012146
63,1013,Chicopee,MA,Springfield,Hampden County,0.731465,2019,4,2019-04-01,2019-04,0.730964,0.728808,0.723925,0.732794,0.757085,-0.163773,0.002156,0.004883,-0.008869,-0.024291,-0.036437,-0.056680,-0.040486,-0.016194,-0.016194,0.020243,0.008097
64,1013,Chicopee,MA,Springfield,Hampden County,0.729964,2019,5,2019-05-01,2019-05,0.731465,0.730964,0.728808,0.723925,0.732794,-0.171369,0.000501,0.002156,0.004883,-0.008869,-0.024291,-0.036437,-0.056680,-0.040486,-0.016194,-0.016194,0.020243
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93379,99207,Spokane,WA,Spokane-Spokane Valley,Spokane County,0.772925,2019,8,2019-08-01,2019-08,0.770612,0.767563,0.766293,0.768857,0.786610,0.148990,0.003049,0.001270,-0.002564,-0.017753,-0.010581,0.009967,0.020640,0.044226,0.049140,0.036855,0.014742
93380,99207,Spokane,WA,Spokane-Spokane Valley,Spokane County,0.775015,2019,9,2019-09-01,2019-09,0.772925,0.770612,0.767563,0.766293,0.768857,0.136562,0.002314,0.003049,0.001270,-0.002564,-0.017753,-0.010581,0.009967,0.020640,0.044226,0.049140,0.036855
93381,99207,Spokane,WA,Spokane-Spokane Valley,Spokane County,0.785670,2019,10,2019-10-01,2019-10,0.775015,0.772925,0.770612,0.767563,0.766293,0.101797,0.002090,0.002314,0.003049,0.001270,-0.002564,-0.017753,-0.010581,0.009967,0.020640,0.044226,0.049140
93382,99207,Spokane,WA,Spokane-Spokane Valley,Spokane County,0.796334,2019,11,2019-11-01,2019-11,0.785670,0.775015,0.772925,0.770612,0.767563,0.063311,0.010655,0.002090,0.002314,0.003049,0.001270,-0.002564,-0.017753,-0.010581,0.009967,0.020640,0.044226


In [34]:
#Y_test['ZRI_Yest_scaled']=Y_test['ZRI_scaled']
rstable=X_test[['zip','year','month','datetime','City','State','Metro','CountyName','year-month','zri']].merge(Y_test[['zri']], left_index=True, right_index=True)


In [35]:
rstable

Unnamed: 0,zip,year,month,datetime,City,State,Metro,CountyName,year-month,zri_x,zri_y
60,1013,2019,1,2019-01-01,Chicopee,MA,Springfield,Hampden County,2019-01,0.723925,0.692308
61,1013,2019,2,2019-02-01,Chicopee,MA,Springfield,Hampden County,2019-02,0.728808,0.688259
62,1013,2019,3,2019-03-01,Chicopee,MA,Springfield,Hampden County,2019-03,0.730964,0.720648
63,1013,2019,4,2019-04-01,Chicopee,MA,Springfield,Hampden County,2019-04,0.731465,0.753036
64,1013,2019,5,2019-05-01,Chicopee,MA,Springfield,Hampden County,2019-05,0.729964,0.761134
...,...,...,...,...,...,...,...,...,...,...,...
93379,99207,2019,8,2019-08-01,Spokane,WA,Spokane-Spokane Valley,Spokane County,2019-08,0.772925,0.950860
93380,99207,2019,9,2019-09-01,Spokane,WA,Spokane-Spokane Valley,Spokane County,2019-09,0.775015,0.938575
93381,99207,2019,10,2019-10-01,Spokane,WA,Spokane-Spokane Valley,Spokane County,2019-10,0.785670,0.896806
93382,99207,2019,11,2019-11-01,Spokane,WA,Spokane-Spokane Valley,Spokane County,2019-11,0.796334,0.948403


In [36]:
for zipcode in rstable['zip'].unique():
    rstable_filtered=rstable[rstable['zip']==zipcode]
    rstable.loc[rstable['zip']==zipcode,'zri_x']=globals()[f"minmax_{zipcode}"].inverse_transform(rstable_filtered[['zri_x']])
    rstable.loc[rstable['zip']==zipcode,'zri_y']=globals()[f"minmax_{zipcode}"].inverse_transform(rstable_filtered[['zri_y']])   
    

In [37]:
#rstable=rstable[rstable['State']=='VA']
RSME=(sum((rstable['zri_x']-rstable['zri_y'])**2)/len(rstable))**0.5
RSME

49.38165444228701

In [38]:
#48.4417412956101 (autoregressive XGBoost)
#48.309152478932894 (autoregressive XGBoost with Metro encoding)
#51.17872757378319 (with racial features)
#52.1259987113339 (with bikeshare features)
#51.598222768311516 (all features)
#82.73230167286547 (non-auto, excl. race)
#81.93278640776168(non-auto, excl. race & age buckets)
#85.51475756187989 (every VIF below 8)

In [39]:
rstable['residual']=rstable['zri_x']-rstable['zri_y']

In [40]:
rstable['residual_squared']=rstable['residual'].map(lambda x: x**2)

In [41]:
rstable2=rstable.groupby('State').agg('mean').sort_values('residual_squared')
rstable2=rstable2.reset_index()

In [42]:
fig = px.choropleth(rstable2,
                    locations='State',
                    color='residual_squared',
                    color_continuous_scale='Viridis',
                    hover_name='State',
                    locationmode='USA-states',
                    scope='usa')

fig.show()

In [65]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

filename='XGB_Auto_All'
feature_importances.to_csv(f'/content/gdrive/MyDrive/zillow-price-index-pred/Moritz/models/features_{filename}.csv')
rstable.to_csv(f'/content/gdrive/MyDrive/zillow-price-index-pred/Moritz/models/residuals_{filename}.csv')


Mounted at /content/gdrive


In [51]:
#drive.flush_and_unmount()  

In [45]:
zips=rstable['zip'].unique()[10:40]

In [46]:
table=rstable[rstable['zip'].isin(zips)]
px.line(y=table['zri_y'],x=table['year-month'], color=table['zip'])
#fig.add_scatter(rstable[rstable['zip'].isin(zips)], y='zri_y',x='year-month', color='zip', mode='lines')
