In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import Lasso
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [4]:
df_orig = pd.read_csv('../../../full_dataset_unscaled.csv')
df_orig = df_orig.rename(columns={"datetime":"year-month"})
df_orig['net_job_rate']=df_orig['job_creation_rate']-df_orig['job_destruction_rate']
df_orig['apartment_for_rent_searches'] = df_orig[['apartment for rent','studio for rent','1 bedroom for rent','3 bedroom for rent']].sum(axis=1)
df_orig['multifamily_for_rent_searches'] = df_orig[['townhomes for rent','townhouse for rent','house for rent','duplex apartments for rent','condos for rent']].sum(axis=1)
df_orig['gun_searches'] = df_orig[['gun range', 'gun control', 'gun violence']].sum(axis=1)
df_orig['zri'] = np.log(df_orig['zri'])

In [5]:
gen_cols = ['zip',
 'City',
 'State',
 'Metro',
 'CountyName',
 'zri',
 'year',
 'month',
 'year-month']


In [6]:
for zipcode in df_orig['zip'].unique():
    globals()[f"scaler_{zipcode}"]=StandardScaler(copy=False)
    df_filtered=df_orig[df_orig['zip']==zipcode]
    df_filtered_train = df_filtered[df_filtered['year']<2019]
    globals()[f"scaler_{zipcode}"].fit(df_filtered_train[['zri']])
    df_orig.loc[df_orig['zip']==zipcode,'zri']=globals()[f"scaler_{zipcode}"].transform(df_filtered[['zri']])

In [7]:
df_zri = df_orig[gen_cols]

In [8]:
def laggenerator(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i]
    df.loc[:,f'{colname}_lag{i}']=df.loc[:,f'{colname}'].shift(i)
    df.loc[df['year-month'].isin(timelist),f'{colname}_lag{i}']=0
    return df

def laggenerator_diff(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i+1]
    df.loc[:,f'{colname}_diff_lag{i}_lag{i+1}']=df.loc[:,f'{colname}'].shift(i)-df.loc[:,f'{colname}'].shift(i+1)
    df.loc[df['year-month'].isin(timelist),f'{colname}_diff_lag{i}_lag{i+1}']=0
    return df

def laggenerator_diff12(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i+11]
    df.loc[:,f'{colname}_diff_lag{i}_lag{i+11}']=df.loc[:,f'{colname}'].shift(i)-df.loc[:,f'{colname}'].shift(i+11)
    df.loc[df['year-month'].isin(timelist),f'{colname}_diff_lag{i}_lag{i+11}']=0
    return df

#ZRI 
def lag_gen(df):
    for i in range(1,12):
        df=laggenerator(i, 'zri', df)
    df=laggenerator_diff12(1, 'zri', df)
    return df

In [9]:
#ZRI
df_zri = lag_gen(df_zri)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [10]:
df = df_zri[df_zri['year-month']>='2015-04-01']
train = df[(df['year'] < 2019) & (df['year']>=2015)]
test = df[df['year']==2019]

train = train.sort_values(by='year-month',ascending=True)
test = test.sort_values(by='year-month',ascending=True)

save_train = train.copy()
save_test = test.copy()



In [11]:
train = train.drop(['zip','City','State','Metro','CountyName','year','month','year-month'],axis=1)
train_y = train['zri']
train_X = train.drop(['zri'],axis=1)

test_y = test['zri']
test_X = test

In [12]:
#Lasso Grid Search
fold=TimeSeriesSplit(n_splits=5)
lasso = Lasso()
grid = dict()
grid['alpha'] = [0.01,0.05,0.1,0.5,1,10]
lasso_grid = GridSearchCV(lasso, grid, cv=fold, n_jobs=-1).fit(train_X,train_y)
lasso_best = lasso_grid.best_estimator_

In [13]:
lasso_best

Lasso(alpha=0.01)

In [14]:
lasso_grid.best_score_

0.9148814537766908

In [15]:
#lasso_grid.cv_results_

In [16]:
#lasso_best = Lasso(alpha=0.01)

In [17]:
lasso_best.fit(train_X, train_y)

Lasso(alpha=0.01)

In [18]:
predictor_table=df.copy()
X_test = test_X.copy()

#loop through all month in 2019
for month in list(X_test['year-month'].drop_duplicates()):
    
    #run prediction for one month
    X_test=X_test[X_test['year-month']==month]
    X_test=X_test.drop(['zip','year','month','City','State','Metro','CountyName','year-month','zri'],axis=1)
    val=lasso_best.predict(X_test)

    #write current month prediction into predictor_table

    predictor_table.loc[predictor_table['year-month']==month,'zri']=val
        
    predictor_table=lag_gen(predictor_table)

    X_test=predictor_table[predictor_table['year']==2019]

scaled_predictions_y = X_test['zri']
temp = pd.concat([save_test['zip'],test_y],axis=1)
temp.reset_index(drop=True, inplace=True)
scaled_predictions_y.reset_index(drop=True, inplace=True)
rstable = pd.concat([temp,scaled_predictions_y],axis=1)
rstable.columns = ['zip','zri_test','zri_predicted']

for zipcode in rstable['zip'].unique():
    rstable_filtered=rstable[rstable['zip']==zipcode]
    rstable.loc[rstable['zip']==zipcode,'zri_test']=globals()[f"scaler_{zipcode}"].inverse_transform(rstable_filtered[['zri_test']])
    rstable.loc[rstable['zip']==zipcode,'zri_predicted']=globals()[f"scaler_{zipcode}"].inverse_transform(rstable_filtered[['zri_predicted']])  
rstable.loc[:,'zri_test'] = np.exp(rstable.loc[:,'zri_test'])
rstable.loc[:,'zri_predicted'] = np.exp(rstable.loc[:,'zri_predicted'])
rstable

Unnamed: 0,zip,zri_test,zri_predicted
0,1013,1099.0,1105.461280
1,14850,1545.0,1598.649859
2,6114,1139.0,1093.375614
3,90804,1957.0,1814.787809
4,59601,845.4,843.660391
...,...,...,...
15607,14217,901.0,925.071759
15608,60611,2138.0,2243.040226
15609,93307,883.0,901.379486
15610,10019,3619.0,3756.872326


In [19]:
#RESULTS 
r2 = r2_score(rstable['zri_test'],rstable['zri_predicted'])
rmse = sqrt(mean_squared_error(rstable['zri_test'],rstable['zri_predicted']))
print(f'R2: {r2*100}')
print(f'RMSE: {rmse}')

R2: 96.72281026197712
RMSE: 120.44119154879576


In [20]:
importance = np.abs(lasso_best.coef_)
coef = lasso_best.coef_
feature_names = train_X.columns
feature_importances= pd.DataFrame([feature_names,importance,coef]).T
feature_importances.columns = ['feature_names','coef_abs','coef']

feature_importances = feature_importances.sort_values(by='coef_abs',ascending=False)
feature_importances['Parent_feature']=feature_importances['feature_names'].apply(lambda x: x[0:x.find('_lag')] if x.find('_lag')!=-1 else x)
feature_importances['Parent_feature']=feature_importances['Parent_feature'].apply(lambda x: x[0:x.find('_diff')] if x.find('_diff')!=-1 else x)

In [21]:
feature_importances

Unnamed: 0,feature_names,coef_abs,coef,Parent_feature
0,zri_lag1,1.039991,1.039991,zri
2,zri_lag3,0.113836,-0.113836,zri
1,zri_lag2,0.0,-0.0,zri
3,zri_lag4,0.0,-0.0,zri
4,zri_lag5,0.0,-0.0,zri
5,zri_lag6,0.0,-0.0,zri
6,zri_lag7,0.0,0.0,zri
7,zri_lag8,0.0,0.0,zri
8,zri_lag9,0.0,0.0,zri
9,zri_lag10,0.0,0.0,zri


In [22]:
feats_agg = feature_importances[['coef_abs','Parent_feature']]
feats_agg['coef_abs'] = feats_agg['coef_abs'].astype('float')
feats_agg_abs = feats_agg.groupby('Parent_feature').agg('mean').sort_values('coef_abs',ascending=False)
feats_agg_abs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feats_agg['coef_abs'] = feats_agg['coef_abs'].astype('float')


Unnamed: 0_level_0,coef_abs
Parent_feature,Unnamed: 1_level_1
zri,0.096152


In [31]:
title = 'FINAL_1_Auto'
fi = title+'_FI'
fi2 = title+'_FI_Agg_sign'
# # rstable.to_csv(f'../../../zillow_orientation/Residuals/{title}.csv')
feature_importances.to_csv(f'../../../zillow_orientation/Residuals/{fi}.csv')
feats_agg_abs.to_csv(f'../../../zillow_orientation/Residuals/{fi2}.csv')

In [24]:
# R2: 96.78959391649265
# RMSE: 119.20767979158444
# alpha: 0.01

In [25]:
rstable

Unnamed: 0,zip,zri_test,zri_predicted
0,1013,1099.0,1105.461280
1,14850,1545.0,1598.649859
2,6114,1139.0,1093.375614
3,90804,1957.0,1814.787809
4,59601,845.4,843.660391
...,...,...,...
15607,14217,901.0,925.071759
15608,60611,2138.0,2243.040226
15609,93307,883.0,901.379486
15610,10019,3619.0,3756.872326


In [26]:
rstable['residual']=rstable['zri_predicted']-rstable['zri_test']
rstable['residual_squared']=rstable['residual'].map(lambda x: x**2)
rstable

Unnamed: 0,zip,zri_test,zri_predicted,residual,residual_squared
0,1013,1099.0,1105.461280,6.461280,41.748145
1,14850,1545.0,1598.649859,53.649859,2878.307398
2,6114,1139.0,1093.375614,-45.624386,2081.584566
3,90804,1957.0,1814.787809,-142.212191,20224.307163
4,59601,845.4,843.660391,-1.739609,3.026238
...,...,...,...,...,...
15607,14217,901.0,925.071759,24.071759,579.449566
15608,60611,2138.0,2243.040226,105.040226,11033.449084
15609,93307,883.0,901.379486,18.379486,337.805508
15610,10019,3619.0,3756.872326,137.872326,19008.778208


In [27]:
rmse_table=rstable.groupby('zip')[['residual_squared']].agg(['sum','count'])
rmse_table['residual_avgd']=rmse_table['residual_squared']['sum']/rmse_table['residual_squared']['count']
rmse_table['residual_avgd']=rmse_table['residual_avgd'].apply(lambda x: x**0.5)
rmse_table['residual_avgd'].mean()

98.68585922845232

In [28]:
rmse_table

Unnamed: 0_level_0,residual_squared,residual_squared,residual_avgd
Unnamed: 0_level_1,sum,count,Unnamed: 3_level_1
zip,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1013,20110.918412,12,40.937878
1020,63305.698354,12,72.632464
1040,38440.647269,12,56.598474
1085,37235.378830,12,55.704113
1104,30941.251267,12,50.778318
...,...,...,...
99207,289786.021847,12,155.398955
99501,62877.969993,12,72.386676
99504,124179.206767,12,101.726433
99508,66482.564788,12,74.432612


In [29]:
#RESULTS 
r2 = r2_score(rstable['zri_test'],rstable['zri_predicted'])
rmse = sqrt(mean_squared_error(rstable['zri_test'],rstable['zri_predicted']))
print(f'R2: {r2*100}')
print(f'RMSE: {rmse}')

R2: 96.72281026197712
RMSE: 120.44119154879576


In [32]:
rstable.to_csv(f'../../../zillow_orientation/Residuals/{title}_rstable.csv')
rmse_table.to_csv(f'../../../zillow_orientation/Residuals/{title}_rmse.csv')