In [241]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


In [242]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [243]:
df = pd.read_csv('full_dataset_unscaled.csv')

In [244]:
df = df.rename(columns={"datetime":"year-month"})

In [245]:
df['net_job_rate']=df['job_creation_rate']-df['job_destruction_rate']
df['apartment_for_rent_searches'] = df[['apartment for rent','studio for rent','1 bedroom for rent','3 bedroom for rent']].sum(axis=1)
df['multifamily_for_rent_searches'] = df[['townhomes for rent','townhouse for rent','house for rent','duplex apartments for rent','condos for rent']].sum(axis=1)
df['gun_searches'] = df[['gun range', 'gun control', 'gun violence']].sum(axis=1)

In [246]:
df['zri'] = np.log(df['zri'])

In [247]:
gen_cols = ['zip',
 'City',
 'State',
 'Metro',
 'CountyName',
 'zri',
 'year',
 'month',
 'year-month']
acs_cols = ['percent_white',
 'percent_black',
 'percent_asian',
 'percent_hispanic',
 'percent_native_am',
 'percent_other_race',
 'percent_0_17',
 'percent_18_39',
 'percent_40_64',
 'percent_65+',
 'percent_rental_units_vacant',
 'percent_rental_units_occupied',
 'percent_graduate_deg',
 'percent_bachelors',
 'percent_associates',
 'percent_highschool',
 'percent_less_highschool',
 'percent_commute_public_transport',
 'percent_commute_less_30',
 'percent_buildings_less_10_units',
 'percent_buildings_10_19_units',
 'percent_buildings_20_49_units',
 'percent_buildings_50+_units',
 'percent_commute_30_to_59',
 'percent_commute_60_to_89',
 'percent_commute_90_more',
 'percent_new_city',
 'percent_new_unit',
 'percent_units_owner_occupied',
 'median_building_age',
 'income_per_capita',
 'poverty_rate',
 'total_pop',
 'percent_workforce_unemployed',
 'percent_work_from_home',
 'median_age',
 'percent_female',
 'gini_index',
 'percent_not_us_citizen']
acs_cols_remove=['percent_other_race','percent_40_64','percent_0_17','percent_18_39',
       'percent_65+','percent_rental_units_vacant','percent_not_us_citizen','percent_less_highschool', 'percent_buildings_less_10_units',
 'percent_commute_30_to_59',
 'percent_commute_60_to_89',
 'percent_commute_90_more', 'percent_commute_less_30','percent_graduate_deg',
                'percent_female','gini_index','percent_hispanic','percent_black','percent_bachelors',
                 'percent_asian','percent_new_city','percent_new_unit']
acs_cols_keep=list(set(acs_cols) - set(acs_cols_remove))
bikeshare_cols = ['bs_total_stations',
 'bs_total_systems',
 'has_bike_sharing']
bikeshare_cols_remove=['has_bike_sharing','bs_total_systems']
bikeshare_cols_keep=list(set(bikeshare_cols) - set(bikeshare_cols_remove))
trends_cols = ['gun range',
 'gun control',
 'gun violence',
 'job opportunities',
 'unemployment',
 'retirement',
 'layoff',
 'lgbt',
 'same sex marriage',
 'they',
 'pronouns',
 'black lives matter',
 'political correctness',
 'make america great again',
 'euthanasia',
 'getaway',
 'places to go',
 'flight tickets',
 'twitter',
 'hashtag',
 'fake news',
 'hurricane',
 'wildfire',
 'flood',
 'fire',
 "trader joe's",
 'whole foods',
 'lululemon',
 'thrift',
 'condos for rent',
 'duplex apartments for rent',
 'townhomes for rent',
 'townhouses for rent',
 'home for rent',
 'house for rent',
 'townhome for rent',
 'townhouse for rent',
 'apartment for rent',
 'studio for rent',
 '1 bedroom for rent',
 '3 bedroom for rent',
 'starbucks',
  'apartment_for_rent_searches',
  'multifamily_for_rent_searches',
  'gun_searches']
trends_cols_remove=['they','apartment for rent','studio for rent','1 bedroom for rent',
                    '3 bedroom for rent', 'townhome for rent','townhouse for rent','townhomes for rent',
                    'townhouses for rent','house for rent','home for rent','duplex apartments for rent','condos for rent',
                   'gun range', 'gun control', 'gun violence','homes for rent']
trends_cols_keep=list(set(trends_cols) - set(trends_cols_remove))
economic_cols = ['total_firms',
 'job_creation_rate',
 'job_destruction_rate',
 'startup_firms','state_local_perc', 'net_job_rate']
economic_cols_remove=['total_firms', 'job_creation_rate','job_destruction_rate',]
economic_cols_keep=list(set(economic_cols) - set(economic_cols_remove))

In [248]:
df_orig = df.copy()

In [249]:
for zipcode in df['zip'].unique():
    globals()[f"scaler_{zipcode}"]=StandardScaler(copy=False)
    df_filtered=df[df['zip']==zipcode]
    df_filtered_train = df_filtered[df_filtered['year']<2019]
    globals()[f"scaler_{zipcode}"].fit(df_filtered_train[['zri']])
    df.loc[df['zip']==zipcode,'zri']=globals()[f"scaler_{zipcode}"].transform(df_filtered[['zri']])

In [250]:
df = df[gen_cols + acs_cols_keep + bikeshare_cols_keep + economic_cols_keep + trends_cols_keep]

In [251]:
scale_columns = ['percent_buildings_50+_units', 'percent_associates',
       'percent_rental_units_occupied', 'percent_white', 'percent_highschool',
       'percent_work_from_home', 'percent_buildings_20_49_units',
       'median_building_age', 'median_age', 'percent_commute_public_transport',
       'percent_buildings_10_19_units', 'income_per_capita',
       'percent_native_am', 'percent_workforce_unemployed', 'poverty_rate',
       'percent_units_owner_occupied', 'total_pop', 'bs_total_stations',
       'startup_firms', 'state_local_perc', 'net_job_rate', 'gun_searches',
       'wildfire', 'fire', 'lgbt', 'political correctness', 'lululemon',
       'make america great again', 'same sex marriage', 'job opportunities',
       'retirement', 'black lives matter', 'flight tickets', 'pronouns',
       'trader joe\'s', 'fake news', 'hurricane', 'flood', 'whole foods',
       'twitter', 'thrift', 'hashtag', 'apartment_for_rent_searches', 'layoff',
       'starbucks', 'getaway', 'places to go', 'unemployment', 'euthanasia',
       'multifamily_for_rent_searches']

In [252]:
for zipcode in df['zip'].unique():
    globals()[f"scaler_features_{zipcode}"]=StandardScaler(copy=False)
    df_filtered=df[df['zip']==zipcode]
    df_filtered_train = df_filtered[df_filtered['year']<2019]
    globals()[f"scaler_features_{zipcode}"].fit(df_filtered_train[scale_columns])
    df.loc[df['zip']==zipcode,scale_columns]=globals()[f"scaler_features_{zipcode}"].transform(df_filtered[scale_columns])

In [253]:
df_scaled_no_lags = df.copy()

In [254]:
df_scaled_no_lags

Unnamed: 0,zip,City,State,Metro,CountyName,zri,year,month,year-month,percent_buildings_50+_units,percent_associates,percent_rental_units_occupied,percent_white,percent_highschool,percent_work_from_home,percent_buildings_20_49_units,median_building_age,median_age,percent_commute_public_transport,percent_buildings_10_19_units,income_per_capita,percent_native_am,percent_workforce_unemployed,poverty_rate,percent_units_owner_occupied,total_pop,bs_total_stations,startup_firms,state_local_perc,net_job_rate,gun_searches,wildfire,fire,lgbt,political correctness,lululemon,make america great again,same sex marriage,job opportunities,retirement,black lives matter,flight tickets,pronouns,trader joe's,fake news,hurricane,flood,whole foods,twitter,thrift,hashtag,apartment_for_rent_searches,layoff,starbucks,getaway,places to go,unemployment,euthanasia,multifamily_for_rent_searches
0,1013,Chicopee,MA,Springfield,Hampden County,-1.357844,2014,1,2014-01-01,-2.462342,0.528917,-2.023827,1.597214,-0.002505,-1.390903,0.991546,-1.378641,1.274909,-0.816386,-0.919095,-2.029337,-1.631316,-0.969092,0.946046,2.353991,0.294540,0.0,0.386305,1.779482,-0.387187,0.035080,,,,,,,,,,,,,,,,,,,,,1.108220,,,,,,,0.545331
1,1013,Chicopee,MA,Springfield,Hampden County,-1.313356,2014,2,2014-02-01,-2.211482,0.521445,-1.913896,1.477034,-0.041369,-1.292497,1.039871,-1.378641,1.304083,-0.862803,-0.922701,-1.903530,-1.631316,-0.965910,1.011503,2.227519,0.119927,0.0,0.353815,1.728516,-0.275534,0.035080,,,,,,,,,,,,,,,,,,,,,1.108220,,,,,,,0.545331
2,1013,Chicopee,MA,Springfield,Hampden County,-1.269012,2014,3,2014-03-01,-1.960621,0.513972,-1.803966,1.356854,-0.080233,-1.194090,1.088197,-1.378641,1.333257,-0.909219,-0.926308,-1.777723,-1.631316,-0.962727,1.076959,2.101046,-0.054685,0.0,0.321325,1.677550,-0.163880,0.035080,,,,,,,,,,,,,,,,,,,,,1.108220,,,,,,,0.545331
3,1013,Chicopee,MA,Springfield,Hampden County,-1.342998,2014,4,2014-04-01,-1.709761,0.506500,-1.694035,1.236675,-0.119097,-1.095684,1.136522,-1.378641,1.362431,-0.955636,-0.929914,-1.651916,-1.631316,-0.959545,1.142416,1.974574,-0.229298,0.0,0.288835,1.626584,-0.052227,0.035080,,,,,,,,,,,,,,,,,,,,,1.108220,,,,,,,0.545331
4,1013,Chicopee,MA,Springfield,Hampden County,-1.342998,2014,5,2014-05-01,-1.458900,0.499028,-1.584105,1.116495,-0.157962,-0.997278,1.184847,-1.378641,1.391605,-1.002052,-0.933520,-1.526109,-1.631316,-0.956363,1.207872,1.848102,-0.403910,0.0,0.256345,1.575618,0.059427,0.035080,,,,,,,,,,,,,,,,,,,,,1.108220,,,,,,,0.545331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93667,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,0.007888,2019,8,2019-08-01,-0.764931,1.361233,-3.152414,-3.923321,-1.621183,2.320791,-2.044123,2.500834,2.650424,-1.052928,1.333320,1.711605,-2.488448,-0.640881,-1.922102,-1.273137,2.570895,0.0,2.096633,-0.960072,-2.539234,-0.210512,0.288231,-0.026856,0.502895,0.0,-0.338743,-0.347441,-0.511872,0.0,-1.229718,-0.474408,0.191792,3.436105,-0.875945,-0.649002,0.808896,1.192079,1.178159,-1.264465,0.028606,-1.262634,0.520563,-0.558584,0.996066,1.916550,-0.204284,-1.225972,-0.590624,0.332005
93668,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,-0.102332,2019,9,2019-09-01,-0.761122,1.269573,-3.672763,-4.035499,-1.599931,2.421031,-2.053985,2.900967,2.698266,-1.256484,1.300455,1.737216,-2.611808,-0.676827,-1.843691,-1.271625,2.639424,0.0,2.096633,-1.307347,-2.539234,-1.544770,-0.480384,-0.671398,0.502895,0.0,-1.227943,-0.347441,-0.511872,0.0,-0.557052,0.208740,1.296514,-0.750013,0.465928,1.576147,2.393672,-0.397360,0.570077,0.032422,-0.932556,-1.262634,-0.172538,0.903891,2.371056,-1.040413,-0.204284,-0.451674,-0.590624,-0.997079
93669,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,0.407646,2019,10,2019-10-01,-0.757313,1.177914,-4.193113,-4.147678,-1.578679,2.521272,-2.063847,3.301101,2.746108,-1.460039,1.267590,1.762827,-2.735167,-0.712772,-1.765280,-1.270112,2.707952,0.0,2.096633,-1.654623,-2.539234,-2.093527,-0.480384,-0.913101,1.039317,0.0,-0.338743,0.887905,-0.511872,0.0,0.788281,-0.474408,-0.912930,0.087211,0.465928,-0.649002,0.511750,-1.986799,0.266036,-0.659251,0.989768,-0.475538,-1.212190,-0.558584,2.709389,0.602344,0.298568,-0.451674,0.354375,0.332005
93670,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,-0.277916,2019,11,2019-11-01,-0.753505,1.086254,-4.713462,-4.259856,-1.557428,2.621512,-2.073709,3.701234,2.793949,-1.663594,1.234725,1.788438,-2.858526,-0.748718,-1.686869,-1.268599,2.776481,0.0,2.096633,-2.001899,-2.539234,1.606467,0.288231,-0.268559,-0.033526,0.0,0.804514,-0.347441,-0.511872,0.0,-0.557052,-0.474408,0.191792,-0.750013,2.255093,0.092715,-0.280637,-0.397360,0.874118,-0.745710,-0.795247,-1.262634,-0.519089,0.416399,4.197635,-1.040413,-0.204284,-0.709773,2.716872,-0.534222


In [255]:
df_zri = df_scaled_no_lags[gen_cols]
df_zri_acs = df_scaled_no_lags[gen_cols+acs_cols_keep]
df_zri_all = df_scaled_no_lags 
df_all_ext = df_scaled_no_lags

In [256]:
def laggenerator(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i]
    df.loc[:,f'{colname}_lag{i}']=df.loc[:,f'{colname}'].shift(i)
    df.loc[df['year-month'].isin(timelist),f'{colname}_lag{i}']=0
    return df

def laggenerator_diff(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i+1]
    df.loc[:,f'{colname}_diff_lag{i}_lag{i+1}']=df.loc[:,f'{colname}'].shift(i)-df.loc[:,f'{colname}'].shift(i+1)
    df.loc[df['year-month'].isin(timelist),f'{colname}_diff_lag{i}_lag{i+1}']=0
    return df

def laggenerator_diff12(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i+11]
    df.loc[:,f'{colname}_diff_lag{i}_lag{i+11}']=df.loc[:,f'{colname}'].shift(i)-df.loc[:,f'{colname}'].shift(i+11)
    df.loc[df['year-month'].isin(timelist),f'{colname}_diff_lag{i}_lag{i+11}']=0
    return df

#ZRI 
def lag_gen(df):
    for i in range(1,12):
        df=laggenerator(i, 'zri', df)
    df=laggenerator_diff12(1, 'zri', df)
    return df

In [148]:
df_zri = lag_gen(df_zri)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


# 1. AUTO (JUST ZRI) 

In [149]:
df = df_zri
train = df[(df['year'] < 2019) & (df['year']>=2015)]
test = df[df['year']==2019]
save_train = train.copy()
save_test = test.copy()

train = train.drop(['zip','City','State','Metro','CountyName','year','month','year-month'],axis=1)
train_y = train['zri']
train_X = train.drop(['zri'],axis=1)

test_y = test['zri']
test_X = test
#test_X = test.drop(['zri'],axis=1)

In [150]:
#Lasso Grid Search
lasso = Lasso()
grid = dict()
grid['alpha'] = [1e-15, 1e-10, 1e-8, 1e-5,1e-4, 1e-3,1e-2, 1, 5, 10]
lasso_grid = GridSearchCV(lasso, grid, cv=5, n_jobs=-1).fit(train_X,train_y)
lasso_best = lasso_grid.best_estimator_
print(f'lasso_best : {lasso_best}')

lasso_best.fit(train_X, train_y)


lasso_best : Lasso(alpha=1e-05)


Lasso(alpha=1e-05)

In [151]:
predictor_table=df.copy()
X_test = test_X.copy()

#loop through all month in 2019
for month in list(X_test['year-month'].drop_duplicates()):
    
    #run prediction for one month
    X_test=X_test[X_test['year-month']==month]
    X_test=X_test.drop(['zip','year','month','City','State','Metro','CountyName','year-month','zri'],axis=1)
    val=lasso_best.predict(X_test)

    #write current month prediction into predictor_table

    predictor_table.loc[predictor_table['year-month']==month,'zri']=val
        
    predictor_table=lag_gen(predictor_table)

    X_test=predictor_table[predictor_table['year']==2019]


In [152]:
scaled_predictions_y = X_test['zri']
temp = pd.concat([save_test['zip'],test_y],axis=1)
temp.reset_index(drop=True, inplace=True)
scaled_predictions_y.reset_index(drop=True, inplace=True)
rstable = pd.concat([temp,scaled_predictions_y],axis=1)
rstable.columns = ['zip','zri_test','zri_predicted']
rstable

Unnamed: 0,zip,zri_test,zri_predicted
0,1013,0.973309,1.062795
1,1013,0.960762,1.051512
2,1013,1.060825,1.056004
3,1013,1.160167,1.065848
4,1013,1.184891,1.073373
...,...,...,...
15607,99654,0.007888,1.066825
15608,99654,-0.102332,1.041338
15609,99654,0.407646,1.023838
15610,99654,-0.277916,1.014662


In [153]:
for zipcode in rstable['zip'].unique():
    rstable_filtered=rstable[rstable['zip']==zipcode]
    rstable.loc[rstable['zip']==zipcode,'zri_test']=globals()[f"scaler_{zipcode}"].inverse_transform(rstable_filtered[['zri_test']])
    rstable.loc[rstable['zip']==zipcode,'zri_predicted']=globals()[f"scaler_{zipcode}"].inverse_transform(rstable_filtered[['zri_predicted']])  
rstable.loc[:,'zri_test'] = np.exp(rstable.loc[:,'zri_test'])
rstable.loc[:,'zri_predicted'] = np.exp(rstable.loc[:,'zri_predicted'])
rstable.isna().sum()

zip              0
zri_test         0
zri_predicted    0
dtype: int64

In [154]:
#RESULTS 
r2 = r2_score(rstable['zri_test'],rstable['zri_predicted'])
rmse = sqrt(mean_squared_error(rstable['zri_test'],rstable['zri_predicted']))
print(f'R2: {r2*100}')
print(f'RMSE: {rmse}')

R2: 98.93213838217116
RMSE: 68.75143047884802


In [155]:
train_X.columns

Index(['zri_lag1', 'zri_lag2', 'zri_lag3', 'zri_lag4', 'zri_lag5', 'zri_lag6',
       'zri_lag7', 'zri_lag8', 'zri_lag9', 'zri_lag10', 'zri_lag11',
       'zri_diff_lag1_lag12'],
      dtype='object')

In [166]:
importance = np.abs(lasso_best.coef_)
coef = lasso_best.coef_
feature_names = train_X.columns
feature_importances= pd.DataFrame([feature_names,importance,coef]).T
feature_importances.columns = ['feature_names','coef_abs','coef']

feature_importances = feature_importances.sort_values(by='coef_abs',ascending=False)

In [167]:
feature_importances['Parent_feature']=feature_importances['feature_names'].apply(lambda x: x[0:x.find('_lag')] if x.find('_lag')!=-1 else x)
feature_importances['Parent_feature']=feature_importances['Parent_feature'].apply(lambda x: x[0:x.find('_diff')] if x.find('_diff')!=-1 else x)

In [168]:
feature_importances

Unnamed: 0,feature_names,coef_abs,coef,Parent_feature
0,zri_lag1,1.527279,1.527279,zri
1,zri_lag2,0.613722,-0.613722,zri
2,zri_lag3,0.059676,-0.059676,zri
11,zri_diff_lag1_lag12,0.038757,0.038757,zri
6,zri_lag7,0.032243,0.032243,zri
3,zri_lag4,0.031968,0.031968,zri
10,zri_lag11,0.026219,0.026219,zri
9,zri_lag10,0.014151,0.014151,zri
7,zri_lag8,0.012129,-0.012129,zri
8,zri_lag9,0.011088,0.011088,zri


In [174]:
feature_importances['feature_names']

0                zri_lag1
1                zri_lag2
2                zri_lag3
11    zri_diff_lag1_lag12
6                zri_lag7
3                zri_lag4
10              zri_lag11
9               zri_lag10
7                zri_lag8
8                zri_lag9
4                zri_lag5
5                zri_lag6
Name: feature_names, dtype: object

In [172]:
# title = 'Final_Auto'
# fi = title+'_FI'
# rstable.to_csv(f'../../../zillow_orientation/Residuals/{title}.csv')
# feature_importances.to_csv(f'../../../zillow_orientation/Residuals/{fi}.csv')

In [175]:
# BEST AUTOREGRESSIVE 
# 12 Features total 
# ['zri_lag1', 'zri_lag2', 'zri_lag3', 'zri_lag4', 'zri_lag5', 'zri_lag6',
#        'zri_lag7', 'zri_lag8', 'zri_lag9', 'zri_lag10', 'zri_lag11',
#        'zri_diff_lag1_lag12']
# Lasso(alpha=1e-05)
# R2: 98.93213838217116
# RMSE: 68.75143047884802
# top coefs: zri_lag1,zri_lag2,zri_lag3,zri_diff_lag1_lag12



# OLD TRIALS TO IGNORE
# ['zri_lag1', 'zri_lag6', 'zri_diff_lag1_lag12']
# R2: 98.85027227385696
# RMSE: 71.3381352101476
    
# ['zri_lag1', 'zri_lag2', 'zri_lag3', 'zri_lag4', 'zri_lag5', 'zri_lag6',
#        'zri_lag7', 'zri_lag8', 'zri_lag9', 'zri_lag10', 'zri_lag11',
#        'zri_lag12', 'zri_diff_lag1_lag12']   
# R2: 98.93199923119373
# RMSE: 68.7559097657487
    
# ['zri_lag1', 'zri_diff_lag1_lag2', 'zri_lag2', 'zri_diff_lag2_lag3',
#        'zri_lag3', 'zri_diff_lag3_lag4', 'zri_lag4', 'zri_diff_lag4_lag5',
#        'zri_lag5', 'zri_diff_lag5_lag6', 'zri_lag6', 'zri_diff_lag6_lag7',
#        'zri_lag7', 'zri_diff_lag7_lag8', 'zri_lag8', 'zri_diff_lag8_lag9',
#        'zri_lag9', 'zri_diff_lag9_lag10', 'zri_lag10', 'zri_diff_lag10_lag11',
#        'zri_lag11', 'zri_diff_lag11_lag12', 'zri_diff_lag1_lag12']
# R2: 98.93217535166379
# RMSE: 68.75024037731302

# 2. ZRI + ACS 

In [178]:
df_zri_acs = lag_gen(df_zri_acs)

In [184]:
for col in acs_cols_keep:
    df_zri_acs = laggenerator_diff12(1,col,df_zri_acs)
    df_zri_acs = laggenerator(12,col,df_zri_acs)

In [187]:
df = df_zri_acs
train = df[(df['year'] < 2019) & (df['year']>=2015)]
test = df[df['year']==2019]
save_train = train.copy()
save_test = test.copy()
train = train.drop(['zip','City','State','Metro','CountyName','year','month','year-month'],axis=1)
test = test.drop(['zip','City','State','Metro','CountyName','year','month','year-month'],axis=1)


train_y = train['zri']
train_X = train.drop(['zri'],axis=1)

test_y = test['zri']
test_X = test.drop(['zri'],axis=1)

In [193]:
#Lasso Grid Search
lasso = Lasso()
grid = dict()
grid['alpha'] = [1e-15, 1e-10, 1e-8, 1e-5,1e-4, 1e-3,1e-2, 1, 5, 10]
lasso_grid = GridSearchCV(lasso, grid, cv=5, n_jobs=-1).fit(train_X,train_y)
lasso_best = lasso_grid.best_estimator_
print(f'lasso_best : {lasso_best}')

lasso_best.fit(train_X, train_y)


  model = cd_fast.enet_coordinate_descent(


lasso_best : Lasso(alpha=1e-05)


  model = cd_fast.enet_coordinate_descent(


Lasso(alpha=1e-05)

In [195]:
scaled_predictions_y = pd.Series(lasso_best.predict(test_X))
temp = pd.concat([save_test['zip'],test_y],axis=1)
temp.reset_index(drop=True, inplace=True)
scaled_predictions_y.reset_index(drop=True, inplace=True)
rstable = pd.concat([temp,scaled_predictions_y],axis=1)
rstable.columns = ['zip','zri_test','zri_predicted']
rstable

Unnamed: 0,zip,zri_test,zri_predicted
0,1013,0.973309,1.069284
1,1013,0.960762,0.919812
2,1013,1.060825,0.977987
3,1013,1.160167,1.143819
4,1013,1.184891,1.231771
...,...,...,...
15607,99654,0.007888,0.309574
15608,99654,-0.102332,-0.056500
15609,99654,0.407646,-0.064711
15610,99654,-0.277916,0.805898


In [196]:
for zipcode in rstable['zip'].unique():
    rstable_filtered=rstable[rstable['zip']==zipcode]
    rstable.loc[rstable['zip']==zipcode,'zri_test']=globals()[f"scaler_{zipcode}"].inverse_transform(rstable_filtered[['zri_test']])
    rstable.loc[rstable['zip']==zipcode,'zri_predicted']=globals()[f"scaler_{zipcode}"].inverse_transform(rstable_filtered[['zri_predicted']])  
rstable.loc[:,'zri_test'] = np.exp(rstable.loc[:,'zri_test'])
rstable.loc[:,'zri_predicted'] = np.exp(rstable.loc[:,'zri_predicted'])
rstable.isna().sum()

zip              0
zri_test         0
zri_predicted    0
dtype: int64

In [197]:
#RESULTS 
r2 = r2_score(rstable['zri_test'],rstable['zri_predicted'])
rmse = sqrt(mean_squared_error(rstable['zri_test'],rstable['zri_predicted']))
print(f'R2: {r2*100}')
print(f'RMSE: {rmse}')

R2: 99.93513413630906
RMSE: 16.94464025139288


In [198]:
train_X.columns

Index(['percent_buildings_50+_units', 'percent_associates',
       'percent_rental_units_occupied', 'percent_white', 'percent_highschool',
       'percent_work_from_home', 'percent_buildings_20_49_units',
       'median_building_age', 'median_age', 'percent_commute_public_transport',
       'percent_buildings_10_19_units', 'income_per_capita',
       'percent_native_am', 'percent_workforce_unemployed', 'poverty_rate',
       'percent_units_owner_occupied', 'total_pop', 'zri_lag1', 'zri_lag2',
       'zri_lag3', 'zri_lag4', 'zri_lag5', 'zri_lag6', 'zri_lag7', 'zri_lag8',
       'zri_lag9', 'zri_lag10', 'zri_lag11', 'zri_diff_lag1_lag12',
       'percent_buildings_50+_units_diff_lag1_lag12',
       'percent_buildings_50+_units_lag12',
       'percent_associates_diff_lag1_lag12', 'percent_associates_lag12',
       'percent_rental_units_occupied_diff_lag1_lag12',
       'percent_rental_units_occupied_lag12', 'percent_white_diff_lag1_lag12',
       'percent_white_lag12', 'percent_highschool

In [199]:
importance = np.abs(lasso_best.coef_)
coef = lasso_best.coef_
feature_names = train_X.columns
feature_importances= pd.DataFrame([feature_names,importance,coef]).T
feature_importances.columns = ['feature_names','coef_abs','coef']

feature_importances = feature_importances.sort_values(by='coef_abs',ascending=False)

In [201]:
feature_importances['Parent_feature']=feature_importances['feature_names'].apply(lambda x: x[0:x.find('_lag')] if x.find('_lag')!=-1 else x)
feature_importances['Parent_feature']=feature_importances['Parent_feature'].apply(lambda x: x[0:x.find('_diff')] if x.find('_diff')!=-1 else x)

In [204]:
#feature_importances=feature_importances.groupby('Parent_feature').agg('mean').sort_values('Importance',ascending=False)
feature_importances

Unnamed: 0,feature_names,coef_abs,coef,Parent_feature
17,zri_lag1,1.496469,1.496469,zri
18,zri_lag2,0.60671,-0.60671,zri
19,zri_lag3,0.060458,-0.060458,zri
28,zri_diff_lag1_lag12,0.057826,0.057826,zri
16,total_pop,0.042028,0.042028,total_pop
61,total_pop_diff_lag1_lag12,0.041741,-0.041741,total_pop
27,zri_lag11,0.037256,0.037256,zri
62,total_pop_lag12,0.036205,-0.036205,total_pop
23,zri_lag7,0.030679,0.030679,zri
20,zri_lag4,0.030401,0.030401,zri


In [211]:
feats_agg = feature_importances[['coef_abs','Parent_feature']]
feats_agg['coef_abs'] = feats_agg['coef_abs'].astype('float')
feats_agg = feats_agg.groupby('Parent_feature').agg('mean').sort_values('coef_abs',ascending=False)
feats_agg

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feats_agg['coef_abs'] = feats_agg['coef_abs'].astype('float')


Unnamed: 0_level_0,coef_abs
Parent_feature,Unnamed: 1_level_1
zri,0.196969
total_pop,0.039992
income_per_capita,0.025751
percent_workforce_unemployed,0.021993
percent_units_owner_occupied,0.01184
percent_native_am,0.011044
percent_commute_public_transport,0.00791
percent_work_from_home,0.006206
percent_highschool,0.005662
median_building_age,0.005629


In [213]:
# title = '02_Final_Auto_ACS'
# fi = title+'_FI'
# fi2 = title+'_FI_Agg'
# rstable.to_csv(f'../../../zillow_orientation/Residuals/{title}.csv')
# feature_importances.to_csv(f'../../../zillow_orientation/Residuals/{fi}.csv')
# feats_agg.to_csv(f'../../../zillow_orientation/Residuals/{fi2}.csv')

In [215]:
# BEST AUTO + ACS 
# 63 features total 
# ['percent_buildings_50+_units', 'percent_associates',
#        'percent_rental_units_occupied', 'percent_white', 'percent_highschool',
#        'percent_work_from_home', 'percent_buildings_20_49_units',
#        'median_building_age', 'median_age', 'percent_commute_public_transport',
#        'percent_buildings_10_19_units', 'income_per_capita',
#        'percent_native_am', 'percent_workforce_unemployed', 'poverty_rate',
#        'percent_units_owner_occupied', 'total_pop', 'zri_lag1', 'zri_lag2',
#        'zri_lag3', 'zri_lag4', 'zri_lag5', 'zri_lag6', 'zri_lag7', 'zri_lag8',
#        'zri_lag9', 'zri_lag10', 'zri_lag11', 'zri_diff_lag1_lag12',
#        'percent_buildings_50+_units_diff_lag1_lag12',
#        'percent_buildings_50+_units_lag12',
#        'percent_associates_diff_lag1_lag12', 'percent_associates_lag12',
#        'percent_rental_units_occupied_diff_lag1_lag12',
#        'percent_rental_units_occupied_lag12', 'percent_white_diff_lag1_lag12',
#        'percent_white_lag12', 'percent_highschool_diff_lag1_lag12',
#        'percent_highschool_lag12', 'percent_work_from_home_diff_lag1_lag12',
#        'percent_work_from_home_lag12',
#        'percent_buildings_20_49_units_diff_lag1_lag12',
#        'percent_buildings_20_49_units_lag12',
#        'median_building_age_diff_lag1_lag12', 'median_building_age_lag12',
#        'median_age_diff_lag1_lag12', 'median_age_lag12',
#        'percent_commute_public_transport_diff_lag1_lag12',
#        'percent_commute_public_transport_lag12',
#        'percent_buildings_10_19_units_diff_lag1_lag12',
#        'percent_buildings_10_19_units_lag12',
#        'income_per_capita_diff_lag1_lag12', 'income_per_capita_lag12',
#        'percent_native_am_diff_lag1_lag12', 'percent_native_am_lag12',
#        'percent_workforce_unemployed_diff_lag1_lag12',
#        'percent_workforce_unemployed_lag12', 'poverty_rate_diff_lag1_lag12',
#        'poverty_rate_lag12', 'percent_units_owner_occupied_diff_lag1_lag12',
#        'percent_units_owner_occupied_lag12', 'total_pop_diff_lag1_lag12',
#        'total_pop_lag12']

# R2: 99.93513413630906
# RMSE: 16.94464025139288
# Lasso(alpha=1e-05)


# 3. ZRI + ALL 

In [None]:
# def lag_gen(df):
#     for i in range(1,12):
#         df=laggenerator(i, 'zri', df)
#     df=laggenerator_diff12(1, 'zri', df)
#     return df

In [257]:
df_zri_all

Unnamed: 0,zip,City,State,Metro,CountyName,zri,year,month,year-month,percent_buildings_50+_units,percent_associates,percent_rental_units_occupied,percent_white,percent_highschool,percent_work_from_home,percent_buildings_20_49_units,median_building_age,median_age,percent_commute_public_transport,percent_buildings_10_19_units,income_per_capita,percent_native_am,percent_workforce_unemployed,poverty_rate,percent_units_owner_occupied,total_pop,bs_total_stations,startup_firms,state_local_perc,net_job_rate,gun_searches,wildfire,fire,lgbt,political correctness,lululemon,make america great again,same sex marriage,job opportunities,retirement,black lives matter,flight tickets,pronouns,trader joe's,fake news,hurricane,flood,whole foods,twitter,thrift,hashtag,apartment_for_rent_searches,layoff,starbucks,getaway,places to go,unemployment,euthanasia,multifamily_for_rent_searches
0,1013,Chicopee,MA,Springfield,Hampden County,-1.357844,2014,1,2014-01-01,-2.462342,0.528917,-2.023827,1.597214,-0.002505,-1.390903,0.991546,-1.378641,1.274909,-0.816386,-0.919095,-2.029337,-1.631316,-0.969092,0.946046,2.353991,0.294540,0.0,0.386305,1.779482,-0.387187,0.035080,,,,,,,,,,,,,,,,,,,,,1.108220,,,,,,,0.545331
1,1013,Chicopee,MA,Springfield,Hampden County,-1.313356,2014,2,2014-02-01,-2.211482,0.521445,-1.913896,1.477034,-0.041369,-1.292497,1.039871,-1.378641,1.304083,-0.862803,-0.922701,-1.903530,-1.631316,-0.965910,1.011503,2.227519,0.119927,0.0,0.353815,1.728516,-0.275534,0.035080,,,,,,,,,,,,,,,,,,,,,1.108220,,,,,,,0.545331
2,1013,Chicopee,MA,Springfield,Hampden County,-1.269012,2014,3,2014-03-01,-1.960621,0.513972,-1.803966,1.356854,-0.080233,-1.194090,1.088197,-1.378641,1.333257,-0.909219,-0.926308,-1.777723,-1.631316,-0.962727,1.076959,2.101046,-0.054685,0.0,0.321325,1.677550,-0.163880,0.035080,,,,,,,,,,,,,,,,,,,,,1.108220,,,,,,,0.545331
3,1013,Chicopee,MA,Springfield,Hampden County,-1.342998,2014,4,2014-04-01,-1.709761,0.506500,-1.694035,1.236675,-0.119097,-1.095684,1.136522,-1.378641,1.362431,-0.955636,-0.929914,-1.651916,-1.631316,-0.959545,1.142416,1.974574,-0.229298,0.0,0.288835,1.626584,-0.052227,0.035080,,,,,,,,,,,,,,,,,,,,,1.108220,,,,,,,0.545331
4,1013,Chicopee,MA,Springfield,Hampden County,-1.342998,2014,5,2014-05-01,-1.458900,0.499028,-1.584105,1.116495,-0.157962,-0.997278,1.184847,-1.378641,1.391605,-1.002052,-0.933520,-1.526109,-1.631316,-0.956363,1.207872,1.848102,-0.403910,0.0,0.256345,1.575618,0.059427,0.035080,,,,,,,,,,,,,,,,,,,,,1.108220,,,,,,,0.545331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93667,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,0.007888,2019,8,2019-08-01,-0.764931,1.361233,-3.152414,-3.923321,-1.621183,2.320791,-2.044123,2.500834,2.650424,-1.052928,1.333320,1.711605,-2.488448,-0.640881,-1.922102,-1.273137,2.570895,0.0,2.096633,-0.960072,-2.539234,-0.210512,0.288231,-0.026856,0.502895,0.0,-0.338743,-0.347441,-0.511872,0.0,-1.229718,-0.474408,0.191792,3.436105,-0.875945,-0.649002,0.808896,1.192079,1.178159,-1.264465,0.028606,-1.262634,0.520563,-0.558584,0.996066,1.916550,-0.204284,-1.225972,-0.590624,0.332005
93668,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,-0.102332,2019,9,2019-09-01,-0.761122,1.269573,-3.672763,-4.035499,-1.599931,2.421031,-2.053985,2.900967,2.698266,-1.256484,1.300455,1.737216,-2.611808,-0.676827,-1.843691,-1.271625,2.639424,0.0,2.096633,-1.307347,-2.539234,-1.544770,-0.480384,-0.671398,0.502895,0.0,-1.227943,-0.347441,-0.511872,0.0,-0.557052,0.208740,1.296514,-0.750013,0.465928,1.576147,2.393672,-0.397360,0.570077,0.032422,-0.932556,-1.262634,-0.172538,0.903891,2.371056,-1.040413,-0.204284,-0.451674,-0.590624,-0.997079
93669,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,0.407646,2019,10,2019-10-01,-0.757313,1.177914,-4.193113,-4.147678,-1.578679,2.521272,-2.063847,3.301101,2.746108,-1.460039,1.267590,1.762827,-2.735167,-0.712772,-1.765280,-1.270112,2.707952,0.0,2.096633,-1.654623,-2.539234,-2.093527,-0.480384,-0.913101,1.039317,0.0,-0.338743,0.887905,-0.511872,0.0,0.788281,-0.474408,-0.912930,0.087211,0.465928,-0.649002,0.511750,-1.986799,0.266036,-0.659251,0.989768,-0.475538,-1.212190,-0.558584,2.709389,0.602344,0.298568,-0.451674,0.354375,0.332005
93670,99654,Wasilla,AK,Anchorage,Matanuska Susitna Borough,-0.277916,2019,11,2019-11-01,-0.753505,1.086254,-4.713462,-4.259856,-1.557428,2.621512,-2.073709,3.701234,2.793949,-1.663594,1.234725,1.788438,-2.858526,-0.748718,-1.686869,-1.268599,2.776481,0.0,2.096633,-2.001899,-2.539234,1.606467,0.288231,-0.268559,-0.033526,0.0,0.804514,-0.347441,-0.511872,0.0,-0.557052,-0.474408,0.191792,-0.750013,2.255093,0.092715,-0.280637,-0.397360,0.874118,-0.745710,-0.795247,-1.262634,-0.519089,0.416399,4.197635,-1.040413,-0.204284,-0.709773,2.716872,-0.534222


In [258]:
#ZRI
df_zri_all = lag_gen(df_zri_all)

In [259]:
#ACS
for col in acs_cols_keep:
    df_zri_all = laggenerator_diff12(1,col,df_zri_all)
    df_zri_all = laggenerator(12,col,df_zri_all)

In [260]:
#BIKESHARE AND ECONOMIC 
for col in bikeshare_cols_keep + economic_cols_keep:
    df_zri_all = laggenerator(1, col, df_zri_all)
    df_zri_all = laggenerator_diff12(1, col, df_zri_all)

In [261]:
#TRENDS 
for col in trends_cols_keep:
    for i in range(1,7):
        df_zri_all=laggenerator(i, col, df_zri_all)
    for i in range(1,7):
        df_zri_all=laggenerator_diff(i, col, df_zri_all)
    df_zri_all = laggenerator_diff12(1,col,df_zri_all)
    df_zri_all = laggenerator(12,col,df_zri_all)
    

In [263]:
df = df_zri_all
train = df[(df['year'] < 2019) & (df['year']>=2015)]
test = df[df['year']==2019]
save_train = train.copy()
save_test = test.copy()
train = train.drop(['zip','City','State','Metro','CountyName','year','month','year-month'],axis=1)
test = test.drop(['zip','City','State','Metro','CountyName','year','month','year-month'],axis=1)


train_y = train['zri']
train_X = train.drop(['zri'],axis=1)

test_y = test['zri']
test_X = test.drop(['zri'],axis=1)

In [264]:
#Lasso Grid Search
lasso = Lasso()
grid = dict()
grid['alpha'] = [1e-15, 1e-10, 1e-8, 1e-5,1e-4, 1e-3,1e-2, 1, 5, 10]
lasso_grid = GridSearchCV(lasso, grid, cv=5, n_jobs=-1).fit(train_X,train_y)
lasso_best = lasso_grid.best_estimator_
print(f'lasso_best : {lasso_best}')

lasso_best.fit(train_X, train_y)




ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
scaled_predictions_y = pd.Series(lasso_best.predict(test_X))
temp = pd.concat([save_test['zip'],test_y],axis=1)
temp.reset_index(drop=True, inplace=True)
scaled_predictions_y.reset_index(drop=True, inplace=True)
rstable = pd.concat([temp,scaled_predictions_y],axis=1)
rstable.columns = ['zip','zri_test','zri_predicted']
rstable