In [56]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [57]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [58]:
df_orig = pd.read_csv('full_dataset_unscaled.csv')
df_orig = df_orig.rename(columns={"datetime":"year-month"})
df_orig['net_job_rate']=df_orig['job_creation_rate']-df_orig['job_destruction_rate']
df_orig['apartment_for_rent_searches'] = df_orig[['apartment for rent','studio for rent','1 bedroom for rent','3 bedroom for rent']].sum(axis=1)
df_orig['multifamily_for_rent_searches'] = df_orig[['townhomes for rent','townhouse for rent','house for rent','duplex apartments for rent','condos for rent']].sum(axis=1)
df_orig['gun_searches'] = df_orig[['gun range', 'gun control', 'gun violence']].sum(axis=1)
df_orig['zri'] = np.log(df_orig['zri'])

In [59]:
gen_cols = ['zip',
 'City',
 'State',
 'Metro',
 'CountyName',
 'zri',
 'year',
 'month',
 'year-month']
acs_cols = ['percent_white',
 'percent_black',
 'percent_asian',
 'percent_hispanic',
 'percent_native_am',
 'percent_other_race',
 'percent_0_17',
 'percent_18_39',
 'percent_40_64',
 'percent_65+',
 'percent_rental_units_vacant',
 'percent_rental_units_occupied',
 'percent_graduate_deg',
 'percent_bachelors',
 'percent_associates',
 'percent_highschool',
 'percent_less_highschool',
 'percent_commute_public_transport',
 'percent_commute_less_30',
 'percent_buildings_less_10_units',
 'percent_buildings_10_19_units',
 'percent_buildings_20_49_units',
 'percent_buildings_50+_units',
 'percent_commute_30_to_59',
 'percent_commute_60_to_89',
 'percent_commute_90_more',
 'percent_new_city',
 'percent_new_unit',
 'percent_units_owner_occupied',
 'median_building_age',
 'income_per_capita',
 'poverty_rate',
 'total_pop',
 'percent_workforce_unemployed',
 'percent_work_from_home',
 'median_age',
 'percent_female',
 'gini_index',
 'percent_not_us_citizen']
acs_cols_remove=['percent_other_race','percent_40_64','percent_0_17','percent_18_39',
       'percent_65+','percent_rental_units_vacant','percent_not_us_citizen','percent_less_highschool', 'percent_buildings_less_10_units',
 'percent_commute_30_to_59',
 'percent_commute_60_to_89',
 'percent_commute_90_more', 'percent_commute_less_30','percent_graduate_deg',
                'percent_female','gini_index','percent_hispanic','percent_black','percent_bachelors',
                 'percent_asian','percent_new_city','percent_new_unit']
acs_cols_keep=list(set(acs_cols) - set(acs_cols_remove))
bikeshare_cols = ['bs_total_stations',
 'bs_total_systems',
 'has_bike_sharing']
bikeshare_cols_remove=['has_bike_sharing','bs_total_systems']
bikeshare_cols_keep=list(set(bikeshare_cols) - set(bikeshare_cols_remove))
trends_cols = ['gun range',
 'gun control',
 'gun violence',
 'job opportunities',
 'unemployment',
 'retirement',
 'layoff',
 'lgbt',
 'same sex marriage',
 'they',
 'pronouns',
 'black lives matter',
 'political correctness',
 'make america great again',
 'euthanasia',
 'getaway',
 'places to go',
 'flight tickets',
 'twitter',
 'hashtag',
 'fake news',
 'hurricane',
 'wildfire',
 'flood',
 'fire',
 "trader joe's",
 'whole foods',
 'lululemon',
 'thrift',
 'condos for rent',
 'duplex apartments for rent',
 'townhomes for rent',
 'townhouses for rent',
 'home for rent',
 'house for rent',
 'townhome for rent',
 'townhouse for rent',
 'apartment for rent',
 'studio for rent',
 '1 bedroom for rent',
 '3 bedroom for rent',
 'starbucks',
  'apartment_for_rent_searches',
  'multifamily_for_rent_searches',
  'gun_searches']
trends_cols_remove=['they','apartment for rent','studio for rent','1 bedroom for rent',
                    '3 bedroom for rent', 'townhome for rent','townhouse for rent','townhomes for rent',
                    'townhouses for rent','house for rent','home for rent','duplex apartments for rent','condos for rent',
                   'gun range', 'gun control', 'gun violence','homes for rent']
trends_cols_keep=list(set(trends_cols) - set(trends_cols_remove))
economic_cols = ['total_firms',
 'job_creation_rate',
 'job_destruction_rate',
 'startup_firms','state_local_perc', 'net_job_rate']
economic_cols_remove=['total_firms', 'job_creation_rate','job_destruction_rate',]
economic_cols_keep=list(set(economic_cols) - set(economic_cols_remove))

In [60]:
for zipcode in df_orig['zip'].unique():
    globals()[f"scaler_{zipcode}"]=StandardScaler(copy=False)
    df_filtered=df_orig[df_orig['zip']==zipcode]
    df_filtered_train = df_filtered[df_filtered['year']<2019]
    globals()[f"scaler_{zipcode}"].fit(df_filtered_train[['zri']])
    df_orig.loc[df_orig['zip']==zipcode,'zri']=globals()[f"scaler_{zipcode}"].transform(df_filtered[['zri']])

In [61]:
df_orig = df_orig[gen_cols + acs_cols_keep + bikeshare_cols_keep + economic_cols_keep + trends_cols_keep]

In [62]:
scale_columns = ['percent_buildings_50+_units', 'percent_associates',
       'percent_rental_units_occupied', 'percent_white', 'percent_highschool',
       'percent_work_from_home', 'percent_buildings_20_49_units',
       'median_building_age', 'median_age', 'percent_commute_public_transport',
       'percent_buildings_10_19_units', 'income_per_capita',
       'percent_native_am', 'percent_workforce_unemployed', 'poverty_rate',
       'percent_units_owner_occupied', 'total_pop', 'bs_total_stations',
       'startup_firms', 'state_local_perc', 'net_job_rate', 'gun_searches',
       'wildfire', 'fire', 'lgbt', 'political correctness', 'lululemon',
       'make america great again', 'same sex marriage', 'job opportunities',
       'retirement', 'black lives matter', 'flight tickets', 'pronouns',
       'trader joe\'s', 'fake news', 'hurricane', 'flood', 'whole foods',
       'twitter', 'thrift', 'hashtag', 'apartment_for_rent_searches', 'layoff',
       'starbucks', 'getaway', 'places to go', 'unemployment', 'euthanasia',
       'multifamily_for_rent_searches']

In [63]:
for zipcode in df_orig['zip'].unique():
    globals()[f"scaler_features_{zipcode}"]=StandardScaler(copy=False)
    df_filtered=df_orig[df_orig['zip']==zipcode]
    df_filtered_train = df_filtered[df_filtered['year']<2019]
    globals()[f"scaler_features_{zipcode}"].fit(df_filtered_train[scale_columns])
    df_orig.loc[df_orig['zip']==zipcode,scale_columns]=globals()[f"scaler_features_{zipcode}"].transform(df_filtered[scale_columns])

In [64]:
df_scaled_no_lags = df_orig.copy()

In [65]:
df_all = df_scaled_no_lags 

In [66]:
def laggenerator(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i]
    df.loc[:,f'{colname}_lag{i}']=df.loc[:,f'{colname}'].shift(i)
    df.loc[df['year-month'].isin(timelist),f'{colname}_lag{i}']=0
    return df

def laggenerator_diff(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i+1]
    df.loc[:,f'{colname}_diff_lag{i}_lag{i+1}']=df.loc[:,f'{colname}'].shift(i)-df.loc[:,f'{colname}'].shift(i+1)
    df.loc[df['year-month'].isin(timelist),f'{colname}_diff_lag{i}_lag{i+1}']=0
    return df

def laggenerator_diff12(i,colname,df):
    timelist=list(df['year-month'].drop_duplicates().sort_values())[0:i+11]
    df.loc[:,f'{colname}_diff_lag{i}_lag{i+11}']=df.loc[:,f'{colname}'].shift(i)-df.loc[:,f'{colname}'].shift(i+11)
    df.loc[df['year-month'].isin(timelist),f'{colname}_diff_lag{i}_lag{i+11}']=0
    return df

#ZRI 
def lag_gen(df):
    for i in range(1,12):
        df=laggenerator(i, 'zri', df)
    df=laggenerator_diff12(1, 'zri', df)
    return df

In [67]:
#ZRI
#df_zri_all = lag_gen(df_zri_all)

#ACS
for col in acs_cols_keep:
    df_all = laggenerator_diff12(1,col,df_all)
    df_all = laggenerator(1,col,df_all)
    df_all = laggenerator(6,col,df_all)
    df_all = laggenerator(12,col,df_all)
    
    
#BIKESHARE AND ECONOMIC 
for col in bikeshare_cols_keep + economic_cols_keep:
    df_all = laggenerator(1, col, df_all)
    df_all = laggenerator_diff12(1, col, df_all)  
    
#TRENDS 
for col in trends_cols_keep:
    for i in range(1,3):
        df_all=laggenerator(i, col, df_all)
    df_all = laggenerator_diff12(1,col,df_all) 

In [68]:
df_all = df_all.fillna(0)

In [69]:
df = df_all
train = df[(df['year'] < 2019) & (df['year']>=2015)]
test = df[df['year']==2019]
save_train = train.copy()
save_test = test.copy()

train = train.drop(['zip','City','State','Metro','CountyName','year','month','year-month'],axis=1)
train_y = train['zri']
train_X = train.drop(['zri'],axis=1)

test = test.drop(['zip','City','State','Metro','CountyName','year','month','year-month'],axis=1)
test_y = test['zri']
test_X = test.drop(['zri'],axis=1)

In [70]:
#Lasso Grid Search
lasso = Lasso()
grid = dict()
grid['alpha'] = [1e-5,1e-4, 1e-3,1e-2, 1, 10]
lasso_grid = GridSearchCV(lasso, grid, cv=5, n_jobs=-1).fit(train_X,train_y)
lasso_best = lasso_grid.best_estimator_



In [71]:
lasso_best.fit(train_X, train_y)

Lasso(alpha=0.001)

In [77]:
scaled_predictions_y = pd.Series(lasso_best.predict(test_X))
temp = pd.concat([save_test['zip'],test_y],axis=1)
temp.reset_index(drop=True, inplace=True)
scaled_predictions_y.reset_index(drop=True, inplace=True)
rstable = pd.concat([temp,scaled_predictions_y],axis=1)
rstable.columns = ['zip','zri_test','zri_predicted']



for zipcode in rstable['zip'].unique():
    rstable_filtered=rstable[rstable['zip']==zipcode]
    rstable.loc[rstable['zip']==zipcode,'zri_test']=globals()[f"scaler_{zipcode}"].inverse_transform(rstable_filtered[['zri_test']])
    rstable.loc[rstable['zip']==zipcode,'zri_predicted']=globals()[f"scaler_{zipcode}"].inverse_transform(rstable_filtered[['zri_predicted']])  
rstable.loc[:,'zri_test'] = np.exp(rstable.loc[:,'zri_test']).astype('float128')
rstable.loc[:,'zri_predicted'] = np.exp(rstable.loc[:,'zri_predicted'])
rstable.isna().sum()

  result = getattr(ufunc, method)(*inputs, **kwargs)


zip              0
zri_test         0
zri_predicted    0
dtype: int64

In [78]:
 rstable.loc[rstable['zri_predicted'] > 1e4,'zip']

11613    78521
11625    78550
Name: zip, dtype: int64

In [79]:
rstable.loc[(rstable['zip'] == 78521) | (rstable['zip'] ==  78550),:]

Unnamed: 0,zip,zri_test,zri_predicted
11604,78521,762.0,749.918952
11605,78521,765.0,756.702285
11606,78521,770.0,751.515834
11607,78521,777.0,755.655438
11608,78521,787.0,745.257625
11609,78521,793.0,743.243404
11610,78521,790.0,742.416388
11611,78521,782.0,0.0
11612,78521,780.0,0.0
11613,78521,806.5,inf


In [80]:
rstable.replace([np.inf, - np.inf], np.nan, inplace = True)
rstable = rstable.fillna(0)

In [81]:
#RESULTS 
r2 = r2_score(rstable['zri_test'],rstable['zri_predicted'])
rmse = sqrt(mean_squared_error(rstable['zri_test'],rstable['zri_predicted']))
print(f'R2: {r2*100}')
print(f'RMSE: {rmse}')

R2: 89.0179840686816
RMSE: 220.47801234312107


In [82]:
importance = np.abs(lasso_best.coef_)
coef = lasso_best.coef_
feature_names = train_X.columns
feature_importances= pd.DataFrame([feature_names,importance,coef]).T
feature_importances.columns = ['feature_names','coef_abs','coef']

feature_importances = feature_importances.sort_values(by='coef_abs',ascending=False)
feature_importances['Parent_feature']=feature_importances['feature_names'].apply(lambda x: x[0:x.find('_lag')] if x.find('_lag')!=-1 else x)
feature_importances['Parent_feature']=feature_importances['Parent_feature'].apply(lambda x: x[0:x.find('_diff')] if x.find('_diff')!=-1 else x)

In [83]:
feats_agg = feature_importances[['coef_abs','Parent_feature']]
feats_agg['coef_abs'] = feats_agg['coef_abs'].astype('float')
feats_agg_abs = feats_agg.groupby('Parent_feature').agg('mean').sort_values('coef_abs',ascending=False)
feats_agg_abs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feats_agg['coef_abs'] = feats_agg['coef_abs'].astype('float')


Unnamed: 0_level_0,coef_abs
Parent_feature,Unnamed: 1_level_1
unemployment,0.059703
startup_firms,0.056099
net_job_rate,0.049752
state_local_perc,0.045676
bs_total_stations,0.031636
twitter,0.031635
fake news,0.027183
total_pop,0.017073
black lives matter,0.017035
whole foods,0.016266


In [84]:
feats_agg = feature_importances[['coef','Parent_feature']]
feats_agg['coef'] = feats_agg['coef'].astype('float')
feats_agg_sign = feats_agg.groupby('Parent_feature').agg('mean').sort_values('coef',ascending=False)
feats_agg_sign

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feats_agg['coef'] = feats_agg['coef'].astype('float')


Unnamed: 0_level_0,coef
Parent_feature,Unnamed: 1_level_1
multifamily_for_rent_searches,0.01601
starbucks,0.015903
net_job_rate,0.013938
fake news,0.011068
hurricane,0.010318
flood,0.009734
startup_firms,0.009331
black lives matter,0.008595
poverty_rate,0.008302
fire,0.007798


In [85]:
title = '04_predicted_Final_All'
fi = title+'_FI'
fi2 = title+'_FI_Agg_sign'
rstable.to_csv(f'../../../zillow_orientation/Residuals/{title}.csv')
feature_importances.to_csv(f'../../../zillow_orientation/Residuals/{fi}.csv')
feats_agg_sign.to_csv(f'../../../zillow_orientation/Residuals/{fi2}.csv')