In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import helper
from statsmodels.stats.outliers_influence import variance_inflation_factor
import missingno as msno
from sklearn.metrics import mean_squared_error

## Load main frame

In [2]:
all_vars = pd.read_csv('all_vars2.csv', index_col=0, parse_dates=['Time'])
# all_vars.head()

In [3]:
## Set target here:
target_name = 'zori_ssa'

In [4]:
# ## View all of the info
all_vars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13054 entries, 0 to 6684
Columns: 273 entries, Time to male_75_to_79
dtypes: datetime64[ns](1), float64(267), int64(1), object(4)
memory usage: 27.3+ MB


In [5]:
## Get target, categorical and numerical features
target = all_vars[target_name]
categorical_features = all_vars.select_dtypes(exclude=['float64', 'int64'])
numerical_features = all_vars.select_dtypes(['datetime','float64', 'int64'])

## First Multilinear model - only numerical

In [7]:
# housing_df_standard_scale=pd.DataFrame(StandardScaler().fit_transform(housing_df))

In [8]:
numerical_features = numerical_features.drop(['Time', 'zip_code'], axis=1)

## Scale all variables
scaler = StandardScaler()
scaled_frame = pd.DataFrame(scaler.fit_transform(numerical_features), columns=list(numerical_features.columns))

In [9]:
## Separate target from dataframe
y = scaled_frame[target_name]
X = scaled_frame.drop(['zori_ssa'], axis=1)

In [24]:
# ## Linear Model
# train_score: 0.23589181089194566
# test_score: 0.21079698925917212
    
lm = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
lm.fit(X_train, y_train)
print(f'train_score: {lm.score(X_train, y_train)}')
print(f'test_score: {lm.score(X_test, y_test)}')
pred = lm.predict(X_test)

# ## getting RMSE doesn't work because need to inferse scale
# pred_reverse = scaler.inverse_transform(pred)
# y_test_reverse = scaler.inverse_transform(y_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
rmse
print(f'rmse (unscaled): {rmse}')

train_score: 0.8138681067627667
test_score: 0.7931264953585497
rmse (unscaled): 0.44601539021466324


In [11]:
# top50['Feature'].to_list()

In [12]:
## Print coefficient table

coef_table = pd.DataFrame({'Feature':X.columns, 'Coef':lm.coef_, 'AbsVal':np.abs(lm.coef_)})
coef_table = coef_table.sort_values('AbsVal', ascending=False)
top50 = coef_table.head(50)

Unnamed: 0,Feature,Coef,AbsVal
37,monthly_avg_gas_price,25.300655,25.300655
24,Retail Gasoline Price TX,-25.299787,25.299787
29,Nonfarm Employment Texas,-24.19043,24.19043
13,Nonfarm Employment TX,23.866746,23.866746
131,quintile_5_income_share,-11.769353,11.769353
87,pop_determined_poverty_status,-6.320282,6.320282
255,gini_coeficient,5.609941,5.609941
89,gini_index,5.609941,5.609941
189,total_population_in_occupied_units,4.096149,4.096149
184,total_population_in_owner_occupied_units,3.995384,3.995384


#### Putting all numerical features in, train_score is 23.5899% and test score is 21.0939%

## Testing multi-colinearity with VIF

In [13]:
def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [14]:
# # initialize X with all variables
# X_reduced = X.copy()

# cnt = 1
# # Run vif with max 10
# vif_frame = calc_vif(X_reduced).sort_values('VIF', ascending=False).reset_index()
# while (vif_frame.loc[0,'VIF'] > 10):
#     print(f'run: {cnt}, shape: {X_reduced.shape}')
#     X_reduced = X_reduced.drop(vif_frame.loc[0,'variables'], axis=1)
#     vif_frame = calc_vif(X_reduced).sort_values('VIF', ascending=False).reset_index()
#     cnt += 1
# vif10_list = vif_frame['variables'].to_list()
# X_vif10 = X_reduced

In [15]:
# vif10_list

In [16]:
# # initialize X with all variables
# X_reduced = X.copy()

# cnt = 1
# # Run vif with max 10
# vif_frame = calc_vif(X_reduced).sort_values('VIF', ascending=False).reset_index()
# while (vif_frame.loc[0,'VIF'] > 5):
#     print(f'run: {cnt}, shape: {X_reduced.shape}')
#     X_reduced = X_reduced.drop(vif_frame.loc[0,'variables'], axis=1)
#     vif_frame = calc_vif(X_reduced).sort_values('VIF', ascending=False).reset_index()
#     cnt += 1
# vif5_list = vif_frame['variables'].to_list()
# X_vif5 = X_reduced

In [17]:
# vif5 = pd.read_csv('vif_5.csv', index_col=1)
# vif10 = pd.read_csv('vif_10.csv')

### Linear Regression with VIF variables

In [20]:
# lm = LinearRegression()
# X_train, X_test, y_train, y_test = train_test_split(X_vif10, y, test_size=0.33, random_state=42)
# lm.fit(X_train, y_train)
# print('VIF greater than 10')
# print(f'train_score: {lm.score(X_train, y_train)}')
# print(f'test_score: {lm.score(X_test, y_test)}')

In [21]:
# lm = LinearRegression()
# X_train, X_test, y_train, y_test = train_test_split(X_vif5, y, test_size=0.33, random_state=42)
# lm.fit(X_train, y_train)
# print('VIF greater than 5')
# print(f'train_score: {lm.score(X_train, y_train)}')
# print(f'test_score: {lm.score(X_test, y_test)}')

In [22]:
# ## Print coefficient table

# coef_table = pd.DataFrame({'Feature':X_test.columns, 'Coef':lm.coef_, 'AbsVal':np.abs(lm.coef_)})
# coef_table = coef_table.sort_values('AbsVal', ascending=False)
# top30 = coef_table.head(30)
# top30

### PCA

In [23]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])

In [24]:
principalDf

Unnamed: 0,principal component 1,principal component 2
0,11.873935,-10.197237
1,-0.884364,-3.287247
2,-8.488271,-2.369734
3,3.472410,-0.812510
4,-1.058387,2.500116
...,...,...
13049,10.099415,4.445361
13050,-6.010913,4.329936
13051,-6.827104,0.848958
13052,-5.860584,13.464483


In [25]:
finalDf = pd.concat([principalDf, y], axis = 1)

In [26]:
finalDf

Unnamed: 0,principal component 1,principal component 2,zori_ssa
0,11.873935,-10.197237,0.453464
1,-0.884364,-3.287247,0.608869
2,-8.488271,-2.369734,-0.819182
3,3.472410,-0.812510,-0.676377
4,-1.058387,2.500116,-0.457969
...,...,...,...
13049,10.099415,4.445361,0.461864
13050,-6.010913,4.329936,0.407262
13051,-6.827104,0.848958,0.722273
13052,-5.860584,13.464483,1.453099


### Lasso

In [43]:
# acs = pd.read_csv('cleaned_data/acs2_clean.csv', index_col=0, parse_dates = ['Time'])
acs = pd.read_csv('cleaned_data/acs_clean.csv', index_col=0, parse_dates = ['Time'])
acs2 = pd.read_csv('cleaned_data/acs2_clean.csv', index_col=0, parse_dates = ['Time'])
zri = pd.read_csv('cleaned_data/target.csv', index_col=0, parse_dates=['Time'])

In [44]:
main_frame = helper.time_lag_merge(zri, acs)
main_frame = helper.time_lag_merge(zri, acs2)

In [45]:
main_frame.head()

Unnamed: 0,zip_code,City,State,Time,zori_ssa,zori_ssa_diff,commuting_population,divorced_men,driving_alone_population,housing_units_built_1940_to_1949,...,housing_units_over_50_units,women_with_bachelors_degree,male_45_to_49,housing_units_built_1960_to_1969,housing_units_duplex_owned,married_women_spouse_present,men_with_hs_diploma,quintile_5_mean_income,quintile_2_mean_income,male_75_to_79
0,77494,Houston,TX,2014-01-01,1390.0,,35176.0,959.0,27840.0,115.0,...,506.0,9997.0,3109.0,0.0,0.0,19112.0,1407.0,303338.0,91207.0,250.0
1,77449,Houston,TX,2014-01-01,1202.0,,47123.0,2666.0,38322.0,15.0,...,580.0,5190.0,2612.0,110.0,0.0,17308.0,7542.0,153318.0,42882.0,184.0
2,77084,Houston,TX,2014-01-01,1058.0,,50487.0,2548.0,41739.0,38.0,...,1286.0,6440.0,3664.0,378.0,9.0,18792.0,7972.0,175714.0,41101.0,372.0
3,79936,El Paso,TX,2014-01-01,887.0,,51933.0,3367.0,43398.0,251.0,...,610.0,6311.0,3564.0,794.0,132.0,19443.0,8410.0,155233.0,33693.0,788.0
4,78130,San Antonio,TX,2014-01-01,1118.0,,28403.0,2442.0,22842.0,1026.0,...,870.0,4207.0,1773.0,1768.0,105.0,12144.0,5169.0,147086.0,35776.0,455.0


In [30]:
# acs = acs.drop('zip_code', axis=1)

In [46]:
numerical_features = main_frame.select_dtypes(['datetime','float64', 'int64'])
numerical_features = numerical_features.drop(['Time', 'zip_code'], axis=1)

numerical_features = numerical_features.dropna(axis='index')

In [47]:
## Scale all variables
scaler = StandardScaler()
scaled_frame = pd.DataFrame(scaler.fit_transform(numerical_features), columns=list(numerical_features.columns))

In [48]:
## Separate target from dataframe
y = scaled_frame['zori_ssa']
X = scaled_frame.drop(['zori_ssa'], axis=1)

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [50]:
las = Lasso(max_iter = 50000, random_state = 33, alpha=0.001)

In [51]:
las.fit(X_train,y_train)
las.score(X_train,y_train)

0.791106353859387

In [52]:
las.score(X_test,y_test)

0.7890901516542532

In [53]:
las.coef_

array([ 0.3796738 ,  0.16653107, -0.16714311,  0.        ,  0.01562319,
        0.06729562, -0.13400735, -0.03477838, -0.08329014,  0.02846912,
       -0.09196558,  0.07803077, -0.        ,  0.04175991, -0.04539215,
       -0.        , -0.02710115, -0.11224543, -0.12966364, -0.        ,
        0.        ,  0.01381609, -0.0387027 ,  0.        ,  0.02496997,
        0.03870009,  0.00217177, -0.00298803, -0.00174171,  0.        ,
        0.02327549, -0.01287093,  0.02421935, -0.        ,  0.03563519,
       -0.05801574,  0.21380304, -0.        , -0.09187268, -0.        ,
       -0.08803687, -0.        , -0.        , -0.4489888 ,  0.0031907 ,
        0.0962908 , -0.26491812,  0.03322507,  0.        ,  0.        ,
       -0.        , -0.01181071, -0.02861772,  0.08161445,  0.        ,
        0.0912017 , -0.05270922, -0.03065689,  0.        , -0.55344632,
        0.        ,  0.        , -0.07709321, -0.15948258,  0.31882748,
       -0.03259323, -0.02790195,  0.06679385,  0.        ,  0.05

In [54]:

# top10_acs2 = top10.copy()

In [55]:
# print(top10_acs2['Feature'].to_list())

In [57]:
## Print coefficient table

coef_table = pd.DataFrame({'Feature':X.columns, 'Coef':las.coef_, 'AbsVal':np.abs(las.coef_)})
coef_table = coef_table.sort_values('AbsVal', ascending=False)
top50 = coef_table.head(50)
top50

Unnamed: 0,Feature,Coef,AbsVal
86,housing_units_3_to_4_units,-0.656792,0.656792
132,housing_units_3_to_4_units_rented,0.579579,0.579579
105,female_population,0.577824,0.577824
59,married_men_spouse_present,-0.553446,0.553446
43,median_household_income,-0.448989,0.448989
0,zori_ssa_diff,0.379674,0.379674
118,average_household_size,0.355974,0.355974
124,average_household_size_owners,-0.323561,0.323561
64,median_gross_rent,0.318827,0.318827
110,total_household_income,0.318098,0.318098


In [58]:
top50['Feature'].to_list()

['housing_units_3_to_4_units',
 'housing_units_3_to_4_units_rented',
 'female_population',
 'married_men_spouse_present',
 'median_household_income',
 'zori_ssa_diff',
 'average_household_size',
 'average_household_size_owners',
 'median_gross_rent',
 'total_household_income',
 'quintile_4_mean_income',
 'housing_units_single_family_detached',
 'female_40_to_44',
 'quintile_3_upper_limit',
 'carpool_population',
 'housing_units_built_1960_to_1969',
 'top_5_percent_lower_limit',
 'housing_units_20_to_49_units',
 'male_85_over',
 'divorced_men',
 'commuting_population',
 'female_50_to_54',
 'men_with_bachelors_degree',
 'housing_units_single_family_attached_owned',
 'quintile_1_upper_limit',
 'housing_units_10_to_19_units_rented',
 'men_with_professional_degree',
 'male_80_to_84',
 'housing_units_single_family_attached',
 'male_21',
 'housing_units_built_1990_to_1999',
 'housing_units_duplex_owned',
 'total_population_in_owner_occupied_units',
 'male_70_to_74',
 'female_21',
 'female_45_

In [42]:
STOP!

SyntaxError: invalid syntax (2635050600.py, line 1)

In [None]:
top10_acs1 = top10.copy()

In [None]:
# print(top10_acs2['Feature'].to_list())

In [None]:
X_vals = ['zori_ssa_12_month_shift',
       'zori_ssa_13_month_shift', 'zori_ssa_18_month_shift',
       'zori_ssa_24_month_shift','zori_ssa_1_diff_lag_12', 'zori_ssa_6_diff_lag_12',
                      'zori_ssa_12_diff_lag_12'
                       ]
y_val = 'zori_ssa'
# for metro in metros:
train = zri_shift.loc[zri_shift['Time']<datetime.datetime(2020,7,2),:].reset_index(drop=True)
test = zri_shift.loc[zri_shift['Time']>datetime.datetime(2020,7,2),:].reset_index(drop=True)
cat_feats_=[]
scaler = StandardScaler(with_mean=False)
lasso = Lasso(max_iter = 50000, random_state = 33)
X = train[X_vals]
transformer = ColumnTransformer([("Cat",
                                  OneHotEncoder(handle_unknown = 'ignore'),
                                  cat_feats_)], remainder='passthrough')
X = transformer.fit_transform(X)
X = scaler.fit_transform(X)
y = train[y_val]
# Grid Search set up.
alphas = [0.3, 0.6, 1]
tuned_parameters = [{'alpha': alphas}]
print(f'Performing Grid Search with alphas of: {alphas}')
clf = GridSearchCV(lasso, tuned_parameters,
                    cv=3,n_jobs = -1,
                  scoring = 'neg_root_mean_squared_error')
# best_alpha = clf.best_params_['alpha']
clf.fit(X, y)
X_test = test[X_vals]
X_test = transformer.transform(X_test)
X_test = scaler.transform(X_test)
y_test = test[y_val]
y_predicted = clf.predict(X_test)
test.loc[:,'pred_difference'] = test.loc[:,y_val] - y_predicted
rms = mean_squared_error(y_test, y_predicted, squared=False)
test.loc[:,'pred'] = y_predicted