In [19]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from statsmodels.stats.outliers_influence import variance_inflation_factor

## Load main frame

In [2]:
all_vars = pd.read_csv('all_vars.csv', index_col=0, parse_dates=['Time'])
# all_vars.head()

In [3]:
## Set target here:
target_name = 'zori_ssa'

In [17]:
# ## View all of the info
# all_vars.info()

In [5]:
## Get target, categorical and numerical features
target = all_vars[target_name]
categorical_features = all_vars.select_dtypes(exclude=['float64', 'int64'])
numerical_features = all_vars.select_dtypes(['datetime','float64', 'int64'])

In [18]:
# numerical_features.info()

## First Multilinear model - only numerical

In [8]:

# housing_df_standard_scale=pd.DataFrame(StandardScaler().fit_transform(housing_df))

In [10]:
numerical_features = numerical_features.drop(['Time', 'zip_code'], axis=1)

## Scale all variables
scaler = StandardScaler()
scaled_frame = pd.DataFrame(scaler.fit_transform(numerical_features), columns=list(numerical_features.columns))

In [12]:
## Separate target from dataframe
y = scaled_frame[target_name]
X = scaled_frame.drop(['zori_ssa'], axis=1)

In [13]:
# ## Linear Model
# train_score: 0.23589181089194566
# test_score: 0.21079698925917212
    
lm = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
lm.fit(X_train, y_train)
print(f'train_score: {lm.score(X_train, y_train)}')
print(f'test_score: {lm.score(X_test, y_test)}')

train_score: 0.23589181089194566
test_score: 0.21079698925917212


In [16]:
## Print coefficient table

coef_table = pd.DataFrame({'Feature':X.columns, 'Coef':lm.coef_, 'AbsVal':np.abs(lm.coef_)})
coef_table = coef_table.sort_values('AbsVal', ascending=False)
top10 = coef_table.head(10)
top10

Unnamed: 0,Feature,Coef,AbsVal
48,tx_is_worse,-1441243000000.0,1441243000000.0
44,tx_is_a_little_worse,961408000000.0,961408000000.0
45,tx_is_a_lot_worse,594705900000.0,594705900000.0
37,monthly_avg_gas_price,18.51607,18.51607
24,Retail Gasoline Price TX,-18.38947,18.38947
29,Nonfarm Employment Texas,-12.12122,12.12122
13,Nonfarm Employment TX,11.8454,11.8454
49,fatalities,-0.7999325,0.7999325
50,caseCount,0.7862346,0.7862346
1,maxtempC,-0.668868,0.668868


#### Putting all numerical features in, train_score is 23.5899% and test score is 21.0939%

## Testing multi-colinearity with VIF

In [None]:
def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [53]:
# initialize X with all variables
X_reduced = X.copy()

cnt = 1
# Run vif with max 10
vif_frame = calc_vif(X_reduced).sort_values('VIF', ascending=False).reset_index()
while (vif_frame.loc[0,'VIF'] > 10):
#     print(f'run: {cnt}, shape: {X_reduced.shape}')
    X_reduced = X_reduced.drop(vif_frame.loc[0,'variables'], axis=1)
    vif_frame = calc_vif(X_reduced).sort_values('VIF', ascending=False).reset_index()
    cnt += 1
vif10_list = vif_frame['variables'].to_list()
X_vif10 = X_reduced

In [54]:
# initialize X with all variables
X_reduced = X.copy()

cnt = 1
# Run vif with max 10
vif_frame = calc_vif(X_reduced).sort_values('VIF', ascending=False).reset_index()
while (vif_frame.loc[0,'VIF'] > 5):
#     print(f'run: {cnt}, shape: {X_reduced.shape}')
    X_reduced = X_reduced.drop(vif_frame.loc[0,'variables'], axis=1)
    vif_frame = calc_vif(X_reduced).sort_values('VIF', ascending=False).reset_index()
    cnt += 1
vif5_list = vif_frame['variables'].to_list()
X_vif5 = X_reduced

In [55]:
lm = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X_vif10, y, test_size=0.33, random_state=42)
lm.fit(X_train, y_train)
print('VIF greater than 10')
print(f'train_score: {lm.score(X_train, y_train)}')
print(f'test_score: {lm.score(X_test, y_test)}')

VIF greater than 10
train_score: 0.2031480864669415
test_score: 0.1738112155826106


In [56]:
lm = LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(X_vif5, y, test_size=0.33, random_state=42)
lm.fit(X_train, y_train)
print('VIF greater than 5')
print(f'train_score: {lm.score(X_train, y_train)}')
print(f'test_score: {lm.score(X_test, y_test)}')

VIF greater than 5
train_score: 0.20018106377819123
test_score: 0.16992138468043017


In [58]:
## Print coefficient table

coef_table = pd.DataFrame({'Feature':X_test.columns, 'Coef':lm.coef_, 'AbsVal':np.abs(lm.coef_)})
coef_table = coef_table.sort_values('AbsVal', ascending=False)
top10 = coef_table.head(10)
top10

Unnamed: 0,Feature,Coef,AbsVal
0,maxtempC,-0.207766,0.207766
12,Gross Value Natural Gas Production,-0.18605,0.18605
9,Existing Single Family Home Sales TX,0.155406,0.155406
21,tx_is_dont_know,0.139795,0.139795
29,taxpayer_org_type_foreign,0.131856,0.131856
24,outlet_org_type_cl,0.12342,0.12342
25,outlet_org_type_is,-0.110677,0.110677
11,Gross Value Crude Oil Production,0.099112,0.099112
34,sap_case_shiller_index,0.097008,0.097008
32,total_sales_tax,-0.089613,0.089613
