*******
# Inference

In [1]:
# import tools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import statsmodels.api as sm
from sklearn.model_selection import GridSearchCV, train_test_split
import sklearn.linear_model as lm
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

# change directory
os.chdir('C:/Users/hubst/ECON_490/Final Project')

# increase size of figures
sns.set(rc = {'axes.titlesize': 20,
              'axes.labelsize': 15,
              'xtick.labelsize': 10,
              'ytick.labelsize': 10,
              'figure.figsize': (10, 5)})

In [2]:
df = pd.read_pickle('C:/Users/hubst/ECON_490/Final Project/final_project.pkl')

In [3]:
y = df['log Unemployment Rate']
x = df.drop(columns = 'log Unemployment Rate')

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 3/10, random_state = 490)

x_train_std = pd.DataFrame(StandardScaler().fit(x_train).transform(x_train), 
                           columns = x_train.columns, index = x_train.index)
x_test_std = pd.DataFrame(StandardScaler().fit(x_test).transform(x_test), 
                          columns = x_test.columns, index = x_test.index)

x_train_std = sm.add_constant(x_train_std)
x_test_std = sm.add_constant(x_test_std)
x_train = sm.add_constant(x_train)
x_test = sm.add_constant(x_test)

In [4]:
param_grid = {'alpha': 10.**np.arange(-10, -1, 1)}

cv_enet = lm.ElasticNet(fit_intercept = False, normalize = False,
                        random_state = 490)
grid_search = GridSearchCV(cv_enet, param_grid, cv = 5,
                           scoring = 'neg_root_mean_squared_error',
                           n_jobs = 10)
grid_search.fit(x_train_std, y_train)
best_ols = grid_search.best_params_
best_ols

{'alpha': 0.001}

In [5]:
lasso_model = sm.OLS(y_train, x_train_std).fit_regularized(alpha = best_ols['alpha'])
lasso_model.params

const                                    0.590848
Female Labor Force Participation Rate    0.016863
Median Age                               0.000000
Average Household Size                   0.032254
Median Household Income                 -0.000124
Average Rent                             0.015247
Average Commute Time                     0.006330
Metro Status                             0.000000
cube Female                              0.003942
log African American                     0.006214
log Hispanic                             0.000000
log Bachelor's or more                  -0.010293
log Agriculture and Mining              -0.024129
log Poverty Level                        0.075322
log Immigrant                           -0.002630
log Manufacturing                       -0.003822
dtype: float64

In [6]:
x_train_std = x_train_std.drop(columns = ['Median Age', 'Metro Status', 'log Hispanic'])
x_test_std = x_test_std.drop(columns = ['Median Age', 'Metro Status', 'log Hispanic'])

In [7]:
model = sm.OLS(y_train, x_train_std).fit()
print(model.summary2())

                           Results: Ordinary least squares
Model:                    OLS                       Adj. R-squared:         0.404     
Dependent Variable:       log Unemployment Rate     AIC:                    -1471.7320
Date:                     2021-04-19 20:39          BIC:                    -1408.4078
No. Observations:         964                       Log-Likelihood:         748.87    
Df Model:                 12                        F-statistic:            55.44     
Df Residuals:             951                       Prob (F-statistic):     7.59e-101 
R-squared:                0.412                     Scale:                  0.012551  
--------------------------------------------------------------------------------------
                                       Coef.  Std.Err.    t     P>|t|   [0.025  0.975]
--------------------------------------------------------------------------------------
const                                  0.5918   0.0036 164.0255 0.0000 

In [8]:
print(10**0.0341 - 1)
print(10**0.0719 - 1)
print(10**-0.0243 - 1)

0.08168298933793317
0.18004888806896657
-0.05441625040245435


Average Household Size:

A one standard deviation increase in the average household size in a county is associated with an increase in the unemployment rate by 0.081683.

Log Poverty Level:

A one standard deviation increase in the log of (proportion of individuals within a county living at or below the poverty level + 1) is associated with an increase in the unemployment rate by 0.180049.

Log Agriculture and Mining:

A one standard deviation increase in the log of (proportion of adults within a county working in agriculture or mining occupations + 1) is associated with a decrease in the unemployment rate by 0.054416.

*******
# Prediction

In [9]:
rmse_inf = mean_squared_error(model.predict(x_test_std), y_test, squared = False)
rmse_inf

0.11582119920527055

# SVM

In [10]:
param_grid1 = {
    'C': 10.**np.arange(2, 4, step = 1),
    'degree': [1, 2, 3],
    'epsilon': 10.**np.arange(-10, -8, step = 1)
}

poly_cv = SVR(kernel = 'poly')

grid_search1 = GridSearchCV(poly_cv, param_grid1, cv = 5,
                           scoring = 'neg_root_mean_squared_error', 
                            n_jobs = 10).fit(x_train_std, y_train)

best_poly = grid_search1.best_params_
best_poly

{'C': 1000.0, 'degree': 1, 'epsilon': 1e-10}

In [11]:
svmr_poly = SVR(kernel = 'poly', C = best_poly['C'], epsilon = best_poly['epsilon'],
               degree = best_poly['degree']).fit(x_train_std, y_train)

In [12]:
rmse_poly = mean_squared_error(svmr_poly.predict(x_test_std), y_test, squared = False)
rmse_poly

0.11670821874878633

In [13]:
param_grid = {
    'C': 10.**np.arange(3, 5, step = 1), 
    'gamma': 10.**np.arange(-4, -2, step = 1),
}

rbf_cv = SVR(kernel = 'rbf')

grid_search = GridSearchCV(rbf_cv, param_grid, cv = 5,
                           scoring = 'neg_root_mean_squared_error', 
                           n_jobs = 10).fit(x_train_std, y_train)

best_rbf = grid_search.best_params_
best_rbf

{'C': 1000.0, 'gamma': 0.0001}

In [14]:
svmr_rbf = SVR(kernel = 'rbf', C = best_rbf['C'],
               gamma = best_rbf['gamma']).fit(x_train_std, y_train)

In [15]:
rmse_rbf = mean_squared_error(svmr_rbf.predict(x_test_std), y_test, squared = False)
rmse_rbf

0.11464457574785258

# XGBoost

In [16]:
x_train_train, x_train_test, y_train_train, y_train_test = train_test_split(x_train_std, y_train,
                                                                            train_size = 4/5,
                                                                            random_state = 490)

In [17]:
reg_xgb = xgb.XGBRegressor(n_estimators = 200,
                           max_depth = 2,
                           learning = 0.1,
                           random_state = 490)

reg_xgb.fit(x_train_train, y_train_train,
            eval_set = [(x_train_test, y_train_test)],
            early_stopping_rounds = 4)

Parameters: { learning } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-rmse:0.13474
[1]	validation_0-rmse:0.12572
[2]	validation_0-rmse:0.12172
[3]	validation_0-rmse:0.11955
[4]	validation_0-rmse:0.11744
[5]	validation_0-rmse:0.11667
[6]	validation_0-rmse:0.11621
[7]	validation_0-rmse:0.11727
[8]	validation_0-rmse:0.11630
[9]	validation_0-rmse:0.11552
[10]	validation_0-rmse:0.11639
[11]	validation_0-rmse:0.11770
[12]	validation_0-rmse:0.11744
[13]	validation_0-rmse:0.11701


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='', learning=0.1,
             learning_rate=0.300000012, max_delta_step=0, max_depth=2,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=200, n_jobs=12, num_parallel_tree=1, random_state=490,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [18]:
best_xgboost = reg_xgb.best_iteration
best_xgboost

9

In [19]:
xgboost_refit = xgb.XGBRegressor(n_estimators = best_xgboost,
                                 max_depth = 2,
                                 learning = 0.1,
                                 random_state = 490)

xgboost_refit.fit(x_train_train, y_train_train,
                  eval_set = [(x_train_test, y_train_test)],
                  early_stopping_rounds = 4)

Parameters: { learning } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-rmse:0.13474
[1]	validation_0-rmse:0.12572
[2]	validation_0-rmse:0.12172
[3]	validation_0-rmse:0.11955
[4]	validation_0-rmse:0.11744
[5]	validation_0-rmse:0.11667
[6]	validation_0-rmse:0.11621
[7]	validation_0-rmse:0.11727
[8]	validation_0-rmse:0.11630


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='', learning=0.1,
             learning_rate=0.300000012, max_delta_step=0, max_depth=2,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=9, n_jobs=12, num_parallel_tree=1, random_state=490,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [20]:
rmse_xgb = mean_squared_error(xgboost_refit.predict(x_test_std), y_test, squared = False)
rmse_xgb

0.12433332211428724

# Stacking

In [21]:
# can't use OLS in stacking so used linearregression insead
estimators = [
    ('xgb', xgb.XGBRegressor(n_estimators = best_xgboost,
                             max_depth = 2,
                             learning = 0.1,
                             random_state = 490)),
    ('ols', LinearRegression()),
    ('svm', SVR(kernel = 'rbf', C = best_rbf['C'],
     gamma = best_rbf['gamma']))
]
stack = StackingRegressor(estimators = estimators, 
                          final_estimator = RandomForestRegressor(n_estimators = 50,
                                                                  max_features = 'sqrt',
                                                                  max_depth = 2,
                                                                  random_state = 490), cv = 5)
stack.fit(x_train_std, y_train)

Parameters: { learning } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { learning } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down 

StackingRegressor(cv=5,
                  estimators=[('xgb',
                               XGBRegressor(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning=0.1, learning_rate=None,
                                            max_delta_step=None, max_depth=2,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=9, n_jobs=None,
                                            num_parallel_tree=None,
                                            ra

In [22]:
rmse_stack = mean_squared_error(stack.predict(x_test_std), y_test, squared = False)
rmse_stack

0.11735549308532334

*******
# Comparison

Model | RMSE
------|------
OLS	| 0.115821
SVM RBF | 0.114645
SVM Poly | 0.116708
XGBoost | 0.124333
Stacking | 0.117355

Flexibility (from most to least flexible):

1. Stacking
2. XGBoost
3. SVM
4. OLS

Ease of Interpretation (from most interpretable to least interpretable):

1. OLS
2. SVM
3. XGBoost
4. Stacking

The best performing model was SVM using an RBF kernel.