 # Features Selection Using Lasso

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
# Custom functions
from sample_panel.merge_datasets import merge_bank_macro_datasets
from supervised_learning.cross_validation import PanelDataSplit
from supervised_learning.cross_validation import search_best_model
from supervised_learning.cross_validation import Lasso_chosen_features
from supervised_learning.cross_validation import find_Lasso_coef

from supervised_learning.estimate_errors import estimate_median_relative_error
from supervised_learning.estimate_errors import estimate_mean_relative_error
from supervised_learning.estimate_errors import estimate_errors

## Preparing Data

In [3]:
# Load bank panel data
bank_data = pd.read_csv('df_response_vars.csv')

In [4]:
# Load macroeconomic data
macro_data = pd.read_csv('macro_features.csv')
macro_columns = macro_data.columns

# Factors with lags are not used in the model. Remove factors with lags
new_macro_columns = [col for col in macro_columns if '_lag' not in col]
macro_data = macro_data[new_macro_columns]

In [5]:
# Load PCA components
pca_data = pd.read_csv('macro_pca_df.csv')

In [6]:
# Load additional macro variables
macro_data1 = pd.read_csv('macro_most_inf_df.csv')
# Clean column names
macro_data1.columns = [col.replace('\n', ' ') for col in macro_data1.columns]

In [7]:
# Merge the bank panel and macroeconomic indicators
data_set = merge_bank_macro_datasets(bank_data, macro_data, pca_data, macro_data1)

In [8]:
# Delete Nans values due to the lag of the response variable
data_set.dropna(subset=['Provision_Lag1'], inplace=True)
data_set.reset_index(drop=True, inplace=True)

## Parameters

In [9]:
# Response variable
y_col = 'Provision for Loan Lease Losses as % of Aver. Assets'

In [10]:
# Models
# In the case of Lasso regression, we didn't need to specify a list of all potential models 
# because Lasso has the capability to autonomously select the model structure. 
# It achieves this by identifying the factors for which the coefficients are not equal to zero.
models = {
    'model1': ['Provision_Lag1', 'Real GDP growth_ema3', 'BBB corporate yield', 
               '3-month Treasury rate change', 'Dow Jones Total Stock Market Index change', 
               'Market Volatility Index'],
    'model2': ['Provision_Lag1', 'Real GDP growth_ema3', 'BBB corporate yield', 
               '3-month Treasury rate change', 'Dow Jones Total Stock Market Index change', 
               'Market Volatility Index change'],
    'model3': ['Provision_Lag1'] + list(pca_data.columns[:-1]), # all pca components,
    'model4': ['Provision_Lag1'] + list(macro_data1.columns[:-1]) # additional macro indicators
    } 

## Additional features

In [11]:
# Fixed Effects (individual intercept for each bank)
data_set['IDRSSD1'] = data_set['IDRSSD']
data_set = pd.get_dummies(data_set, columns=['IDRSSD1'], drop_first=True)
fixed_effects_features = [col for col in data_set.columns if col.startswith('IDRSSD1_')]

## Train data set

In [12]:
# The last year is for test. We shouldn't use the last year to choose the features
data_set_train = data_set[data_set['Report Date']<='2021-12-31'].copy()
data_set_test = data_set[data_set['Report Date']>'2021-12-31'].copy()

### Removing outliers from the train set

In [13]:
lower_limit = np.percentile(data_set_train[y_col], 0.5)
upper_limit = np.percentile(data_set_train[y_col], 99)

data_set_train = data_set_train[(data_set_train[y_col]<=upper_limit)&(data_set_train[y_col]>=lower_limit)].copy()
data_set_train.reset_index(drop=True, inplace=True)

## Pipeline

In [14]:
# For lasso regression we need to scale data, so its coefficients has similar magnitude, 
# it is important for correct work of regularization. Otherwise regularization will penalize large coefficients
scaler = StandardScaler()
lasso_model = Lasso(max_iter=10000)

pipeline = Pipeline(steps=[("scaler", scaler), ("lasso", lasso_model)])

## Defining cross validator and GridSearchCV

In [15]:
# Custom cross-validator for panel data cross-validation split, based on sklearn's sklearn.model_selection.TimeSeriesSplit 
panel_cv = PanelDataSplit(test_size=4, date_axis=data_set_train['Report Date'], n_splits=5)

In [16]:
param_grid = {
    'lasso__alpha': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.5, 0.75, 1]
}
search = GridSearchCV(pipeline, param_grid, scoring='r2', cv=panel_cv)

##  Model Selection

In [17]:
best_model_name, best_score, best_model, models_results, estimators = \
    search_best_model(data_set_train, models, search, y_col, fixed_effects_features)

In [18]:
models_results['Cross-Validation R^2 Standard Error of the Mean'] = \
    models_results['Cross-Validation R^2 std'] / panel_cv.get_n_splits()**0.5

models_results

Unnamed: 0,Cross-Validation R^2,Cross-Validation R^2 std,Best Hyperparameters,Cross-Validation R^2 Standard Error of the Mean
model1,0.438239,0.538243,{'lasso__alpha': 0.01},0.240709
model2,0.493577,0.426238,{'lasso__alpha': 0.01},0.190619
model3,0.315647,0.523222,{'lasso__alpha': 0.1},0.233992
model4,0.416651,0.621058,{'lasso__alpha': 0.01},0.277746


##  Features chosen with Lasso

### Model1

In [19]:
Lasso_chosen_features(estimators[0], models['model1'], y_col, fixed_effects_features, data_set_train)                

Unnamed: 0,factors,coef
0,Provision_Lag1,0.400529
1,Real GDP growth_ema3,-0.08687
2,BBB corporate yield,0.053627
3,3-month Treasury rate change,-0.03527
4,Dow Jones Total Stock Market Index change,0.0
5,Market Volatility Index,0.040364


### Model2

In [20]:
Lasso_chosen_features(estimators[1], models['model2'], y_col, fixed_effects_features, data_set_train)   

Unnamed: 0,factors,coef
0,Provision_Lag1,0.400867
1,Real GDP growth_ema3,-0.099587
2,BBB corporate yield,0.054853
3,3-month Treasury rate change,-0.053521
4,Dow Jones Total Stock Market Index change,-0.0
5,Market Volatility Index change,-0.0


### Model3

In [21]:
Lasso_chosen_features(estimators[2], models['model3'], y_col, fixed_effects_features, data_set_train)  

Unnamed: 0,factors,coef
0,Provision_Lag1,0.494657
1,PC1,-0.0
2,PC2,0.008507
3,PC3,-0.0
4,PC4,0.0
5,PC5,0.0
6,PC6,0.0
7,PC7,0.0
8,PC8,0.0
9,PC9,-0.0


### Model4

In [22]:
factors = Lasso_chosen_features(estimators[3], models['model4'], y_col, fixed_effects_features, data_set_train) 
factors[factors['coef']!=0]

Unnamed: 0,factors,coef
0,Provision_Lag1,0.381043
3,Japan bilateral dollar exchange rate (yen/USD),-0.005234
4,Euro area bilateral dollar exchange rate (USD/...,0.018461
5,NBER_Recession_Indicator_Peak_through_Trough,0.031737
6,Commercial_Banks_Treasury_and_Agency_Securities,0.016445
8,Real disposable income growth,-0.016507
9,U.K. bilateral dollar exchange rate (USD/pound),-0.050345
12,Unemployment rate,0.099282
17,BBB corporate yield,0.076716
18,Households_Net_Worth,-0.014169


##   The chosen model's performance on the test dataset

In [23]:
best_model_name

'model2'

In [24]:
best_model

Pipeline(steps=[('scaler', StandardScaler()),
                ('lasso', Lasso(alpha=0.01, max_iter=10000))])

In [25]:
# Fit the pipeline on the whole train set
model_factors_all = models['model2'] + fixed_effects_features
X_train = data_set_train[model_factors_all]
y_train = data_set_train[y_col]
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_train)

### Train sample

In [26]:
estimate_errors(y_train, y_pred, lower_limit, upper_limit)

Unnamed: 0,measure
R squared,0.651785
RMSE,0.512533
"median relative error, %",43.443012


### Test sample

In [27]:
X_test = data_set_test[model_factors_all]
y_test = data_set_test[y_col]
y_pred = best_model.predict(X_test)

In [28]:
estimate_errors(y_test, y_pred, lower_limit, upper_limit)

Unnamed: 0,measure
R squared,0.746256
RMSE,0.264592
"median relative error, %",34.475139
