# Supervised Learning: Autocorrelation Time Series Models

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Custom functions
from sample_panel.merge_datasets import merge_bank_macro_datasets
from supervised_learning.estimate_errors import estimate_errors

from supervised_learning.time_series import create_bank_train_set
from supervised_learning.time_series import choose_bank_best_model
from supervised_learning.time_series import bank_prediction

## Loading Data

In [3]:
# Load bank panel data
bank_data = pd.read_csv('df_response_vars.csv')

In [4]:
# Load macroeconomic data
macro_data = pd.read_csv('macro_features.csv')
macro_columns = macro_data.columns

# Factors with lags are not used in the model. Remove factors with lags
new_macro_columns = [col for col in macro_columns if '_lag' not in col]
macro_data = macro_data[new_macro_columns]

In [5]:
# Load PCA components
pca_data = pd.read_csv('macro_pca_df.csv')

In [6]:
# Load additional macro variables
macro_data1 = pd.read_csv('macro_most_inf_df.csv')
# Clean column names
macro_data1.columns = [col.replace('\n', ' ') for col in macro_data1.columns]

In [7]:
# Merge the bank panel and macroeconomic indicators
data_set = merge_bank_macro_datasets(bank_data, macro_data, pca_data, macro_data1)

In [8]:
# Column for predicted values
data_set['y_pred']=np.NaN

## Params

DataFrame "factors_df" contains the chosen factors and specifies how these factors should be utilized.
- The "sign" column indicates the relationship's sign with the response variable. The sign's correctness will be
automatically verified for each model, and models with incorrect signs will be disregarded.
- The "group" column: In the case of multiple factors within one group, the factor with the highest correlation with the response variable will be included in the model.
- The "calc_ema" flag determines whether to apply the Exponential Moving Average (EMA) transformation to a factor. If "calc_ema" is set to 1, the EMA's center of mass parameter will be automatically selected from a predefined range of values in a manner that maximizes the correlation between the factor and the residuals of the AutoRegressive (AR) model.

In [9]:
selected_factors = {
    'Real GDP growth': [-1, 0, 1],
    'BBB corporate yield': [1, 1, 1], 
    '3-month Treasury rate change': [-1, 2, 1], 
    'Dow Jones Total Stock Market Index change': [-1, 3, 1], 
    'Market Volatility Index': [1, 4, 1], 
    'Market Volatility Index change': [1, 4, 1],    
}

factors_df = pd.DataFrame.from_dict(selected_factors, orient='index', columns=['sign', 'group', 'calc_ema'])

In [10]:
factors_df

Unnamed: 0,sign,group,calc_ema
Real GDP growth,-1,0,1
BBB corporate yield,1,1,1
3-month Treasury rate change,-1,2,1
Dow Jones Total Stock Market Index change,-1,3,1
Market Volatility Index,1,4,1
Market Volatility Index change,1,4,1


In [11]:
y_col = 'Provision for Loan Lease Losses as % of Aver. Assets'

In [12]:
train_up_to = '2021-12-31'

In [13]:
# Center of mass parameter possible values for exponential moving average (EMA) transformation
com_vals = [0, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5]

## Defining outliers

In [14]:
# The last year is for test,remaining data - for train
data_set_train = data_set[data_set['Report Date']<='2021-12-31'].copy()

In [15]:
lower_limit = np.percentile(data_set_train[y_col], 0.5)
upper_limit = np.percentile(data_set_train[y_col], 99)

## Modeling

### List of banks for modeling

In [16]:
# Banks that have data for the training set
banks = data_set.loc[data_set['Report Date']>train_up_to, 'IDRSSD'].unique()
len(banks)

181

#### Sample size for each bank

In [17]:
data_set_banks = data_set[data_set['IDRSSD'].isin(banks)].copy()

In [18]:
sample_size = data_set_banks.groupby('IDRSSD')['Financial Institution Name'].count()

In [19]:
sample_size.describe()

count    181.000000
mean      74.828729
std       13.026829
min       18.000000
25%       81.000000
50%       81.000000
75%       81.000000
max       81.000000
Name: Financial Institution Name, dtype: float64

#### Exclude outliers

In [20]:
banks_exclude = data_set_banks.loc[(data_set_banks[y_col]<lower_limit)|(data_set_banks[y_col]>upper_limit), 'IDRSSD'].unique()
banks_upd = [bank for bank in list(banks) if bank not in banks_exclude]

#### Exclude banks with zero variance of the response variable

In [21]:
response_variance = data_set_banks.groupby('IDRSSD')[[y_col]].std()
banks_exclude = response_variance.index[response_variance[y_col]==0]
banks_exclude = list(banks_exclude)
banks_exclude

[5278251]

In [22]:
banks_upd = [bank for bank in list(banks_upd) if bank not in banks_exclude]

In [23]:
len(banks_upd)

147

### Modeling
A separate AutoRegressive (AR) time series model is created for each bank, with an assumed autocorrelation lag of one. For each bank, multiple models are created, encompassing all possible combinations of factors. When the "calc_ema" flag is set to 1 for a specific factor, that factor undergoes transformation using the EMA, with the EMA parameter selected to maximize the correlation between the factor and the residuals of the AR(1) model. From these various models, the best one for each bank is determined based on the Akaike Information Criterion (AIC) score.

In [24]:
for bank_id in banks_upd:
    
    # Create samples for the bank
    bank_data, bank_data_train = create_bank_train_set(bank_id, data_set, 
                                                       y_col, factors_df,
                                                       train_up_to)
    
       
    # Select the best model for the bank by evaluating all possible combinations of factors
    # The criterion for selecting the best model is the Akaike Information Criterion (AIC) 
    # score on the training dataset
    models_info, best_model, best_params, factors_bank = \
        choose_bank_best_model(bank_id, bank_data_train, y_col, factors_df, com_vals)
    
    
    # Make predictions for the test sample using the model chosen in the previous step
    bank_prediction(bank_id, bank_data,  # bank info
                    best_params, factors_bank, # model info
                    data_set, # data set to write the prediction
                    y_col, train_up_to)   

#### Checking that predictions have been computed for all banks

In [25]:
len(data_set.loc[~data_set['y_pred'].isnull(), 'IDRSSD'].unique())

147

### Results on the test sample

In [26]:
test_sample = data_set.loc[~data_set['y_pred'].isnull(), 
                           ['Report Date', 'IDRSSD', 'Financial Institution Name', y_col, 'y_pred']]

In [27]:
# Estimate errors excluding outliers
estimate_errors(test_sample[y_col], test_sample['y_pred'], lower_limit, upper_limit, False)

Unnamed: 0,measure
RMSE,0.200016
"median relative error, %",56.320164
