# Machine Learning Benchmark

One possible way to forecast the equity premium is using machine learning models. At each time $t$, we find a ML model $f_t^i$ to predict the equity premium $r_{t+1}$ in the next period $t+1$.

$$
\begin{equation}
f_t^i(X_t) \rightarrow r_{t+1}
\end{equation}
$$

The ML model $f_t^i$ changes over time $t$ and it also changes with method $i$.

We use the same econ variables and equity premium data as in [MODEL_linear_prediction.ipynb](MODEL_linear_prediction.ipynb).

In [2]:
from __future__ import annotations
import pandas as pd
import numpy as np
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import warnings
import sys
import os
sys.path.append('../module')

import yfinance as yf

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet

from analysis import get_return_forecast_performance
from analysis import get_period_return
from data_handler import get_econ_predictors
from data_handler import get_monthly_date_format
from model import get_benchmark_of_equity_premium_prediction

## Data Processing

In [3]:
# load data 
START_DATE = '1947-01'
END_DATE = '2005-04'

econ_predictors = get_econ_predictors(START_DATE=START_DATE, END_DATE=END_DATE, data_freq='monthly')
equity_premium = econ_predictors.pop('Equity Premium')

econ_predictors = econ_predictors[:-1] # drop the last element, because there's no OOS prediction true value for the last date
equity_premium = equity_premium[1:] # shift the label 1 day backwards to make it prediction problem

create a function to calculate the monthly return form daily return.

In [4]:
def get_period_return(return_series:pd.DataFrame) -> float:
    """
    Calculate the period return for a given time series.

    Parameters
    ----------
    return_series : pd.DataFrame
        A time series of daily returns.
    
    Returns
    -------
    period_return : float
        The period return for the given time series.
    """
    period_return = (1 + return_series).product() - 1

    return period_return

In [4]:
# standardize feature set
scaler = StandardScaler()
scaler.fit(econ_predictors)
X = scaler.transform(econ_predictors)

In [8]:
y = equity_premium.values.reshape((-1,1))

## Model

In [27]:
def get_ml_ret_prediction(model,
                          param_dict:dict,
                          cv_generator,
                          selection_criterion:str,
                          X_train:np.array, 
                          y_train:np.array, 
                          X_test:np.array, 
                          y_test:np.array = None
                          ) -> np.array|dict:
    """
    This function is used to get the expected return prediction from ML model on the test data.
    The model is predicting the next period return. 
    If monthly data is used, the model is predicting the next month return.

    Parameters
    ----------
    model : object
        ML model from SKlearn package
    param_dict : dict
        Dictionary containing the model parameters
    cv_generator : object
        Cross validation generator
    X_train : np.array
        Training data
    y_train : np.array
        Training labels
    X_test : np.array
        Test data
    y_test : np.array
        Test labels. Not needed. Because the OOS performance is NOT evaluated for the single prediction.
    
    Returns
    -------
    np.array
        Expected return prediction
    dict
        In sample performance (e.g. R square)
    """

    param_num = [np.count_nonzero(param_vector) for _, param_vector in param_dict.items()]
    cv_sample_size = 0.5 * np.prod(param_num)
    grid_search_generator = RandomizedSearchCV(estimator = model, 
                                                param_distributions = param_dict,
                                                cv=cv_generator, 
                                                scoring=selection_criterion,
                                                n_jobs=20,
                                                n_iter=cv_sample_size)
    grid_search_result = grid_search_generator.fit(X_train, y_train)
    performance_in_sample = grid_search_result.best_score_
    best_model = grid_search_result.best_estimator_
    pred = best_model.predict(X_test)[0]

    return pred, {selection_criterion:performance_in_sample}

rolling window estimation

In [16]:
DATA_FREQUENCY = 12
WINDOW_SIZE = 5
SAMPLE_SIZE = X.shape[0]
tscv = TimeSeriesSplit(n_splits = SAMPLE_SIZE - DATA_FREQUENCY * WINDOW_SIZE,
                       test_size=1, 
                       max_train_size= DATA_FREQUENCY * WINDOW_SIZE)

In [10]:
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

The grid search takes more than half an hour to run. A possible way to speed up the process is to use the [Ray](https://docs.ray.io/en/latest/tune/index.html) library to tune the hyperparameters gradiently.

In [29]:
training_settings = {'model':ElasticNet(random_state=0),
                     'param_dict': dict(alpha = np.arange(0.01,1,0.05), l1_ratio = np.arange(0,1,0.05)),
                     'cv_generator': TimeSeriesSplit(n_splits=8, test_size=6),
                     'selection_criterion':'r2'}
performance_elastic = [get_ml_ret_prediction(X_train=X[train_index], y_train=y[train_index], X_test=X[test_index], **training_settings) for i, (train_index, test_index) in enumerate(tscv.split(X))]

In [17]:
training_settings = {'model':RandomForestRegressor(random_state = 0),
                     'param_dict': dict(n_estimators = [200, 600, 1000],
                                        max_depth = [10, 30, 50],
                                        min_samples_leaf =  [1, 4],
                                        min_samples_split =  [2, 10]),
                     'cv_generator': TimeSeriesSplit(n_splits=8, test_size=6),
                     'selection_criterion':'r2'}
performance_rf = [get_ml_ret_prediction(X_train=X[train_index], y_train=y[train_index], X_test=X[test_index], **training_settings) for i, (train_index, test_index) in enumerate(tscv.split(X))]

In [None]:
training_settings = {'model':svm.SVR(kernel="rbf"),
                     'param_dict': dict(C = np.logspace(-5,15,num = 20,base=2), gamma = np.logspace(-15,3,num = 18,base=2)),
                     'cv_generator': TimeSeriesSplit(n_splits=8, test_size=6),
                     'selection_criterion':'r2'}
performance_ivv_svm = [get_ml_ret_prediction(X_train=X[train_index], y_train=y_ivv[train_index], X_test=X[test_index], **training_settings) for i, (train_index, test_index) in enumerate(tscv.split(X))]

## Result output

Combine prediction and actual return and historical average return into a dataframe.

In [26]:
benchmark = get_benchmark_of_equity_premium_prediction(equity_premium=equity_premium.values,
                                                          prediction_size=len(performance_rf),
                                                          holdout_size=0,
                                                          prediction_index=equity_premium.index[-len(performance_rf):])
benchmark

Unnamed: 0,Historical Average,Equity Premium
1952-03,0.006699,0.046621
1952-04,0.007354,-0.044286
1952-05,0.006521,0.021856
1952-06,0.006764,0.044602
1952-07,0.007355,0.016128
...,...,...
2004-12,0.003276,0.030858
2005-01,0.003316,-0.026890
2005-02,0.003273,0.017303
2005-03,0.003293,-0.021218


In [30]:
prediction_df = pd.concat([pd.DataFrame(performance_elastic)[0],
                           pd.DataFrame(performance_rf)[0]],
                           axis=1)
prediction_df.index = equity_premium.index[-len(prediction_df):]
prediction_df = pd.concat([prediction_df, benchmark], axis=1)
prediction_df.columns = ['Elastic Net', 'Random Forest', 'Historical Average', 'Equity Premium']
prediction_df

Unnamed: 0,Elastic Net,Random Forest,Historical Average,Equity Premium
1952-03,0.018907,-0.010133,0.006699,0.046621
1952-04,0.013030,0.000027,0.007354,-0.044286
1952-05,0.026350,0.003778,0.006521,0.021856
1952-06,0.001888,-0.006035,0.006764,0.044602
1952-07,0.002243,-0.009551,0.007355,0.016128
...,...,...,...,...
2004-12,0.010536,0.018050,0.003276,0.030858
2005-01,0.003752,0.025157,0.003316,-0.026890
2005-02,-0.003855,0.006044,0.003273,0.017303
2005-03,-0.003160,0.018595,0.003293,-0.021218


In [24]:
prediction_df.to_csv('../../data/ml_prediction_1947_2005.csv')

Output performance results

In [32]:
def post_ml_performance(performance_list:np.ndarray, date_index:pd.Index, name:str) -> None:
    """
    
    Parameters
    ----------
    performance_list : np.ndarray
        DESCRIPTION.
    date_index : pd.Index
        DESCRIPTION.
    name : str
        DESCRIPTION.
    
    Returns
    -------
    None.
    """

    pred_vec = [pred[0] for pred, _ in performance_list]
    performance_in_sample = [info_dict['r2'] for _, info_dict in performance_list]
    performance_ivv_elastic_df = pd.DataFrame(list(zip(pred_vec, performance_in_sample)), columns=['return forecast', 'in sample performance'], index=date_index)
    performance_ivv_elastic_df.to_csv('../../data/' + name + '.csv')



In [51]:
ivv_result_dict = {'performance_list': performance_ivv_elastic,
                   'date_index': feature_df.index[DATA_FREQUENCY * WINDOW_SIZE:-1],
                   'name': 'performance_ivv_elastic_new'}
post_ml_performance(**ivv_result_dict)

In [52]:
ief_result_dict = {'performance_list': performance_ief_elastic,
                   'date_index': feature_df.index[DATA_FREQUENCY * WINDOW_SIZE:-1],
                   'name': 'performance_ief_elastic_new'}
post_ml_performance(**ief_result_dict)

In [53]:
# rf ief
ief_result_dict = {'performance_list': performance_ief_rf,
                   'date_index': feature_df.index[DATA_FREQUENCY * WINDOW_SIZE:-1],
                   'name': 'performance_ief_rf_new'}
post_ml_performance(**ief_result_dict)

In [54]:
# rf ivv
ivv_result_dict = {'performance_list': performance_ivv_rf,
                   'date_index': feature_df.index[DATA_FREQUENCY * WINDOW_SIZE:-1],
                   'name': 'performance_ivv_rf_new'}
post_ml_performance(**ivv_result_dict)

In [55]:
performance_ivv_rf = pd.read_csv('../../data/performance_ivv_rf_new.csv', index_col=0, usecols=[0,1])

In [56]:
performance_ivv_rf

Unnamed: 0,return forecast
2007-08,0.019275
2007-09,0.021874
2007-10,0.019016
2007-11,0.008545
2007-12,0.028623
...,...
2019-07,0.004680
2019-08,0.030029
2019-09,0.004477
2019-10,-0.000102
