# Machine Learning Benchmark

One possible way to forecast the expected return is using machine learning models. At each time $t$, we find a ML model $f_t^i$ to predict the expected return $r_{t+1}^i$of equity $i$ at time $t+1$.

$$
\begin{equation}
f_t^i(X_t) \rightarrow r_{t+1}
\end{equation}
$$

The ML model $f_t^i$ changes over time $t$ and it also changes with equity $i$.

In [None]:
from __future__ import annotations
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import yfinance as yf
import warnings
import sys
import os
sys.path.append('../module')
from analysis import get_return_forecast_performance


## Data Processing

In [None]:
# load data 
feature_df = pd.read_csv('/content/drive/MyDrive/PhD/Research/Non-Myopic Equity Risk Premium/data/econ_predictors.csv', index_col = 0, parse_dates=True)
IVV_IEF_daily_ret = pd.read_csv('/content/drive/MyDrive/PhD/Research/Non-Myopic Equity Risk Premium/data/IVV_IEF_daily_ret.csv', index_col=0, parse_dates=True)

In [None]:
# truncate data to target period
DATE_START = '2002-08-01'
DATE_END = '2019-12-31'
feature_df = feature_df[DATE_START: DATE_END]

# get monthly data 
IVV_IEF_monthly_ret = IVV_IEF_daily_ret.groupby(pd.Grouper(freq='M')).mean()
IVV_IEF_monthly_ret = IVV_IEF_monthly_ret[DATE_START: DATE_END]

In [None]:
# standardize feature set
scaler = StandardScaler()
scaler.fit(feature_df)
X = scaler.transform(feature_df)

In [None]:
# create label which equal 1 when IVV return larger than 0 
monthly_ret_ivv = IVV_IEF_monthly_ret['IVV'].values
monthly_ret_ief = IVV_IEF_monthly_ret['IEF'].values

In [None]:
# remove the last row of the feature matrix X and remove the first row of the label vector
# this shift adjustment fit the time series prediction
X = X[:-1,:]
y_ivv = monthly_ret_ivv[1:].reshape((-1,1))
y_ief = monthly_ret_ief[1:].reshape((-1,1))

# combine X and y
data_ivv = np.concatenate((X, y_ivv), axis=1)
data_ief = np.concatenate((X, y_ief), axis=1)

## Model

In [None]:
X_train_ivv, X_test_ivv, y_train_ivv, y_test_ivv = train_test_split(X, y_ivv, test_size=0.2, shuffle=False)

In [None]:
regr = svm.SVR()
regr.fit(X_train_ivv, y_train_ivv)
pred = regr.predict(X_test_ivv)
pred_in_sample = regr.predict(X_train_ivv)
print(mean_squared_error(pred, y_test_ivv))
print(r2_score(pred, y_test_ivv))
print(r2_score(pred_in_sample, y_train_ivv))

4.627100598544023e-06
0.0
-1.117519324245484e+32


  y = column_or_1d(y, warn=True)


In [None]:
regr = ElasticNet(random_state=0)
regr.fit(X_train_ivv, y_train_ivv)
pred = regr.predict(X_test_ivv)
pred_in_sample = regr.predict(X_train_ivv)
print(mean_squared_error(pred, y_test_ivv))
print(r2_score(pred, y_test_ivv))
print(r2_score(pred_in_sample, y_train_ivv))

2.512464440110739e-06
0.0
-1.2568885222316553e+33


cross validation

In [None]:
elastic_net = ElasticNet(random_state=0)
distributions = dict(alpha = np.arange(0.01,1,0.01), l1_ratio = np.arange(0,1.01,0.01))
tscv = TimeSeriesSplit(n_splits=8, test_size=6)
grid_search = RandomizedSearchCV(elastic_net, distributions, cv=tscv, scoring='r2', n_jobs=-1, n_iter=50)
search = grid_search.fit(X_train_ivv[0:60], y_train_ivv[0:60])
r2_in_sample = search.best_score_
best_model = search.best_estimator_
pred = best_model.predict(X_test_ivv)
pred

array([0.00047912, 0.00047912, 0.00047912, 0.00047912, 0.00047912,
       0.00047912, 0.00047912, 0.00047912, 0.00047912, 0.00047912,
       0.00047912, 0.00047912, 0.00047912, 0.00047912, 0.00047912,
       0.00047912, 0.00047912, 0.00047912, 0.00047912, 0.00047912,
       0.00047912, 0.00047912, 0.00047912, 0.00047912, 0.00047912,
       0.00047912, 0.00047912, 0.00047912, 0.00047912, 0.00047912,
       0.00047912, 0.00047912, 0.00047912, 0.00047912, 0.00047912,
       0.00047912, 0.00047912, 0.00047912, 0.00047912, 0.00047912,
       0.00047912, 0.00047912])

In [None]:
def get_ml_ret_prediction(model,
                          param_dict:dict,
                          cv_generator,
                          selection_criterion:str,
                          X_train:np.array, 
                          y_train:np.array, 
                          X_test:np.array, 
                          y_test:np.array = None
                          ) -> np.array|dict:
    """
    This function is used to get the expected return prediction from ML model on the test data.

    Parameters
    ----------
    model : object
        ML model from SKlearn package
    param_dict : dict
        Dictionary containing the model parameters
    X_train : np.array
        Training data
    y_train : np.array
        Training labels
    X_test : np.array
        Test data
    y_test : np.array
        Test labels. Not needed. Because the OOS performance is NOT evaluated for the single prediction.
    
    Returns
    -------
    np.array
        Expected return prediction
    dict
        In sample performance (e.g. R square)
    """

    param_num = [np.count_nonzero(param_vector) for _, param_vector in param_dict.items()]
    cv_sample_size = 0.5 * np.prod(param_num)
    grid_search_generator = RandomizedSearchCV(estimator = model, 
                                     param_distributions = param_dict,
                                     cv=cv_generator, 
                                     scoring=selection_criterion,
                                     n_jobs=-1,
                                     n_iter=cv_sample_size)
    grid_search_result = grid_search_generator.fit(X_train, y_train)
    performance_in_sample = grid_search_result.best_score_
    best_model = grid_search_result.best_estimator_
    pred = best_model.predict(X_test)

    return pred, {selection_criterion:performance_in_sample}


    

In [None]:
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"
training_settings = {'model':ElasticNet(random_state=0),
                     'param_dict': dict(alpha = np.arange(0.01,1,0.01), l1_ratio = np.arange(0,1.01,0.01)),
                     'cv_generator': TimeSeriesSplit(n_splits=8, test_size=6),
                     'selection_criterion':'r2',
                     'X_train': X_train_ivv[0:60],
                     'y_train': y_train_ivv[0:60],
                     'X_test': X_test_ivv[0:1],
                     'y_test': y_test_ivv[0:1]}
pred, info = get_ml_ret_prediction(**training_settings)

In [None]:
training_settings = {'model':RandomForestRegressor(random_state = 0),
                     'param_dict': dict(n_estimators = [200, 600, 1000],
                                        max_depth = [10, 30, 50],
                                        min_samples_leaf =  [1, 4],
                                        min_samples_split =  [2, 10]),
                     'cv_generator': TimeSeriesSplit(n_splits=8, test_size=6),
                     'selection_criterion':'r2',
                     'X_train': X_train_ivv[0:60],
                     'y_train': y_train_ivv[0:60],
                     'X_test': X_test_ivv[0:1],
                     'y_test': y_test_ivv[0:1]}
pred, info = get_ml_ret_prediction(**training_settings)

  self.best_estimator_.fit(X, y, **fit_params)


In [None]:
info

{'r2': -0.23896953400941387}

rolling window estimation

In [None]:
DATA_FREQUENCY = 12
WINDOW_SIZE = 5
SAMPLE_SIZE = X.shape[0]
tscv = TimeSeriesSplit(n_splits = SAMPLE_SIZE - DATA_FREQUENCY * WINDOW_SIZE,
                       test_size=1, 
                       max_train_size= DATA_FREQUENCY * WINDOW_SIZE)

In [None]:
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"
training_settings = {'model':ElasticNet(random_state=0),
                     'param_dict': dict(alpha = np.arange(0.01,1,0.01), l1_ratio = np.arange(0,1.01,0.01)),
                     'cv_generator': TimeSeriesSplit(n_splits=8, test_size=6),
                     'selection_criterion':'r2'}
performance_ief_elastic = [get_ml_ret_prediction(X_train=X[train_index], y_train=y_ief[train_index], X_test=X[test_index], **training_settings) for i, (train_index, test_index) in enumerate(tscv.split(X))]

In [None]:
training_settings = {'model':svm.SVR(kernel="rbf"),
                     'param_dict': dict(C = np.logspace(-5,15,num = 20,base=2), gamma = np.logspace(-15,3,num = 18,base=2)),
                     'cv_generator': TimeSeriesSplit(n_splits=8, test_size=6),
                     'selection_criterion':'r2'}
performance_ivv_svm = [get_ml_ret_prediction(X_train=X[train_index], y_train=y_ivv[train_index], X_test=X[test_index], **training_settings) for i, (train_index, test_index) in enumerate(tscv.split(X))]

In [None]:
training_settings = {'model':svm.SVR(kernel="rbf"),
                     'param_dict': dict(C = np.logspace(-5,15,num = 20,base=2), gamma = np.logspace(-15,3,num = 18,base=2)),
                     'cv_generator': TimeSeriesSplit(n_splits=8, test_size=6),
                     'selection_criterion':'r2'}
performance_ief_svm = [get_ml_ret_prediction(X_train=X[train_index], y_train=y_ief[train_index], X_test=X[test_index], **training_settings) for i, (train_index, test_index) in enumerate(tscv.split(X))]

In [None]:
training_settings = {'model':RandomForestRegressor(random_state = 0),
                     'param_dict': dict(n_estimators = [200, 600, 1000],
                                        max_depth = [10, 30, 50],
                                        min_samples_leaf =  [1, 4],
                                        min_samples_split =  [2, 10]),
                     'cv_generator': TimeSeriesSplit(n_splits=8, test_size=6),
                     'selection_criterion':'r2'}

performance_ief_rf = [get_ml_ret_prediction(X_train=X[train_index], y_train=y_ief[train_index], X_test=X[test_index], **training_settings) for i, (train_index, test_index) in enumerate(tscv.split(X))]

In [None]:
training_settings = {'model':RandomForestRegressor(random_state = 0),
                     'param_dict': dict(n_estimators = [200, 600, 1000],
                                        max_depth = [10, 30, 50],
                                        min_samples_leaf =  [1, 4],
                                        min_samples_split =  [2, 10]),
                     'cv_generator': TimeSeriesSplit(n_splits=8, test_size=6),
                     'selection_criterion':'r2'}

performance_ivv_rf = [get_ml_ret_prediction(X_train=X[train_index], y_train=y_ivv[train_index], X_test=X[test_index], **training_settings) for i, (train_index, test_index) in enumerate(tscv.split(X))]

Output performance results

In [None]:
def post_ml_performance(performance_list, date_index, name):
    """
    :param performance_list:
    :param name:
    """

    pred_vec = [pred[0] for pred, _ in performance_list]
    performance_in_sample = [info_dict['r2'] for _, info_dict in performance_list]
    performance_ivv_elastic_df = pd.DataFrame(list(zip(pred_vec, performance_in_sample)), columns=['return forecast', 'in sample performance'], index=date_index)
    performance_ivv_elastic_df.to_csv('../../data/' + name + '.csv')



In [None]:
ivv_result_dict = {'performance_list': performance_ivv_elastic,
                   'date_index': feature_df.index[DATA_FREQUENCY * WINDOW_SIZE:-1],
                   'name': 'performance_ivv_elastic'}
post_ml_performance(**ivv_result_dict)

In [None]:
ief_result_dict = {'performance_list': performance_ief_elastic,
                   'date_index': feature_df.index[DATA_FREQUENCY * WINDOW_SIZE:-1],
                   'name': 'performance_ief_elastic'}
post_ml_performance(**ief_result_dict)

In [None]:
# svm ivv
ivv_result_dict = {'performance_list': performance_ivv_svm,
                   'date_index': feature_df.index[DATA_FREQUENCY * WINDOW_SIZE:-1],
                   'name': 'performance_ivv_svm'}
post_ml_performance(**ivv_result_dict)

In [None]:
# svm ief
ief_result_dict = {'performance_list': performance_ief_svm,
                   'date_index': feature_df.index[DATA_FREQUENCY * WINDOW_SIZE:-1],
                   'name': 'performance_ief_svm'}
post_ml_performance(**ief_result_dict)

In [None]:
# rf ief
ief_result_dict = {'performance_list': performance_ief_rf,
                   'date_index': feature_df.index[DATA_FREQUENCY * WINDOW_SIZE:-1],
                   'name': 'performance_ief_rf'}
post_ml_performance(**ief_result_dict)

In [None]:
# rf ivv
ivv_result_dict = {'performance_list': performance_ivv_rf,
                   'date_index': feature_df.index[DATA_FREQUENCY * WINDOW_SIZE:-1],
                   'name': 'performance_ivv_rf'}
post_ml_performance(**ivv_result_dict)

In [None]:
performance_ivv_rf = pd.read_csv('/Users/cheng/Google Drive/PhD/Research/Non-Myopic Equity Risk Premium/data/performance_ivv_rf.csv', index_col=0, usecols=[0,1])

In [None]:
performance_ivv_rf

Unnamed: 0,return forecast
2007-08-01,0.000170
2007-09-01,0.000218
2007-10-01,-0.000107
2007-11-01,0.000005
2007-12-01,0.000518
...,...
2019-07-01,0.000468
2019-08-01,0.001054
2019-09-01,0.000434
2019-10-01,0.000230
