# Machine Learning Benchmark

One possible way to forecast the expected return is using machine learning models. At each time $t$, we find a ML model $f_t^i$ to predict the expected return $r_{t+1}^i$of equity $i$ at time $t+1$.

$$
\begin{equation}
f_t^i(X_t) \rightarrow r_{t+1}
\end{equation}
$$

The ML model $f_t^i$ changes over time $t$ and it also changes with equity $i$.

In [1]:
from __future__ import annotations
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import yfinance as yf
import warnings
import sys
import os
sys.path.append('../module')
from analysis import get_return_forecast_performance
from analysis import get_period_return
from data_handler import get_econ_predictors
from data_handler import get_monthly_date_format
from model import get_benchmark_of_equity_premium_prediction


## Data Processing

In [3]:
# load data 
econ_data_settings = {'data_freq':'monthly', 'START_DATE':'1947-01', 'END_DATE':'2020-04'}
feature_df = get_econ_predictors(**econ_data_settings)
IVV_IEF_daily_ret = pd.read_csv('../../data/IVV_IEF_daily_ret.csv', index_col=0, parse_dates=True)

create a function to calculate the monthly return form daily return.

In [4]:
def get_period_return(return_series:pd.DataFrame) -> float:
    """
    Calculate the period return for a given time series.

    Parameters
    ----------
    return_series : pd.DataFrame
        A time series of daily returns.
    
    Returns
    -------
    period_return : float
        The period return for the given time series.
    """
    period_return = (1 + return_series).product() - 1

    return period_return

In [5]:
# get monthly data 
IVV_IEF_monthly_ret = IVV_IEF_daily_ret.groupby(pd.Grouper(freq='M')).apply(get_period_return)
IVV_IEF_monthly_ret.index = [get_monthly_date_format(x) for x in IVV_IEF_monthly_ret.index]

In [6]:
# truncate data to target period
DATE_START = '2002-08-01'
DATE_END = '2019-12-31'
feature_df = feature_df[DATE_START: DATE_END]
IVV_IEF_monthly_ret = IVV_IEF_monthly_ret[DATE_START: DATE_END]

In [7]:
# standardize feature set
scaler = StandardScaler()
scaler.fit(feature_df)
X = scaler.transform(feature_df)

In [8]:
# create label (monthly return)
monthly_ret_ivv = IVV_IEF_monthly_ret['IVV'].values
monthly_ret_ief = IVV_IEF_monthly_ret['IEF'].values

In [9]:
# remove the last row of the feature matrix X and remove the first row of the label vector
# this shift adjustment fit the time series prediction
X = X[:-1,:]
y_ivv = monthly_ret_ivv[1:].reshape((-1,1))
y_ief = monthly_ret_ief[1:].reshape((-1,1))

# combine X and y
data_ivv = np.concatenate((X, y_ivv), axis=1)
data_ief = np.concatenate((X, y_ief), axis=1)

## Model

In [10]:
def get_ml_ret_prediction(model,
                          param_dict:dict,
                          cv_generator,
                          selection_criterion:str,
                          X_train:np.array, 
                          y_train:np.array, 
                          X_test:np.array, 
                          y_test:np.array = None
                          ) -> np.array|dict:
    """
    This function is used to get the expected return prediction from ML model on the test data.
    The model is predicting the next period return. 
    If monthly data is used, the model is predicting the next month return.

    Parameters
    ----------
    model : object
        ML model from SKlearn package
    param_dict : dict
        Dictionary containing the model parameters
    cv_generator : object
        Cross validation generator
    X_train : np.array
        Training data
    y_train : np.array
        Training labels
    X_test : np.array
        Test data
    y_test : np.array
        Test labels. Not needed. Because the OOS performance is NOT evaluated for the single prediction.
    
    Returns
    -------
    np.array
        Expected return prediction
    dict
        In sample performance (e.g. R square)
    """

    param_num = [np.count_nonzero(param_vector) for _, param_vector in param_dict.items()]
    cv_sample_size = 0.5 * np.prod(param_num)
    grid_search_generator = RandomizedSearchCV(estimator = model, 
                                                param_distributions = param_dict,
                                                cv=cv_generator, 
                                                scoring=selection_criterion,
                                                n_jobs=-1,
                                                n_iter=cv_sample_size)
    grid_search_result = grid_search_generator.fit(X_train, y_train)
    performance_in_sample = grid_search_result.best_score_
    best_model = grid_search_result.best_estimator_
    pred = best_model.predict(X_test)[0]

    return pred, {selection_criterion:performance_in_sample}

In [27]:
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"
training_settings = {'model':ElasticNet(random_state=0),
                     'param_dict': dict(alpha = np.arange(0.01,1,0.01), l1_ratio = np.arange(0,1.01,0.01)),
                     'cv_generator': TimeSeriesSplit(n_splits=8, test_size=6),
                     'selection_criterion':'r2',
                     'X_train': X_train_ivv[0:60],
                     'y_train': y_train_ivv[0:60],
                     'X_test': X_test_ivv[0:1],
                     'y_test': y_test_ivv[0:1]}
pred, info = get_ml_ret_prediction(**training_settings)

NameError: name 'X_train_ivv' is not defined

In [None]:
training_settings = {'model':RandomForestRegressor(random_state = 0),
                     'param_dict': dict(n_estimators = [200, 600, 1000],
                                        max_depth = [10, 30, 50],
                                        min_samples_leaf =  [1, 4],
                                        min_samples_split =  [2, 10]),
                     'cv_generator': TimeSeriesSplit(n_splits=8, test_size=6),
                     'selection_criterion':'r2',
                     'X_train': X_train_ivv[0:60],
                     'y_train': y_train_ivv[0:60],
                     'X_test': X_test_ivv[0:1],
                     'y_test': y_test_ivv[0:1]}
pred, info = get_ml_ret_prediction(**training_settings)

  self.best_estimator_.fit(X, y, **fit_params)


rolling window estimation

In [11]:
DATA_FREQUENCY = 12
WINDOW_SIZE = 5
SAMPLE_SIZE = X.shape[0]
tscv = TimeSeriesSplit(n_splits = SAMPLE_SIZE - DATA_FREQUENCY * WINDOW_SIZE,
                       test_size=1, 
                       max_train_size= DATA_FREQUENCY * WINDOW_SIZE)

In [13]:
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

In [14]:
training_settings = {'model':ElasticNet(random_state=0),
                     'param_dict': dict(alpha = np.arange(0.01,1,0.01), l1_ratio = np.arange(0,1.01,0.01)),
                     'cv_generator': TimeSeriesSplit(n_splits=8, test_size=6),
                     'selection_criterion':'r2'}
performance_ief_elastic = [get_ml_ret_prediction(X_train=X[train_index], y_train=y_ief[train_index], X_test=X[test_index], **training_settings) for i, (train_index, test_index) in enumerate(tscv.split(X))]

In [15]:
training_settings = {'model':ElasticNet(random_state=0),
                     'param_dict': dict(alpha = np.arange(0.01,1,0.01), l1_ratio = np.arange(0,1.01,0.01)),
                     'cv_generator': TimeSeriesSplit(n_splits=8, test_size=6),
                     'selection_criterion':'r2'}
performance_ivv_elastic = [get_ml_ret_prediction(X_train=X[train_index], y_train=y_ivv[train_index], X_test=X[test_index], **training_settings) for i, (train_index, test_index) in enumerate(tscv.split(X))]

In [16]:
training_settings = {'model':RandomForestRegressor(random_state = 0),
                     'param_dict': dict(n_estimators = [200, 600, 1000],
                                        max_depth = [10, 30, 50],
                                        min_samples_leaf =  [1, 4],
                                        min_samples_split =  [2, 10]),
                     'cv_generator': TimeSeriesSplit(n_splits=8, test_size=6),
                     'selection_criterion':'r2'}
performance_ief_rf = [get_ml_ret_prediction(X_train=X[train_index], y_train=y_ief[train_index], X_test=X[test_index], **training_settings) for i, (train_index, test_index) in enumerate(tscv.split(X))]

In [17]:
training_settings = {'model':RandomForestRegressor(random_state = 0),
                     'param_dict': dict(n_estimators = [200, 600, 1000],
                                        max_depth = [10, 30, 50],
                                        min_samples_leaf =  [1, 4],
                                        min_samples_split =  [2, 10]),
                     'cv_generator': TimeSeriesSplit(n_splits=8, test_size=6),
                     'selection_criterion':'r2'}
performance_ivv_rf = [get_ml_ret_prediction(X_train=X[train_index], y_train=y_ivv[train_index], X_test=X[test_index], **training_settings) for i, (train_index, test_index) in enumerate(tscv.split(X))]

In [None]:
training_settings = {'model':svm.SVR(kernel="rbf"),
                     'param_dict': dict(C = np.logspace(-5,15,num = 20,base=2), gamma = np.logspace(-15,3,num = 18,base=2)),
                     'cv_generator': TimeSeriesSplit(n_splits=8, test_size=6),
                     'selection_criterion':'r2'}
performance_ivv_svm = [get_ml_ret_prediction(X_train=X[train_index], y_train=y_ivv[train_index], X_test=X[test_index], **training_settings) for i, (train_index, test_index) in enumerate(tscv.split(X))]

## Result output

Combine prediction and actual return and historical average return into a dataframe.

In [22]:
benchmark_ivv = get_benchmark_of_equity_premium_prediction(equity_premium=monthly_ret_ivv,
                                                          prediction_size=len(performance_ivv_elastic),
                                                          holdout_size=0,
                                                          prediction_index=IVV_IEF_monthly_ret.index[-len(performance_ivv_elastic):])

benchmark_ief = get_benchmark_of_equity_premium_prediction(equity_premium=monthly_ret_ief,
                                                          prediction_size=len(performance_ief_elastic),
                                                          holdout_size=0,
                                                          prediction_index=IVV_IEF_monthly_ret.index[-len(performance_ief_elastic):])
benchmark_ief

Unnamed: 0,Historical Average,Equity Premium
2007-09,0.003831,0.000886
2007-10,0.003783,0.010767
2007-11,0.003894,0.040441
2007-12,0.004465,0.000474
2008-01,0.004404,0.033559
...,...,...
2019-08,0.004053,0.039492
2019-09,0.004225,-0.011862
2019-10,0.004147,0.001872
2019-11,0.004136,-0.006868


In [23]:
prediction_df = pd.concat([pd.DataFrame(performance_ief_elastic)[0],
                           pd.DataFrame(performance_ief_rf)[0],
                           pd.DataFrame(performance_ivv_elastic)[0],
                           pd.DataFrame(performance_ivv_rf)[0]],
                           axis=1)
prediction_df.index = IVV_IEF_monthly_ret.index[-len(prediction_df):]
prediction_df = pd.concat([prediction_df, benchmark_ivv, benchmark_ief], axis=1)
prediction_df.columns = ['ief_elastic', 'ief_rf', 'ivv_elastic', 'ivv_rf', 'ivv_average', 'ivv_return', 'ief_average', 'ief_return']
prediction_df

Unnamed: 0,ief_elastic,ief_rf,ivv_elastic,ivv_rf,ivv_average,ivv_return,ief_average,ief_return
2007-09,0.004453,0.011143,0.009917,0.019275,0.009832,0.040906,0.003831,0.000886
2007-10,0.002847,0.009290,0.012299,0.021874,0.010333,0.014120,0.003783,0.010767
2007-11,0.002159,0.007992,0.011175,0.019016,0.010393,-0.041127,0.003894,0.040441
2007-12,0.004257,-0.004143,0.009535,0.008545,0.009588,-0.008199,0.004465,0.000474
2008-01,0.003030,0.001289,0.011099,0.028623,0.009314,-0.062696,0.004404,0.033559
...,...,...,...,...,...,...,...,...
2019-08,0.002816,-0.002078,0.009544,0.004680,0.008300,-0.016576,0.004053,0.039492
2019-09,0.003161,0.001108,0.008607,0.030029,0.008179,0.019498,0.004225,-0.011862
2019-10,0.003138,-0.000896,0.005825,0.002511,0.008234,0.021607,0.004147,0.001872
2019-11,0.002913,-0.001618,0.009123,-0.000102,0.008298,0.036364,0.004136,-0.006868


In [24]:
prediction_df.to_csv('../../data/prediction_of_equity_return_ivv_ief_ml.csv')

Output performance results

In [32]:
def post_ml_performance(performance_list:np.ndarray, date_index:pd.Index, name:str) -> None:
    """
    
    Parameters
    ----------
    performance_list : np.ndarray
        DESCRIPTION.
    date_index : pd.Index
        DESCRIPTION.
    name : str
        DESCRIPTION.
    
    Returns
    -------
    None.
    """

    pred_vec = [pred[0] for pred, _ in performance_list]
    performance_in_sample = [info_dict['r2'] for _, info_dict in performance_list]
    performance_ivv_elastic_df = pd.DataFrame(list(zip(pred_vec, performance_in_sample)), columns=['return forecast', 'in sample performance'], index=date_index)
    performance_ivv_elastic_df.to_csv('../../data/' + name + '.csv')



In [51]:
ivv_result_dict = {'performance_list': performance_ivv_elastic,
                   'date_index': feature_df.index[DATA_FREQUENCY * WINDOW_SIZE:-1],
                   'name': 'performance_ivv_elastic_new'}
post_ml_performance(**ivv_result_dict)

In [52]:
ief_result_dict = {'performance_list': performance_ief_elastic,
                   'date_index': feature_df.index[DATA_FREQUENCY * WINDOW_SIZE:-1],
                   'name': 'performance_ief_elastic_new'}
post_ml_performance(**ief_result_dict)

In [53]:
# rf ief
ief_result_dict = {'performance_list': performance_ief_rf,
                   'date_index': feature_df.index[DATA_FREQUENCY * WINDOW_SIZE:-1],
                   'name': 'performance_ief_rf_new'}
post_ml_performance(**ief_result_dict)

In [54]:
# rf ivv
ivv_result_dict = {'performance_list': performance_ivv_rf,
                   'date_index': feature_df.index[DATA_FREQUENCY * WINDOW_SIZE:-1],
                   'name': 'performance_ivv_rf_new'}
post_ml_performance(**ivv_result_dict)

In [55]:
performance_ivv_rf = pd.read_csv('../../data/performance_ivv_rf_new.csv', index_col=0, usecols=[0,1])

In [56]:
performance_ivv_rf

Unnamed: 0,return forecast
2007-08,0.019275
2007-09,0.021874
2007-10,0.019016
2007-11,0.008545
2007-12,0.028623
...,...
2019-07,0.004680
2019-08,0.030029
2019-09,0.004477
2019-10,-0.000102
