# Asset Return Prediction with XGBoost

In [14]:
import xgboost as xgb
import pandas as pd
import numpy as np
import json

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV

In [5]:
returns = pd.read_csv("../data/weekly_returns.csv",index_col="Date")
returns.index = pd.to_datetime(returns.index)

returns

Unnamed: 0_level_0,ABEV3.SA,BBAS3.SA,BBDC4.SA,CPLE6.SA,CSAN3.SA,CSNA3.SA,ELET3.SA,ENBR3.SA,GGBR4.SA,ITUB4.SA,JBSS3.SA,PETR4.SA,SANB11.SA,SULA11.SA,TIMS3.SA,USIM5.SA,VALE3.SA,VIVT3.SA,WEGE3.SA,Selic
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2010-01-04,0.038579,0.029609,0.018519,0.032145,-0.054668,0.039757,0.054168,0.056991,0.019790,0.008492,0.048338,0.007062,-0.019008,-0.010050,0.027588,0.042368,0.080260,-0.027217,0.054612,0.001647
2010-01-11,0.019564,-0.019300,-0.026413,-0.014484,-0.010191,0.005034,-0.067417,0.047165,-0.023635,-0.045886,0.016924,-0.033015,-0.023734,-0.030364,-0.013699,-0.010875,-0.006713,-0.012510,-0.029853,0.001647
2010-01-18,0.000126,-0.024573,-0.126903,-0.010667,-0.022880,-0.074747,0.154428,-0.023305,-0.102092,-0.027749,-0.055824,-0.028371,-0.053824,0.039237,-0.006920,-0.078553,-0.056575,-0.006653,-0.044017,0.001647
2010-01-25,-0.058269,-0.015537,-0.014260,0.032703,-0.072430,0.025784,-0.047791,-0.019608,-0.027634,-0.003316,-0.025371,-0.016832,0.037091,-0.038828,-0.028171,0.064538,-0.032190,0.025805,0.002774,0.001647
2010-02-01,-0.030459,-0.007143,-0.041714,-0.034313,0.094029,-0.012809,-0.507667,-0.044838,-0.026796,-0.037506,-0.035971,-0.080726,-0.106248,-0.044016,-0.049791,-0.039059,-0.036436,-0.030543,-0.048236,0.001647
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-11-29,-0.042221,0.059396,0.016513,0.026103,0.049014,0.043905,0.081197,0.016467,0.086196,0.019370,-0.028620,0.010135,-0.017778,0.038511,-0.007236,0.097350,0.045983,0.000197,0.002749,0.001464
2021-12-06,-0.003130,0.009274,-0.025893,0.033257,-0.004929,0.046168,0.000283,-0.013152,0.060604,-0.035547,0.051907,0.146801,-0.049957,-0.002269,-0.039250,0.066601,0.051258,-0.035021,0.088129,0.001464
2021-12-13,0.007495,-0.069103,-0.019486,0.000000,-0.005405,0.043738,-0.045660,-0.004264,-0.019796,-0.032564,0.051236,-0.022511,-0.008838,-0.015646,-0.014454,0.021464,0.045480,0.003050,-0.026895,0.001629
2021-12-20,-0.032894,-0.031840,-0.029700,-0.015699,-0.028727,-0.032724,-0.016998,-0.013384,-0.073657,-0.005141,0.045104,-0.023030,-0.034842,-0.021381,-0.013889,-0.032169,-0.000253,0.012510,-0.040404,0.001739


## Reshaping the data

We need to reshape the Time Series into a Cross-Section shape so that we can pass it to the model.

**ts2cross(serie,lags)** reshapes the time series into a cross-section table that has, in each collumn, the series lagged values.

In [6]:
def ts2cross(serie,lags):

    columns = ["Y_{t}"]
    for i in range(1,lags+1):
        columns.append("Y_{t-%s}"%str(i))

    columns.reverse()

    moving_window = pd.DataFrame(columns=columns)

    for i in range(lags,len(serie)):
        moving_window.loc[len(moving_window.index)] = serie[i-lags:i+1].values

    moving_window.set_index(serie.index[lags:],inplace=True)

    return moving_window

In [7]:
def get_X_y(df,ylabel):
    y = df[ylabel]
    cols = list(df.columns.values)
    cols.remove(ylabel)
    X = df[cols]
    return X,y

In [8]:
def train_test_split(cs,end_train_year):
    train = cs[cs.index.year<end_train_year]  
    test = cs[cs.index.year>=end_train_year]
    return train,test

## Select best hyperparameters for each Asset

In [12]:
lags = 5

param_grid = {
    'learning_rate': [0.1, 0.01, 0.05],
    'min_split_loss': [1000, 2000, 500], # gamma
    'reg_lambda': [0, 0.5, 10.0], # lambda
    'max_depth': [8, 9, 10]}

best_parameters = {}

for asset in returns.columns:
    serie = returns[asset]
    serie_cs = ts2cross(serie=serie,lags=5)

    train,test = train_test_split(serie_cs,2020)
    
    X_train, y_train = get_X_y(train,"Y_{t}")
    X_test, y_test = get_X_y(test,"Y_{t}")


    optimal_params = GridSearchCV(estimator = xgb.XGBRegressor(objective = 'reg:squarederror', eval_metric='mae', n_estimators = 300, early_stopping_rounds=10),
                                param_grid = param_grid,
                                scoring = 'neg_mean_absolute_error', 
                                verbose = 0)

    optimal_params.fit(X_train,
                        y_train,
                        eval_set=[(X_test, y_test)],
                        verbose=False)

    best_parameters[asset] = optimal_params.best_params_
    

In [17]:
with open("../output/hyperparameters/hyperparameters_xgboost.json", "w") as outfile:
    json.dump(best_parameters, outfile)

best_parameters["ABEV3.SA"]

{'learning_rate': 0.1,
 'max_depth': 8,
 'min_split_loss': 1000,
 'reg_lambda': 10.0}

## Train XGBoost Model for each asset using the best hyperparameters chosen

In [29]:
predicted_returns_train = pd.DataFrame()
predicted_returns_test = pd.DataFrame()

for asset in returns.columns:
    serie = returns[asset]
    serie_cs = ts2cross(serie=serie,lags=5)

    train,test = train_test_split(serie_cs,2020)
    
    X_train, y_train = get_X_y(train,"Y_{t}")
    X_test, y_test = get_X_y(test,"Y_{t}")

    params = best_parameters[asset]

    model = xgb.XGBRegressor(n_estimators=300,random_state=0,
                            objective = 'reg:squarederror', eval_metric='mae',
                            early_stopping_rounds=10,
                            learning_rate=params['learning_rate'], max_depth=params["max_depth"],
                            min_split_loss=params["min_split_loss"],reg_lambda=params["reg_lambda"]).fit(X_train,y_train,eval_set=[(X_test, y_test)],verbose=False)
    
    train_prediction = model.predict(X_train)
    predicted_returns_train[asset] = train_prediction

    test_prediction = model.predict(X_test)
    predicted_returns_test[asset] = test_prediction
    
predicted_returns_train.set_index(train.index,inplace=True)
predicted_returns_test.set_index(test.index,inplace=True)

In [31]:
predicted_returns_test

Unnamed: 0_level_0,ABEV3.SA,BBAS3.SA,BBDC4.SA,CPLE6.SA,CSAN3.SA,CSNA3.SA,ELET3.SA,ENBR3.SA,GGBR4.SA,ITUB4.SA,JBSS3.SA,PETR4.SA,SANB11.SA,SULA11.SA,TIMS3.SA,USIM5.SA,VALE3.SA,VIVT3.SA,WEGE3.SA,Selic
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2020-01-06,0.002955,0.002523,0.002686,0.003666,0.002981,0.014331,0.00316,0.002412,0.013873,0.002474,0.005846,0.002388,0.002842,0.003949,0.002086,0.016247,0.010115,0.002184,0.005513,0.001821
2020-01-13,0.002955,0.002523,0.002686,0.003666,0.002981,0.014331,0.00316,0.002412,0.013873,0.002474,0.005846,0.002388,0.002842,0.003949,0.002086,0.016247,0.010115,0.002184,0.005513,0.001821
2020-01-20,0.002955,0.002523,0.002686,0.003666,0.002981,0.014331,0.00316,0.002412,0.013873,0.002474,0.005846,0.002388,0.002842,0.003949,0.002086,0.016247,0.010115,0.002184,0.005513,0.001821
2020-01-27,0.002955,0.002523,0.002686,0.003666,0.002981,0.014331,0.00316,0.002412,0.013873,0.002474,0.005846,0.002388,0.002842,0.003949,0.002086,0.016247,0.010115,0.002184,0.005513,0.001821
2020-02-03,0.002955,0.002523,0.002686,0.003666,0.002981,0.014331,0.00316,0.002412,0.013873,0.002474,0.005846,0.002388,0.002842,0.003949,0.002086,0.016247,0.010115,0.002184,0.005513,0.001821
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-11-29,0.002955,0.002523,0.002686,0.003666,0.002981,0.014331,0.00316,0.002412,0.013873,0.002474,0.005846,0.002388,0.002842,0.003949,0.002086,0.016247,0.010115,0.002184,0.005513,0.001821
2021-12-06,0.002955,0.002523,0.002686,0.003666,0.002981,0.014331,0.00316,0.002412,0.013873,0.002474,0.005846,0.002388,0.002842,0.003949,0.002086,0.016247,0.010115,0.002184,0.005513,0.001821
2021-12-13,0.002955,0.002523,0.002686,0.003666,0.002981,0.014331,0.00316,0.002412,0.013873,0.002474,0.005846,0.002388,0.002842,0.003949,0.002086,0.016247,0.010115,0.002184,0.005513,0.001821
2021-12-20,0.002955,0.002523,0.002686,0.003666,0.002981,0.014331,0.00316,0.002412,0.013873,0.002474,0.005846,0.002388,0.002842,0.003949,0.002086,0.016247,0.010115,0.002184,0.005513,0.001821


In [33]:
predicted_returns = pd.concat([predicted_returns_train,predicted_returns_test])

predicted_returns.to_csv("../data/predicted_returns_xgboost.csv")

## To-Do

Verificar porque o XGBoost está prevendo sempre o mesmo valor
Pensar se essa estratégia seccional já está fazendo a janela rolante e prevendo sempre 1 passo à frente. Eu acho que sim.