In [280]:
# Python
import pandas as pd
import torch
#
import xgboost
from xgboost import XGBRegressor
#
import plotly.graph_objects as go

from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error
import importlib
import utilities.train_test as train_test
import utilities.variables as variables

In [281]:
df = pd.read_csv('../../../data/df_monthly_returns_complete_percentage.csv', index_col='Date')

In [282]:
importlib.reload(train_test)
df_tabular = train_test.get_dataframe_tabular(df)

In [283]:
df_tabular 

Unnamed: 0,month,year,date,m_return(t-11),m_return(t-10),m_return(t-9),m_return(t-8),m_return(t-7),m_return(t-6),m_return(t-5),m_return(t-4),m_return(t-3),m_return(t-2),m_return(t-1),m_return(t),m_return_target(t+1),stock_ticker_label
12,10,2000,2000-10-01,1.13,1.11,1.02,1.00,1.00,1.00,1.00,1.00,1.01,1.00,1.00,1.00,1.00,1371
13,11,2000,2000-11-01,1.11,1.02,1.00,1.00,1.00,1.00,1.00,1.01,1.00,1.00,1.00,1.00,1.00,1371
14,12,2000,2000-12-01,1.02,1.00,1.00,1.00,1.00,1.00,1.01,1.00,1.00,1.00,1.00,1.00,1.01,1371
15,1,2001,2001-01-01,1.00,1.00,1.00,1.00,1.00,1.01,1.00,1.00,1.00,1.00,1.00,1.01,1.00,1371
16,2,2001,2001-02-01,1.00,1.00,1.00,1.00,1.01,1.00,1.00,1.00,1.00,1.00,1.01,1.00,1.00,1371
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495894,4,2024,2024-04-01,0.98,1.07,1.13,0.95,0.93,1.02,1.09,1.09,1.00,1.04,0.97,0.96,0.98,1549
495895,5,2024,2024-05-01,1.07,1.13,0.95,0.93,1.02,1.09,1.09,1.00,1.04,0.97,0.96,0.98,0.98,1549
495896,6,2024,2024-06-01,1.13,0.95,0.93,1.02,1.09,1.09,1.00,1.04,0.97,0.96,0.98,0.98,1.09,1549
495897,7,2024,2024-07-01,0.95,0.93,1.02,1.09,1.09,1.00,1.04,0.97,0.96,0.98,0.98,1.09,1.04,1549


In [284]:
df_tabular[df_tabular["m_return_target(t+1)"].isna()][["date", "stock_ticker_label"]]


Unnamed: 0,date,stock_ticker_label


In [285]:
# Methods
def generate_plot(df, y_train_pred, y_test_pred):
    # Create the plot
    fig = go.Figure()
    indices = df.index.tolist()
    min_date = pd.to_datetime(df_tabular['date'].max()) - pd.DateOffset(months=len(y_test_pred))
    min_datestr = min_date.strftime('%Y-%m-%d')

    # Add the timeseries line
    fig.add_trace(go.Scatter(y=df.mean(axis=1), x=indices, mode='lines', name='Actual returns',
                             line=dict(color='#5c839f', width=2)))
    # Add the training plot in red
    fig.add_trace(go.Scatter(y=y_train_pred, x=indices[:len(y_train_pred)],
                             mode='lines', name='Train returns',
                             line=dict(color='red', width=2)))

    # Add the testing plot in green
    fig.add_trace(go.Scatter(x=indices[len(y_train_pred) -1:],
                             y=[y_train_pred[len(y_train_pred)-1], *y_test_pred],
                             mode='lines', name='Test returns',
                             line=dict(color='green', width=2)))

    fig.add_vline(x=min_datestr, line_color='red', line_dash='dash', line_width=1)

    # Update layout with labels
    fig.update_layout(
        title='{0} Month Prediction vs Actual Plot'.format(len(y_test_pred)),
        xaxis=dict(
            title='Date'
        ),
        yaxis=dict(
            title='Day closing return (%)',
            tickformat='.0%',
            range=[0.75, 1.6]
        ),
        legend=dict(title="Legend"),
        template="plotly_white"
    )

    fig.show()

def get_tt_pred(y_train_pred_1m, y_test_pred_1m):
    train_pred_torch_list = torch.from_numpy(y_train_pred_1m)
    # Reshape to (num_samples, num_features) for normalization
    train_rows = int(len(train_pred_torch_list)/1653)
    train_pred_torch_view = train_pred_torch_list.view(train_rows, 1653)
    y_train_pred = pd.DataFrame(train_pred_torch_view).mean(axis=1)
    #
    test_pred_torch_list = torch.from_numpy(y_test_pred_1m)
    test_rows = int(len(test_pred_torch_list) / 1653)
    test_pred_torch_view = test_pred_torch_list.view(test_rows, 1653)
    y_test_pred = pd.DataFrame(test_pred_torch_view).mean(axis=1)

    return y_train_pred, y_test_pred

## Model

In [286]:
pd.to_datetime(df_tabular['date']).max() - pd.DateOffset(months=30)

Timestamp('2022-02-01 00:00:00')

In [287]:
importlib.reload(train_test)

X_train, y_train, X_test, y_test = train_test.get_train_test(df_tabular, months=60)

model = xgboost.XGBRegressor(n_estimators=1000)
model.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=False)

## 1 Month

In [288]:
months_1m = 1
X_train_1m, y_train_1m, X_test_1m, y_test_1m = train_test.get_train_test(df_tabular, months=1)

In [289]:
# Predictions
y_train_pred_1m = model.predict(X_train_1m)
y_test_pred_1m = model.predict(X_test_1m)

#### Train-Data

In [290]:
y_train_pred_1m, y_test_pred_1m = get_tt_pred(y_train_pred_1m, y_test_pred_1m)

#### Actual vs Prediction plot

In [291]:
generate_plot(df, y_train_pred_1m, y_test_pred_1m)

### 6 Months

In [292]:
months_6m = 6
X_train_6m, y_train_6m, X_test_6m, y_test_6m = train_test.get_train_test(df_tabular, months=6)

In [293]:
# Predictions
y_train_pred_6m = model.predict(X_train_6m)
y_test_pred_6m = model.predict(X_test_6m)

#### Train-Data

In [294]:
y_train_pred_6m, y_test_pred_6m = get_tt_pred(y_train_pred_6m, y_test_pred_6m)

#### Actual vs Prediction plot

In [295]:
generate_plot(df, y_train_pred_6m, y_test_pred_6m)

## 12 Months

In [296]:
months_12m = 12
X_train_12m, y_train_12m, X_test_12m, y_test_12m = train_test.get_train_test(df_tabular, months=12)

In [297]:
# Predictions
y_train_pred_12m = model.predict(X_train_12m)
y_test_pred_12m = model.predict(X_test_12m)

#### Train-Data

In [298]:
y_train_pred_12m, y_test_pred_12m = get_tt_pred(y_train_pred_12m, y_test_pred_12m)

#### Actual vs Prediction plot

In [299]:
generate_plot(df, y_train_pred_12m, y_test_pred_12m)