Will create data for days 0-750 then fit a forecaster on it to predict logReturns and then export the model and attempt to runin a sim without retraining.

In [1]:
import glob
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from IPython.display import display
from skforecast.plot import set_dark_theme
from sklearn.multioutput import MultiOutputRegressor
from tqdm import tqdm

import sklearn
import skforecast
from sklearn.ensemble import HistGradientBoostingRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from skforecast.recursive import ForecasterRecursive, ForecasterRecursiveMultiSeries
from skforecast.model_selection import (
    TimeSeriesFold,
    OneStepAheadFold,
    backtesting_forecaster,
    bayesian_search_forecaster,
    backtesting_forecaster_multiseries,
    bayesian_search_forecaster_multiseries
)
from skforecast.preprocessing import RollingFeatures, series_long_to_dict, exog_long_to_dict
from skforecast.exceptions import OneStepAheadValidationWarning

import warnings

In [2]:
prices750Days = np.loadtxt("../../sourceCode/prices.txt").T

from strategies.ms_forecasting.main import createGreeksManager
from strategies.ms_forecasting.main import GreeksManager
from strategies.ms_forecasting.main import PRICE_LAGS
from strategies.ms_forecasting.main import WINDOW_SIZES

LAGS = 100
ROLLING_FEATURES_SIZE = 100

preTrainingDays = max(max(PRICE_LAGS + WINDOW_SIZES), LAGS, ROLLING_FEATURES_SIZE)
firstTrainingDay = preTrainingDays + 1
trainingDataWindowSize = 749 - preTrainingDays # 749 because 1 extra day needed for the windows

print(f"PreTrainingDays        = {preTrainingDays}")
print(f"trainingDataWindowSize = {trainingDataWindowSize}")

gm: GreeksManager = createGreeksManager(prices750Days, trainingDataWindowSize)

exogIndex = pd.RangeIndex(firstTrainingDay, firstTrainingDay + trainingDataWindowSize)
exogDict = gm.getGreeksHistoryDict(exogIndex)

print("Inst 0 days [742, 750]:\n", prices750Days[0, -8:])
display("inst 0 head:", exogDict["inst_0"].head(1))
display("inst 0 tail:", exogDict["inst_0"].tail(1))
print(f"each inst-greek has {len(exogDict["inst_0"]["greek_lag_1"])} days")

display("exogDict: ", gm.getGreeksDict(pd.RangeIndex(750, 751))["inst_0"])

PreTrainingDays        = 248
trainingDataWindowSize = 501


IndexError: index 750 is out of bounds for axis 1 with size 750

In [None]:
prices1000Days = np.loadtxt("../../sourceCode/1000Prices.txt").T

print(prices1000Days[0, 747:752])
print(prices750Days[0, 747:752])

logReturnsPricesWindow = prices1000Days[:, preTrainingDays:]
logReturnsNp = np.log(logReturnsPricesWindow[:, 1:] / logReturnsPricesWindow[:, :-1])

logReturnsIndex = pd.RangeIndex(firstTrainingDay, 1000)
logReturnsSeries = pd.DataFrame(logReturnsNp.T,
                          index = logReturnsIndex,
                          columns = [f"inst_{i}" for i in range(logReturnsNp.shape[0])])


# Create the training window separate so not to leak data
logReturnsTrainingWindow = prices750Days[:, preTrainingDays:]
logReturnsTrainingNp = np.log(logReturnsTrainingWindow[:, 1:] / logReturnsTrainingWindow[:, :-1])

logReturnsTrainingIndex = pd.RangeIndex(firstTrainingDay, 750)
logReturnsTrainingSeries = pd.DataFrame(logReturnsTrainingNp.T,
                          index = logReturnsTrainingIndex,
                          columns = [f"inst_{i}" for i in range(logReturnsTrainingNp.shape[0])])

display("Log returns training series tail:", logReturnsTrainingSeries.tail())
display("Log returns series around day 750 (Make sure it matches the above + extra):", logReturnsSeries.loc[745:750])
display("Log returns tail (1) (should be index 999):", logReturnsSeries.tail(1))

print("Inst_0:")
priceA = prices750Days[0, 748]
priceB = prices750Days[0, 749]
logReturnAToB = np.log(priceB / priceA)
print(f"Price on day 749 = {priceA}")
print(f"Price on day 750 = {priceB}")
print(f"The log return for day 749 = {logReturnAToB}")

assert logReturnsTrainingSeries.index[0] == exogDict['inst_0'].index[0]

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

display(exogDict["inst_0"].tail(3))
display(logReturnsTrainingSeries.tail(3))
print("inst_0 prices days 745: = ", prices750Days[0, 745:])
display(gm.getGreeksDict(pd.RangeIndex(750, 751))["inst_0"])

logReturnsForecaster = ForecasterRecursiveMultiSeries(
    # regressor           = HistGradientBoostingRegressor(random_state=8523, learning_rate=0.05),
    regressor = XGBRegressor(
        objective='reg:squarederror',
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        random_state=8523,
        verbosity=0
    ),
    transformer_series  = None,
    transformer_exog    = StandardScaler(),
    lags                = LAGS,
    window_features     = RollingFeatures(
                                stats           = ['min', 'max'],
                                window_sizes    = ROLLING_FEATURES_SIZE,
                            ),
)

logReturnsForecaster.fit(
    series  = logReturnsTrainingSeries,
    exog    = exogDict
)

logReturnsForecaster.is_fitted

# Simulate updating the greeks daily and making a one step prediction:

In [None]:
predictions = []

for day in range(750, 1000):

    predictionExogs = gm.getGreeksDict(pd.RangeIndex(day, day+1))
    last_window = logReturnsSeries.loc[:day-1].tail(max(logReturnsForecaster.lags))

    prediction = logReturnsForecaster.predict(
        last_window = last_window,
        exog        = predictionExogs,
        steps       = 1,
        levels      = list(last_window.columns),
    )

    predictions.append(prediction["pred"].values)

    gm.updateGreeks(prices1000Days[:, day])


In [None]:
def plotPredictedVsActualLogReturns(predictions, actualLogReturns, instrumentIndex):
    instrumentPredictions = np.array([prediction[instrumentIndex] for prediction in predictions])
    actualInstrumentLogReturns = actualLogReturns[instrumentIndex, :]

    print(f"instrumentPredictions shape = {instrumentPredictions.shape}")
    print(f"actualInstrumentLogReturns shape = {actualInstrumentLogReturns.shape}")

    days = np.arange(750, 1000)

    plt.figure(figsize=(14, 8))
    plt.plot(days, instrumentPredictions, label = "Predictions")
    plt.plot(days, actualInstrumentLogReturns, label = "Actual")
    plt.xlabel("day")
    plt.ylabel("Log Return")
    plt.legend()
    plt.grid(True)
    plt.show()

actualLogReturns = np.log(prices1000Days[:, 750:] / prices1000Days[:, 749:-1])

plotPredictedVsActualLogReturns(predictions, actualLogReturns, 0)