In [1]:
import sys
import os

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)
    
from src.predictionModule.FilterSamples import FilterSamples
from src.predictionModule.LoadupSamples import LoadupSamples

import pandas as pd
import numpy as np
import polars as pl
import datetime
import seaborn as sns
import lightgbm as lgb
import random
import matplotlib.pyplot as plt
import logging
import time
import re
import copy

formatted_date = datetime.datetime.now().strftime("%d%b%y_%H%M").lower()

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter(fmt="%(asctime)s - %(message)s")
handler.setFormatter(formatter)
if not logger.hasHandlers():
    logger.addHandler(handler)
else:
    logger.handlers[:] = [handler]

#Output File handler
file_handler = logging.FileHandler(f"notebook-{formatted_date}.log", mode="w")
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# Usage
logger.info("This will print to the notebook's output cell")

2025-08-05 15:38:55,980 - This will print to the notebook's output cell


In [2]:
params = {
    "idxAfterPrediction": 5,
    'timesteps': 25,
    'target_option': 'last',
    
    "TreeTime_lstm_units": 64,
    "TreeTime_lstm_num_layers": 3,
    "TreeTime_lstm_dropout": 0.00001,
    "TreeTime_lstm_recurrent_dropout": 0.00001,
    "TreeTime_lstm_learning_rate": 0.001,
    "TreeTime_lstm_optimizer": "adam",
    "TreeTime_lstm_bidirectional": True,
    "TreeTime_lstm_batch_size": 2**14,
    "TreeTime_lstm_epochs": 4,
    "TreeTime_lstm_l1": 0.00001,
    "TreeTime_lstm_l2": 0.00001,
    "TreeTime_inter_dropout": 0.00001,
    "TreeTime_input_gaussian_noise": 0.00001,
    "TreeTime_lstm_conv1d": True,
    "TreeTime_lstm_conv1d_kernel_size": 3,
    "TreeTime_lstm_loss": "mse",
}

In [3]:
stock_group = "group_finanTo2011"

eval_date = datetime.date(2025,  5,  1)

start_Date = datetime.date(year=2020, month=1, day=1)

ls = LoadupSamples(
    train_start_date=start_Date,
    test_dates=[eval_date],
    group=stock_group,
    group_type='Tree',
    params=params,
)

In [4]:
ls.load_samples(main_path="../src/featureAlchemy/bin/")

In [5]:
fs_params= {
    "FilterSamples_days_to_train_end": 365,
    
    "FilterSamples_lincomb_q_up": 0.99,
    "FilterSamples_lincomb_lr": 0.0008384895113158295,
    "FilterSamples_lincomb_epochs": 800,
    "FilterSamples_lincomb_probs_noise_std": 0.010503627436184224,
    "FilterSamples_lincomb_subsample_ratio": 0.2424109747001177,
    "FilterSamples_lincomb_sharpness": 0.59226051089996,
    "FilterSamples_lincomb_featureratio": 0.33269072850403053,
    "FilterSamples_lincomb_itermax": 2,
    "FilterSamples_lincomb_show_progress": True,
    'FilterSamples_lincomb_init_toprand': 2
}

In [6]:
dates = ls.meta_pl_train.select(pl.col("date")).to_series()
Xtree_all = ls.train_Xtree
ytree_all = ls.train_ytree

end_train_date = datetime.date(2025,  4,  1)
test_days = 30  # 1 month of testing data
        
def est_samples(X_all, y_all, dates: pl.Series, end_tr_date, test_days):
    train_start = dates.min()
    train_end   = end_tr_date

    test_start  = end_tr_date + datetime.timedelta(days=1)
    test_end    = end_tr_date + datetime.timedelta(days=test_days)

    # slice train
    train_mask = (
        (dates >= train_start) &
        (dates <= train_end)
    ).fill_null(False).to_numpy()

    # slice test
    test_mask = (
        (dates >= test_start) &
        (dates <= test_end)
    ).fill_null(False).to_numpy()

    return (
        X_all[train_mask],
        X_all[test_mask],
        y_all[train_mask],
        y_all[test_mask],
        dates.filter(train_mask),
        dates.filter(test_mask)
    )

In [None]:
end_train_dates = sorted([
    end_train_date - datetime.timedelta(days=i*60 + random.randint(-10,10)) 
    for i in range(5)
])

test_scores = []
for i, end_date in enumerate(end_train_dates):
    split_f = 0.90
    train_days = 390  # 2 years of training data
    test_days = int(train_days * (1-split_f)) 
    
    (
        Xtree_train,
        Xtree_test,
        ytree_train,
        ytree_test,
        samples_dates_train,
        samples_dates_test
    ) = est_samples(Xtree_all, ytree_all, dates, end_date, test_days=test_days)
            
    fs_params["FilterSamples_days_to_train_end"] = train_days
    fs = FilterSamples(
        Xtree_train = Xtree_train,
        ytree_train = ytree_train,
        treenames = ls.featureTreeNames,
        Xtree_test = Xtree_test,
        samples_dates_train = samples_dates_train,
        samples_dates_test = samples_dates_test,
        ytree_test = ytree_test,
        params = fs_params
    )
    
    _, _, s_tr, s_te = fs.categorical_masks()
    s_te = 1.0 if s_te is None or s_te < 0.5 else s_te
    test_scores.append(s_te)
    
    logger.info(f"")
    logger.info(f"END DATE {end_date}")
    logger.info(f"Mean of all test scores: {np.mean([s for s in test_scores if ~np.isnan(s) or s is not None])}")
    logger.info("")
    
    logger.info(f"Training score: {(s_tr)}, testing score: {(s_te)}")

