In [19]:
import sys
import os

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)
    
from src.predictionModule.FilterSamples import FilterSamples
from src.predictionModule.LoadupSamples import LoadupSamples

import pandas as pd
import numpy as np
import polars as pl
import datetime
import seaborn as sns
import lightgbm as lgb
import random
import matplotlib.pyplot as plt
import logging
import time
import re
import copy

formatted_date = datetime.datetime.now().strftime("%d%b%y_%H%M").lower()

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter(fmt="%(asctime)s - %(message)s")
handler.setFormatter(formatter)
if not logger.hasHandlers():
    logger.addHandler(handler)
else:
    logger.handlers[:] = [handler]

#Output File handler
file_handler = logging.FileHandler(f"notebook-{formatted_date}.log", mode="w")
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# Usage
logger.info("This will print to the notebook's output cell")

2025-08-28 15:41:59,543 - This will print to the notebook's output cell


In [None]:
params = {
    "daysAfterPrediction": None,
    "idxAfterPrediction": 5,
    'timesteps': 60,
    'target_option': 'mean',
    
    "LoadupSamples_tree_scaling_standard": True,
    "LoadupSamples_time_scaling_stretch": True,
    "LoadupSamples_time_inc_factor": 10,

    "LSTM_units": 32,
    "LSTM_num_layers": 1,
    "LSTM_dropout": 0.001,
    "LSTM_recurrent_dropout": 0.001,
    "LSTM_learning_rate": 0.00001,
    "LSTM_optimizer": "adam",
    "LSTM_bidirectional": True,
    "LSTM_batch_size": 2**12,
    "LSTM_epochs": 2,
    "LSTM_l1": 0.0001,
    "LSTM_l2": 0.0001,
    "LSTM_inter_dropout": 0.0001,
    "LSTM_input_gaussian_noise": 0.0001,
    "LSTM_conv1d": True,
    "LSTM_conv1d_kernel_size": 3,
    "LSTM_loss": "mse",

    "FilterSamples_q_up": 0.90,
    "FilterSamples_lincomb_epochs": 5,
    "FilterSamples_lincomb_probs_noise_std": 0.05,
    "FilterSamples_lincomb_subsample_ratio": 0.5,
    "FilterSamples_lincomb_sharpness": 0.6,
    "FilterSamples_lincomb_show_progress": False,
    "FilterSamples_lincomb_featureratio": 0.5,
    "FilterSamples_lincomb_itermax": 2,
    "FilterSamples_lincomb_init_toprand":  1,

    "FilterSamples_days_to_train_end": 10,
    "FilterSamples_cat_over20": True,
    "FilterSamples_cat_posOneYearReturn": False,
    "FilterSamples_cat_posFiveYearReturn": False,
    "FilterSamples_taylor_horizon_days": 20,
    "FilterSamples_taylor_roll_window_days": 20
}

In [21]:
timegroup = "group_regOHLCV_over5years"
treegroup = "group_debug"

eval_date = datetime.date(year=2025, month=6, day=13)
evaldates = [eval_date - datetime.timedelta(days=i) for i in range(1, 6)]
start_train_date = datetime.date(year=2023, month=1, day=1)
split_Date = datetime.date(year=2025, month=1, day=1)
ls = LoadupSamples(
    train_start_date=start_train_date,
    test_dates=evaldates,
    treegroup=treegroup,
    timegroup=timegroup,
    params=params,
)
ls.load_samples(main_path = "../src/featureAlchemy/bin/")
ls.split_dataset(
    start_date=start_train_date,
    last_train_date=split_Date,
    last_test_date=eval_date
)

2025-08-28 15:42:03,725 - Test date 2025-06-08 not found in the database. Omitting.


In [22]:
train_Xtree = ls.train_Xtree
train_ytree = ls.train_ytree
train_Xtime = ls.train_Xtime
train_ytime = ls.train_ytime

test_Xtree = ls.test_Xtree
test_ytree = ls.test_ytree
test_Xtime = ls.test_Xtime
test_ytime = ls.test_ytime

treenames = ls.featureTreeNames
timenames = ls.featureTimeNames

meta_pl_train = ls.meta_pl_train
meta_pl_test = ls.meta_pl_test

fs_pre = FilterSamples(
    Xtree_train=train_Xtree,
    ytree_train=train_ytree,
    treenames=treenames,
    Xtree_test=test_Xtree,
    ytree_test=test_ytree,
    meta_train=meta_pl_train,
    meta_test=meta_pl_test,
    params=params
)

cat_mask_train, cat_mask_test = fs_pre.categorical_masks()
train_Xtree = train_Xtree[cat_mask_train]
train_ytree = train_ytree[cat_mask_train]
train_Xtime = train_Xtime[cat_mask_train]
train_ytime = train_ytime[cat_mask_train]

test_Xtree = test_Xtree[cat_mask_test]
test_ytree = test_ytree[cat_mask_test]
test_Xtime = test_Xtime[cat_mask_test]
test_ytime = test_ytime[cat_mask_test]

meta_pl_train = meta_pl_train.filter(pl.Series(cat_mask_train))
meta_pl_test = meta_pl_test.filter(pl.Series(cat_mask_test))

fs = FilterSamples(
    Xtree_train=train_Xtree,
    ytree_train=train_ytree,
    treenames=treenames,
    Xtree_test=test_Xtree,
    ytree_test=test_ytree,
    meta_train=meta_pl_train,
    meta_test=meta_pl_test,
    params=params
)


In [23]:
lincomb_mask_train, lincomb_mask_test = fs.lincomb_masks()
score_train = fs.evaluate_mask(lincomb_mask_train, meta_pl_train['date'], train_ytree)
score_test = fs.evaluate_mask(lincomb_mask_test, meta_pl_test['date'], test_ytree)

logger.info(f"")
logger.info(f"Mean of all train scores: {score_train}")
logger.info(f"Mean of all test scores: {score_test}")
logger.info("")

2025-08-28 15:42:04,841 - FilterSamples: (train) mean of y values 1.003165115940096.
2025-08-28 15:42:04,848 - FilterSamples: (test) mean of y values 1.002741455933534.
2025-08-28 15:42:04,850 - FilterSamples: Starting Lincomb
2025-08-28 15:42:04,855 -   FilterSamples: Lincomb Iteration 0/2 running.
2025-08-28 15:42:04,957 - FilterSamples: Best init score 1.0173
2025-08-28 15:42:05,016 -   Mean target (train): 1.0196064656231731
2025-08-28 15:42:05,022 -   Number of features selected: 454
2025-08-28 15:42:05,022 -   Max distance between idces: 35
2025-08-28 15:42:05,023 -   Mean distance between idces: 8.933333333333334
2025-08-28 15:42:05,024 -   Median distance between idces: 8.0
2025-08-28 15:42:05,032 -   Mean target (test): 1.007680621914807
2025-08-28 15:42:05,037 -   Fraction of days with no coverage by test mask: 9.01%
2025-08-28 15:42:05,037 -   w quantile     : 6.108936786651611
2025-08-28 15:42:05,037 -   w_test quantile: 4.538480758666992
2025-08-28 15:42:05,039 -     Top-0

In [24]:
lincomb_mask_train, lincomb_mask_test = fs.taylor_feature_masks()
score_train = fs.evaluate_mask(lincomb_mask_train, meta_pl_train['date'], train_ytree)
score_test = fs.evaluate_mask(lincomb_mask_test, meta_pl_test['date'], test_ytree)

logger.info(f"")
logger.info(f"Mean of all train scores: {score_train}")
logger.info(f"Mean of all test scores: {score_test}")
logger.info("")

2025-08-28 15:42:05,278 - FilterSamples/Taylor: (train) mean of y values 1.003165115940096.
2025-08-28 15:42:05,282 - FilterSamples/Taylor: (test) mean of y values 1.002741455933534.
2025-08-28 15:42:05,284 - FilterSamples/Taylor: scoring features via rolling-mean Taylor approximation.
2025-08-28 15:42:05,286 -   Taylor score feat[10] FinData_quar_grossProfit_nivRev: x0=0.992583, slope=0.000000, t=6, score=0.992583
2025-08-28 15:42:05,287 -   Taylor score feat[11] FinData_quar_ebit_nivRev: x0=0.982255, slope=0.000000, t=6, score=0.982255
2025-08-28 15:42:05,288 -   Taylor score feat[12] FinData_quar_ebitda_nivRev: x0=0.984034, slope=0.000000, t=6, score=0.984034
2025-08-28 15:42:05,289 -   Taylor score feat[13] FinData_quar_totalAssets_nivRev: x0=0.998664, slope=0.000000, t=6, score=0.998664
2025-08-28 15:42:05,290 -   Taylor score feat[14] FinData_quar_totalCurrentLiabilities_nivRev: x0=0.993916, slope=0.000000, t=6, score=0.993916
2025-08-28 15:42:05,293 -   Taylor score feat[15] Fin