In [1]:
import sys
import os

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)
    
from src.predictionModule.FilterSamples import FilterSamples
from src.predictionModule.LoadupSamples import LoadupSamples

import pandas as pd
import numpy as np
import polars as pl
import datetime
import seaborn as sns
import lightgbm as lgb
import random
import matplotlib.pyplot as plt
import logging
import time
import re
import copy

formatted_date = datetime.datetime.now().strftime("%d%b%y_%H%M").lower()

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter(fmt="%(asctime)s - %(message)s")
handler.setFormatter(formatter)
if not logger.hasHandlers():
    logger.addHandler(handler)
else:
    logger.handlers[:] = [handler]

#Output File handler
file_handler = logging.FileHandler(f"notebook-{formatted_date}.log", mode="w")
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# Usage
logger.info("This will print to the notebook's output cell")

2025-07-17 18:27:38,295 - This will print to the notebook's output cell


In [2]:
params = {
    "idxAfterPrediction": 5,
    'timesteps': 25,
    'target_option': 'last',
    
    "TreeTime_lstm_units": 64,
    "TreeTime_lstm_num_layers": 3,
    "TreeTime_lstm_dropout": 0.00001,
    "TreeTime_lstm_recurrent_dropout": 0.00001,
    "TreeTime_lstm_learning_rate": 0.001,
    "TreeTime_lstm_optimizer": "adam",
    "TreeTime_lstm_bidirectional": True,
    "TreeTime_lstm_batch_size": 2**14,
    "TreeTime_lstm_epochs": 4,
    "TreeTime_lstm_l1": 0.00001,
    "TreeTime_lstm_l2": 0.00001,
    "TreeTime_inter_dropout": 0.00001,
    "TreeTime_input_gaussian_noise": 0.00001,
    "TreeTime_lstm_conv1d": True,
    "TreeTime_lstm_conv1d_kernel_size": 3,
    "TreeTime_lstm_loss": "mse",
}

In [3]:
stock_group = "group_snp500_finanTo2011"

eval_date = datetime.date(2025,  2,  1)

start_Date = datetime.date(year=2020, month=1, day=1)

ls = LoadupSamples(
    train_start_date=start_Date,
    test_dates=[eval_date],
    group=stock_group,
    params=params,
)

In [4]:
ls.load_samples(main_path="../src/featureAlchemy/bin/")

2025-07-17 18:28:20,565 - Test date 2025-02-01 not found in the database. Resetting to last trading day.


In [8]:
fs_params= {
    "FilterSamples_days_to_train_end": 365,
    
    "FilterSamples_lincomb_q_up": 0.95,
    "FilterSamples_lincomb_lr": 0.0008384895113158295,
    "FilterSamples_lincomb_epochs": 800,
    "FilterSamples_lincomb_probs_noise_std": 0.010503627436184224,
    "FilterSamples_lincomb_subsample_ratio": 0.2424109747001177,
    "FilterSamples_lincomb_sharpness": 0.59226051089996,
    "FilterSamples_lincomb_featureratio": 0.33269072850403053,
    "FilterSamples_lincomb_itermax": 2,
    "FilterSamples_lincomb_show_progress": True,
}

In [9]:
dates = ls.meta_pl_train.select(pl.col("date")).to_series()
Xtree_all = ls.train_Xtree
ytree_all = ls.train_ytree

end_train_date = datetime.date(2024,  2,  1)
test_days = 30  # 1 month of testing data
        
def est_samples(X_all, y_all, dates: pl.Series, end_train_date, test_days):
    train_start = dates.min()
    train_end   = end_train_date

    test_start  = end_train_date + datetime.timedelta(days=1)
    test_end    = end_train_date + datetime.timedelta(days=test_days)

    # slice train
    train_mask = (
        (dates >= train_start) &
        (dates <= train_end)
    ).fill_null(False).to_numpy()

    # slice test
    test_mask = (
        (dates >= test_start) &
        (dates <= test_end)
    ).fill_null(False).to_numpy()

    return (
        X_all[train_mask],
        X_all[test_mask],
        y_all[train_mask],
        y_all[test_mask],
        dates.filter(train_mask),
        dates.filter(test_mask)
    )

In [10]:
end_train_dates = sorted([
    datetime.date(2024,  2,  1) - datetime.timedelta(days=i*60 + random.randint(-10,10)) 
    for i in range(5)
])

test_scores = []
for i, end_date in enumerate(end_train_dates):
    split_f = 0.90
    train_days = 390  # 2 years of training data
    test_days = int(train_days * (1-split_f)) 
    
    (
        Xtree_train,
        Xtree_test,
        ytree_train,
        ytree_test,
        samples_dates_train,
        samples_dates_test
    ) = est_samples(Xtree_all, ytree_all, dates, end_date, test_days=test_days)
            
    fs_params["FilterSamples_days_to_train_end"] = train_days
    fs = FilterSamples(
        Xtree_train = Xtree_train,
        ytree_train = ytree_train,
        treenames = ls.featureTreeNames,
        Xtree_test = Xtree_test,
        samples_dates_train = samples_dates_train,
        samples_dates_test = samples_dates_test,
        ytree_test = ytree_test,
        params = fs_params
    )
    
    _, _, s_tr, s_te = fs.run()
    s_te = 1.0 if s_te is None or s_te < 0.5 else s_te
    test_scores.append(s_te)
    
    logger.info(f"")
    logger.info(f"END DATE {end_date}")
    logger.info(f"Mean of all test scores: {np.mean([s for s in test_scores if ~np.isnan(s) or s is not None])}")
    logger.info("")
    
    logger.info(f"Training score: {(s_tr)}, testing score: {(s_te)}")



2025-07-17 18:30:09,561 - FilterSamples: (train) mean of y values 1.002645514566944.
2025-07-17 18:30:09,561 - FilterSamples: (test) mean of y values 1.0077879975184139.
2025-07-17 18:30:09,567 - FilterSamples: Starting Lincomb
2025-07-17 18:30:09,573 -   FilterSamples: Lincomb Iteration 0/2 running.
2025-07-17 18:30:10,643 - FilterSamples: Best init score 1.0090


Epochs: 100%|██████████| 800/800 [00:04<00:00, 184.81it/s, mean_perdate_v=1.0153, mean_v=1.0149]


2025-07-17 18:30:16,291 -   Mean target (train): 1.0209358526935177
2025-07-17 18:30:16,292 -   Number of features selected: 455
2025-07-17 18:30:16,292 -   Max distance between idces: 230
2025-07-17 18:30:16,292 -   Mean distance between idces: 19.97417548226509
2025-07-17 18:30:16,294 -   Median distance between idces: 8.0
2025-07-17 18:30:16,316 -   Mean target (test): 1.0069624823585375
2025-07-17 18:30:16,318 -   Fraction of days with no coverage by test mask: 0.00%
2025-07-17 18:30:16,320 -   w quantile     : 18.9870662689209
2025-07-17 18:30:16,322 -   w_test quantile: 10.236125946044922
2025-07-17 18:30:16,322 -     Top-0 feature: Fourier_ReturnLog_RSMECoeff_4_MH_8
2025-07-17 18:30:16,323 -     Top-1 feature: Fourier_Price_RSMERatioCoeff_1_MH_6
2025-07-17 18:30:16,324 -     Top-2 feature: FinData_quar_ebit_nivRevLag_qm4
2025-07-17 18:30:16,324 -     Top-3 feature: FinData_quar_commonStockSharesOutstanding_nivRev
2025-07-17 18:30:16,325 -     Top-4 feature: Fourier_Price_RSMERat

Epochs: 100%|██████████| 800/800 [00:04<00:00, 193.77it/s, mean_perdate_v=1.0119, mean_v=1.0127]


2025-07-17 18:30:21,772 -   Mean target (train): 1.0159592184944113
2025-07-17 18:30:21,772 -   Number of features selected: 263
2025-07-17 18:30:21,776 -   Max distance between idces: 163
2025-07-17 18:30:21,776 -   Mean distance between idces: 19.97417548226509
2025-07-17 18:30:21,776 -   Median distance between idces: 8.0
2025-07-17 18:30:21,787 -   Mean target (test): 1.0078734614090434
2025-07-17 18:30:21,791 -   Fraction of days with no coverage by test mask: 0.00%
2025-07-17 18:30:21,791 -   w quantile     : 13.601014137268066
2025-07-17 18:30:21,791 -   w_test quantile: 8.556903839111328
2025-07-17 18:30:21,794 -     Top-0 feature: Fourier_ReturnLog_RSMECoeff_4_MH_8
2025-07-17 18:30:21,794 -     Top-1 feature: FinData_quar_commonStockSharesOutstanding_nivRev
2025-07-17 18:30:21,794 -     Top-2 feature: FinData_quar_ebit_nivRevLag_qm4
2025-07-17 18:30:21,794 -     Top-3 feature: FeatureTA_volatility_atr
2025-07-17 18:30:21,794 -     Top-4 feature: FinData_quar_ebitda_nivRevLag_q

Epochs: 100%|██████████| 800/800 [00:04<00:00, 188.53it/s, mean_perdate_v=1.0161, mean_v=1.0171]


2025-07-17 18:30:29,503 -   Mean target (train): 1.0203399090582588
2025-07-17 18:30:29,503 -   Number of features selected: 455
2025-07-17 18:30:29,503 -   Max distance between idces: 239
2025-07-17 18:30:29,503 -   Mean distance between idces: 20.00093341630367
2025-07-17 18:30:29,503 -   Median distance between idces: 8.0
2025-07-17 18:30:29,530 -   Mean target (test): 1.001148647361409
2025-07-17 18:30:29,531 -   Fraction of days with no coverage by test mask: 0.00%
2025-07-17 18:30:29,533 -   w quantile     : 16.060447692871094
2025-07-17 18:30:29,534 -   w_test quantile: 7.48601770401001
2025-07-17 18:30:29,535 -     Top-0 feature: Fourier_ReturnLog_RSMECoeff_4_MH_8
2025-07-17 18:30:29,535 -     Top-1 feature: Fourier_Price_RSMERatioCoeff_1_MH_6
2025-07-17 18:30:29,536 -     Top-2 feature: FinData_quar_operatingCashflow_nivRevLag_qm11
2025-07-17 18:30:29,537 -     Top-3 feature: FinData_ann_ebit_nivRevLag_am3
2025-07-17 18:30:29,537 -     Top-4 feature: FinData_quar_commonStockSh

Epochs: 100%|██████████| 800/800 [00:04<00:00, 187.57it/s, mean_perdate_v=1.0143, mean_v=1.0154]


2025-07-17 18:30:35,130 -   Mean target (train): 1.0182402089247402
2025-07-17 18:30:35,131 -   Number of features selected: 263
2025-07-17 18:30:35,132 -   Max distance between idces: 239
2025-07-17 18:30:35,132 -   Mean distance between idces: 20.00093341630367
2025-07-17 18:30:35,133 -   Median distance between idces: 9.0
2025-07-17 18:30:35,146 -   Mean target (test): 0.9921658123721506
2025-07-17 18:30:35,146 -   Fraction of days with no coverage by test mask: 0.00%
2025-07-17 18:30:35,150 -   w quantile     : 13.442365646362305
2025-07-17 18:30:35,151 -   w_test quantile: 6.511627197265625
2025-07-17 18:30:35,151 -     Top-0 feature: Fourier_ReturnLog_RSMECoeff_4_MH_8
2025-07-17 18:30:35,152 -     Top-1 feature: FinData_quar_operatingCashflow_nivRevLag_qm11
2025-07-17 18:30:35,152 -     Top-2 feature: Fourier_Price_RSMERatioCoeff_1_MH_6
2025-07-17 18:30:35,152 -     Top-3 feature: FinData_ann_ebit_nivRevLag_am3
2025-07-17 18:30:35,152 -     Top-4 feature: FinData_quar_grossProfit

Epochs: 100%|██████████| 800/800 [00:04<00:00, 192.82it/s, mean_perdate_v=1.0142, mean_v=1.0162]


2025-07-17 18:30:43,096 -   Mean target (train): 1.0205337751776387
2025-07-17 18:30:43,100 -   Number of features selected: 455
2025-07-17 18:30:43,101 -   Max distance between idces: 239
2025-07-17 18:30:43,101 -   Mean distance between idces: 19.865899191039205
2025-07-17 18:30:43,101 -   Median distance between idces: 9.0
2025-07-17 18:30:43,121 -   Mean target (test): 1.0119365688090924
2025-07-17 18:30:43,121 -   Fraction of days with no coverage by test mask: 3.85%
2025-07-17 18:30:43,121 -   w quantile     : 13.325568199157715
2025-07-17 18:30:43,129 -   w_test quantile: 8.555083274841309
2025-07-17 18:30:43,131 -     Top-0 feature: Fourier_Price_RSMECoeff_4_MH_8
2025-07-17 18:30:43,132 -     Top-1 feature: Fourier_Price_SignCoeff_1_MH_12
2025-07-17 18:30:43,132 -     Top-2 feature: FeatureTA_volatility_bbw
2025-07-17 18:30:43,132 -     Top-3 feature: FinData_quar_commonStockSharesOutstanding_nivRev
2025-07-17 18:30:43,133 -     Top-4 feature: Fourier_Price_RSMERatioCoeff_1_MH_

Epochs: 100%|██████████| 800/800 [00:04<00:00, 190.25it/s, mean_perdate_v=1.0117, mean_v=1.0124]


2025-07-17 18:30:48,708 -   Mean target (train): 1.016418343545219
2025-07-17 18:30:48,709 -   Number of features selected: 263
2025-07-17 18:30:48,709 -   Max distance between idces: 239
2025-07-17 18:30:48,710 -   Mean distance between idces: 19.951462352209084
2025-07-17 18:30:48,710 -   Median distance between idces: 9.0
2025-07-17 18:30:48,722 -   Mean target (test): 1.0093249537180555
2025-07-17 18:30:48,725 -   Fraction of days with no coverage by test mask: 0.00%
2025-07-17 18:30:48,727 -   w quantile     : 11.769609451293945
2025-07-17 18:30:48,728 -   w_test quantile: 5.626058578491211
2025-07-17 18:30:48,728 -     Top-0 feature: Fourier_ReturnLog_RSMECoeff_3_MH_12
2025-07-17 18:30:48,730 -     Top-1 feature: Fourier_Price_SignCoeff_1_MH_12
2025-07-17 18:30:48,731 -     Top-2 feature: Fourier_Price_RSMERatioCoeff_1_MH_6
2025-07-17 18:30:48,731 -     Top-3 feature: Fourier_Price_RSMERatioCoeff_1_MH_2
2025-07-17 18:30:48,732 -     Top-4 feature: FinData_quar_ebit_nivRevLag_qm4


Epochs: 100%|██████████| 800/800 [00:04<00:00, 191.92it/s, mean_perdate_v=1.0130, mean_v=1.0132]


2025-07-17 18:30:56,652 -   Mean target (train): 1.0181995944527389
2025-07-17 18:30:56,652 -   Number of features selected: 455
2025-07-17 18:30:56,655 -   Max distance between idces: 239
2025-07-17 18:30:56,655 -   Mean distance between idces: 19.997510889856876
2025-07-17 18:30:56,655 -   Median distance between idces: 9.0
2025-07-17 18:30:56,677 -   Mean target (test): 1.0087204770017781
2025-07-17 18:30:56,679 -   Fraction of days with no coverage by test mask: 0.00%
2025-07-17 18:30:56,681 -   w quantile     : 13.642681121826172
2025-07-17 18:30:56,681 -   w_test quantile: 11.481939315795898
2025-07-17 18:30:56,682 -     Top-0 feature: FeatureGroup_VolGrRk
2025-07-17 18:30:56,682 -     Top-1 feature: Fourier_Price_RSMERatioCoeff_1_MH_6
2025-07-17 18:30:56,682 -     Top-2 feature: Fourier_Price_AmpCoeff_1_MH_4
2025-07-17 18:30:56,683 -     Top-3 feature: FeatureTA_volatility_bbw
2025-07-17 18:30:56,683 -     Top-4 feature: Fourier_Price_RSMECoeff_1_MH_4
2025-07-17 18:30:56,684 -  

Epochs: 100%|██████████| 800/800 [00:04<00:00, 196.45it/s, mean_perdate_v=1.0108, mean_v=1.0108]


2025-07-17 18:31:02,142 -   Mean target (train): 1.0149261587680325
2025-07-17 18:31:02,144 -   Number of features selected: 263
2025-07-17 18:31:02,144 -   Max distance between idces: 226
2025-07-17 18:31:02,144 -   Mean distance between idces: 19.997510889856876
2025-07-17 18:31:02,147 -   Median distance between idces: 9.0
2025-07-17 18:31:02,158 -   Mean target (test): 1.0068675672786327
2025-07-17 18:31:02,162 -   Fraction of days with no coverage by test mask: 0.00%
2025-07-17 18:31:02,163 -   w quantile     : 10.567854881286621
2025-07-17 18:31:02,163 -   w_test quantile: 8.569337844848633
2025-07-17 18:31:02,163 -     Top-0 feature: Fourier_Price_AmpCoeff_2_MH_1
2025-07-17 18:31:02,163 -     Top-1 feature: Fourier_Price_RSMERatioCoeff_1_MH_6
2025-07-17 18:31:02,163 -     Top-2 feature: Fourier_Price_AmpCoeff_1_MH_4
2025-07-17 18:31:02,163 -     Top-3 feature: FinData_quar_operatingCashflow_nivRevLag_qm7
2025-07-17 18:31:02,163 -     Top-4 feature: FinData_quar_operatingCashflow

Epochs: 100%|██████████| 800/800 [00:04<00:00, 193.80it/s, mean_perdate_v=1.0126, mean_v=1.0138]


2025-07-17 18:31:10,341 -   Mean target (train): 1.0169507561628224
2025-07-17 18:31:10,341 -   Number of features selected: 455
2025-07-17 18:31:10,341 -   Max distance between idces: 239
2025-07-17 18:31:10,341 -   Mean distance between idces: 19.9477286869944
2025-07-17 18:31:10,341 -   Median distance between idces: 10.0
2025-07-17 18:31:10,365 -   Mean target (test): 1.0124262996438576
2025-07-17 18:31:10,367 -   Fraction of days with no coverage by test mask: 0.00%
2025-07-17 18:31:10,369 -   w quantile     : 11.497292518615723
2025-07-17 18:31:10,369 -   w_test quantile: 8.371849060058594
2025-07-17 18:31:10,370 -     Top-0 feature: MathFeature_Drawup_MH12
2025-07-17 18:31:10,370 -     Top-1 feature: FinData_metrics_log_pb_ratio
2025-07-17 18:31:10,371 -     Top-2 feature: Fourier_Price_RSMERatioCoeff_1_MH_12
2025-07-17 18:31:10,371 -     Top-3 feature: FinData_quar_grossProfit_nivRevLag_qm7
2025-07-17 18:31:10,372 -     Top-4 feature: FinData_quar_grossProfit_nivRevLag_qm11
202

Epochs: 100%|██████████| 800/800 [00:04<00:00, 191.10it/s, mean_perdate_v=1.0107, mean_v=1.0105]


2025-07-17 18:31:15,991 -   Mean target (train): 1.0153430810713975
2025-07-17 18:31:15,991 -   Number of features selected: 263
2025-07-17 18:31:15,991 -   Max distance between idces: 225
2025-07-17 18:31:15,991 -   Mean distance between idces: 19.979464841319228
2025-07-17 18:31:15,991 -   Median distance between idces: 9.0
2025-07-17 18:31:16,007 -   Mean target (test): 1.0134887691258545
2025-07-17 18:31:16,008 -   Fraction of days with no coverage by test mask: 0.00%
2025-07-17 18:31:16,010 -   w quantile     : 9.109771728515625
2025-07-17 18:31:16,011 -   w_test quantile: 7.098681926727295
2025-07-17 18:31:16,012 -     Top-0 feature: MathFeature_Drawup_MH12
2025-07-17 18:31:16,013 -     Top-1 feature: FeatureGroup_VolGrLvl
2025-07-17 18:31:16,013 -     Top-2 feature: FinData_metrics_log_pb_ratio
2025-07-17 18:31:16,013 -     Top-3 feature: FinData_quar_reportTime
2025-07-17 18:31:16,013 -     Top-4 feature: Fourier_Price_RSMERatioCoeff_1_MH_6
2025-07-17 18:31:16,013 -     Top-5 f