In [1]:
import sys, os
project_dir = os.path.abspath('..')
if project_dir not in sys.path:
    sys.path.append(project_dir)

import datetime
from src.predictionModule.FilterSamples import FilterSamples
from src.predictionModule.LoadupSamples import LoadupSamples

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter(fmt="%(message)s")
handler.setFormatter(formatter)
if not logger.hasHandlers():
    logger.addHandler(handler)
else:
    logger.handlers[:] = [handler]
# Usage
logger.info("This will print to the notebook's output cell")


This will print to the notebook's output cell


In [2]:
params = {
    "daysAfterPrediction": None,
    "idxAfterPrediction": 5,
    'timesteps': 60,
    'target_option': 'mean',
    
    "LoadupSamples_tree_scaling_standard": True,
    "LoadupSamples_time_scaling_stretch": True,
    "LoadupSamples_time_inc_factor": 10,

    "FilterSamples_q_up": 0.90,

    "FilterSamples_lincomb_epochs": 5,
    "FilterSamples_lincomb_show_progress": False,
    "FilterSamples_lincomb_featureratio": 0.5,
    "FilterSamples_lincomb_itermax": 2,
    "FilterSamples_lincomb_init_toprand":  1,

    "FilterSamples_days_to_train_end": 10,
    "FilterSamples_cat_over20": True,
    "FilterSamples_cat_posOneYearReturn": True,
    "FilterSamples_cat_posFiveYearReturn": True,
    "FilterSamples_taylor_horizon_days": 20,
    "FilterSamples_roll_window_days": 20
}

In [3]:
timegroup = "group_regOHLCV_over5years"
treegroup = "group_debug"

eval_date = datetime.date(year=2025, month=6, day=13)
evaldates = [eval_date - datetime.timedelta(days=i) for i in range(1, 6)]
start_train_date = datetime.date(year=2023, month=1, day=1)
split_Date = datetime.date(year=2025, month=1, day=1)
ls = LoadupSamples(
    train_start_date=start_train_date,
    test_dates=evaldates,
    treegroup=treegroup,
    timegroup=timegroup,
    params=params,
)
ls.load_samples(main_path = "../src/featureAlchemy/bin/")
ls.split_dataset(
    start_date=start_train_date,
    last_train_date=split_Date,
    last_test_date=eval_date
)

Test date 2025-06-08 not found in the database. Omitting.


In [4]:
train_Xtree = ls.train_Xtree
train_ytree = ls.train_ytree
train_Xtime = ls.train_Xtime
train_ytime = ls.train_ytime

test_Xtree = ls.test_Xtree
test_ytree = ls.test_ytree
test_Xtime = ls.test_Xtime
test_ytime = ls.test_ytime

treenames = ls.featureTreeNames
timenames = ls.featureTimeNames

meta_pl_train = ls.meta_pl_train
meta_pl_test = ls.meta_pl_test

In [5]:
samples_dates_train = meta_pl_train['date']
samples_dates_test = meta_pl_test['date']
closeprices_train = meta_pl_train['Close'].to_numpy()
closeprices_test = meta_pl_test['Close'].to_numpy()
adjcloseprices_train = meta_pl_train['AdjClose'].to_numpy()
adjcloseprices_test = meta_pl_test['AdjClose'].to_numpy()

fs = FilterSamples(
    Xtree_train=train_Xtree,
    ytree_train=train_ytree,
    treenames=treenames,
    Xtree_test=test_Xtree,
    samples_dates_train=samples_dates_train,
    samples_dates_test=samples_dates_test,
    ytree_test=test_ytree,
    closeprices_train=closeprices_train,
    closeprices_test=closeprices_test,
    adjcloseprices_train=adjcloseprices_train,
    adjcloseprices_test=adjcloseprices_test,
    params=params
)

In [6]:
cat_mask_train, cat_mask_test = fs.categorical_masks()

In [7]:
lincomb_mask_train, lincomb_mask_test = fs.lincomb_masks()
print(lincomb_mask_train.sum()/len(lincomb_mask_train))
print(lincomb_mask_test.sum()/len(lincomb_mask_test))

FilterSamples: (train) mean of y values 1.0032133198146556.
FilterSamples: (test) mean of y values 1.0034009483137694.
FilterSamples: Starting Lincomb
  FilterSamples: Lincomb Iteration 0/2 running.
FilterSamples: Best init score 1.0173
  Mean target (train): 1.0116099120367281
  Number of features selected: 454
  Max distance between idces: 19
  Mean distance between idces: 9.5625
  Median distance between idces: 10.0
  Mean target (test): 1.0059813275954963
  Fraction of days with no coverage by test mask: 40.54%
  w quantile     : 4.050098419189453
  w_test quantile: 3.3564364910125732
    Top-0 feature: FeatureTA_trend_vortex_ind_neg
    Top-1 feature: FinData_quar_ebitda_nivRevLag_qm6
    Top-2 feature: Fourier_Price_SignCoeff_2_MH_6
    Top-3 feature: Fourier_Price_SignCoeff_1_MH_2
    Top-4 feature: Fourier_Price_SignCoeff_2_MH_12
    Top-5 feature: FeatureTA_momentum_ppo_hist
    Top-6 feature: Fourier_Price_SignCoeff_1_MH_6
    Top-7 feature: FeatureTA_volatility_kchi
    Top-

In [8]:
taylor_mask_train, taylor_mask_test = fs.taylor_feature_masks()
print(taylor_mask_train.sum()/len(taylor_mask_train))
print(taylor_mask_test.sum()/len(taylor_mask_test))

FilterSamples/Taylor: (train) mean of y values 1.0032133198146556.
FilterSamples/Taylor: (test) mean of y values 1.0034009483137694.
FilterSamples/Taylor: scoring features via rolling-mean Taylor approximation.
  Taylor score feat[10] FinData_quar_grossProfit_nivRev: x0=0.992583, slope=0.000000, t=6, score=0.992583
  Taylor score feat[11] FinData_quar_ebit_nivRev: x0=0.982255, slope=0.000000, t=6, score=0.982255
  Taylor score feat[12] FinData_quar_ebitda_nivRev: x0=0.984034, slope=0.000000, t=6, score=0.984034
  Taylor score feat[13] FinData_quar_totalAssets_nivRev: x0=0.998664, slope=0.000000, t=6, score=0.998664
  Taylor score feat[14] FinData_quar_totalCurrentLiabilities_nivRev: x0=0.994773, slope=0.000000, t=6, score=0.994773
  Taylor score feat[15] FinData_quar_totalShareholderEquity_nivRev: x0=0.985264, slope=0.000000, t=6, score=0.985264
  Taylor score feat[16] FinData_quar_operatingCashflow_nivRev: x0=0.988847, slope=0.000000, t=6, score=0.988847
  Taylor score feat[17] FinDat