In [None]:
import sys, os
project_dir = os.path.abspath('..')
if project_dir not in sys.path:
    sys.path.append(project_dir)

import numpy as np
import polars as pl
import torch
import datetime
from src.predictionModule.FilterSamples import FilterSamples

np.random.seed(0)
torch.manual_seed(0)


In [None]:
# Synthetic fixture: small deterministic dataset
n_train, n_test, n_feat = 120, 20, 8
Xtree_train = np.random.randn(n_train, n_feat)
ytree_train = np.random.rand(n_train) + 1.0  # positive for geometric mean
Xtree_test = np.random.randn(n_test, n_feat)
ytree_test = np.random.rand(n_test) + 1.0

# Feature names include blacklist keywords
treenames = [
    'Alpha',
    'Beta_Category',
    'GammaSeasonal',
    'Delta_lag',
    'Epsilon',
    'ZetaCategory',
    'EtaSeasonal',
    'Theta'
]

start_train = datetime.date(2021, 1, 1)
start_test = datetime.date(2021, 5, 1)
samples_dates_train = pl.Series(
    name='dates_train',
    values=[start_train + datetime.timedelta(days=i) for i in range(n_train)],
    dtype=pl.Date
)
samples_dates_test = pl.Series(
    name='dates_test',
    values=[start_test + datetime.timedelta(days=i) for i in range(n_test)],
    dtype=pl.Date
)

params = dict(
    FilterSamples_lincomb_epochs=5,
    FilterSamples_lincomb_show_progress=False,
    FilterSamples_lincomb_featureratio=0.5,
    FilterSamples_lincomb_itermax=2,
    FilterSamples_lincomb_init_toprand=1
)

fs = FilterSamples(
    Xtree_train=Xtree_train,
    ytree_train=ytree_train,
    treenames=treenames,
    Xtree_test=Xtree_test,
    samples_dates_train=samples_dates_train,
    samples_dates_test=samples_dates_test,
    ytree_test=ytree_test,
    params=params
)

fs_notest = FilterSamples(
    Xtree_train=Xtree_train,
    ytree_train=ytree_train,
    treenames=treenames,
    Xtree_test=Xtree_test,
    samples_dates_train=samples_dates_train,
    samples_dates_test=samples_dates_test,
    ytree_test=None,
    params=params
)


In [None]:
mask_feat = fs.separate_treefeatures()
assert mask_feat.dtype == bool  # mask uses boolean dtype
expected_mask = [True, False, False, False, True, False, False, True]
assert mask_feat.tolist() == expected_mask  # filter Category/Seasonal/lag names


In [None]:
mask = np.zeros(n_train, dtype=bool)
mask[:3] = True
res = fs.evaluate_mask(mask, samples_dates_train, ytree_train)
expected = np.mean(ytree_train[:3])  # geometric mean per-date (single sample per date)
assert np.isclose(res, expected)  # geometric mean over dates

mask_none = np.zeros(n_train, dtype=bool)
assert np.isnan(fs.evaluate_mask(mask_none, samples_dates_train, ytree_train))  # all False -> nan
assert fs.evaluate_mask(np.array([], dtype=bool), pl.Series(name='d', values=[], dtype=pl.Date), np.array([])) == 1.0  # no date coverage -> 1.0


In [None]:
params_recent = dict(FilterSamples_days_to_train_end=10)
fs_recent = FilterSamples(
    Xtree_train=Xtree_train,
    ytree_train=ytree_train,
    treenames=treenames,
    Xtree_test=Xtree_test,
    samples_dates_train=samples_dates_train,
    samples_dates_test=samples_dates_test,
    ytree_test=ytree_test,
    params=params_recent
)
recent_mask = fs_recent.get_recent_training_mask(samples_dates_train)
assert recent_mask.dtype == bool  # recent mask is boolean
assert recent_mask.sum() == 11  # includes last day and previous 10 days
assert recent_mask[-1] and recent_mask[-11]  # last day and first included day are True
assert not recent_mask[-12]  # day before range is False


In [None]:
train_mask, test_mask, _, score_test = fs.lincomb_masks()
assert train_mask.dtype == bool  # train mask boolean
assert train_mask.shape[0] == Xtree_train.shape[0]  # mask matches training length
assert test_mask.dtype == bool and test_mask.shape[0] == Xtree_test.shape[0]  # test mask size when doTest=True

train_mask_nt, test_mask_nt, _, score_test_nt = fs_notest.lincomb_masks()
assert test_mask_nt is None  # test mask None when doTest=False


In [None]:
train_t, test_t = fs.taylor_feature_masks()
assert train_t.dtype == bool  # train mask boolean
assert test_t.dtype == bool  # test mask boolean when doTest=True

train_t2, test_t2 = fs_notest.taylor_feature_masks()
assert test_t2 is None  # test mask None when doTest=False
