In [88]:
import sys
import os

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)
    
from src.predictionModule.TreeTimeML import TreeTimeML

import pandas as pd
import numpy as np
import polars as pl
import datetime
import logging
logging.basicConfig(
    level=logging.DEBUG,
    format='%(message)s'
)
logger = logging.getLogger(__name__)

In [89]:
params = {
    "daysAfterPrediction": 7,
    'timesteps': 22,
    'target_option': 'last',
    "TreeTime_isFiltered": False,
    "TreeTime_top_highest": 10,
}

stock_group = "group_snp500_finanTo2011"

eval_date = datetime.date(year=2025, month=2, day=13)
start_train_date = datetime.date(year=2014, month=1, day=1)

treetimeML = TreeTimeML(
    train_start_date=start_train_date,
    test_dates=[eval_date],
    group=stock_group,
    params=params,
)

In [90]:
treetimeML.load_and_filter_sets(main_path = "../src/featureAlchemy/bin/")

In [91]:
Xtree = treetimeML.train_Xtree
ytree = treetimeML.train_ytree
ytime = treetimeML.train_ytime

treenames = treetimeML.featureTreeNames

In [92]:
np.mean(ytree)

np.float32(1.0024893)

# RSI Stats

In [119]:
q = 0.10
idx = np.where(treenames == 'FeatureTA_momentum_stoch_rsi')[0]
arr = Xtree[:, idx].flatten()
quant_lower = np.quantile(arr, q)
#quant_upper = np.quantile(arr, 1-q)
mask = np.zeros_like(arr, dtype=bool)
mask = mask | (arr <= quant_lower)
#mask = mask | (arr >= quant_upper)

print(mask.sum())
print(np.mean(ytree[mask]))

89643
1.0035527


# RSME Stats

### RSME 1, with intermediate value

In [94]:
def f(q):
    mask = np.zeros(Xtree.shape[0], dtype=bool)
    idx = np.where(treenames == 'Fourier_Price_RSMERatioCoeff_1_MH_1')[0]
    idx1 = np.where(treenames == 'FeatureTA_Open_lag_m1')[0]
    idx2 = np.where(treenames == 'FeatureTA_Open_lag_m10')[0]
    arr = Xtree[:, idx].flatten()
    arr1 = Xtree[:, idx1].flatten()
    arr2 = Xtree[:, idx2].flatten()
    mask = mask | (arr2 < arr1*0.98)
    quant = np.quantile(arr[(arr2 < arr1*0.98)], q)
    mask = mask | (arr <= quant)
    return mask

n_q = 10
masksum = np.zeros(n_q, dtype=int)
returns = np.zeros(n_q)
linspace = np.linspace(0.01, 0.99, n_q)
for i, q in enumerate(linspace):
    mask = f(q)
    masksum[i] = mask.sum()
    returns[i] = np.mean(ytree[mask])

print(masksum)
print(returns)

[356218 389486 422311 455693 489850 524670 559426 594495 630014 665050]
[1.00261331 1.00262392 1.00259709 1.00258422 1.00256324 1.00253797
 1.00254107 1.00252962 1.00251698 1.00249422]


### RSME 1, with out intermediate value

In [117]:
def f(q):
    mask = np.zeros(Xtree.shape[0], dtype=bool)
    idx = np.where(treenames == 'Fourier_Price_RSMERatioCoeff_1_MH_1')[0]
    arr = Xtree[:, idx].flatten()
    quant = np.quantile(arr, q)
    mask = mask | (arr <= quant)
    return mask

n_q = 15
masksum = np.zeros(n_q, dtype=int)
returns = np.zeros(n_q)
linspace = np.linspace(0.01, 0.2, n_q)
for i, q in enumerate(linspace):
    mask = f(q)
    masksum[i] = mask.sum()
    returns[i] = np.mean(ytree[mask])

print(linspace)
print(masksum)
print(returns)

[0.01       0.02357143 0.03714286 0.05071429 0.06428571 0.07785714
 0.09142857 0.105      0.11857143 0.13214286 0.14571429 0.15928571
 0.17285714 0.18642857 0.2       ]
[  6684  15754  24824  33894  42965  52035  61105  70175  79245  88316
  97386 106456 115526 124596 133667]
[1.00497591 1.00455809 1.00379574 1.00346303 1.00320864 1.00321603
 1.00315297 1.00308752 1.00312555 1.00305247 1.00304937 1.00303054
 1.00301099 1.00299501 1.00294149]


### RSME 1 mhx, with out intermediate value

In [116]:
m = 2
def f(q):
    mask = np.zeros(Xtree.shape[0], dtype=bool)
    idx = np.where(treenames == f'Fourier_Price_RSMERatioCoeff_1_MH_{m}')[0]
    arr = Xtree[:, idx].flatten()
    quant = np.quantile(arr, q)
    mask = mask | (arr <= quant)
    return mask

n_q = 15
masksum = np.zeros(n_q, dtype=int)
returns = np.zeros(n_q)
linspace = np.linspace(0.01, 0.2, n_q)
for i, q in enumerate(linspace):
    mask = f(q)
    masksum[i] = mask.sum()
    returns[i] = np.mean(ytree[mask])

print(linspace)
print(masksum)
print(returns)

[0.01       0.02357143 0.03714286 0.05071429 0.06428571 0.07785714
 0.09142857 0.105      0.11857143 0.13214286 0.14571429 0.15928571
 0.17285714 0.18642857 0.2       ]
[  6684  15754  24824  33894  42965  52035  61105  70175  79245  88316
  97386 106456 115526 124596 133667]
[1.00632381 1.00552213 1.00520861 1.00492346 1.00444937 1.00419772
 1.00387132 1.00365758 1.00355947 1.00340784 1.00328755 1.00314391
 1.00304699 1.00295675 1.00287056]


# ebit

In [97]:
qm = 5
def f(q):
    mask = np.zeros(Xtree.shape[0], dtype=bool)
    idx = np.where(treenames == f'FinData_quar_ebit_lagquot_qm{qm}')[0]
    arr = Xtree[:, idx].flatten()
    quant = np.quantile(arr, 1-q)
    mask = mask | (arr >= quant)
    return mask

n_q = 15
masksum = np.zeros(n_q, dtype=int)
returns = np.zeros(n_q)
linspace = np.linspace(0.01, 0.2, n_q)
for i, q in enumerate(linspace):
    mask = f(q)
    masksum[i] = mask.sum()
    returns[i] = np.mean(ytree[mask])

print(linspace)
print(masksum)
print(returns)

[0.01       0.02357143 0.03714286 0.05071429 0.06428571 0.07785714
 0.09142857 0.105      0.11857143 0.13214286 0.14571429 0.15928571
 0.17285714 0.18642857 0.2       ]
[  6719  15868  24824  33904  43039  52061  61162  70209  79275  88361
  97393 106486 115541 124615 133726]
[1.00540709 1.0038172  1.00345945 1.00321388 1.00307107 1.0027498
 1.00262439 1.00272822 1.00267661 1.00266755 1.00261855 1.0026058
 1.00259268 1.00262463 1.0026418 ]


# Group Dynamic, FeatureGroup_RetGrLvl

In [109]:
def f(q):
    mask = np.zeros(Xtree.shape[0], dtype=bool)
    idx = np.where(treenames == f'FeatureGroup_AvgReturnPct')[0]
    arr = Xtree[:, idx].flatten()
    quant = np.quantile(arr, q)
    mask = mask | (arr >= quant)
    return mask

n_q = 15
masksum = np.zeros(n_q, dtype=int)
returns = np.zeros(n_q)
linspace = np.linspace(0.01, 0.99, n_q)
for i, q in enumerate(linspace):
    mask = f(q)
    masksum[i] = mask.sum()
    returns[i] = np.mean(ytree[mask])

print(linspace)
print(masksum)
print(returns)

[0.01 0.08 0.15 0.22 0.29 0.36 0.43 0.5  0.57 0.64 0.71 0.78 0.85 0.92
 0.99]
[661663 614870 568107 521338 474545 427842 381120 334250 287388 240742
 193925 147141 100449  53493   6684]
[1.00238955 1.00203454 1.00177813 1.00186861 1.00188112 1.0020957
 1.00208485 1.00227237 1.00218022 1.00201309 1.00129652 1.00089705
 1.00063491 1.00002694 1.01071346]


# TA Features

In [122]:
def f(q):
    mask = np.zeros(Xtree.shape[0], dtype=bool)
    idx = np.where(treenames == f'FeatureTA_trend_stc')[0]
    arr = Xtree[:, idx].flatten()
    quant = np.quantile(arr, q)
    mask = mask | (arr <= quant)
    return mask

n_q = 11
masksum = np.zeros(n_q, dtype=int)
returns = np.zeros(n_q)
linspace = np.linspace(0.01, 0.99, n_q)
for i, q in enumerate(linspace):
    mask = f(q)
    masksum[i] = mask.sum()
    returns[i] = np.mean(ytree[mask])

print(linspace)
print(masksum)
print(returns)

[0.01  0.108 0.206 0.304 0.402 0.5   0.598 0.696 0.794 0.892 0.99 ]
[ 12979  72594 137677 203173 268669 334166 399662 465158 530657 596845
 668331]
[1.0064764  1.00445187 1.00316954 1.00282693 1.00277281 1.00271261
 1.00261855 1.00254726 1.0025394  1.0024879  1.00248933]


In [100]:
def f(q):
    mask = np.zeros(Xtree.shape[0], dtype=bool)
    idx = np.where(treenames == f'FeatureTA_volatility_atr')[0]
    arr = Xtree[:, idx].flatten()
    quant = np.quantile(arr, q)
    mask = mask | (arr >= quant)
    return mask

n_q = 20
masksum = np.zeros(n_q, dtype=int)
returns = np.zeros(n_q)
linspace = np.linspace(0.01, 0.99, n_q)
for i, q in enumerate(linspace):
    mask = f(q)
    masksum[i] = mask.sum()
    returns[i] = np.mean(ytree[mask])

print(linspace)
print(masksum)
print(returns)

[0.01       0.06157895 0.11315789 0.16473684 0.21631579 0.26789474
 0.31947368 0.37105263 0.42263158 0.47421053 0.52578947 0.57736842
 0.62894737 0.68052632 0.73210526 0.78368421 0.83526316 0.88684211
 0.93842105 0.99      ]
[661647 627176 592704 558232 523760 489288 454817 420345 385873 351401
 316930 282458 247986 213514 179043 144571 110099  75627  41156   6684]
[1.00250828 1.002599   1.00270605 1.00284219 1.00298643 1.00312054
 1.00328469 1.00345337 1.0036211  1.00378573 1.00398493 1.00423443
 1.00453663 1.00483906 1.00518191 1.00557363 1.00635731 1.00719464
 1.00939429 1.02458763]


# Math Features

In [101]:
candidates = [
    'MathFeature_Drawdown_MH4',
    'MathFeature_Drawdown_MH5',
    'MathFeature_Drawdown_MH6',
]
# find the first one that actually appears in treenames
name = next((c for c in candidates if c in treenames), None)
def f(q):
    mask = np.zeros(Xtree.shape[0], dtype=bool)
    idx = np.where(treenames == name)[0]
    arr = Xtree[:, idx].flatten()
    quant = np.quantile(arr, q)
    mask = mask | (arr <= quant)
    return mask

n_q = 20
masksum = np.zeros(n_q, dtype=int)
returns = np.zeros(n_q)
linspace = np.linspace(0.01, 0.99, n_q)
for i, q in enumerate(linspace):
    mask = f(q)
    masksum[i] = mask.sum()
    returns[i] = np.mean(ytree[mask])

print(linspace)
print(masksum)
print(returns)

[0.01       0.06157895 0.11315789 0.16473684 0.21631579 0.26789474
 0.31947368 0.37105263 0.42263158 0.47421053 0.52578947 0.57736842
 0.62894737 0.68052632 0.73210526 0.78368421 0.83526316 0.88684211
 0.93842105 0.99      ]
[  6684  41156  75627 110099 144571 179044 213514 247986 282458 316930
 351401 385873 420345 454817 489288 523760 558232 592704 668331 668331]
[1.02653122 1.01019084 1.00721335 1.00587642 1.00516081 1.00469995
 1.00446928 1.00427318 1.00400794 1.00382423 1.00364411 1.00354266
 1.00341153 1.00328767 1.00315762 1.0030309  1.00291932 1.0027914
 1.00248933 1.00248933]


# ALL at once

In [102]:
q = 0.94
n_feats = Xtree.shape[1]
returns_le = np.zeros(n_feats)
returns_ge = np.zeros(n_feats)

for j in range(n_feats):
    col = Xtree[:, j]
    thresh = np.quantile(col, q)
    mask_le = col <= thresh
    mask_ge = col >= thresh
    returns_le[j] = np.mean(ytree[mask_le]) if mask_le.any() else np.nan
    returns_ge[j] = np.mean(ytree[mask_ge]) if mask_ge.any() else np.nan

results = pd.DataFrame({
    'feature': treenames,
    f'return_le': returns_le,
    f'return_ge': returns_ge,
}).set_index('feature')

print(results)

                                                return_le  return_ge
feature                                                             
Category_other                                   1.002489   1.002489
Category_industrials                             1.002489   1.002457
Category_healthcare                              1.002489   1.002287
Category_technology                              1.002489   1.002904
Category_financial-services                      1.002489   1.002670
...                                                   ...        ...
FeatureGroup_WeightedIndexMHPct_lag_m500_MH_2    1.002380   1.002570
FeatureGroup_WeightedIndexMHPct_lag_m500_MH_4    1.002516   1.002077
FeatureGroup_WeightedIndexMHPct_lag_m500_MH_6    1.002535   1.003021
FeatureGroup_WeightedIndexMHPct_lag_m500_MH_8    1.002800   0.998248
FeatureGroup_WeightedIndexMHPct_lag_m500_MH_12   1.002321   1.006010

[2829 rows x 2 columns]


In [103]:
res_sorted_le = results.sort_values(by='return_le', ascending=False)
res_sorted_ge = results.sort_values(by='return_ge', ascending=False)