In [1]:
import sys
import os

# Get the absolute path to the project directory
project_dir = os.path.abspath("..")

# Append the project directory to sys.path
if project_dir not in sys.path:
    sys.path.append(project_dir)

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from dataclasses import asdict, is_dataclass, dataclass
from pandas.api.types import is_datetime64_any_dtype
import scipy
import numpy as np
import polars as pl
from typing import Dict, List
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix
import bisect

from src.common.AssetData import AssetData
from src.common.AssetDataPolars import AssetDataPolars
from src.common.AssetDataService import AssetDataService
from src.common.AssetFileInOut import AssetFileInOut 
from src.databaseService.OutsourceLoader import OutsourceLoader

from src.simulation.SimulatePortfolio import SimulatePortfolio
from src.strategy.StratBuyAndHold import StratBuyAndHold
from src.simulation.ResultAnalyzer import ResultAnalyzer
from src.common.AssetFileInOut import AssetFileInOut
from src.predictionModule.SubsetML import SubsetML
from src.common.DataFrameTimeOperations import DataFrameTimeOperationsPandas as DFTO
from src.predictionModule.CollectionModels import CollectionModels
from src.common.AssetFileInOut import AssetFileInOut

from src.common.AssetDataPolars import AssetDataPolars
from src.common.AssetDataService import AssetDataService
from src.predictionModule.ModelAnalyzer import ModelAnalyzer

from src.featureAlchemy.FeatureMain import FeatureMain
from src.common.DataFrameTimeOperations import DataFrameTimeOperationsPolars as DPl


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
assets=AssetFileInOut("../src/stockGroups/bin").loadDictFromFile("group_finanTo2011")

# Convert to Polars for speedup
assetspl: Dict[str, AssetDataPolars] = {}
for ticker, asset in assets.items():
    assetspl[ticker]= AssetDataService.to_polars(asset)

In [9]:
subsetML = SubsetML(assetspl)

In [10]:
loadup_name = "SubsetML_debug_spareDate100_dayLag0"
subsetML.load_data('../src/predictionModule/bin', loadup_name)


print(subsetML.metadata)

Data and metadata loaded from ../src/predictionModule/bin\SubsetML_debug_spareDate100_dayLag0.pkl
{'Subset_params': {'idxLengthOneMonth': 21, 'fouriercutoff': 5, 'multFactor': 6, 'daysAfterPrediction': 21, 'monthsHorizon': 6, 'timesteps': 5, 'classificationInterval': [0.05], 'optuna_trials': 10, 'LGBM_max_depth': 10, 'averageOverDays': 5}}


In [11]:
subsetML.X_train.shape

(172508, 1186)

In [7]:
X_train = pd.DataFrame(subsetML.X_train, columns=subsetML.featureColumnNames)

X_train.shape

featCol = subsetML.featureColumnNames
print(subsetML.featureColumnNames.index("Fourier_Price_RSME"))
print(subsetML.featureColumnNames.index("FinData_ann_reportedEPS"))

72
415


In [12]:
import lightgbm as lgb
import optuna
X_train = pd.DataFrame(subsetML.X_train, columns=subsetML.featureColumnNames)
X_test = pd.DataFrame(subsetML.X_test, columns=subsetML.featureColumnNames)
y_train = subsetML.y_train
y_test = subsetML.y_test
X_val = pd.DataFrame(subsetML.X_val, columns=subsetML.featureColumnNames)
y_val = subsetML.y_val
colNames = subsetML.featureColumnNames

print("Training Label Distribution:")
ModelAnalyzer().print_label_distribution(y_train)
print("Validation Label Distribution:")
ModelAnalyzer().print_label_distribution(y_val)
print("Testing Label Distribution:")
ModelAnalyzer().print_label_distribution(y_test)

print("Training Open Description:")
print(X_train["FeatureTA_Open"].describe())

print("Testing Open Description:")
print(X_test["FeatureTA_Open"].describe())

Training Label Distribution:
  Label 0: Count = 115889, Frequency = 0.67
  Label 1: Count = 56619, Frequency = 0.33

Validation Label Distribution:
  Label 0: Count = 29016, Frequency = 0.67
  Label 1: Count = 14111, Frequency = 0.33

Testing Label Distribution:
  Label 0: Count = 3196, Frequency = 0.72
  Label 1: Count = 1220, Frequency = 0.28

Training Open Description:
count    172508.000000
mean          0.999952
std           0.018741
min           0.727763
25%           0.990249
50%           0.999698
75%           1.009352
max           1.704104
Name: FeatureTA_Open, dtype: float64
Testing Open Description:
count    4416.000000
mean        0.998606
std         0.014713
min         0.832682
25%         0.990621
50%         0.998316
75%         1.006142
max         1.096435
Name: FeatureTA_Open, dtype: float64


In [13]:
# Define XGBoost parameters if not provided
def objective(trial):
    lgbm_params = {
        'verbosity': -1,
        'n_jobs': -1,
        'is_unbalance': True,
        'metric': 'binary_logloss',
        'lambda_l1': 1,
        'lambda_l2': 1,
        'n_estimators': 500,
        'feature_fraction': trial.suggest_float('feature_fraction', 0.005, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 16, 256),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2, log=True),
    }
    # Initialize and train LGBM model
    LGBMModel = lgb.LGBMClassifier(**lgbm_params)
    LGBMModel.fit(X_train, y_train,
                    eval_set=[(subsetML.X_val, subsetML.y_val)])
    y_val_pred = LGBMModel.predict(X_val)
    cm:np.array = confusion_matrix(subsetML.y_val, y_val_pred, labels=np.unique(subsetML.y_val))
    per_class_accuracy = cm.diagonal() / cm.sum(axis=1)
    return np.sum(per_class_accuracy)

# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials = 3, show_progress_bar=False)

for key, value in study.best_trial.params.items():
    print(f"  {key}: {value}")

lgbm_params = {
    'verbosity': -1,
    'n_jobs': -1,
    'is_unbalance': True,
    'metric': 'binary_logloss',
    'lambda_l1': 1,
    'lambda_l2': 1,
    'n_estimators': 500,
    'feature_fraction': study.best_trial.params['feature_fraction'],
    'num_leaves': study.best_trial.params['num_leaves'],
    'max_depth': study.best_trial.params['max_depth'],
    'learning_rate': study.best_trial.params['learning_rate'],
}
# Initialize and train LGBM model
LGBMModel = lgb.LGBMClassifier(**lgbm_params)
LGBMModel.fit(X_train, y_train,
                eval_set=[(subsetML.X_val, subsetML.y_val)])

y_pred_val = LGBMModel.predict(X_val)
y_pred_proba_val = LGBMModel.predict_proba(X_val)
val_acc = accuracy_score(y_val, y_pred_val)
val_loss = log_loss(y_val, y_pred_proba_val)

print(f"Validation accuracy: {val_acc}")
print(f"Validation loss: {val_loss}")

ModelAnalyzer().print_classification_metrics(y_val, y_pred_val, y_pred_proba_val)

y_pred_test = LGBMModel.predict(X_test)
y_pred_proba_test = LGBMModel.predict_proba(X_test)
test_acc = accuracy_score(y_test, y_pred_test)
test_loss = log_loss(y_test, y_pred_proba_test)

print(f"test accuracy: {test_acc}")
print(f"test loss: {test_loss}")

ModelAnalyzer().print_classification_metrics(y_test, y_pred_test, y_pred_proba_test)

[I 2025-01-11 21:57:37,841] A new study created in memory with name: no-name-e6e30bdf-1669-4f12-8b20-d671c09098ba
[I 2025-01-11 21:57:53,279] Trial 0 finished with value: 1.80207919174917 and parameters: {'feature_fraction': 0.00964565408341876, 'num_leaves': 236, 'max_depth': 7, 'learning_rate': 0.1393003111577421}. Best is trial 0 with value: 1.80207919174917.
[I 2025-01-11 21:58:07,560] Trial 1 finished with value: 1.4606926307444206 and parameters: {'feature_fraction': 0.009304944660567017, 'num_leaves': 217, 'max_depth': 5, 'learning_rate': 0.010003134445663326}. Best is trial 0 with value: 1.80207919174917.
[I 2025-01-11 21:58:25,424] Trial 2 finished with value: 1.8212693376749787 and parameters: {'feature_fraction': 0.0700941999833709, 'num_leaves': 115, 'max_depth': 7, 'learning_rate': 0.10063789483503538}. Best is trial 2 with value: 1.8212693376749787.


  feature_fraction: 0.0700941999833709
  num_leaves: 115
  max_depth: 7
  learning_rate: 0.10063789483503538
Validation accuracy: 0.9119808936397152
Validation loss: 0.2401235669695423

  Overall Accuracy: 0.91
  Log Loss: 0.2401

  Metrics per Class:
    Class 0:
      TPR: 0.91, FPR: 0.09, TNR: 0.91, FNR: 0.09
    Class 1:
      TPR: 0.91, FPR: 0.09, TNR: 0.91, FNR: 0.09

test accuracy: 0.626358695652174
test loss: 0.8965110141687969

  Overall Accuracy: 0.63
  Log Loss: 0.8965

  Metrics per Class:
    Class 0:
      TPR: 0.79, FPR: 0.81, TNR: 0.19, FNR: 0.21
    Class 1:
      TPR: 0.19, FPR: 0.21, TNR: 0.79, FNR: 0.81



In [17]:
print(colNames.index('FinData_quar_surprise'))
colNames.index("FinData_quar_surprise")
colNames.index("Fourier_Price_RSME")
columnToSubset = colNames.index("Fourier_Price_RSME") #np.random.randint(0, len(colNames))
print(columnToSubset)
mask_quantile = np.quantile(subsetML.X_test[:, columnToSubset], 0.05)
print(colNames[columnToSubset])
print(pl.DataFrame(subsetML.X_train[:, columnToSubset]).describe())
print(pl.DataFrame(subsetML.X_test[:, columnToSubset]).describe())

201
72
Fourier_Price_RSME
shape: (9, 2)
┌────────────┬──────────┐
│ statistic  ┆ column_0 │
│ ---        ┆ ---      │
│ str        ┆ f64      │
╞════════════╪══════════╡
│ count      ┆ 172508.0 │
│ null_count ┆ 0.0      │
│ mean       ┆ 0.026054 │
│ std        ┆ 0.022904 │
│ min        ┆ 0.001329 │
│ 25%        ┆ 0.012946 │
│ 50%        ┆ 0.019037 │
│ 75%        ┆ 0.030017 │
│ max        ┆ 0.280279 │
└────────────┴──────────┘
shape: (9, 2)
┌────────────┬──────────┐
│ statistic  ┆ column_0 │
│ ---        ┆ ---      │
│ str        ┆ f64      │
╞════════════╪══════════╡
│ count      ┆ 4416.0   │
│ null_count ┆ 0.0      │
│ mean       ┆ 0.02319  │
│ std        ┆ 0.021151 │
│ min        ┆ 0.001512 │
│ 25%        ┆ 0.011492 │
│ 50%        ┆ 0.016934 │
│ 75%        ┆ 0.026965 │
│ max        ┆ 0.339561 │
└────────────┴──────────┘


In [18]:
mask = subsetML.X_train[:, columnToSubset] < mask_quantile
mask_X_Train = subsetML.X_train[mask,:]
mask_y_Train = subsetML.y_train[mask]

mask = subsetML.X_val[:,columnToSubset] < mask_quantile
mask_X_val = subsetML.X_val[mask,:]
mask_y_val = subsetML.y_val[mask]

In [29]:
LGBMModel2 = lgb.LGBMClassifier()
def objective(trial):
    lgbm_params = {
        'verbosity': -1,
        'n_jobs': -1,
        'is_unbalance': True,
        'metric': 'binary_logloss',
        'lambda_l1': 1,
        'lambda_l2': 1,
        'n_estimators': 500,
        'feature_fraction': trial.suggest_float('feature_fraction', 0.001, 0.2, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 64, 512),
        'max_depth': trial.suggest_int('max_depth', 2, 14),
        'learning_rate': trial.suggest_float('learning_rate', 0.01,0.2, log=True),
    }
    # Initialize and train LGBM model
    LGBMModel2 = lgb.LGBMClassifier(**lgbm_params)
    LGBMModel2.fit(mask_X_Train, mask_y_Train,
                    eval_set=[(mask_X_val, mask_y_val)])
    mask_y_val_pred = LGBMModel2.predict(mask_X_val)
    cm:np.array = confusion_matrix(mask_y_val, mask_y_val_pred, labels=np.unique(mask_y_val))
    per_class_accuracy = cm.diagonal() / cm.sum(axis=1)
    return np.sum(per_class_accuracy)

# 3. Create a study2 object and optimize the objective function.
study2 = optuna.create_study(direction='maximize')
study2.optimize(objective, n_trials = 50, show_progress_bar=False)

for key, value in study2.best_trial.params.items():
    print(f"  {key}: {value}")
    
lgbm_params = {
    'verbosity': -1,
    'n_jobs': -1,
    'is_unbalance': True,
    'metric': 'binary_logloss',
    'lambda_l1': 1,
    'lambda_l2': 1,
    'n_estimators': 500,
    'feature_fraction': study2.best_trial.params['feature_fraction'],
    'num_leaves': study2.best_trial.params['num_leaves'],
    'max_depth': study2.best_trial.params['max_depth'],
    'learning_rate': study2.best_trial.params['learning_rate'],
}
# Initialize and train LGBM model
LGBMModel2 = lgb.LGBMClassifier(**lgbm_params)
LGBMModel2.fit(mask_X_Train, mask_y_Train,
                eval_set=[(mask_X_val, mask_y_val)])

mask_y_pred_val = LGBMModel2.predict(mask_X_val)
mask_y_pred_proba_val = LGBMModel2.predict_proba(mask_X_val)
mask_val_acc = accuracy_score(mask_y_val, mask_y_pred_val)
mask_val_loss = log_loss(mask_y_val, mask_y_pred_proba_val)

print(f"Validation accuracy: {mask_val_acc}")
print(f"Validation loss: {mask_val_loss}")

ModelAnalyzer().print_classification_metrics(mask_y_val, mask_y_pred_val, mask_y_pred_proba_val)

[I 2025-01-11 23:31:45,102] A new study created in memory with name: no-name-3221d46d-a8fb-4562-901a-ac29a85494a7
[I 2025-01-11 23:31:45,840] Trial 0 finished with value: 1.6219495813370348 and parameters: {'feature_fraction': 0.0018414972623986723, 'num_leaves': 444, 'max_depth': 2, 'learning_rate': 0.07148764578563337}. Best is trial 0 with value: 1.6219495813370348.
[I 2025-01-11 23:31:47,043] Trial 1 finished with value: 1.8085585663161448 and parameters: {'feature_fraction': 0.05174191069191712, 'num_leaves': 297, 'max_depth': 10, 'learning_rate': 0.1328220304720362}. Best is trial 1 with value: 1.8085585663161448.
[I 2025-01-11 23:31:48,004] Trial 2 finished with value: 1.8059890675750614 and parameters: {'feature_fraction': 0.007905440634946677, 'num_leaves': 157, 'max_depth': 9, 'learning_rate': 0.10892410062605648}. Best is trial 1 with value: 1.8085585663161448.
[I 2025-01-11 23:31:49,341] Trial 3 finished with value: 1.6160975996197418 and parameters: {'feature_fraction': 0.

  feature_fraction: 0.09635713782427702
  num_leaves: 249
  max_depth: 8
  learning_rate: 0.021342333878133737
Validation accuracy: 0.9346456692913386
Validation loss: 0.1594674643591774

  Overall Accuracy: 0.93
  Log Loss: 0.1595

  Metrics per Class:
    Class 0:
      TPR: 0.95, FPR: 0.12, TNR: 0.88, FNR: 0.05
    Class 1:
      TPR: 0.88, FPR: 0.05, TNR: 0.95, FNR: 0.12



In [30]:
ModelAnalyzer().print_feature_importance_LGBM(LGBMModel2, colNames,20)

Top 20 Feature Importances:
                                        Feature Importance
Rank                                                      
1         FeatureTA_momentum_pvo_signal_lag_m21   233.0000
2            FeatureTA_trend_mass_index_lag_m21   168.0000
3                  FeatureTA_volume_vpt_lag_m21   150.0000
4                   FeatureTA_trend_adx_lag_m21   149.0000
5               FeatureTA_trend_kst_sig_lag_m21   126.0000
6                 Seasonal_week_of_year_lag_m21   121.0000
7        FeatureTA_trend_vortex_ind_pos_lag_m21   117.0000
8                FeatureTA_momentum_tsi_lag_m21   112.0000
9                  FeatureTA_volume_cmf_lag_m21   111.0000
10       FeatureTA_momentum_stoch_rsi_k_lag_m21   108.0000
11              FeatureTA_volatility_ui_lag_m10   104.0000
12    FeatureTA_trend_visual_ichimoku_b_lag_m21    94.0000
13              FeatureTA_volatility_ui_lag_m21    94.0000
14             Fourier_Price_lag_m10_AbsCoeff_2    94.0000
15              Fourier_Pric

In [31]:
mask = subsetML.X_test[:,columnToSubset] < mask_quantile
mask_X_test = subsetML.X_test[mask,:]
mask_y_test = subsetML.y_test[mask]

mask_y_pred_test = LGBMModel2.predict(mask_X_test)
mask_y_pred_proba_test = LGBMModel2.predict_proba(mask_X_test)
mask_test_acc = accuracy_score(mask_y_test, mask_y_pred_test)
mask_test_loss = log_loss(mask_y_test, mask_y_pred_proba_test)

print(f"Validation accuracy: {mask_test_acc}")
print(f"Validation loss: {mask_test_loss}")

ModelAnalyzer().print_classification_metrics(mask_y_test, mask_y_pred_test, mask_y_pred_proba_test)

Validation accuracy: 0.5746606334841629
Validation loss: 1.1236485784793753

  Overall Accuracy: 0.57
  Log Loss: 1.1236

  Metrics per Class:
    Class 0:
      TPR: 1.00, FPR: 0.98, TNR: 0.02, FNR: 0.00
    Class 1:
      TPR: 0.02, FPR: 0.00, TNR: 1.00, FNR: 0.98

