<a href="https://colab.research.google.com/github/chielgroen1998/MLOps/blob/main/full_pipeline_draft_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# my_stock_forecasting/
# ├── conf/
# │   ├── base/
# │   │   ├── catalog.yml         # Data Catalog definitions
# │   │   ├── parameters/         # Parameters for data fetching, processing, and models
# │   │   │   ├── data_fetching.yml
# │   │   │   ├── data_processing.yml
# │   │   │   ├── feature_selection.yml # Parameters for feature selection methods
# │   │   │   ├── model_params/
# │   │   │   │   ├── xgboost_params.yml
# │   │   │   │   ├── svm_params.yml
# │   │   │   │   ├── lstm_params.yml
# │   │   │   │   └── arima_garch_params.yml
# │   │   │   └── validation_params.yml # For purged k-fold and walk-forward
# │   │   ├── mlflow.yml            # MLFlow tracking parameters
# │   │   └── pipelines.yml         # Pipeline definitions
# │   └── local/                  # Local overrides (e.g., API keys, database credentials)
# │       ├── catalog.yml
# │       └── parameters.yml
# ├── docs/                       # Project documentation
# ├── src/
# │   └── my_stock_forecasting/
# │       ├── __init__.py
# │       ├── hooks.py            # Optional: For customizing Kedro behavior
# │       ├── pipeline.py         # Main pipeline definition
# │       ├── pipelines/
# │       │   ├── data_fetching/    # Pipeline for fetching raw data
# │       │   │   ├── __init__.py
# │       │   │   ├── nodes.py
# │       │   │   └── pipeline.py
# │       │   ├── data_processing/  # Pipeline for technical indicators and cleaning
# │       │   │   ├── __init__.py
# │       │   │   ├── nodes.py
# │       │   │   └── pipeline.py
# │       │   ├── data_splitting/   # Pipeline for splitting data
# │       │   │   ├── __init__.py
# │       │   │   ├── nodes.py
# │       │   │   └── pipeline.py
# │       │   ├── feature_selection/  # New pipeline for feature selection
# │       │   │   ├── __init__.py
# │       │   │   ├── nodes.py
# │       │   │   └── pipeline.py
# │       │   └── model_training/   # Pipeline for training and evaluating models
# │       │       ├── __init__.py
# │       │       ├── nodes.py
# │       │       └── pipeline.py
# │       ├── tests/                # Unit and integration tests
# │       │   ├── __init__.py
# │       │   ├── pipelines/
# │       │   │   ├── data_fetching/
# │       │   │   │   └── test_nodes.py
# │       │   │   ├── data_processing/
# │       │   │   │   └── test_nodes.py
# │       │   │   ├── data_splitting/
# │       │   │   │   └── test_nodes.py
# │       │   │   ├── feature_selection/
# │       │   │   │   └── test_nodes.py
# │       │   │   └── model_training/
# │       │   │       └── test_nodes.py
# ├── .gitignore
# ├── environment.yml             # Conda environment file < docker
# ├── pyproject.toml              # Project metadata (for packaging)
# ├── README.md
# ├── setup.cfg                   # Project configuration
# └── kedro.yml                   # Kedro project configuration



In [5]:
!pip3 install kedro

Defaulting to user installation because normal site-packages is not writeable


In [13]:

!pip3 install xgboost
!pip3 install pandas_ta==0.3.14b0
!pip3 install optuna
!pip3 install --upgrade pyspark==3.3.0 delta-spark==2.2.0
!pip3 install --upgrade numpy
!pip3 install --upgrade pandas_ta
!pip3 install statsmodels
!pip3 install arch
!pip3 install pytorch-forecasting
!pip3 install prophet
!pip3 install optuna
!pip3 install xgboost
!pip3 install torch
!pip3 install yfinance
!pip3 install numpy
!pip install MLflow



Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools

In [4]:
import os
import logging
from datetime import timedelta
import numpy as np
import pandas as pd
np.NaN = np.nan
import pandas_ta as ta
import optuna
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance
from joblib import Parallel, delayed
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, fcluster
import yfinance as yf
import matplotlib.pyplot as plt


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Define the symbols to fetch data for
# Replace with the actual ticker for Nasdaq composite and the top 25 Nasdaq tickers
symbols = ['QQQ', 'AAPL', 'MSFT', 'AMZN', 'GOOG', 'GOOGL', 'NVDA', 'META', 'TSLA', 'AVGO', 'COST', 'PEP', 'CMCSA', 'ADBE', 'CSCO', 'INTC', 'NFLX', 'AMGN', 'PYPL', 'QCOM', 'TXN', 'SBUX', 'MDLZ', 'ISRG', 'BKNG', 'GILD']

# Fetch data from Yahoo Finance
all_data = {}
for symbol in symbols:
    # Download data
    data = yf.download(symbol, start='2000-01-01')
    all_data[symbol] = data

    # Print the first few rows of the dataset
    print(f"Data for {symbol}:")
    print(data.head())

    # Plot the data
    data['Close'].plot(figsize=(12, 6), title=f'{symbol} Daily Closing Price (from 2012)')
    plt.xlabel('Date')
    plt.ylabel('Closing Price')
    plt.grid(True)
    plt.show()

# Adjust column names based on the number of columns
for symbol, df in all_data.items():
    if len(df.columns) == 6:
        df.columns = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
    else:
        df.columns = ['Open', 'High', 'Low', 'Close', 'Volume']

# Verify the column names
for symbol, df in all_data.items():
    print(f"{symbol} columns: {df.columns}")

In [6]:
all_data['QQQ'].columns = ['open', 'high', 'low', 'close',  'volume']
all_data['AAPL'].columns = ['open', 'high', 'low', 'close',  'volume']
all_data['MSFT'].columns = ['open', 'high', 'low', 'close', 'volume']
all_data['AMZN'].columns = ['open', 'high', 'low', 'close',  'volume']
all_data['GOOG'].columns = ['open', 'high', 'low', 'close', 'volume']
all_data['GOOGL'].columns = ['open', 'high', 'low', 'close',  'volume']
all_data['NVDA'].columns = ['open', 'high', 'low', 'close', 'volume']
all_data['META'].columns = ['open', 'high', 'low', 'close', 'volume']
all_data['TSLA'].columns = ['open', 'high', 'low', 'close',  'volume']
all_data['AVGO'].columns = ['open', 'high', 'low', 'close', 'volume']
all_data['COST'].columns = ['open', 'high', 'low', 'close', 'volume']
all_data['PEP'].columns = ['open', 'high', 'low', 'close',  'volume']
all_data['CMCSA'].columns = ['open', 'high', 'low', 'close',  'volume']
all_data['ADBE'].columns = ['open', 'high', 'low', 'close',  'volume']
all_data['CSCO'].columns = ['open', 'high', 'low', 'close',  'volume']
all_data['INTC'].columns = ['open', 'high', 'low', 'close',  'volume']
all_data['NFLX'].columns = ['open', 'high', 'low', 'close', 'volume']
all_data['AMGN'].columns = ['open', 'high', 'low', 'close',  'volume']
all_data['PYPL'].columns = ['open', 'high', 'low', 'close',  'volume']
all_data['QCOM'].columns = ['open', 'high', 'low', 'close',  'volume']
all_data['TXN'].columns = ['open', 'high', 'low', 'close',  'volume']
all_data['SBUX'].columns = ['open', 'high', 'low', 'close',  'volume']
all_data['MDLZ'].columns = ['open', 'high', 'low', 'close',  'volume']
all_data['ISRG'].columns = ['open', 'high', 'low', 'close',  'volume']
all_data['BKNG'].columns = ['open', 'high', 'low', 'close',  'volume']
all_data['GILD'].columns = ['open', 'high', 'low', 'close', 'volume']

In [None]:
# Apply technical indicators to each dataframe
for symbol, data in all_data.items():
    # Skip completely if DataFrame is empty or too small
    if len(data) <= 1:
        print(f"Warning: Ticker {symbol} has insufficient data (length={len(data)}). Skipping technical indicators.")
        continue

    # Rename columns if needed (ensure they match pandas_ta requirements)
    if data.columns[0] != 'open':  # Only rename if not already renamed
        data.columns = ['open', 'high', 'low', 'close', 'volume']

    indicator_count = 0  # Track the number of successfully added indicators

    # Trend Indicators
    try:
        data.ta.sma(length=100, append=True)
        data.ta.sma(length=50, append=True)  # Half length
        data.ta.sma(length=200, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding SMA to {symbol}: {e}")

    try:
        data.ta.ema(length=50, append=True)
        data.ta.ema(length=25, append=True)  # Half length
        data.ta.ema(length=100, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding EMA to {symbol}: {e}")

    try:
        data.ta.wma(length=50, append=True)
        data.ta.wma(length=25, append=True)  # Half length
        data.ta.wma(length=100, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding WMA to {symbol}: {e}")

    try:
        data.ta.hma(length=50, append=True)
        data.ta.hma(length=25, append=True)  # Half length
        data.ta.hma(length=100, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding HMA to {symbol}: {e}")

    try:
        data.ta.vwma(length=50, append=True)
        data.ta.vwma(length=25, append=True)  # Half length
        data.ta.vwma(length=100, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding VWMA to {symbol}: {e}")

    try:
        data.ta.macd(fast=24, slow=52, signal=18, append=True)
        data.ta.macd(fast=12, slow=26, signal=9, append=True)  # Half lengths
        data.ta.macd(fast=48, slow=104, signal=36, append=True)  # Double lengths
        indicator_count += 3
    except Exception as e:
        print(f"Error adding MACD to {symbol}: {e}")

    try:
        data.ta.adx(length=28, append=True)
        data.ta.adx(length=14, append=True)  # Half length
        data.ta.adx(length=56, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding ADX to {symbol}: {e}")

    try:
        data.ta.cci(length=20, append=True)
        data.ta.cci(length=10, append=True)  # Half length
        data.ta.cci(length=40, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding CCI to {symbol}: {e}")

    try:
        data.ta.rsi(length=28, append=True)
        data.ta.rsi(length=14, append=True)  # Half length
        data.ta.rsi(length=56, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding RSI to {symbol}: {e}")

    try:
        data.ta.stoch(k=28, d=6, append=True)
        data.ta.stoch(k=14, d=3, append=True)  # Half lengths
        data.ta.stoch(k=56, d=12, append=True)  # Double lengths
        indicator_count += 3
    except Exception as e:
        print(f"Error adding Stochastic Oscillator to {symbol}: {e}")

    try:
        data.ta.ichimoku(tenkan=18, kijun=52, senkou=104, append=True)
        data.ta.ichimoku(tenkan=9, kijun=26, senkou=52, append=True)  # Half lengths
        data.ta.ichimoku(tenkan=36, kijun=104, senkou=208, append=True)  # Double lengths
        indicator_count += 3
    except Exception as e:
        print(f"Error adding Ichimoku Cloud to {symbol}: {e}")

    try:
        data.ta.supertrend(length=14, multiplier=4, append=True)
        data.ta.supertrend(length=7, multiplier=2, append=True)  # Half parameters
        data.ta.supertrend(length=28, multiplier=8, append=True)  # Double parameters
        indicator_count += 3
    except Exception as e:
        print(f"Error adding SuperTrend to {symbol}: {e}")

    # Check for PSAR which needs more data
    try:
        if len(data) > 2:
            data.ta.psar(step=0.04, max_step=0.4, append=True)
            data.ta.psar(step=0.02, max_step=0.2, append=True)  # Half steps
            data.ta.psar(step=0.08, max_step=0.8, append=True)  # Double steps
            indicator_count += 3
        else:
            data['PSARl_0.02_0.2'] = float('nan')
            data['PSARs_0.02_0.2'] = float('nan')
            data['PSARl_0.01_0.1'] = float('nan')
            data['PSARs_0.01_0.1'] = float('nan')
            data['PSARl_0.04_0.4'] = float('nan')
            data['PSARs_0.04_0.4'] = float('nan')
    except Exception as e:
        print(f"Error adding PSAR to {symbol}: {e}")

    # Momentum Indicators
    try:
        data.ta.mom(length=50, append=True)
        data.ta.mom(length=25, append=True)  # Half length
        data.ta.mom(length=100, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding Momentum to {symbol}: {e}")

    try:
        data.ta.roc(length=30, append=True)
        data.ta.roc(length=15, append=True)  # Half length
        data.ta.roc(length=60, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding ROC to {symbol}: {e}")

    try:
        data.ta.willr(length=14, append=True)
        data.ta.willr(length=7, append=True)  # Half length
        data.ta.willr(length=28, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding Williams %R to {symbol}: {e}")

    try:
        data.ta.uo(append=True)
        # UO doesn't have configurable length, so we can't create variants
        indicator_count += 1
    except Exception as e:
        print(f"Error adding Ultimate Oscillator to {symbol}: {e}")

    try:
        data.ta.ao(fast=5, slow=34, append=True)
        data.ta.ao(fast=3, slow=17, append=True)  # Half lengths
        data.ta.ao(fast=10, slow=68, append=True)  # Double lengths
        indicator_count += 3
    except Exception as e:
        print(f"Error adding Awesome Oscillator to {symbol}: {e}")

    try:
        data.ta.kama(length=10, append=True)
        data.ta.kama(length=5, append=True)  # Half length
        data.ta.kama(length=20, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding KAMA to {symbol}: {e}")

    try:
        data.ta.cg(length=10, append=True)
        data.ta.cg(length=5, append=True)  # Half length
        data.ta.cg(length=20, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding Center of Gravity to {symbol}: {e}")

    # Volatility Indicators
    try:
        data.ta.bbands(length=40, std=2, append=True)
        data.ta.bbands(length=20, std=1, append=True)  # Half parameters
        data.ta.bbands(length=80, std=4, append=True)  # Double parameters
        indicator_count += 3
    except Exception as e:
        print(f"Error adding Bollinger Bands to {symbol}: {e}")

    try:
        data.ta.atr(length=14, append=True)
        data.ta.atr(length=7, append=True)  # Half length
        data.ta.atr(length=28, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding ATR to {symbol}: {e}")

    try:
        data.ta.kc(length=40, scalar=3, append=True)
        data.ta.kc(length=20, scalar=1.5, append=True)  # Half parameters
        data.ta.kc(length=80, scalar=6, append=True)  # Double parameters
        indicator_count += 3
    except Exception as e:
        print(f"Error adding Keltner Channel to {symbol}: {e}")

    try:
        data.ta.donchian(lower_length=40, upper_length=40, append=True)
        data.ta.donchian(lower_length=20, upper_length=20, append=True)  # Half lengths
        data.ta.donchian(lower_length=80, upper_length=80, append=True)  # Double lengths
        indicator_count += 3
    except Exception as e:
        print(f"Error adding Donchian Channel to {symbol}: {e}")

    try:
        data.ta.rvi(length=14, append=True)
        data.ta.rvi(length=7, append=True)  # Half length
        data.ta.rvi(length=28, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding RVI to {symbol}: {e}")

    # Volume Indicators - OBV needs a length check
    try:
        if len(data) > 1:
            data.ta.obv(append=True)
            # OBV doesn't have configurable length, so we can't create variants
            indicator_count += 1
        else:
            data['OBV'] = float('nan')
    except Exception as e:
        print(f"Error adding OBV to {symbol}: {e}")

    try:
        data.ta.cmf(length=30, append=True)
        data.ta.cmf(length=15, append=True)  # Half length
        data.ta.cmf(length=60, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding CMF to {symbol}: {e}")

    try:
        data.ta.adosc(fast=5, slow=17, append=True)
        data.ta.adosc(fast=3, slow=9, append=True)  # Half lengths (rounded)
        data.ta.adosc(fast=10, slow=34, append=True)  # Double lengths
        indicator_count += 3
    except Exception as e:
        print(f"Error adding AD Oscillator to {symbol}: {e}")

    try:
        data.ta.mfi(length=28, append=True)
        data.ta.mfi(length=14, append=True)  # Half length
        data.ta.mfi(length=56, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding MFI to {symbol}: {e}")

    try:
        data.ta.vwap(append=True)
        # VWAP doesn't have configurable length, so we can't create variants
        indicator_count += 1
    except Exception as e:
        print(f"Error adding VWAP to {symbol}: {e}")

    try:
        data.ta.eom(length=28, append=True)
        data.ta.eom(length=14, append=True)  # Half length
        data.ta.eom(length=56, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding Ease of Movement to {symbol}: {e}")

    try:
        data.ta.nvi(length=255, append=True)
        data.ta.nvi(length=128, append=True)  # Half length (rounded)
        data.ta.nvi(length=510, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding NVI to {symbol}: {e}")

    # Other Indicators
    try:
        data.ta.squeeze(append=True)
        # Squeeze doesn't have easily configurable length, so we can't create variants
        indicator_count += 1
    except Exception as e:
        print(f"Error adding TTM Squeeze to {symbol}: {e}")

    try:
        data.ta.fisher(length=18, append=True)
        data.ta.fisher(length=9, append=True)  # Half length
        data.ta.fisher(length=36, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding Fisher Transform to {symbol}: {e}")

    try:
        data.ta.decay(length=10, mode="linear", append=True)
        data.ta.decay(length=5, mode="linear", append=True)  # Half length
        data.ta.decay(length=20, mode="linear", append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding Linear Decay to {symbol}: {e}")

    try:
        data.ta.decay(length=10, mode="exponential", append=True)
        data.ta.decay(length=5, mode="exponential", append=True)  # Half length
        data.ta.decay(length=20, mode="exponential", append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding Exponential Decay to {symbol}: {e}")

    try:
        data.ta.vortex(length=28, append=True)
        data.ta.vortex(length=14, append=True)  # Half length
        data.ta.vortex(length=56, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding Vortex Indicator to {symbol}: {e}")

    try:
        data.ta.zscore(length=40, append=True)
        data.ta.zscore(length=20, append=True)  # Half length
        data.ta.zscore(length=80, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding Z-Score to {symbol}: {e}")

    try:
        data.ta.entropy(length=20, append=True)
        data.ta.entropy(length=10, append=True)  # Half length
        data.ta.entropy(length=40, append=True)  # Double length
        indicator_count += 3
    except Exception as e:
        print(f"Error adding Entropy to {symbol}: {e}")

    print(f"\nData for {symbol}:")
    print(data.head())  # Print the first few rows of the dataset

    # Plot the data
    data['close'].plot(figsize=(12, 6), title=f'{symbol} Daily Closing Price (from 2018)')
    plt.xlabel('Date')
    plt.ylabel('Closing Price')
    plt.grid(True)
    plt.show()

    # Print the number of technical indicators added
    print(f"Number of technical indicators successfully added to {symbol}: {indicator_count}")

In [8]:
for symbol, data in all_data.items():
    # Get the original column names
    original_columns = data.columns.tolist()
    # Create the new column names with the symbol prefix
    new_columns = [f"{symbol}_{col}" for col in original_columns]
    # Rename the columns in the DataFrame
    data.columns = new_columns
    print(f"\nColumns for {symbol} after renaming:")
    print(data.columns)


Columns for QQQ after renaming:
Index(['QQQ_open', 'QQQ_high', 'QQQ_low', 'QQQ_close', 'QQQ_volume',
       'QQQ_SMA_100', 'QQQ_SMA_50', 'QQQ_SMA_200', 'QQQ_EMA_50', 'QQQ_EMA_25',
       ...
       'QQQ_VTXP_14', 'QQQ_VTXM_14', 'QQQ_VTXP_56', 'QQQ_VTXM_56', 'QQQ_ZS_40',
       'QQQ_ZS_20', 'QQQ_ZS_80', 'QQQ_ENTP_20', 'QQQ_ENTP_10', 'QQQ_ENTP_40'],
      dtype='object', length=181)

Columns for AAPL after renaming:
Index(['AAPL_open', 'AAPL_high', 'AAPL_low', 'AAPL_close', 'AAPL_volume',
       'AAPL_SMA_100', 'AAPL_SMA_50', 'AAPL_SMA_200', 'AAPL_EMA_50',
       'AAPL_EMA_25',
       ...
       'AAPL_VTXP_14', 'AAPL_VTXM_14', 'AAPL_VTXP_56', 'AAPL_VTXM_56',
       'AAPL_ZS_40', 'AAPL_ZS_20', 'AAPL_ZS_80', 'AAPL_ENTP_20',
       'AAPL_ENTP_10', 'AAPL_ENTP_40'],
      dtype='object', length=181)

Columns for MSFT after renaming:
Index(['MSFT_open', 'MSFT_high', 'MSFT_low', 'MSFT_close', 'MSFT_volume',
       'MSFT_SMA_100', 'MSFT_SMA_50', 'MSFT_SMA_200', 'MSFT_EMA_50',
       'MSFT_EMA_2

In [9]:

dataframes_to_join = []

for symbol, data in all_data.items():
    dataframes_to_join.append(data)

merged_data = pd.concat(dataframes_to_join, axis=1)

print("\nMerged DataFrame head:")
print(merged_data.head())

print("\nMerged DataFrame info:")
merged_data.info()


Merged DataFrame head:
             QQQ_open   QQQ_high    QQQ_low  QQQ_close  QQQ_volume  \
Date                                                                 
2000-01-03  80.314308  81.532797  76.923731  81.532797  36345200.0   
2000-01-04  74.804626  79.254760  74.539738  77.983293  33786600.0   
2000-01-05  72.897453  75.970166  71.414075  74.168921  42496600.0   
2000-01-06  67.890984  74.592669  67.599606  73.639069  37134800.0   
2000-01-07  76.287987  76.287987  69.930655  70.301499  28138200.0   

            QQQ_SMA_100  QQQ_SMA_50  QQQ_SMA_200  QQQ_EMA_50  QQQ_EMA_25  ...  \
Date                                                                      ...   
2000-01-03          NaN         NaN          NaN         NaN         NaN  ...   
2000-01-04          NaN         NaN          NaN         NaN         NaN  ...   
2000-01-05          NaN         NaN          NaN         NaN         NaN  ...   
2000-01-06          NaN         NaN          NaN         NaN         NaN  ...   

In [10]:
merged_data.head(5)

Unnamed: 0_level_0,QQQ_open,QQQ_high,QQQ_low,QQQ_close,QQQ_volume,QQQ_SMA_100,QQQ_SMA_50,QQQ_SMA_200,QQQ_EMA_50,QQQ_EMA_25,...,GILD_VTXP_14,GILD_VTXM_14,GILD_VTXP_56,GILD_VTXM_56,GILD_ZS_40,GILD_ZS_20,GILD_ZS_80,GILD_ENTP_20,GILD_ENTP_10,GILD_ENTP_40
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-03,80.314308,81.532797,76.923731,81.532797,36345200.0,,,,,,...,,,,,,,,,,
2000-01-04,74.804626,79.25476,74.539738,77.983293,33786600.0,,,,,,...,,,,,,,,,,
2000-01-05,72.897453,75.970166,71.414075,74.168921,42496600.0,,,,,,...,,,,,,,,,,
2000-01-06,67.890984,74.592669,67.599606,73.639069,37134800.0,,,,,,...,,,,,,,,,,
2000-01-07,76.287987,76.287987,69.930655,70.301499,28138200.0,,,,,,...,,,,,,,,,,


In [11]:
merged_data['target'] = (merged_data['QQQ_close'] - merged_data['QQQ_SMA_200']).shift(-1)

In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import iqr

def fill_nan_with_rolling_median(data, window_size=21):
    """
    Fill NaN values in a DataFrame using forward fill first, then rolling median.
    Any remaining NaNs (e.g., at the start) are set to zero.
    """
    # Forward fill first
    data_ffill = data.ffill()
    # Fill remaining NaNs with rolling median
    data_filled = data_ffill.apply(lambda x: x.fillna(x.rolling(window=window_size, min_periods=1).median()), axis=0)
    # Fill any remaining NaNs (e.g., at the very start) with zero
    data_filled = data_filled.fillna(0)
    return data_filled

# Usage example with your existing pipeline
# First handle missing values
merged_data_filled = fill_nan_with_rolling_median(merged_data.copy(), window_size=21)

KeyboardInterrupt: 

In [11]:
target_column = 'target'

In [12]:


# Print statistics to prove no NaNs remain
print('NaN count per column after filling:')
print(merged_data_filled.isna().sum())
print('Any NaNs left in data:', merged_data_filled.isna().any().any())
print('Any zeros in data:', (merged_data_filled == 0).any().any())

class FeatureScaler(BaseEstimator, TransformerMixin):
    """
    Automatically detects and applies optimal scaling for financial time series features.
    Handles mixed-scale features like RSI (0-100) and ATR (volatility) without hardcoding.
    """

    def __init__(self):
        self.scalers = {}
        self.feature_stats = {}

    def _auto_detect_scaler(self, data):
        """Heuristic-based scaler selection using financial data characteristics"""
        # Calculate distribution properties
        q1, q3 = np.percentile(data, [25, 75])
        iqr_val = q3 - q1
        range_val = np.max(data) - np.min(data)
        has_outliers = (iqr_val > 0) and (range_val / iqr_val > 4)

        # Check for common financial indicator ranges
        is_bounded = (
            (np.min(data) >= 0) and
            (np.max(data) <= 100) and
            (range_val >= 50)  # Typical for RSI-like features
        )

        if is_bounded:
            return MinMaxScaler(feature_range=(0, 1)), 'minmax'
        elif has_outliers:
            return RobustScaler(), 'robust'
        else:
            return StandardScaler(), 'standard'

    def fit(self, X, y=None):
        for col in X.columns:
            column_data = X[col].values.reshape(-1, 1)
            scaler, scaler_type = self._auto_detect_scaler(column_data)
            self.scalers[col] = scaler.fit(column_data)

            # Store metadata for MLOps monitoring
            self.feature_stats[col] = {
                'min': np.min(column_data),
                'max': np.max(column_data),
                'iqr': iqr(column_data),
                'scaler': scaler_type
            }
        return self

    def transform(self, X):
        X_scaled = X.copy()
        for col, scaler in self.scalers.items():
            X_scaled[col] = scaler.transform(X_scaled[col].values.reshape(-1, 1)).flatten()
        return X_scaled

    def get_feature_stats(self):
        """For MLOps monitoring and model interpretation"""
        return pd.DataFrame(self.feature_stats).T



NaN count per column after filling:
QQQ_open        0
QQQ_high        0
QQQ_low         0
QQQ_close       0
QQQ_volume      0
               ..
GILD_ZS_80      0
GILD_ENTP_20    0
GILD_ENTP_10    0
GILD_ENTP_40    0
target          0
Length: 4707, dtype: int64
Any NaNs left in data: False
Any zeros in data: True


In [13]:
# Instantiate the FeatureScaler
feature_scaler = FeatureScaler()

# Fit the scaler to the data excluding the target column
feature_scaler.fit(merged_data_filled.drop(columns=[target_column]))

# Transform the data excluding the target column
scaled_features = feature_scaler.transform(merged_data_filled.drop(columns=[target_column]))

# Combine the scaled features with the unscaled target column
scaled_data_df = pd.DataFrame(scaled_features, columns=merged_data_filled.columns.drop(target_column), index=merged_data_filled.index)
scaled_data_df[target_column] = merged_data_filled[target_column]


In [14]:
from sklearn.model_selection import train_test_split
import os
import pandas as pd

#target_column = merged_data['target'] # < remove hardcording to make any symbol feasible at target stock price ?

columns_to_drop = []

if target_column in scaled_data_df.columns:
    scaled_data_df['QQQ_target'] = scaled_data_df[target_column].shift(-1)

    scaled_data_df.dropna(subset=['QQQ_target'], inplace=True)

    X = scaled_data_df.drop(columns=['QQQ_target', target_column] + columns_to_drop, errors='ignore') # Add columns to drop
    y = scaled_data_df['QQQ_target']

    train_size = int(len(scaled_data_df) * 0.8)

    X_train = X.iloc[:train_size]
    X_test = X.iloc[train_size:]
    y_train = y.iloc[:train_size]
    y_test = y.iloc[train_size:]

    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)
    print("y_train shape:", y_train.shape)
    print("y_test shape:", y_test.shape)

    output_dir = 'data_split'

    if os.path.exists(output_dir):
      import shutil
      shutil.rmtree(output_dir)
    os.makedirs(output_dir)

    X_train.to_csv(os.path.join(output_dir, 'X_train.csv'), index=True) # Include index for timestamp
    X_test.to_csv(os.path.join(output_dir, 'X_test.csv'), index=True)   # Include index for timestamp
    y_train.to_csv(os.path.join(output_dir, 'y_train.csv'), index=True, header=True) # Include index and header
    y_test.to_csv(os.path.join(output_dir, 'y_test.csv'), index=True, header=True)     # Include index and header

    print(f"Training and testing data saved to CSV files in the '{output_dir}' directory.")
    print(f"- {os.path.join(output_dir, 'X_train.csv')}")
    print(f"- {os.path.join(output_dir, 'X_test.csv')}")
    print(f"- {os.path.join(output_dir, 'y_train.csv')}")
    print(f"- {os.path.join(output_dir, 'y_test.csv')}")

else:
    print(f"Target column '{target_column}' not found in the DataFrame.")
    print("Please ensure the target column name is correct.")

X_train shape: (5109, 4706)
X_test shape: (1278, 4706)
y_train shape: (5109,)
y_test shape: (1278,)
Training and testing data saved to CSV files in the 'data_split' directory.
- data_split/X_train.csv
- data_split/X_test.csv
- data_split/y_train.csv
- data_split/y_test.csv


Before applying computationally heavy FS methods we need to thin down the dataset from the obvious useless features, in the beginning the approach was to maximalize the option to now thin it down to the best ones and heavily test those for useability and for holding any predictive significance at all, this will be done with variance treshold and correlation

In [15]:
def drop_highly_correlated_features(df, threshold=0.95):
    """
    Drops features from df with absolute correlation greater than the threshold.
    Keeps the first feature in each correlated group.
    """
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    return df.drop(columns=to_drop), to_drop

from sklearn.feature_selection import VarianceThreshold

def variance_threshold_filter(df, threshold=0.0):
    """
    Drops features with variance below the threshold.
    Returns the filtered DataFrame and the dropped column names.
    """
    selector = VarianceThreshold(threshold=threshold)
    selector.fit(df)
    kept_columns = df.columns[selector.get_support()]
    dropped_columns = df.columns[~selector.get_support()]
    return df[kept_columns], list(dropped_columns)

from sklearn.feature_selection import mutual_info_regression

def mutual_information_filter(X, y, top_n=50):
    """
    Selects the top_n features with the highest mutual information with the target y.
    Returns the filtered DataFrame and the dropped column names.
    """
    mi = mutual_info_regression(X, y, random_state=42)
    mi_series = pd.Series(mi, index=X.columns)
    top_features = mi_series.sort_values(ascending=False).head(top_n).index
    dropped = [col for col in X.columns if col not in top_features]
    return X[top_features], dropped

In [16]:
def granularity_filter(df, min_unique_ratio=0.01):
    """
    Removes features with too few unique values relative to sample size.
    """
    min_unique = df.shape[0] * min_unique_ratio
    to_keep = [col for col in df.columns if df[col].nunique() > min_unique]
    dropped = [col for col in df.columns if col not in to_keep]
    return df[to_keep], dropped

from scipy.stats import entropy

def entropy_filter(df, threshold=1.0):
    """
    Drop features with Shannon entropy below the threshold.
    """
    def shannon_entropy(col):
        p = col.value_counts(normalize=True)
        return entropy(p, base=2)

    entropies = df.apply(shannon_entropy)
    to_keep = entropies[entropies > threshold].index
    dropped = entropies[entropies <= threshold].index
    return df[to_keep], list(dropped)

def target_correlation_filter(X, y, method='spearman', min_corr=0.05):
    corrs = X.corrwith(y, method=method).abs()
    kept = corrs[corrs > min_corr].index
    dropped = corrs[corrs <= min_corr].index
    return X[kept], list(dropped)

from statsmodels.stats.outliers_influence import variance_inflation_factor

def vif_filter(df, threshold=10.0):
    """
    Drop features with VIF above the threshold (multicollinearity).
    """
    dropped = []
    while True:
        vif = pd.Series(
            [variance_inflation_factor(df.values, i) for i in range(df.shape[1])],
            index=df.columns
        )
        max_vif = vif.max()
        if max_vif <= threshold:
            break
        drop_col = vif.idxmax()
        df = df.drop(columns=[drop_col])
        dropped.append(drop_col)
    return df, dropped


In [17]:
def mutual_information_threshold_filter(X, y, mi_threshold=0.01):
    """
    Keeps all features with mutual information above the threshold.
    Returns the filtered DataFrame and the dropped column names.
    """
    mi = mutual_info_regression(X, y, random_state=42)
    mi_series = pd.Series(mi, index=X.columns)
    to_keep = mi_series[mi_series > mi_threshold].index
    dropped = mi_series[mi_series <= mi_threshold].index
    return X[to_keep], list(dropped)

In [18]:
# 1. Drop highly correlated features
X_filt, dropped_corr = drop_highly_correlated_features(X_train, threshold=0.8)
print(f"Dropped due to correlation: {len(dropped_corr)} columns")
print(dropped_corr)

# 2. Drop low-variance features
X_filt, dropped_var = variance_threshold_filter(X_filt, threshold=0.15)
print(f"Dropped due to low variance: {len(dropped_var)} columns")
print(dropped_var)

# 3. Drop features with low granularity
X_filt, dropped_gran = granularity_filter(X_filt, min_unique_ratio=0.15)
print(f"Dropped due to low granularity: {len(dropped_gran)} columns")
print(dropped_gran)

# 4. Drop features with low entropy
X_filt, dropped_entropy = entropy_filter(X_filt, threshold=5.0)
print(f"Dropped due to low entropy: {len(dropped_entropy)} columns")
print(dropped_entropy)

# 5. Drop features with low correlation to target
X_filt, dropped_corr_target = target_correlation_filter(X_filt, y_train, method='spearman', min_corr=0.15)
print(f"Dropped due to low target correlation: {len(dropped_corr_target)} columns")
print(dropped_corr_target)

# 6. Select features by MI threshold
X_filt, dropped_mi = mutual_information_threshold_filter(X_filt, y_train, mi_threshold=0.15)
print(f"Dropped due to low mutual information: {len(dropped_mi)} columns")
print(dropped_mi)

# 7. VIF (only if not too many features)
if X_filt.shape[1] < 250:
    X_final, dropped_vif = vif_filter(X_filt, threshold=10.0)
    print(f"Dropped due to high VIF: {len(dropped_vif)} columns")
    print(dropped_vif)
else:
    print(f"Skipping VIF: too many features ({X_filt.shape[1]})")
    X_final = X_filt


Dropped due to correlation: 3941 columns
['QQQ_high', 'QQQ_low', 'QQQ_close', 'QQQ_SMA_100', 'QQQ_SMA_50', 'QQQ_SMA_200', 'QQQ_EMA_50', 'QQQ_EMA_25', 'QQQ_EMA_100', 'QQQ_WMA_50', 'QQQ_WMA_25', 'QQQ_WMA_100', 'QQQ_HMA_50', 'QQQ_HMA_25', 'QQQ_HMA_100', 'QQQ_VWMA_50', 'QQQ_VWMA_25', 'QQQ_VWMA_100', 'QQQ_MACDs_24_52_18', 'QQQ_MACD_12_26_9', 'QQQ_MACDs_12_26_9', 'QQQ_MACD_48_104_36', 'QQQ_MACDh_48_104_36', 'QQQ_MACDs_48_104_36', 'QQQ_DMP_14', 'QQQ_DMN_14', 'QQQ_DMP_56', 'QQQ_DMN_56', 'QQQ_CCI_10_0.015', 'QQQ_CCI_40_0.015', 'QQQ_RSI_28', 'QQQ_RSI_14', 'QQQ_RSI_56', 'QQQ_STOCHk_28_6_3', 'QQQ_STOCHd_28_6_3', 'QQQ_STOCHk_14_3_3', 'QQQ_STOCHd_14_3_3', 'QQQ_STOCHk_56_12_3', 'QQQ_STOCHd_56_12_3', 'QQQ_ISA_18', 'QQQ_ISB_52', 'QQQ_ITS_18', 'QQQ_IKS_52', 'QQQ_ICS_52', 'QQQ_ISA_9', 'QQQ_ISB_26', 'QQQ_ITS_9', 'QQQ_IKS_26', 'QQQ_ICS_26', 'QQQ_ISA_36', 'QQQ_ISB_104', 'QQQ_ITS_36', 'QQQ_IKS_104', 'QQQ_ICS_104', 'QQQ_SUPERT_14_4.0', 'QQQ_SUPERTl_14_4.0', 'QQQ_SUPERTs_14_4.0', 'QQQ_SUPERT_7_2.0', 'QQQ_SUPER

In [19]:

X_final.describe()

Unnamed: 0,QQQ_volume,QQQ_MACD_24_52_18,QQQ_MACDh_24_52_18,QQQ_ADX_28,QQQ_DMP_28,QQQ_DMN_28,QQQ_ADX_56,QQQ_ROC_30,QQQ_ROC_15,QQQ_ROC_60,...,BKNG_EOM_14_100000000,BKNG_EOM_56_100000000,BKNG_VTXM_28,GILD_volume,GILD_MACD_24_52_18,GILD_BBB_20_1.0,GILD_BBB_80_4.0,GILD_CMF_60,GILD_EOM_28_100000000,GILD_EOM_56_100000000
count,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0,...,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0
mean,0.308114,-0.139047,-0.049758,0.005655,-0.103286,0.018325,-0.006251,-0.157096,-0.126815,-0.180548,...,-0.164426,-0.187235,-0.000606,0.484405,0.104192,0.386262,0.242552,0.07969,-0.375638,-0.585716
std,0.940019,0.920914,1.127006,1.029516,0.709102,0.783539,1.04921,0.967884,0.963519,0.941852,...,4.244385,2.86922,0.822554,1.242513,1.344203,1.111568,0.970916,0.724403,5.228235,6.025705
min,-0.886162,-5.754817,-12.844602,-2.674483,-3.039872,-3.244642,-2.344337,-4.6574,-5.799074,-4.086288,...,-56.392396,-32.681512,-5.532642,-1.098003,-5.928871,-1.342885,-1.517972,-2.146428,-72.199113,-57.226568
25%,-0.411163,-0.461927,-0.379715,-0.758817,-0.607466,-0.509797,-0.74924,-0.572278,-0.564884,-0.595399,...,-0.05872,-0.033478,-0.505806,-0.243439,-0.315335,-0.333587,-0.424948,-0.414209,-0.151773,-0.17828
50%,0.118994,-0.079782,-0.015351,-0.183642,-0.061207,-0.005064,-0.215219,-0.021888,-0.024647,-0.026703,...,-0.000857,-0.002018,0.04444,0.176869,-0.020154,0.090734,0.069055,0.036639,-0.003112,-0.003174
75%,0.732864,0.281079,0.39654,0.686308,0.367724,0.514249,0.662231,0.396325,0.411337,0.391512,...,0.148362,0.177494,0.526832,0.814036,0.41215,0.741431,0.548513,0.565731,0.230904,0.236506
max,10.126276,3.792276,7.105553,2.839183,1.989256,4.942531,3.024711,4.564826,5.125005,3.56672,...,26.464845,11.418278,2.162059,22.826057,7.067198,8.137816,5.019421,2.156156,18.764396,22.334524


In [20]:
# Load the data
X_train = pd.read_csv('data_split/X_train.csv', index_col=0)
X_test = pd.read_csv('data_split/X_test.csv', index_col=0)

# List of columns to drop (combine all dropped columns from each step)
dropped_columns = set(dropped_corr + dropped_var + dropped_gran + dropped_entropy + dropped_corr_target + dropped_mi)
if 'dropped_vif' in locals():
    dropped_columns.update(dropped_vif)

# Drop the columns from both train and test sets
X_train_dropped = X_train.drop(columns=dropped_columns, errors='ignore')
X_test_dropped = X_test.drop(columns=dropped_columns, errors='ignore')

# Save the modified DataFrames back to CSV
X_train_dropped.to_csv('data_split/X_train.csv')
X_test_dropped.to_csv('data_split/X_test.csv')

print(f"Dropped columns: {dropped_columns}")

Dropped columns: {'PYPL_VTXP_14', 'GOOG_KCLe_80_6.0', 'MDLZ_KCUe_40_3.0', 'GOOGL_ICS_52', 'GOOGL_SMA_50', 'NVDA_ATRr_7', 'TXN_DMN_14', 'PEP_BBB_40_2.0', 'AVGO_IKS_26', 'COST_KCUe_20_1.5', 'PYPL_HMA_100', 'NVDA_WMA_50', 'QCOM_LDECAY_5', 'TXN_MACDh_24_52_18', 'META_STOCHk_56_12_3', 'ADBE_CCI_40_0.015', 'GOOGL_CCI_40_0.015', 'CSCO_PSARs_0.02_0.2', 'META_VTXM_28', 'TSLA_PSARl_0.02_0.2', 'TXN_ITS_18', 'BKNG_ADX_28', 'TSLA_SQZ_NO', 'QCOM_STOCHk_28_6_3', 'TXN_NVI_510', 'AMGN_ICS_52', 'MDLZ_VTXM_56', 'AAPL_EMA_50', 'GILD_STOCHd_56_12_3', 'ADBE_ZS_20', 'ADBE_DCL_20_20', 'AMGN_SUPERTl_7_2.0', 'CMCSA_IKS_26', 'INTC_RVI_28', 'ADBE_ISB_26', 'MSFT_SUPERTs_14_4.0', 'GOOGL_ICS_26', 'MSFT_KAMA_20_2_30', 'INTC_KAMA_5_2_30', 'PEP_SMA_200', 'PYPL_VTXM_14', 'AVGO_DCM_20_20', 'SBUX_IKS_104', 'NVDA_CG_5', 'QQQ_PSARs_0.02_0.2', 'SBUX_SMA_200', 'ISRG_MACD_12_26_9', 'CMCSA_RVI_14', 'AMZN_KCLe_80_6.0', 'AVGO_SUPERTl_28_8.0', 'META_IKS_26', 'NFLX_FISHERT_36_1', 'AVGO_RVI_7', 'COST_ADOSC_5_17', 'ISRG_BBU_40_2.0', 

In [23]:
output_dir = 'data_split/'

# Load all data
X_train_full = pd.read_csv(os.path.join(output_dir, 'X_train.csv'), index_col=0, parse_dates=True)
X_test_full = pd.read_csv(os.path.join(output_dir, 'X_test.csv'), index_col=0, parse_dates=True)
y_train_full = pd.read_csv(os.path.join(output_dir, 'y_train.csv'), index_col=0, parse_dates=True).squeeze()
y_test_full = pd.read_csv(os.path.join(output_dir, 'y_test.csv'), index_col=0, parse_dates=True).squeeze()


In [24]:
# Function for purged k-fold split
def purged_kfold_split(data, n_splits=5, purging_window=timedelta(days=1)):
    kf = KFold(n_splits=n_splits)
    indices = np.arange(len(data))
    dates = data.index

    for train_index, test_index in kf.split(indices):
        test_start_date = dates[test_index[0]]
        test_end_date = dates[test_index[-1]]

        purge_start_date = test_start_date - purging_window
        purge_end_date = test_end_date + purging_window

        train_mask = (dates[train_index] < purge_start_date) | (dates[train_index] > purge_end_date)

        purged_train_index = train_index[train_mask]

        yield purged_train_index, test_index


In [25]:
from sklearn.ensemble import RandomForestRegressor

In [26]:
def get_random_forest_regressor(
    n_estimators=100,
    criterion='squared_error',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='sqrt',  # Use a valid option
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    bootstrap=True,
    oob_score=False,
    n_jobs=-1,
    random_state=None,
    verbose=0,
    warm_start=False,
    ccp_alpha=0.0,
    max_samples=None,
    **kwargs
):
    return RandomForestRegressor(
        n_estimators=n_estimators,
        criterion=criterion,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        min_weight_fraction_leaf=min_weight_fraction_leaf,
        max_features=max_features,
        max_leaf_nodes=max_leaf_nodes,
        min_impurity_decrease=min_impurity_decrease,
        bootstrap=bootstrap,
        oob_score=oob_score,
        n_jobs=n_jobs,
        random_state=random_state,
        verbose=verbose,
        warm_start=warm_start,
        ccp_alpha=ccp_alpha,
        max_samples=max_samples,
        **kwargs
    )

In [27]:
def compute_mda_importance(X_train_full, y_train_full, X_test_full, y_test_full, purged_kfold_split, n_splits=5, n_repeats=10, n_jobs=-1):
    feature_importances = np.zeros(X_train_full.shape[1])
    fold_count = 0

    # Accumulate permutation importances over folds
    for train_index, val_index in purged_kfold_split(X_train_full, n_splits=n_splits):
        X_train = X_train_full.iloc[train_index]
        y_train = y_train_full.iloc[train_index]
        X_val = X_train_full.iloc[val_index]
        y_val = y_train_full.iloc[val_index]

        model = get_random_forest_regressor()
        model.fit(X_train, y_train)

        perm_result = permutation_importance(
            model,
            X_val,
            y_val,
            scoring='neg_mean_squared_error',
            n_repeats=n_repeats,
            n_jobs=n_jobs
        )
        feature_importances += perm_result.importances_mean
        fold_count += 1

    feature_importances /= fold_count
    mda_ranking = pd.Series(feature_importances, index=X_train_full.columns).sort_values(ascending=False)

    # Train model on full data
    model = get_random_forest_regressor()
    model.fit(X_train_full, y_train_full)

    # Walk-forward prediction
    preds = []
    trues = []
    for i in range(len(X_test_full)):
        X_test_row = X_test_full.iloc[[i]]
        y_test_row = y_test_full.iloc[i]
        pred = model.predict(X_test_row)[0]
        preds.append(pred)
        trues.append(y_test_row)

    mse = mean_squared_error(trues, preds)

    return mse, mda_ranking


In [28]:
def boruta_mda(
    X_train, y_train, X_test, y_test,
    purged_kfold_split,
    min_features=10,
    drop_fraction=0.1,
    n_splits=5,
    n_repeats=10,
    n_jobs=-1,
    verbose=True
):
    """
    Boruta-style feature selection using MDA as the optimization metric.
    """
    features = list(X_train.columns)
    best_mda_ranking = None
    best_features = features.copy()
    history = []

    while len(features) > min_features:
        # Compute MDA importance
        mse, mda_ranking = compute_mda_importance(
            X_train[features], y_train, X_test[features], y_test,
            purged_kfold_split, n_splits=n_splits, n_repeats=n_repeats, n_jobs=n_jobs
        )
        history.append((len(features), mda_ranking))
        if verbose:
            print(f"Features: {len(features)}, MSE: {mse:.6f}")
            print("MDA Scores:")
            print(mda_ranking)

        # If this is the best so far, save the features and MDA ranking
        if best_mda_ranking is None or mda_ranking.sum() > best_mda_ranking.sum():
            best_mda_ranking = mda_ranking
            best_features = features.copy()

        # Drop the lowest MDA features
        n_drop = max(1, int(len(features) * drop_fraction))
        to_drop = mda_ranking.index[-n_drop:]
        features = [f for f in features if f not in to_drop]

        # Stopping criterion: if only min_features left, break
        if len(features) <= min_features:
            break

    # Final run with best features
    if verbose:
        print(f"Best feature set: {len(best_features)} features")
        print("Best MDA Scores:")
        print(best_mda_ranking)
    return best_features, best_mda_ranking, history

In [29]:
selected_features, best_mse, history = boruta_mda(
    X_train_full, y_train_full, X_test_full, y_test_full,
    purged_kfold_split,
    min_features=20, drop_fraction=0.1, n_splits=5, n_repeats=10, n_jobs=-1
)
print("Selected features:", selected_features)

Features: 270, MSE: 927.886421
MDA Scores:
QQQ_MACD_24_52_18         1.913115
QQQ_ROC_60                1.084496
QQQ_VTXM_56               0.559835
QQQ_ENTP_40               0.458523
AAPL_MACD_24_52_18        0.402915
                            ...   
CMCSA_EOM_56_100000000   -0.042652
TXN_DMP_56               -0.054813
AVGO_ROC_60              -0.056448
MSFT_EOM_56_100000000    -0.064995
MSFT_MACD_24_52_18       -0.204496
Length: 270, dtype: float64
Features: 243, MSE: 953.598487
MDA Scores:
QQQ_MACD_24_52_18     1.573067
QQQ_ROC_60            1.017671
QQQ_VTXM_56           0.828892
AAPL_MACD_24_52_18    0.626431
QQQ_ENTP_40           0.468888
                        ...   
INTC_BBB_80_4.0      -0.023051
CMCSA_BBB_20_1.0     -0.023174
NVDA_CMF_60          -0.026862
PEP_ROC_60           -0.030024
CMCSA_ROC_60         -0.048115
Length: 243, dtype: float64
Features: 219, MSE: 916.380759
MDA Scores:
QQQ_MACD_24_52_18        1.775439
QQQ_ROC_60               1.151043
QQQ_VTXM_56          

In [31]:
# Load the data
X_train = pd.read_csv('data_split/X_train.csv', index_col=0)
X_test = pd.read_csv('data_split/X_test.csv', index_col=0)

# Convert the set to a list
selected_features_list = list(selected_features)

# Use the list to index the DataFrame
X_train_selected = X_train[selected_features_list]
X_test_selected = X_test[selected_features_list]

# Override the original CSV files with the filtered DataFrames
X_train_selected.to_csv('data_split/X_train.csv')
X_test_selected.to_csv('data_split/X_test.csv')

print(f"Selected features: {selected_features}")

Selected features: ['QQQ_MACD_24_52_18', 'QQQ_ADX_28', 'QQQ_DMP_28', 'QQQ_DMN_28', 'QQQ_ADX_56', 'QQQ_ROC_30', 'QQQ_ROC_60', 'QQQ_BBB_80_4.0', 'QQQ_EOM_56_100000000', 'QQQ_VTXM_56', 'AAPL_MACD_24_52_18', 'MSFT_ADX_56', 'AMZN_ADX_56', 'AVGO_CCI_20_0.015', 'ADBE_MACD_24_52_18', 'ADBE_ROC_60', 'CSCO_EOM_28_100000000', 'CSCO_EOM_14_100000000', 'TXN_MACD_24_52_18', 'ISRG_DMN_28', 'ISRG_ROC_60', 'ISRG_EOM_56_100000000']


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
from datetime import timedelta
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.svm import SVR
from statsmodels.tsa.arima.model import ARIMA
from arch import arch_model
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from prophet import Prophet
import warnings
warnings.filterwarnings('ignore')

# Purged K-Fold implementation (López de Prado method)
def purged_kfold_split(data, n_splits=5, purging_window=timedelta(days=1)):
    """
    Implementation of Purged K-fold cross-validation following Marcos López de Prado's
    methodology to prevent forward-looking bias in financial time series.

    Parameters:
    -----------
    data : DataFrame with DatetimeIndex
        The data to split
    n_splits : int, default=5
        Number of folds
    purging_window : timedelta, default=timedelta(days=1)
        Window to purge around test set

    Returns:
    --------
    Generator of train_indices, test_indices for each fold
    """
    kf = KFold(n_splits=n_splits)
    indices = np.arange(len(data))
    dates = data.index

    for train_index, test_index in kf.split(indices):
        test_start_date = dates[test_index[0]]
        test_end_date = dates[test_index[-1]]

        purge_start_date = test_start_date - purging_window
        purge_end_date = test_end_date + purging_window

        train_mask = (dates[train_index] < purge_start_date) | (dates[train_index] > purge_end_date)

        purged_train_index = train_index[train_mask]

        yield purged_train_index, test_index

# Walk-Forward Validation for final model evaluation
class WalkForwardValidator:
    def __init__(self, window_size=252, test_period=21):
        self.window_size = window_size
        self.test_period = test_period

    def split(self, X, y):
        """Generate walk-forward splits with daily testing and monthly retraining"""
        total_size = len(X)
        start_idx = 0

        while start_idx + self.window_size + self.test_period <= total_size:
            train_end = start_idx + self.window_size
            test_end = train_end + self.test_period

            # Yield daily test sets within the test period
            for test_start in range(train_end, test_end):
                yield (
                    X.iloc[start_idx:train_end],
                    y.iloc[start_idx:train_end],
                    X.iloc[test_start:test_start + 1],
                    y.iloc[test_start:test_start + 1]
                )

            # Move the training window forward by the test period
            start_idx += self.test_period

def create_objective(model_type):
    def objective(trial):
        # Load data (already scaled)
        X = pd.read_csv('data_split/X_train.csv', index_col=0, parse_dates=True)
        y = pd.read_csv('data_split/y_train.csv', index_col=0, parse_dates=True).squeeze()

        # Define parameters only for the model being optimized
        if model_type == 'xgboost':
            params = {
                'objective': 'reg:squarederror',
                'n_estimators': trial.suggest_int('n_estimators', 100, 350),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
                'max_depth': trial.suggest_int('max_depth', 3, 15),
                'subsample': trial.suggest_float('subsample', 0.5, 0.8),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
                # 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.7),
                'gamma': trial.suggest_float('gamma', 0, 5),
                'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0),
                'reg_lambda': trial.suggest_float('reg_lambda', 0, 2.0),
                'random_state': 42
            }

            def fit_predict(X_train, y_train, X_test):
                model = xgb.XGBRegressor(**params)
                model.fit(X_train, y_train)
                return model.predict(X_test)

        elif model_type == 'svm':
            kernel = trial.suggest_categorical('kernel', ['linear', 'rbf'])
            params = {
                'C': trial.suggest_float('C', 0.1, 10.0, log=True),
                'epsilon': trial.suggest_float('epsilon', 0.01, 0.5),
                'kernel': kernel,
                'max_iter': 1000
            }
            if kernel == 'rbf':
                params['gamma'] = trial.suggest_float('gamma', 1e-3, 1.0, log=True)

            import logging

            logging.basicConfig(level=logging.INFO)

            def fit_predict(X_train, y_train, X_test):
                try:
                    logging.info(f"Starting SVM trial with params: {params}")
                    y_train_1d = np.ravel(y_train)
                    model = SVR(**params)
                    model.fit(X_train, y_train_1d)
                    return model.predict(X_test)
                except Exception as e:
                    print(f'SVM error: {e}')
                    return np.ones(len(X_test)) * y_train.mean()

        elif model_type == 'lstm':
            params = {
                'hidden_size': trial.suggest_int('hidden_size', 64, 1024),
                'num_layers': trial.suggest_int('num_layers', 2, 4),
                'dropout': trial.suggest_float('dropout', 0.1, 0.5),
                'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True),
                'batch_size': 64,
                'epochs': 50
            }

            def fit_predict(X_train, y_train, X_test):
                try:
                    # Convert to tensors
                    X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
                    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
                    X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

                    # Simple LSTM model
                    model = nn.LSTM(
                        input_size=X_train.shape[1],
                        hidden_size=params['hidden_size'],
                        num_layers=params['num_layers'],
                        dropout=params['dropout'],
                        batch_first=True
                    )

                    fc = nn.Linear(params['hidden_size'], 1)
                    optimizer = torch.optim.Adam([*model.parameters(), *fc.parameters()],
                                              lr=params['learning_rate'])

                    # Train
                    model.train()
                    for _ in range(params['epochs']):
                        X_batch = X_train_tensor.unsqueeze(1)
                        outputs, _ = model(X_batch)
                        predictions = fc(outputs[:, -1, :])
                        loss = nn.MSELoss()(predictions, y_train_tensor)
                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()

                    # Predict
                    model.eval()
                    with torch.no_grad():
                        X_test_batch = X_test_tensor.unsqueeze(1)
                        test_outputs, _ = model(X_test_batch)
                        predictions = fc(test_outputs[:, -1, :])
                        return predictions.numpy().flatten()
                except Exception as e:
                    print(f"LSTM error: {e}")
                    return np.ones(len(X_test)) * y_train.mean()

        elif model_type == 'arima_garch':
            params = {
                'p' : trial.suggest_int('p', 0, 8),
                'd' : trial.suggest_int('d', 0, 2),
                'q' : trial.suggest_int('q', 0, 8),
                'use_garch' : True
            }

            def fit_predict(X_train, y_train, X_test):
                try:
                    model = ARIMA(y_train, order=(params['p'], params['d'], params['q']))
                    results = model.fit()
                    return results.forecast(steps=len(X_test))
                except Exception as e:
                    print(f"ARIMA error: {e}")
                    return np.ones(len(X_test)) * y_train.mean()

        # Using purged k-fold for hyperparameter optimization
        # This follows López de Prado's approach for preventing leakage
        purging_window = timedelta(days=1)  # 1 trading day
        errors = []

        try:
            # Apply purged k-fold cross-validation
            for train_idx, test_idx in purged_kfold_split(X, n_splits=5, purging_window=purging_window):
                X_train_fold, X_test_fold = X.iloc[train_idx], X.iloc[test_idx]
                y_train_fold, y_test_fold = y.iloc[train_idx], y.iloc[test_idx]

                # Skip if not enough data
                if len(X_train_fold) < 50 or len(X_test_fold) < 5:
                    continue

                # Apply model
                y_pred = fit_predict(X_train_fold, y_train_fold, X_test_fold)

                # Calculate error
                error = mean_squared_error(y_test_fold, y_pred)
                errors.append(error)

            # Return mean error across all folds
            if not errors:
                return float('inf')
            return np.mean(errors)

        except Exception as e:
            print(f"Error in purged k-fold: {e}")
            return float('inf')

    return objective

# Function to evaluate final model performance using walk-forward validation
def evaluate_model(model_type, best_params):
    print(f"\n=== Evaluating {model_type.upper()} with Walk-Forward Validation ===")

    # Load data
    X = pd.read_csv('data_split/X_train.csv', index_col=0, parse_dates=True)
    y = pd.read_csv('data_split/y_train.csv', index_col=0, parse_dates=True).squeeze()

    # Walk-forward validation
    validator = WalkForwardValidator(window_size=252, test_period=21)
    errors = []
    predictions = []

    print("Starting walk-forward validation...")

    # Create model with best parameters
    if model_type == 'xgboost':
        params = {**best_params, 'objective': 'reg:squarederror', 'random_state': 42}

        for X_train, y_train, X_test, y_test in validator.split(X, y):
            model = xgb.XGBRegressor(**params)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            error = mean_squared_error(y_test, y_pred)
            errors.append(error)
            predictions.extend(list(zip(y_test.index, y_test, y_pred)))

    elif model_type == 'svm':
        params = best_params

        for X_train, y_train, X_test, y_test in validator.split(X, y):
            model = SVR(**params)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            error = mean_squared_error(y_test, y_pred)
            errors.append(error)
            predictions.extend(list(zip(y_test.index, y_test, y_pred)))

    elif model_type == 'lstm':
        params = best_params

        for X_train, y_train, X_test, y_test in validator.split(X, y):
            try:
                # Convert to tensors
                X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
                y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
                X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

                # Simple LSTM model
                model = nn.LSTM(
                    input_size=X_train.shape[1],
                    hidden_size=params['hidden_size'],
                    num_layers=params['num_layers'],
                    dropout=params['dropout'],
                    batch_first=True
                )

                fc = nn.Linear(params['hidden_size'], 1)
                optimizer = torch.optim.Adam([*model.parameters(), *fc.parameters()],
                                          lr=params['learning_rate'])

                # Train
                model.train()
                for _ in range(params['epochs']):
                    X_batch = X_train_tensor.unsqueeze(1)
                    outputs, _ = model(X_batch)
                    predictions = fc(outputs[:, -1, :])
                    loss = nn.MSELoss()(predictions, y_train_tensor)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                # Predict
                model.eval()
                with torch.no_grad():
                    X_test_batch = X_test_tensor.unsqueeze(1)
                    test_outputs, _ = model(X_test_batch)
                    y_pred = fc(test_outputs[:, -1, :]).numpy().flatten()

                error = mean_squared_error(y_test, y_pred)
                errors.append(error)
                predictions.extend(list(zip(y_test.index, y_test, y_pred)))
            except Exception as e:
                print(f"LSTM error during evaluation: {e}")

    elif model_type == 'arima_garch':
        params = best_params

        for X_train, y_train, X_test, y_test in validator.split(X, y):
            try:
                model = ARIMA(y_train, order=(params['p'], params['d'], params['q']))
                results = model.fit()
                y_pred = results.forecast(steps=len(X_test))

                error = mean_squared_error(y_test, y_pred)
                errors.append(error)
                predictions.extend(list(zip(y_test.index, y_test, y_pred)))
            except Exception as e:
                print(f"ARIMA error during evaluation: {e}")

    # Store predictions for analysis
    if len(predictions) > 0:
        pred_df = pd.DataFrame(predictions, columns=['date', 'actual', 'predicted'])
        pred_df.to_csv(f'predictions_{model_type}_walkforward.csv', index=False)

    avg_error = np.mean(errors) if errors else float('inf')
    print(f"Average MSE: {avg_error:.6f}")
    return avg_error

# Run optimization
models = [ 'xgboost', 'lstm',  'svm', 'arima_garch',]
best_models = {}

for model_type in models:
    print(f"\nOptimizing {model_type.upper()} using purged k-fold cross-validation...")
    study = optuna.create_study(direction='minimize')
    study.optimize(create_objective(model_type), n_trials=150)

    best_models[model_type] = {
        'value': study.best_trial.value,
        'params': study.best_trial.params
    }

    print(f"Best {model_type} trial:")
    print(f"  MSE: {study.best_trial.value:.6f}")
    print("  Params:", study.best_trial.params)

# Compare models using k-fold results
print("\n=== MODEL COMPARISON (Purged K-Fold) ===")
for model_type, results in sorted(best_models.items(), key=lambda x: x[1]['value']):
    print(f"{model_type.upper()}: MSE = {results['value']:.6f}")

# Evaluate best models with walk-forward validation
print("\n=== FINAL WALK-FORWARD EVALUATION ===")
walk_forward_results = {}

for model_type, results in best_models.items():
    wf_error = evaluate_model(model_type, results['params'])
    walk_forward_results[model_type] = wf_error

# Final comparison
print("\n=== FINAL MODEL COMPARISON (Walk-Forward) ===")
for model_type, error in sorted(walk_forward_results.items(), key=lambda x: x[1]):
    print(f"{model_type.upper()}: MSE = {error:.6f}")

[I 2025-05-27 16:14:22,489] A new study created in memory with name: no-name-43345fa8-34f8-4298-970a-ad8f2959fef1



Optimizing XGBOOST using purged k-fold cross-validation...


[I 2025-05-27 16:14:25,964] Trial 0 finished with value: 24.318708557504813 and parameters: {'n_estimators': 251, 'learning_rate': 0.07182164579919523, 'max_depth': 11, 'subsample': 0.7325760047893526, 'min_child_weight': 4, 'gamma': 1.0609726605211716, 'reg_alpha': 0.5289386949818147, 'reg_lambda': 1.513845794644027}. Best is trial 0 with value: 24.318708557504813.
[I 2025-05-27 16:14:31,242] Trial 1 finished with value: 21.933543648872437 and parameters: {'n_estimators': 322, 'learning_rate': 0.022673795780329284, 'max_depth': 7, 'subsample': 0.5754446204230412, 'min_child_weight': 10, 'gamma': 0.4655268529749673, 'reg_alpha': 0.5157226447415877, 'reg_lambda': 1.7564270482012496}. Best is trial 1 with value: 21.933543648872437.
[I 2025-05-27 16:14:31,821] Trial 2 finished with value: 21.065766605083933 and parameters: {'n_estimators': 137, 'learning_rate': 0.09524252360781085, 'max_depth': 3, 'subsample': 0.5645122771492764, 'min_child_weight': 1, 'gamma': 2.0606859738253522, 'reg_al

Best xgboost trial:
  MSE: 20.276290
  Params: {'n_estimators': 312, 'learning_rate': 0.031620261698627444, 'max_depth': 4, 'subsample': 0.6604581561396677, 'min_child_weight': 13, 'gamma': 1.4560635237102806, 'reg_alpha': 0.7789262479335402, 'reg_lambda': 0.11770408407457993}

Optimizing LSTM using purged k-fold cross-validation...


[I 2025-05-27 16:24:45,158] Trial 0 finished with value: 54.51683842907907 and parameters: {'hidden_size': 905, 'num_layers': 2, 'dropout': 0.34660212691577763, 'learning_rate': 0.00026335510592872966}. Best is trial 0 with value: 54.51683842907907.
[I 2025-05-27 16:25:32,977] Trial 1 finished with value: 29.694462953232676 and parameters: {'hidden_size': 281, 'num_layers': 2, 'dropout': 0.22301331248343717, 'learning_rate': 0.003394169285444056}. Best is trial 1 with value: 29.694462953232676.
[I 2025-05-27 16:28:08,753] Trial 2 finished with value: 51.848265704299976 and parameters: {'hidden_size': 374, 'num_layers': 4, 'dropout': 0.29767311760252235, 'learning_rate': 0.0006183981533581072}. Best is trial 1 with value: 29.694462953232676.
[I 2025-05-27 16:33:44,001] Trial 3 finished with value: 59.59560050511122 and parameters: {'hidden_size': 726, 'num_layers': 3, 'dropout': 0.10114142067874701, 'learning_rate': 0.0001265186034493552}. Best is trial 1 with value: 29.694462953232676.

In [None]:
# import pandas as pd
# import numpy as np
# import xgboost as xgb
# import optuna
# from datetime import timedelta
# from sklearn.metrics import mean_squared_error
# from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor
# from sklearn.experimental import enable_hist_gradient_boosting
# from sklearn.ensemble import HistGradientBoostingRegressor

# # Define a function to create an objective function for each model type
# def create_objective(model_type):
#     def objective(trial):
#         X = pd.read_csv('data_split/X_train.csv', index_col=0, parse_dates=True)
#         y = pd.read_csv('data_split/y_train.csv', index_col=0, parse_dates=True).squeeze() # Use squeeze to get a Series

#         # Define parameters based on model type
#         if model_type == 'xgboost':
#             params = {
#                 'objective': 'reg:squarederror',
#                 'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#                 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
#                 'max_depth': trial.suggest_int('max_depth', 5, 15),
#                 'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#                 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.8),
#                 'gamma': trial.suggest_float('gamma', 0, 0.5),
#                 'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
#                 'random_state': 42
#             }
#             model_class = xgb.XGBRegressor

#         elif model_type == 'random_forest':
#             params = {
#                 'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#                 'max_depth': trial.suggest_int('max_depth', 5, 30),
#                 'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
#                 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
#                 'max_features': trial.suggest_float('max_features', 0.3, 1.0),
#                 'random_state': 42
#             }
#             model_class = RandomForestRegressor

#         elif model_type == 'extra_trees':
#             params = {
#                 'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#                 'max_depth': trial.suggest_int('max_depth', 5, 30),
#                 'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
#                 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
#                 'max_features': trial.suggest_float('max_features', 0.3, 1.0),
#                 'random_state': 42
#             }
#             model_class = ExtraTreesRegressor

#         elif model_type == 'hist_gradient_boosting':
#             params = {
#                 'max_iter': trial.suggest_int('max_iter', 100, 1000),
#                 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
#                 'max_depth': trial.suggest_int('max_depth', 5, 15),
#                 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
#                 'l2_regularization': trial.suggest_float('l2_regularization', 0, 10),
#                 'random_state': 42
#             }
#             model_class = HistGradientBoostingRegressor

#         n_splits = 3
#         purging_window = timedelta(days=5) # Adjust as needed
#         fold_errors = []

#         # Perform purged k-fold cross-validation
#         for train_indices, test_indices in purged_kfold_split(X, n_splits=n_splits, purging_window=purging_window):
#             X_train_fold, X_test_fold = X.iloc[train_indices], X.iloc[test_indices]
#             y_train_fold, y_test_fold = y.iloc[train_indices], y.iloc[test_indices]

#             model = model_class(**params)
#             model.fit(X_train_fold, y_train_fold)

#             y_pred_fold = model.predict(X_test_fold)
#             mse = mean_squared_error(y_test_fold, y_pred_fold)
#             fold_errors.append(mse)

#         return np.mean(fold_errors)

#     return objective

# # Dictionary to store the best trials for each model
# best_models = {}

# # List of models to evaluate
# models = ['xgboost', 'random_forest', 'extra_trees', 'hist_gradient_boosting']

# # Run optimization for each model
# for model_type in models:
#     print(f"\n\nOptimizing {model_type.upper()}...")
#     study = optuna.create_study(direction='minimize', study_name=model_type)
#     study.optimize(create_objective(model_type), n_trials=150)

#     print(f"Best {model_type} trial:")
#     print(f"  Value: {study.best_trial.value}")
#     print("  Params: ")
#     for key, value in study.best_trial.params.items():
#         print(f"    {key}: {value}")

#     # Store the best trial
#     best_models[model_type] = {
#         'value': study.best_trial.value,
#         'params': study.best_trial.params
#     }

# # Compare all models
# print("\n\n=== MODEL COMPARISON ===")
# for model_type, results in sorted(best_models.items(), key=lambda x: x[1]['value']):
#     print(f"{model_type.upper()}: MSE = {results['value']:.6f}")

# # Identify the best overall model
# best_model = min(best_models.items(), key=lambda x: x[1]['value'])
# print(f"\nBest overall model: {best_model[0].upper()} with MSE = {best_model[1]['value']:.6f}")