In [8]:
# 05_multihorizon_advanced_ratio.ipynb

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For advanced ML
from sklearn.ensemble import RandomForestRegressor
try:
    from xgboost import XGBRegressor
    xgb_available = True
except ImportError:
    xgb_available = False

from sklearn.metrics import mean_squared_error, mean_absolute_error

sns.set_theme(font_scale=1.1)

DATA_FOLDER = '../data'
all_coins = [
    "Aave",
    "BinanceCoin",
    "Bitcoin",
    "Cardano",
    "ChainLink",
    "Cosmos",
    "CryptocomCoin",
    "Dogecoin",
    "EOS",
    "Ethereum",
    "Iota",
    "Litecoin",
    "Monero",
    "NEM",
    "Polkadot",
    "Solana",
    "Stellar",
    "Tether",
    "Tron",
    "Uniswap",
    "USDCoin",
    "WrappedBitcoin",
    "XRP"
]

# Our horizon columns match what we used in Notebook 3 & 4
horizons = ['Close_t+1','Close_t+7','Close_t+30','Close_t+90']

# Typical feature columns from your feature engineering step
feature_cols = [
    'Close','MA_20','BB_upper','BB_lower','RSI_14',
    'MACD','MACD_Signal','MACD_Hist',
    'Stoch_%K','Stoch_%D'
]

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

results = []

for coin in all_coins:
    print(f"\n=== Advanced Multi-Horizon for {coin} ===")

    # 1. Load the ratio-based train/test from Notebook 3
    train_csv = f"{coin.lower()}_train_multi.csv"
    test_csv  = f"{coin.lower()}_test_multi.csv"
    train_path = os.path.join(DATA_FOLDER, train_csv)
    test_path  = os.path.join(DATA_FOLDER, test_csv)

    if not (os.path.exists(train_path) and os.path.exists(test_path)):
        print(f"[{coin}] Missing train/test CSV, skipping...")
        continue

    train_df = pd.read_csv(train_path, parse_dates=['Date'])
    test_df  = pd.read_csv(test_path,  parse_dates=['Date'])

    # 2. Check we have enough data
    if len(train_df) < 20:
        print(f"[{coin}] Train set <20 rows, skipping coin.")
        continue
    if len(test_df) == 0:
        print(f"[{coin}] Test set empty, skipping coin.")
        continue

    # 3. Drop any rows with missing features or horizon columns
    #    We'll do that per horizon inside the loop

    for horizon_col in horizons:
        if horizon_col not in train_df.columns:
            continue

        # Keep a local copy so we can drop differently for each horizon
        train_local = train_df.copy()
        test_local  = test_df.copy()

        # Drop NA from feature cols + [horizon_col]
        train_local.dropna(subset=feature_cols + [horizon_col], inplace=True)
        test_local.dropna(subset=feature_cols + [horizon_col], inplace=True)

        # If no data left
        if len(train_local) < 10 or len(test_local) < 1:
            print(f"{coin} - {horizon_col}: not enough data after dropna, skipping horizon.")
            continue

        X_train = train_local[feature_cols]
        y_train = train_local[horizon_col]
        X_test  = test_local[feature_cols]
        y_test  = test_local[horizon_col]

        print(f"{coin} - {horizon_col}: Train={len(X_train)}, Test={len(X_test)}")

        # 4. RANDOM FOREST
        rf = RandomForestRegressor(
            n_estimators=100,
            max_depth=5,
            random_state=42
        )
        rf.fit(X_train, y_train)
        rf_pred = rf.predict(X_test)
        rf_rmse_val = rmse(y_test, rf_pred)
        rf_mae_val  = mae(y_test, rf_pred)

        # (Optional) feature importances
        rf_importance = rf.feature_importances_
        # Sort them if you want
        top_feats = sorted(zip(feature_cols, rf_importance), key=lambda x: x[1], reverse=True)[:3]

        # 5. XGBOOST
        xgb_rmse_val, xgb_mae_val = np.nan, np.nan
        if xgb_available:
            xgb_model = XGBRegressor(
                n_estimators=100,
                max_depth=5,
                learning_rate=0.1,
                random_state=42
            )
            xgb_model.fit(X_train, y_train)
            xgb_pred = xgb_model.predict(X_test)

            xgb_rmse_val = rmse(y_test, xgb_pred)
            xgb_mae_val  = mae(y_test, xgb_pred)

            # (Optional) XGB feature importances
            xgb_importance = xgb_model.feature_importances_

        print(f"{coin} / {horizon_col} => RF: RMSE={rf_rmse_val:.2f}, MAE={rf_mae_val:.2f}; "
              f"XGB: RMSE={xgb_rmse_val:.2f}, MAE={xgb_mae_val:.2f}")
        print(f"   Top RF feats: {top_feats}")

        results.append({
            'Coin': coin,
            'Horizon': horizon_col,
            'RF_RMSE': rf_rmse_val,
            'RF_MAE': rf_mae_val,
            'XGB_RMSE': xgb_rmse_val,
            'XGB_MAE': xgb_mae_val
        })

# 6. Save all results
results_df = pd.DataFrame(results)
advanced_csv = os.path.join(DATA_FOLDER, 'advanced_results_multihorizon.csv')
results_df.to_csv(advanced_csv, index=False)
print(f"\n=== Saved multi-horizon advanced results to: {advanced_csv}")
print(results_df.head(30))



=== Advanced Multi-Horizon for Aave ===
Aave - Close_t+1: Train=132, Test=33
Aave / Close_t+1 => RF: RMSE=24.35, MAE=19.67; XGB: RMSE=22.37, MAE=16.66
   Top RF feats: [('Close', np.float64(0.36058163338769184)), ('BB_upper', np.float64(0.34155453384498635)), ('MA_20', np.float64(0.2178436924953899))]
Aave - Close_t+7: Train=132, Test=33
Aave / Close_t+7 => RF: RMSE=35.04, MAE=30.59; XGB: RMSE=28.89, MAE=23.93
   Top RF feats: [('BB_upper', np.float64(0.4773638842599168)), ('MA_20', np.float64(0.30898816371515875)), ('Close', np.float64(0.17079234276870697))]
Aave - Close_t+30: Train=132, Test=33
Aave / Close_t+30 => RF: RMSE=66.66, MAE=51.18; XGB: RMSE=55.86, MAE=44.98
   Top RF feats: [('MA_20', np.float64(0.406297285820723)), ('Close', np.float64(0.2852897544276636)), ('BB_upper', np.float64(0.23327707814065518))]
Aave - Close_t+90: Train=132, Test=33
Aave / Close_t+90 => RF: RMSE=128.77, MAE=114.76; XGB: RMSE=137.71, MAE=123.41
   Top RF feats: [('RSI_14', np.float64(0.25287627408