In [3]:
# 07_advanced_tuning.ipynb (or as a .py script)
# ----------------------------------------------------------------------
# PURPOSE:
#   1) Load {coin}_features.csv for each coin (which you already prepared).
#   2) Create multi-horizon targets (Close_t+1, +7, +30, +90).
#   3) Time-based split into train/test.
#   4) Hyperparam tuning for RandomForest & XGBoost (via RandomizedSearchCV).
#   5) Pick whichever model yields lowest RMSE => save as .pkl.
#   6) Save overall results to a CSV, plus a JSON listing final feature columns.
# ----------------------------------------------------------------------

import os
import json
import numpy as np
import pandas as pd

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from joblib import dump

# ---------------- CONFIG --------------------
DATA_FOLDER   = "../data"   # where your {coin}_features.csv are
MODELS_FOLDER = "../models" # folder to save .pkl
OUTPUT_CSV    = os.path.join(DATA_FOLDER, "advanced_tuning_results.csv")
FEATURE_MAP_JSON = os.path.join(MODELS_FOLDER, "final_feature_columns.json")

# Make sure you adjust this list to match the coins you actually have:
ALL_COINS = [
    "ADA","AVAX","BCH","BNB","BTC",
    "DOGE","DOT","ETH","LEO","LINK",
    "LTC","MATIC","NEAR","SHIB","SOL",
    "TON","TRX","UNI","XRP"
]

HORIZONS = [1, 7, 30, 90]
TRAIN_RATIO = 0.80  # 80% train, 20% test

# Number of random-search iterations:
N_ITER_RF  = 20
N_ITER_XGB = 20

# -------------- Helper Functions --------------
def create_multi_horizon_targets(df, horizon_list):
    """
    Creates columns like 'Close_t+1','Close_t+7','Close_t+30','Close_t+90'
    by shifting the 'Close' column up by each horizon.
    """
    for h in horizon_list:
        df[f"Close_t+{h}"] = df["Close"].shift(-h)
    # Drop rows at the end that no longer have targets:
    df.dropna(inplace=True)
    return df

def time_based_split(df, ratio=0.8):
    """
    Splits the DataFrame by time index into train/test by ratio.
    """
    cutoff = int(len(df) * ratio)
    train_df = df.iloc[:cutoff].copy()
    test_df  = df.iloc[cutoff:].copy()
    return train_df, test_df

def tune_random_forest(X_train, y_train, n_iter=20, random_state=42):
    """Random-search hyperparam tuning for RandomForest."""
    rf = RandomForestRegressor(random_state=random_state)
    param_distributions = {
        "n_estimators":      [50, 100, 200, 300, 500],
        "max_depth":         [3, 5, 7, 10, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf":  [1, 2, 4],
    }
    search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_distributions,
        n_iter=n_iter,
        scoring="neg_mean_squared_error",
        cv=3,
        n_jobs=-1,
        random_state=random_state
    )
    search.fit(X_train, y_train)
    return search.best_estimator_

def tune_xgboost(X_train, y_train, n_iter=20, random_state=42):
    """Random-search hyperparam tuning for XGBoost."""
    xgb = XGBRegressor(
        objective="reg:squarederror",
        random_state=random_state,
        n_jobs=-1
    )
    param_distributions = {
        "n_estimators":     [50, 100, 200, 300, 500],
        "max_depth":        [3, 5, 7, 10],
        "learning_rate":    [0.01, 0.05, 0.1, 0.2],
        "subsample":        [0.6, 0.8, 1.0],
        "colsample_bytree": [0.6, 0.8, 1.0]
    }
    search = RandomizedSearchCV(
        estimator=xgb,
        param_distributions=param_distributions,
        n_iter=n_iter,
        scoring="neg_mean_squared_error",
        cv=3,
        n_jobs=-1,
        random_state=random_state
    )
    search.fit(X_train, y_train)
    return search.best_estimator_

def main():
    if not os.path.exists(MODELS_FOLDER):
        os.makedirs(MODELS_FOLDER)
    
    all_results = []
    feature_map = {}  # will store { (coin, horizon_col) : [feature_cols,...] }

    for coin in ALL_COINS:
        print(f"\n=== Hyperparam Tuning for {coin} ===")
        features_file = os.path.join(DATA_FOLDER, f"{coin}_features.csv")
        if not os.path.exists(features_file):
            print(f"  [Skipping] CSV not found: {features_file}")
            continue

        # 1) Load features
        df = pd.read_csv(features_file, parse_dates=["Date"])
        df.sort_values("Date", inplace=True)
        df.set_index("Date", inplace=True)

        # 2) Create multi-horizon targets
        df = create_multi_horizon_targets(df, HORIZONS)
        if len(df) < 50:
            print(f"  [Skipping] Not enough rows (<50) after shift for {coin}.")
            continue
        
        # 3) Train/test split
        train_df, test_df = time_based_split(df, TRAIN_RATIO)
        if len(test_df) < 10:
            print(f"  [Skipping] Not enough test rows (<10) for {coin}.")
            continue
        
        # Identify which columns are numeric but not horizon columns
        horizon_cols = [c for c in df.columns if c.startswith("Close_t+")]
        # Exclude any non-numeric or these target columns
        exclude_list = set(horizon_cols + ["Name","Symbol","SNo","High","Low","Open","Volume","Marketcap"])
        feature_cols = [
            c for c in df.columns
            if (c not in exclude_list) and pd.api.types.is_numeric_dtype(df[c])
        ]
        
        # For each horizon
        for horizon_col in horizon_cols:
            # Make sure we have enough data not only in train but also in test
            tmp_train = train_df.dropna(subset=feature_cols + [horizon_col])
            tmp_test  = test_df.dropna(subset=feature_cols + [horizon_col])
            if (len(tmp_train) < 40) or (len(tmp_test) < 10):
                print(f"  {coin}/{horizon_col} => Not enough data after dropna. Skipping.")
                continue
            
            X_train = tmp_train[feature_cols]
            y_train = tmp_train[horizon_col]
            X_test  = tmp_test[feature_cols]
            y_test  = tmp_test[horizon_col]
            
            # 4) Tune RF
            print(f"  {coin}/{horizon_col} => Tuning RandomForest ...")
            best_rf  = tune_random_forest(X_train, y_train, n_iter=N_ITER_RF)
            # 5) Tune XGB
            print(f"  {coin}/{horizon_col} => Tuning XGBoost ...")
            best_xgb = tune_xgboost(X_train, y_train, n_iter=N_ITER_XGB)
            
            # Evaluate
            rf_pred  = best_rf.predict(X_test)
            xgb_pred = best_xgb.predict(X_test)
            
            rf_rmse  = np.sqrt(mean_squared_error(y_test, rf_pred))
            rf_mae   = mean_absolute_error(y_test, rf_pred)
            xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))
            xgb_mae  = mean_absolute_error(y_test, xgb_pred)
            
            # Decide best
            if xgb_rmse < rf_rmse:
                best_model = best_xgb
                best_model_type = "XGB"
                best_rmse = xgb_rmse
            else:
                best_model = best_rf
                best_model_type = "RF"
                best_rmse = rf_rmse
            
            # Save best model
            horizon_sanitized = horizon_col.replace("+","_plus_")
            model_filename = f"{coin}_{horizon_sanitized}_{best_model_type}.pkl"
            model_path     = os.path.join(MODELS_FOLDER, model_filename)
            dump(best_model, model_path)
            
            print(f"    => Best: {best_model_type} (RMSE={best_rmse:.3f}). Saved => {model_path}")
            
            all_results.append({
                "Coin"      : coin,
                "Horizon"   : horizon_col,
                "RF_RMSE"   : rf_rmse,
                "RF_MAE"    : rf_mae,
                "XGB_RMSE"  : xgb_rmse,
                "XGB_MAE"   : xgb_mae,
                "BestModel" : best_model_type,
                "BestRMSE"  : best_rmse
            })
            
            # record feature columns for later usage (fetch_and_predict)
            feature_map_key = (coin, horizon_col)
            feature_map[feature_map_key] = feature_cols

    # Wrap up
    if len(all_results) == 0:
        print("\n[Info] No valid data found for any coins. No models have been saved.")
        return
    
    # Create a results DataFrame
    results_df = pd.DataFrame(all_results)
    results_df.sort_values(["Coin","Horizon"], inplace=True)
    results_df.to_csv(OUTPUT_CSV, index=False)
    print(f"\n[Info] Saved advanced tuning results => {OUTPUT_CSV}")
    
    # Save feature map as JSON
    fm_dict = {}
    for (coin,horizon_col), feats in feature_map.items():
        fm_dict[f"{coin}__{horizon_col}"] = feats
    
    with open(FEATURE_MAP_JSON, "w") as f:
        json.dump(fm_dict, f, indent=2)
    print(f"[Info] Saved final feature columns => {FEATURE_MAP_JSON}")


if __name__ == "__main__":
    main()



=== Hyperparam Tuning for ADA ===
  ADA/Close_t+1 => Tuning RandomForest ...
  ADA/Close_t+1 => Tuning XGBoost ...
    => Best: RF (RMSE=0.026). Saved => ../models\ADA_Close_t_plus_1_RF.pkl
  ADA/Close_t+7 => Tuning RandomForest ...
  ADA/Close_t+7 => Tuning XGBoost ...
    => Best: XGB (RMSE=0.075). Saved => ../models\ADA_Close_t_plus_7_XGB.pkl
  ADA/Close_t+30 => Tuning RandomForest ...
  ADA/Close_t+30 => Tuning XGBoost ...
    => Best: RF (RMSE=0.194). Saved => ../models\ADA_Close_t_plus_30_RF.pkl
  ADA/Close_t+90 => Tuning RandomForest ...
  ADA/Close_t+90 => Tuning XGBoost ...
    => Best: XGB (RMSE=0.268). Saved => ../models\ADA_Close_t_plus_90_XGB.pkl

=== Hyperparam Tuning for AVAX ===
  AVAX/Close_t+1 => Tuning RandomForest ...
  AVAX/Close_t+1 => Tuning XGBoost ...
    => Best: RF (RMSE=2.339). Saved => ../models\AVAX_Close_t_plus_1_RF.pkl
  AVAX/Close_t+7 => Tuning RandomForest ...
  AVAX/Close_t+7 => Tuning XGBoost ...
    => Best: RF (RMSE=5.922). Saved => ../models\AVAX