### Imports & Configuration

In [2]:
import pandas as pd
import numpy as np
import os
import joblib
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import precision_score, accuracy_score, classification_report

# --- CONFIG ---
DATA_DIR = "../data"
MODELS_DIR = "../models"
os.makedirs(MODELS_DIR, exist_ok=True)

# Input Files (Full Data)
FILE_BUY = os.path.join(DATA_DIR, "Train_Buy_Sniper.parquet")
FILE_SELL = os.path.join(DATA_DIR, "Train_Sell_Sniper.parquet")

# Vault Files (Where we hide the test data)
VAULT_BUY_TEST = os.path.join(DATA_DIR, "Vault_Test_Buy.parquet")
VAULT_SELL_TEST = os.path.join(DATA_DIR, "Vault_Test_Sell.parquet")

# Features List (Explicit to prevent leakage)
FEATURES = [
    'EMA_50_Slope', 
    'Dist_from_200', 
    'RSI', 
    'ATR', 
    'Rel_Vol', 
    'hour'
]

print("‚úÖ Config Loaded. Vault paths established.")

‚úÖ Config Loaded. Vault paths established.


### Load Data

In [3]:
# Load full datasets
df_buy_full = pd.read_parquet(FILE_BUY).dropna()
df_sell_full = pd.read_parquet(FILE_SELL).dropna()

print(f"üìä Total Bull Rows: {len(df_buy_full)}")
print(f"üìä Total Bear Rows: {len(df_sell_full)}")

üìä Total Bull Rows: 3103
üìä Total Bear Rows: 3115


### The Vault (Split & Hide)

In [4]:
def vault_data(df, save_path, split_pct=0.20):
    """
    Splits data by time (no shuffling). Saves the tail to disk.
    Returns the head (Training set).
    """
    # Calculate split index
    split_idx = int(len(df) * (1 - split_pct))
    
    # Split
    train_df = df.iloc[:split_idx].copy()
    test_df = df.iloc[split_idx:].copy()
    
    # Save Test Data to Vault
    test_df.to_parquet(save_path)
    print(f"üîí Vaulted {len(test_df)} rows to {save_path}")
    
    return train_df

# --- EXECUTE VAULTING ---
print("--- üîê SECURING TEST DATA ---")

# 1. Vault Buy Data
df_buy_train = vault_data(df_buy_full, VAULT_BUY_TEST)

# 2. Vault Sell Data
df_sell_train = vault_data(df_sell_full, VAULT_SELL_TEST)

print(f"\nüìâ Training Data Remaining:")
print(f"   Buy Model: {len(df_buy_train)} rows")
print(f"   Sell Model: {len(df_sell_train)} rows")

--- üîê SECURING TEST DATA ---
üîí Vaulted 621 rows to ../data\Vault_Test_Buy.parquet
üîí Vaulted 623 rows to ../data\Vault_Test_Sell.parquet

üìâ Training Data Remaining:
   Buy Model: 2482 rows
   Sell Model: 2492 rows


### Training Function (The Engine)

In [38]:
# def train_evaluate_classifier(df, model_name, target_col):
#     """
#     Trains a Random Forest Classifier using Time Series Cross Validation.
#     """
#     print(f"\nüöÄ Training {model_name}...")
    
#     X = df[FEATURES]
#     y = df[target_col]
    
#     # 1. Time Series Split (Internal Validation)
#     # We use 5 splits to ensure stability
#     tscv = TimeSeriesSplit(n_splits=5)
#     fold = 1
#     precisions = []
    
#     for train_index, val_index in tscv.split(X):
#         # Validation Split (NOT the Vault)
#         X_train, X_val = X.iloc[train_index], X.iloc[val_index]
#         y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
#         # Train (Conservative Params)
#         clf = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=10, random_state=42)
#         clf.fit(X_train, y_train)
        
#         # Predict
#         preds = clf.predict(X_val)
        
#         # Check Precision (Avoid division by zero if no trades taken)
#         prec = precision_score(y_val, preds, zero_division=0)
#         precisions.append(prec)
        
#         print(f"  > Fold {fold}: Precision = {prec:.2%}")
#         fold += 1
        
#     print(f"üèÜ Avg CV Precision: {np.mean(precisions):.2%}")
    
#     # 2. Final Fit on ALL TRAINING Data
#     final_model = RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_leaf=5, random_state=42)
#     final_model.fit(X, y)
    
#     # Save
#     save_path = os.path.join(MODELS_DIR, f"{model_name}.joblib")
#     joblib.dump(final_model, save_path)
#     return final_model

# def train_evaluate_regressor(df, model_name, target_col):
#     print(f"\nüìè Training Regressor {model_name}...")
#     X = df[FEATURES]
#     y = df[target_col]
    
#     reg = RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_leaf=10, random_state=42)
#     reg.fit(X, y)
    
#     save_path = os.path.join(MODELS_DIR, f"{model_name}.joblib")
#     joblib.dump(reg, save_path)
#     return reg

### Train the BULL Models (Buy)

In [39]:
# # Train Buy Models
# rf_buy_class = train_evaluate_classifier(df_buy_train, "RF_Buy_Class", "LABEL_BUY")
# rf_buy_reg = train_evaluate_regressor(df_buy_train, "RF_Buy_Reg", "REG_TARGET_HIGH")



### Train the BEAR Models (Sell)

In [40]:
# # Train Sell Models
# rf_sell_class = train_evaluate_classifier(df_sell_train, "RF_Sell_Class", "LABEL_SELL")
# rf_sell_reg = train_evaluate_regressor(df_sell_train, "RF_Sell_Reg", "REG_TARGET_LOW")

### The Tuning Code

In [6]:
# Cell: "Production Scale" Tuning Strategy

from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import make_scorer, fbeta_score
from scipy.stats import randint

# --- SCORER: F0.5 (Sniper Metric) ---
# We return to F0.5 because 'Profit Scorer' was too harsh on volume.
# F0.5 balances Precision (Safety) with Recall (Volume).
sniper_scorer = make_scorer(fbeta_score, beta=0.5, zero_division=0)

def tune_production_model(df, target_col, model_type="Buy"):
    print(f"\nüè≠ LAUNCHING PRODUCTION TUNING FOR: {model_type} Model")
    print(f"{'='*60}")
    
    X = df[FEATURES]
    y = df[target_col]
    
    # 1. THE PRODUCTION GRID
    param_dist = {
        # Massive Forest: 1500-2500 trees to smooth out "noise"
        'n_estimators': randint(1500, 2500),
        
        # Depth: Allow deep learning of specific patterns
        'max_depth': randint(20, 50),
        
        # Split: Higher numbers (15-40) prevent memorizing single candles
        'min_samples_split': randint(15, 40),
        'min_samples_leaf': randint(5, 20),
        
        'max_features': ['sqrt'], # Square root is mathematically optimal for RF
        'bootstrap': [True],
        
        # THE KEY FIX: Balanced Weights
        # This automatically sets weights inversely proportional to class frequency.
        # For 30% win rate, it sets approx ~1:2.3 ratio.
        'class_weight': ['balanced', 'balanced_subsample'] 
    }
    
    rf = RandomForestClassifier(random_state=42, n_jobs=-1)
    
    tscv = TimeSeriesSplit(n_splits=3)
    
    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=50,              
        scoring=sniper_scorer,
        cv=tscv,
        verbose=1,
        # n_jobs=-1,
        random_state=42
    )
    
    print("‚è≥ Running Optimization...")
    random_search.fit(X, y)
    
    print(f"\nüèÜ BEST PARAMS: {random_search.best_params_}")
    print(f"‚ú® Best F0.5 Score: {random_search.best_score_:.4f}")
    
    return random_search.best_estimator_

# --- EXECUTE ---
# Ensure training data is loaded!
best_buy_model = tune_production_model(df_buy_train, "LABEL_BUY", "Buy")
best_sell_model = tune_production_model(df_sell_train, "LABEL_SELL", "Sell")

import joblib
joblib.dump(best_buy_model, os.path.join(MODELS_DIR, "RF_Buy_Production.joblib"))
joblib.dump(best_sell_model, os.path.join(MODELS_DIR, "RF_Sell_Production.joblib"))
print("\nüíæ Production Models Saved.")


üè≠ LAUNCHING PRODUCTION TUNING FOR: Buy Model
‚è≥ Running Optimization...
Fitting 3 folds for each of 50 candidates, totalling 150 fits

üèÜ BEST PARAMS: {'bootstrap': True, 'class_weight': 'balanced_subsample', 'max_depth': 35, 'max_features': 'sqrt', 'min_samples_leaf': 19, 'min_samples_split': 29, 'n_estimators': 2457}
‚ú® Best F0.5 Score: 0.3700

üè≠ LAUNCHING PRODUCTION TUNING FOR: Sell Model
‚è≥ Running Optimization...
Fitting 3 folds for each of 50 candidates, totalling 150 fits

üèÜ BEST PARAMS: {'bootstrap': True, 'class_weight': 'balanced_subsample', 'max_depth': 48, 'max_features': 'sqrt', 'min_samples_leaf': 19, 'min_samples_split': 32, 'n_estimators': 2229}
‚ú® Best F0.5 Score: 0.3516

üíæ Production Models Saved.


### # Save the best models

In [42]:
# Save
import joblib
joblib.dump(best_buy_model, os.path.join(MODELS_DIR, "RF_Buy_HailMary_profit_based.joblib"))
joblib.dump(best_sell_model, os.path.join(MODELS_DIR, "RF_Sell_HailMary_profit_based.joblib"))
print("\nüíæ Hail Mary Models Saved.")


üíæ Hail Mary Models Saved.


In [None]:

joblib.dump(best_buy_model, os.path.join(MODELS_DIR, "RF_Buy_Tuned.joblib"))
joblib.dump(best_sell_model, os.path.join(MODELS_DIR, "RF_Sell_Tuned.joblib"))

print("\n‚úÖ Models Saved.")