### Imports & Configuration

In [35]:
import pandas as pd
import numpy as np
import os
import joblib
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import precision_score, accuracy_score, classification_report

# --- CONFIG ---
DATA_DIR = "../data"
MODELS_DIR = "../models"
os.makedirs(MODELS_DIR, exist_ok=True)

# Input Files (Full Data)
FILE_BUY = os.path.join(DATA_DIR, "Train_Buy_Sniper.parquet")
FILE_SELL = os.path.join(DATA_DIR, "Train_Sell_Sniper.parquet")

# Vault Files (Where we hide the test data)
VAULT_BUY_TEST = os.path.join(DATA_DIR, "Vault_Test_Buy.parquet")
VAULT_SELL_TEST = os.path.join(DATA_DIR, "Vault_Test_Sell.parquet")

# Features List (Explicit to prevent leakage)
FEATURES = [
    'EMA_50_Slope', 
    'Dist_from_200', 
    'RSI', 
    'ATR', 
    'Rel_Vol', 
    'hour'
]

print("‚úÖ Config Loaded. Vault paths established.")

‚úÖ Config Loaded. Vault paths established.


### Load Data

In [36]:
# Load full datasets
df_buy_full = pd.read_parquet(FILE_BUY).dropna()
df_sell_full = pd.read_parquet(FILE_SELL).dropna()

print(f"üìä Total Bull Rows: {len(df_buy_full)}")
print(f"üìä Total Bear Rows: {len(df_sell_full)}")

üìä Total Bull Rows: 3103
üìä Total Bear Rows: 3115


### The Vault (Split & Hide)

In [37]:
def vault_data(df, save_path, split_pct=0.20):
    """
    Splits data by time (no shuffling). Saves the tail to disk.
    Returns the head (Training set).
    """
    # Calculate split index
    split_idx = int(len(df) * (1 - split_pct))
    
    # Split
    train_df = df.iloc[:split_idx].copy()
    test_df = df.iloc[split_idx:].copy()
    
    # Save Test Data to Vault
    test_df.to_parquet(save_path)
    print(f"üîí Vaulted {len(test_df)} rows to {save_path}")
    
    return train_df

# --- EXECUTE VAULTING ---
print("--- üîê SECURING TEST DATA ---")

# 1. Vault Buy Data
df_buy_train = vault_data(df_buy_full, VAULT_BUY_TEST)

# 2. Vault Sell Data
df_sell_train = vault_data(df_sell_full, VAULT_SELL_TEST)

print(f"\nüìâ Training Data Remaining:")
print(f"   Buy Model: {len(df_buy_train)} rows")
print(f"   Sell Model: {len(df_sell_train)} rows")

--- üîê SECURING TEST DATA ---
üîí Vaulted 621 rows to ../data\Vault_Test_Buy.parquet
üîí Vaulted 623 rows to ../data\Vault_Test_Sell.parquet

üìâ Training Data Remaining:
   Buy Model: 2482 rows
   Sell Model: 2492 rows


### Training Function (The Engine)

In [38]:
# def train_evaluate_classifier(df, model_name, target_col):
#     """
#     Trains a Random Forest Classifier using Time Series Cross Validation.
#     """
#     print(f"\nüöÄ Training {model_name}...")
    
#     X = df[FEATURES]
#     y = df[target_col]
    
#     # 1. Time Series Split (Internal Validation)
#     # We use 5 splits to ensure stability
#     tscv = TimeSeriesSplit(n_splits=5)
#     fold = 1
#     precisions = []
    
#     for train_index, val_index in tscv.split(X):
#         # Validation Split (NOT the Vault)
#         X_train, X_val = X.iloc[train_index], X.iloc[val_index]
#         y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
#         # Train (Conservative Params)
#         clf = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=10, random_state=42)
#         clf.fit(X_train, y_train)
        
#         # Predict
#         preds = clf.predict(X_val)
        
#         # Check Precision (Avoid division by zero if no trades taken)
#         prec = precision_score(y_val, preds, zero_division=0)
#         precisions.append(prec)
        
#         print(f"  > Fold {fold}: Precision = {prec:.2%}")
#         fold += 1
        
#     print(f"üèÜ Avg CV Precision: {np.mean(precisions):.2%}")
    
#     # 2. Final Fit on ALL TRAINING Data
#     final_model = RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_leaf=5, random_state=42)
#     final_model.fit(X, y)
    
#     # Save
#     save_path = os.path.join(MODELS_DIR, f"{model_name}.joblib")
#     joblib.dump(final_model, save_path)
#     return final_model

# def train_evaluate_regressor(df, model_name, target_col):
#     print(f"\nüìè Training Regressor {model_name}...")
#     X = df[FEATURES]
#     y = df[target_col]
    
#     reg = RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_leaf=10, random_state=42)
#     reg.fit(X, y)
    
#     save_path = os.path.join(MODELS_DIR, f"{model_name}.joblib")
#     joblib.dump(reg, save_path)
#     return reg

### Train the BULL Models (Buy)

In [39]:
# # Train Buy Models
# rf_buy_class = train_evaluate_classifier(df_buy_train, "RF_Buy_Class", "LABEL_BUY")
# rf_buy_reg = train_evaluate_regressor(df_buy_train, "RF_Buy_Reg", "REG_TARGET_HIGH")



### Train the BEAR Models (Sell)

In [40]:
# # Train Sell Models
# rf_sell_class = train_evaluate_classifier(df_sell_train, "RF_Sell_Class", "LABEL_SELL")
# rf_sell_reg = train_evaluate_regressor(df_sell_train, "RF_Sell_Reg", "REG_TARGET_LOW")

### The Tuning Code

In [41]:
# Cell: Aggressive "Hail Mary" Tuning
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import make_scorer, fbeta_score, precision_score
from scipy.stats import randint, uniform

# --- CUSTOM SCORER: NET PROFIT ---
def net_profit_score(y_true, y_pred):
    """
    Calculates Net Profit in R-Multiples.
    Assumes 1:1 Risk/Reward (Win=+1, Loss=-1).
    """
    trades = np.sum(y_pred)
    if trades < 10: 
        return -1.0 # Penalize "doing nothing"
    
    wins = np.sum((y_pred == 1) & (y_true == 1))
    losses = np.sum((y_pred == 1) & (y_true == 0))
    
    net_r = (wins * 1.0) - (losses * 1.0)
    return net_r

profit_scorer = make_scorer(net_profit_score)

def tune_hail_mary(df, target_col, model_type="Buy"):
    print(f"\nüöÄ LAUNCHING HAIL MARY TUNING FOR: {model_type} Model")
    print(f"{'='*50}")
    
    X = df[FEATURES]
    y = df[target_col]
    
    # 1. THE CHAOS GRID (Maximizing Randomness)
    # We use scipy.stats distributions so the search can pick ANY integer in the range
    param_dist = {
        # Trees: Anywhere from 100 to 1200 trees. More trees = smoother probability map.
        'n_estimators': randint(500, 3000),
        
        # Depth: Deep trees (up to 50) allow memorizing complex specific scenarios
        'max_depth': randint(5, 60),
        
        # Split & Leaf: Low numbers = Riskier, High numbers = Conservative
        'min_samples_split': randint(5, 50),
        'min_samples_leaf': randint(1, 50),
        
        # Features: 'sqrt' is standard, 'log2' is more random, None uses all features
        'max_features': ['sqrt', 'log2', None],
        
        # Bootstrap: To sample with replacement or not
        'bootstrap': [True, False],
        
        # Class Weight: THE AGGRESSION FACTOR
        # We force the model to value Wins (1) significantly more than Safety (0)
        'class_weight': [
            'balanced', 
            'balanced_subsample', 
            # {0:1, 1:2},  # Win is worth 2x
            # {0:1, 1:3},  # Win is worth 4x
            # {0:1, 1:4},  # Win is worth 4x
            # {0:1, 1:8},  # Win is worth 8x (Very Aggressive)
            None
        ]
    }
    
    # 2. The Model
    rf = RandomForestClassifier(random_state=42, 
                                # n_jobs=-1
                               )
    
    # 3. The Metric: F0.5 Score
    # We use F0.5 because it prioritizes Precision but DOES NOT allow "0 trades"
    # If the model takes 0 trades, F0.5 is 0.0. It MUST trade to win.
    sniper_scorer = make_scorer(fbeta_score, beta=0.5, zero_division=0)
    
    # 4. The Search Engine
    tscv = TimeSeriesSplit(n_splits=3) # Keep splits low to allow more iterations
    
    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=200,              # <--- 100 Random Shots at the target
        scoring=profit_scorer,   # Optimizing for F0.5
        cv=tscv,
        verbose=1,
        # n_jobs=-1,
        random_state=42
    )
    
    print("‚è≥ Running Optimization (100 Iterations)...")
    random_search.fit(X, y)
    
    # 5. Results
    print(f"\nüèÜ BEST PARAMS: {random_search.best_params_}")
    print(f"‚ú® Best F0.5 Score: {random_search.best_score_:.4f}")
    
    # Quick Training Check
    best_model = random_search.best_estimator_
    preds = best_model.predict(X)
    trades = preds.sum()
    precision = precision_score(y, preds, zero_division=0)
    
    print(f"üìä Training Performance Check:")
    print(f"   ‚ñ∫ Trades Found: {trades}")
    print(f"   ‚ñ∫ Precision:    {precision:.2%}")
    
    return best_model

# --- EXECUTE ---
# Reloading Training Data (Just in case)
print("--- üîê RELOADING TRAINING DATA (Checking Vault Status) ---")
# Ensure we are using the TRAINING split from before, not the full file
# (Assuming df_buy_train and df_sell_train exist in memory from previous cells)
# If not, uncomment these lines:
# df_buy_train = pd.read_parquet(FILE_BUY).iloc[:int(len(pd.read_parquet(FILE_BUY))*0.8)]
# df_sell_train = pd.read_parquet(FILE_SELL).iloc[:int(len(pd.read_parquet(FILE_SELL))*0.8)]

best_buy_model = tune_hail_mary(df_buy_train, "LABEL_BUY", "Buy")
best_sell_model = tune_hail_mary(df_sell_train, "LABEL_SELL", "Sell")



--- üîê RELOADING TRAINING DATA (Checking Vault Status) ---

üöÄ LAUNCHING HAIL MARY TUNING FOR: Buy Model
‚è≥ Running Optimization (100 Iterations)...
Fitting 3 folds for each of 200 candidates, totalling 600 fits

üèÜ BEST PARAMS: {'bootstrap': True, 'class_weight': None, 'max_depth': 46, 'max_features': None, 'min_samples_leaf': 47, 'min_samples_split': 48, 'n_estimators': 1767}
‚ú® Best F0.5 Score: -1.0000
üìä Training Performance Check:
   ‚ñ∫ Trades Found: 25
   ‚ñ∫ Precision:    84.00%

üöÄ LAUNCHING HAIL MARY TUNING FOR: Sell Model
‚è≥ Running Optimization (100 Iterations)...
Fitting 3 folds for each of 200 candidates, totalling 600 fits

üèÜ BEST PARAMS: {'bootstrap': True, 'class_weight': None, 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 9, 'min_samples_split': 41, 'n_estimators': 1757}
‚ú® Best F0.5 Score: 1.3333
üìä Training Performance Check:
   ‚ñ∫ Trades Found: 90
   ‚ñ∫ Precision:    88.89%


### # Save the best models

In [42]:
# Save
import joblib
joblib.dump(best_buy_model, os.path.join(MODELS_DIR, "RF_Buy_HailMary_profit_based.joblib"))
joblib.dump(best_sell_model, os.path.join(MODELS_DIR, "RF_Sell_HailMary_profit_based.joblib"))
print("\nüíæ Hail Mary Models Saved.")


üíæ Hail Mary Models Saved.


In [None]:

joblib.dump(best_buy_model, os.path.join(MODELS_DIR, "RF_Buy_Tuned.joblib"))
joblib.dump(best_sell_model, os.path.join(MODELS_DIR, "RF_Sell_Tuned.joblib"))

print("\n‚úÖ Models Saved.")