### Imports & Configuration

In [1]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import make_scorer, fbeta_score
from scipy.stats import randint

# --- 1. CONFIGURATION ---
DATA_DIR = "../data"
MODELS_DIR = "../models"
os.makedirs(MODELS_DIR, exist_ok=True)

# Input File
FILE_FULL = os.path.join(DATA_DIR, "Train_Sniper_Full.parquet")

# Vault Output Files (Separated for clarity)
VAULT_BUY_TEST = os.path.join(DATA_DIR, "Vault_Buy_Test.parquet")
VAULT_SELL_TEST = os.path.join(DATA_DIR, "Vault_Sell_Test.parquet")

# Features & Targets
FEATURES = ['EMA_50_Slope', 'Dist_from_200', 'RSI', 'ATR', 'Rel_Vol', 'hour']
LABEL_BUY = "LABEL_BUY"   # Ensure these match your actual column names
LABEL_SELL = "LABEL_SELL" # Ensure these match your actual column names

# --- 2. LOAD & PREPARE ---
print(f"‚è≥ Loading data from {FILE_FULL}...")
df_full = pd.read_parquet(FILE_FULL)

# Drop any rows where Features are NaN (Critical for RF)
df_full = df_full.dropna(subset=FEATURES)

# Create the specific Buy/Sell datasets
# (Assuming your dataset has both columns. If they are separate files, load them separately)
df_buy_full = df_full.dropna(subset=[LABEL_BUY]).copy()
df_sell_full = df_full.dropna(subset=[LABEL_SELL]).copy()

print(f"üìä Total Data Loaded: {len(df_full)}")
print(f"   Buy Candidates:  {len(df_buy_full)}")
print(f"   Sell Candidates: {len(df_sell_full)}")

‚è≥ Loading data from ../data\Train_Sniper_Full.parquet...
üìä Total Data Loaded: 18669
   Buy Candidates:  18669
   Sell Candidates: 18669


### The Vault (Split & Hide)

In [2]:
# --- 3. VAULT FUNCTION ---
def vault_data(df, save_path, split_pct=0.20):
    """Splits data by time (no shuffling). Saves the tail (Test) to disk."""
    split_idx = int(len(df) * (1 - split_pct))
    
    train_df = df.iloc[:split_idx].copy()
    test_df = df.iloc[split_idx:].copy()
    
    # Save Test Data to Vault
    test_df.to_parquet(save_path)
    print(f"üîí Vaulted {len(test_df)} rows to {save_path}")
    
    return train_df

# --- EXECUTE VAULTING ---
print("\n--- üîê SECURING TEST DATA ---")
df_buy_train = vault_data(df_buy_full, VAULT_BUY_TEST)
df_sell_train = vault_data(df_sell_full, VAULT_SELL_TEST)


--- üîê SECURING TEST DATA ---
üîí Vaulted 3734 rows to ../data\Vault_Buy_Test.parquet
üîí Vaulted 3734 rows to ../data\Vault_Sell_Test.parquet


### Training Function (The Engine)

In [38]:
# def train_evaluate_classifier(df, model_name, target_col):
#     """
#     Trains a Random Forest Classifier using Time Series Cross Validation.
#     """
#     print(f"\nüöÄ Training {model_name}...")
    
#     X = df[FEATURES]
#     y = df[target_col]
    
#     # 1. Time Series Split (Internal Validation)
#     # We use 5 splits to ensure stability
#     tscv = TimeSeriesSplit(n_splits=5)
#     fold = 1
#     precisions = []
    
#     for train_index, val_index in tscv.split(X):
#         # Validation Split (NOT the Vault)
#         X_train, X_val = X.iloc[train_index], X.iloc[val_index]
#         y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
#         # Train (Conservative Params)
#         clf = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=10, random_state=42)
#         clf.fit(X_train, y_train)
        
#         # Predict
#         preds = clf.predict(X_val)
        
#         # Check Precision (Avoid division by zero if no trades taken)
#         prec = precision_score(y_val, preds, zero_division=0)
#         precisions.append(prec)
        
#         print(f"  > Fold {fold}: Precision = {prec:.2%}")
#         fold += 1
        
#     print(f"üèÜ Avg CV Precision: {np.mean(precisions):.2%}")
    
#     # 2. Final Fit on ALL TRAINING Data
#     final_model = RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_leaf=5, random_state=42)
#     final_model.fit(X, y)
    
#     # Save
#     save_path = os.path.join(MODELS_DIR, f"{model_name}.joblib")
#     joblib.dump(final_model, save_path)
#     return final_model

# def train_evaluate_regressor(df, model_name, target_col):
#     print(f"\nüìè Training Regressor {model_name}...")
#     X = df[FEATURES]
#     y = df[target_col]
    
#     reg = RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_leaf=10, random_state=42)
#     reg.fit(X, y)
    
#     save_path = os.path.join(MODELS_DIR, f"{model_name}.joblib")
#     joblib.dump(reg, save_path)
#     return reg

### Train the BULL Models (Buy)

In [39]:
# # Train Buy Models
# rf_buy_class = train_evaluate_classifier(df_buy_train, "RF_Buy_Class", "LABEL_BUY")
# rf_buy_reg = train_evaluate_regressor(df_buy_train, "RF_Buy_Reg", "REG_TARGET_HIGH")



### Train the BEAR Models (Sell)

In [40]:
# # Train Sell Models
# rf_sell_class = train_evaluate_classifier(df_sell_train, "RF_Sell_Class", "LABEL_SELL")
# rf_sell_reg = train_evaluate_regressor(df_sell_train, "RF_Sell_Reg", "REG_TARGET_LOW")

### The Tuning Code

In [None]:
# --- 4. PRODUCTION TUNING STRATEGY ---
# Scorer: F0.5 favors Precision over Recall (Safety > Volume)
sniper_scorer = make_scorer(fbeta_score, beta=1, zero_division=0)

def tune_production_model(df, target_col, model_type="Buy"):
    print(f"\nüè≠ LAUNCHING PRODUCTION TUNING FOR: {model_type} Model")
    print(f"{'='*60}")
    
    X = df[FEATURES]
    y = df[target_col]
    
    # THE PRODUCTION GRID
    param_dist = {
        'n_estimators': randint(1000, 2000),      # High trees to reduce variance
        'max_depth': randint(15, 40),             # Deep enough for patterns, not for noise
        'min_samples_split': randint(20, 50),     # Prevent overfitting specific candles
        'min_samples_leaf': randint(10, 25),      # Ensure generic rules
        'max_features': ['sqrt'],
        'bootstrap': [True],
        # BALANCED WEIGHTS: Critical for your 33% Win Rate
        'class_weight': ['balanced', 'balanced_subsample'] 
    }
    
    rf = RandomForestClassifier(random_state=42, n_jobs=-1)
    tscv = TimeSeriesSplit(n_splits=3)
    
    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=20,  # 20 iters is usually enough for a "Good Enough" production model
        scoring=sniper_scorer,
        cv=tscv,
        verbose=1,
        random_state=42,
        # n_jobs=-1 
    )
    
    print("‚è≥ Running Optimization (This may take 5-10 mins)...")
    random_search.fit(X, y)
    
    print(f"\nüèÜ BEST PARAMS: {random_search.best_params_}")
    print(f"‚ú® Best F0.5 Score: {random_search.best_score_:.4f}")
    
    return random_search.best_estimator_

# --- 5. EXECUTE TRAINING ---
# Train Buy Model 

# [Image of Random Forest Decision Trees]

best_buy_model = tune_production_model(df_buy_train, LABEL_BUY, "Buy")

# Train Sell Model
best_sell_model = tune_production_model(df_sell_train, LABEL_SELL, "Sell")

# --- 6. SAVE MODELS ---
path_buy = os.path.join(MODELS_DIR, "RF_Buy_Production.joblib")
path_sell = os.path.join(MODELS_DIR, "RF_Sell_Production.joblib")

joblib.dump(best_buy_model, path_buy)
joblib.dump(best_sell_model, path_sell)

print(f"\n‚úÖ SUCCESS. Models saved to:")
print(f"   {path_buy}")
print(f"   {path_sell}")


üè≠ LAUNCHING PRODUCTION TUNING FOR: Buy Model
‚è≥ Running Optimization (This may take 5-10 mins)...
Fitting 3 folds for each of 20 candidates, totalling 60 fits

üèÜ BEST PARAMS: {'bootstrap': True, 'class_weight': 'balanced', 'max_depth': 34, 'max_features': 'sqrt', 'min_samples_leaf': 22, 'min_samples_split': 34, 'n_estimators': 1106}
‚ú® Best F0.5 Score: 0.2079

üè≠ LAUNCHING PRODUCTION TUNING FOR: Sell Model
‚è≥ Running Optimization (This may take 5-10 mins)...
Fitting 3 folds for each of 20 candidates, totalling 60 fits


### # Save the best models

In [42]:
# Save
import joblib
joblib.dump(best_buy_model, os.path.join(MODELS_DIR, "RF_Buy_HailMary_profit_based.joblib"))
joblib.dump(best_sell_model, os.path.join(MODELS_DIR, "RF_Sell_HailMary_profit_based.joblib"))
print("\nüíæ Hail Mary Models Saved.")


üíæ Hail Mary Models Saved.


In [None]:

joblib.dump(best_buy_model, os.path.join(MODELS_DIR, "RF_Buy_Tuned.joblib"))
joblib.dump(best_sell_model, os.path.join(MODELS_DIR, "RF_Sell_Tuned.joblib"))

print("\n‚úÖ Models Saved.")