# 🧠 NeuroNautilus AI Trader - Ultimate Pipeline

**Version:** 1.0 (Auto-Discovery Edition)
**Updated:** January 2026

---

## 🚀 Overview

This notebook provides a complete **End-to-End Pipeline** for training and validating the NeuroNautilus AI Trading System.

### ✨ Key Features:
- **🤖 Smart Auto-Discovery:** Automatically finds the best data and trained models.
- **🧹 Auto-Cleanup:** Automatically manages storage by removing intermediate checkpoints.
- **📊 Robust Backtesting:** Full validation and test period analysis with equity curves.
- **🛡️ Risk-Adjusted Training:** Uses research-based PPO hyperparameters.

---


## 1️⃣ Environment Setup


In [None]:
# @title Install Dependencies & Setup Workspace
# Force re-install if needed
!pip install -q stable-baselines3[extra] gymnasium pandas numpy tao-ta
!pip install -q nautilus_trader

import sys
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Mount Drive
try:
    from google.colab import drive
    drive.mount('/content/drive')
    IS_COLAB = True
    print("✅ Google Drive Mounted")
except:
    IS_COLAB = False
    print("💻 Local Environment Detected")

# Clone/Pull Repository
if IS_COLAB:
    if not os.path.exists('/content/NeuroTrader'):
        !git clone https://github.com/MaDoHee33/NeuroTrader.git /content/NeuroTrader
    else:
        !cd /content/NeuroTrader && git pull

    sys.path.insert(0, '/content/NeuroTrader')

# Define Workspace
if IS_COLAB:
    WORKSPACE = Path('/content/drive/MyDrive/NeuroTrader_Workspace')
    WORKSPACE.mkdir(parents=True, exist_ok=True)
else:
    WORKSPACE = Path.cwd()

DATA_DIR = WORKSPACE / 'data'
MODELS_DIR = WORKSPACE / 'models'
LOGS_DIR = WORKSPACE / 'logs'

# Create directories
for d in [DATA_DIR, MODELS_DIR, LOGS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print(f"\n📂 Workspace: {WORKSPACE}")
print("✅ Setup Complete!")


## 2️⃣ Data Configuration (Auto-Discovery)


In [None]:
# @title Smart Data Auto-Configuration
from src.brain.data_discovery import auto_configure_training

print("🔍 Scanning for best data...")

try:
    # Auto-configure everything
    config = auto_configure_training(
        catalog_path='data/nautilus_catalog',
        workspace=WORKSPACE
    )
    
    # Set global variables
    BAR_TYPE = config['bar_type']
    TRAIN_START = config['train_start']
    TRAIN_END = config['train_end']
    VAL_START = config['val_start']
    VAL_END = config['val_end']
    TEST_START = config['test_start']
    TEST_END = config['test_end']
    
    print(f"\n✅ Data Selected: {BAR_TYPE}")
    
except Exception as e:
    print(f"⚠️ Error: {e}")
    print("Please perform manual configuration or upload data.")


## 3️⃣ Model Training (Research-Optimized)


In [None]:
# @title Train PPO Model (10M Steps)
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import CheckpointCallback
from src.brain.env.trading_env import TradingEnv
from src.brain.train import load_data, add_features
import time

# --- CONFIGURATION ---
TOTAL_TIMESTEPS = 10_000_000  # 10 Million Steps
MODEL_NAME = "ppo_neurotrader"
# ---------------------

print(f"🚀 Initializing Training Pipeline for {TOTAL_TIMESTEPS:,} steps...")

# 1. Load & Prep Data
df = load_data(str(DATA_DIR / 'nautilus_catalog'), BAR_TYPE)
df = add_features(df)
train_df = df[(df.index >= TRAIN_START) & (df.index <= TRAIN_END)]

print(f"📉 Training Data: {len(train_df):,} bars ({TRAIN_START} to {TRAIN_END})")

# 2. Environment
env = TradingEnv(train_df)
vec_env = DummyVecEnv([lambda: env])

# 3. Model Setup (Research Based)
model = PPO(
    "MlpPolicy",
    vec_env,
    learning_rate=3e-4,
    n_steps=2048,
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    ent_coef=0.01,
    verbose=0,
    tensorboard_log=str(LOGS_DIR)
)

# 4. Checkpoint Callback (Auto-Cleanup Enabled)
checkpoint_callback = CheckpointCallback(
    save_freq=1_000_000,
    save_path=str(MODELS_DIR / 'checkpoints'),
    name_prefix='ppo_checkpoint',
    save_replay_buffer=False,
    verbose=1
)

# 5. Train
print("\n🏃 Training started... (This may take 5-8 hours)")
start_time = time.time()

try:
    model.learn(total_timesteps=TOTAL_TIMESTEPS, callback=checkpoint_callback, progress_bar=True)
    
    # 6. Save Final (Smart Cleanup)
    final_path = MODELS_DIR / f"{MODEL_NAME}.zip"
    model.save(str(final_path))
    print(f"\n💾 Final model saved: {final_path}")
    
    # 7. Auto-Cleanup
    print("\n🧹 Auto-cleaning intermediate checkpoints...")
    for ckpt in (MODELS_DIR / 'checkpoints').glob(f'ppo_checkpoint_*_steps.zip'):
        ckpt.unlink(missing_ok=True)
    print("✅ Cleanup complete. Only final model kept.")
    
except KeyboardInterrupt:
    print("\n⚠️ Training interrupted. Saving current state...")
    model.save(str(MODELS_DIR / f"{MODEL_NAME}_interrupted.zip"))

elapsed = (time.time() - start_time) / 3600
print(f"⏱️ Total Time: {elapsed:.2f} hours")


In [None]:
# @title (Optional) Visualize Training with TensorBoard
%load_ext tensorboard
%tensorboard --logdir "$LOGS_DIR"


## 4️⃣ Validation & Backtesting


In [None]:
# @title 🔍 Smart Model Discovery & Validation
from src.brain.model_discovery import find_best_model
from src.neuro_nautilus.runner import simple_backtest, analyze_results
import matplotlib.pyplot as plt
import pandas as pd

# 1. Find Best Model
best_model_path = find_best_model(workspace=WORKSPACE)

if not best_model_path:
    print("❌ No trained model found!")
else:
    print(f"✅ Using Best Model: {best_model_path.name}")
    
    # 2. Run Validation Backtest
    print(f"\n📊 Running Validation Backtest ({VAL_START} to {VAL_END})...")
    
    val_results = simple_backtest(
        data_path=str(DATA_DIR / 'nautilus_catalog'),
        model_path=str(best_model_path),
        bar_type=BAR_TYPE,
        start_date=VAL_START,
        end_date=VAL_END,
        initial_balance=10000
    )
    
    # 3. Metrics
    metrics = analyze_results(val_results)
    print(f"\n📈 Validation Results:")
    print(f"   Total Return: {metrics['total_return']:.2%}")
    print(f"   Sharpe Ratio: {metrics['sharpe_ratio']:.3f}")
    print(f"   Max Drawdown: {metrics['max_drawdown']:.2%}")
    print(f"   Win Rate:     {metrics['win_rate']:.2%}")
    
    # 4. Plot
    if 'equity_curve' in val_results:
        eq = val_results['equity_curve']
        plt.figure(figsize=(12, 5))
        plt.plot(pd.to_datetime(eq['timestamp']), eq['balance'])
        plt.title(f'Equity Curve (Validation) - Sharpe: {metrics["sharpe_ratio"]:.2f}')
        plt.grid(True, alpha=0.3)
        plt.show()


In [None]:
# @title 🧪 Final Test (Unseen Data)
# Run Backtest on TEST set
print(f"📊 Running Test Backtest ({TEST_START} to {TEST_END})...")

try:
    test_results = simple_backtest(
        data_path=str(DATA_DIR / 'nautilus_catalog'),
        model_path=str(best_model_path),
        bar_type=BAR_TYPE,
        start_date=TEST_START,
        end_date=TEST_END,
        initial_balance=10000
    )

    test_metrics = analyze_results(test_results)

    print(f"\n📈 Test Results (Unseen Data):")
    print(f"   Total Return: {test_metrics['total_return']:.2%}")
    print(f"   Sharpe Ratio: {test_metrics['sharpe_ratio']:.3f}")
    print(f"   Max Drawdown: {test_metrics['max_drawdown']:.2%}")
    print(f"   Win Rate:     {test_metrics['win_rate']:.2%}")

    # Pass/Fail
    if test_metrics['sharpe_ratio'] > 0.5:
        print("\n✅ PASSED: Ready for Paper Trading (Week 2)")
    else:
        print("\n❌ FAILED: Needs improvement/retraining")

    # Plot
    if 'equity_curve' in test_results:
        eq = test_results['equity_curve']
        plt.figure(figsize=(12, 5))
        plt.plot(pd.to_datetime(eq['timestamp']), eq['balance'], color='green')
        plt.title(f'Equity Curve (TEST) - Sharpe: {test_metrics["sharpe_ratio"]:.2f}')
        plt.grid(True, alpha=0.3)
        plt.show()

except Exception as e:
    print(f"⚠️ Error running test backtest: {e}")
