# Complete Training Notebook - Prediction Models + PPO Agent

This notebook trains both prediction models (LSTM/GRU/BiLSTM/DLSTM) and PPO trading agent with multi-horizon future predictions.

**Configuration:**
- Dataset: Configurable below (default: `ADA-EUR_1H_20240101-20251231`)
- Task: Classification (3 classes: Fall, Stationary, Rise)
- Sequence Length: 60
- Models: All 4 models for ensemble
- Prediction Horizons: [1, 2, 3, 5, 10] steps ahead (short-term + medium-term)

**Usage:**
1. Set your dataset name in Cell 1
2. Run all cells (Runtime > Run All)
3. All outputs saved to Google Drive automatically


In [3]:
# Cell 1: Setup and Mount Drive
# ==============================

# CONFIGURATION: Set your dataset name here
DATASET_NAME = "ADA-EUR_1H_20240101-20251231"  # Change this to your dataset

import os
import sys
from pathlib import Path

# Mount Google Drive (handle already-mounted case)
drive_mounted = False
try:
    from google.colab import drive  # type: ignore
    drive_path = Path('/content/drive')
    
    # Check if Drive is already mounted
    if drive_path.exists() and (drive_path / 'MyDrive').exists():
        print("‚úì Google Drive already mounted")
        drive_mounted = True
    else:
        print("Mounting Google Drive...")
        print("=" * 60)
        print("‚ö† IMPORTANT: If using ipykernel from local IDE:")
        print("   The authentication popup may not appear.")
        print("   Please mount Drive manually in Colab web interface:")
        print("   1. Open the notebook in Colab web interface")
        print("   2. Run: from google.colab import drive; drive.mount('/content/drive')")
        print("   3. Complete authentication in the web browser")
        print("   4. Then return to your IDE")
        print("=" * 60)
        
        try:
            # Try to mount (will show auth URL if interactive)
            # Note: When using ipykernel, the popup may not appear
            # The auth URL will be printed - copy it and open in browser
            print("\nüìã If you see an authentication URL below, copy it and:")
            print("   1. Open the URL in your web browser")
            print("   2. Sign in and authorize access")
            print("   3. Copy the authorization code")
            print("   4. Paste it in the input field below\n")
            
            drive.mount('/content/drive', force_remount=False)
            print("\n‚úì Google Drive mounted successfully")
            drive_mounted = True
        except KeyboardInterrupt:
            print("\n‚ö† Mount cancelled by user")
            print("üí° You can mount Drive manually in Colab web interface")
        except Exception as mount_error:
            print(f"\n‚ö† Could not mount Drive automatically: {mount_error}")
            print("\nüí° Solutions:")
            print("   1. Mount Drive manually in Colab web interface:")
            print("      - Open notebook in Colab (colab.research.google.com)")
            print("      - Run: from google.colab import drive; drive.mount('/content/drive')")
            print("      - Complete authentication, then return to IDE")
            print("   2. Or use the authentication URL printed above (if any)")
            print("   3. Drive mount persists for the Colab session")
            
            # Check if it got mounted anyway (user might have done it manually)
            if drive_path.exists() and (drive_path / 'MyDrive').exists():
                print("\n   ‚úì Drive appears to be mounted now!")
                drive_mounted = True
except ImportError:
    print("‚ö† google.colab not available - not running on Colab")
    print("  (This is OK if running locally)")
except Exception as e:
    print(f"‚ö† Drive mount check: {e}")
    print("  (This is OK if running locally)")

# Set project paths using colab_utils
try:
    # Add PPO approach to path first to import colab_utils
    # Add local Windows path first (for IDE static analysis)
    cwd = Path.cwd()
    local_ppo_path = cwd / 'PPO approach'
    if str(local_ppo_path) not in sys.path:
        sys.path.insert(0, str(local_ppo_path))
    
    # Also add Colab paths (for actual execution on Colab)
    sys.path.insert(0, '/content/drive/MyDrive/Bot 2026/PPO approach')
    sys.path.insert(0, '/content/drive/Mijn Drive/Bot 2026/PPO approach')
    
    from colab_utils import get_project_path, setup_environment  # type: ignore
    
    # Setup environment (handles Colab detection, path setup)
    env_info = setup_environment(verbose=True)
    PROJECT_PATH = env_info['project_path']
    
except Exception as e:
    print(f"‚ö† Error setting up paths: {e}")
    # Fallback paths
    possible_paths = [
        Path('/content/drive/MyDrive/Bot 2026'),
        Path('/content/drive/Mijn Drive/Bot 2026'),
    ]
    PROJECT_PATH = None
    for path in possible_paths:
        if path.exists():
            PROJECT_PATH = path
            break
    
    if PROJECT_PATH is None:
        raise FileNotFoundError(
            "Could not find 'Bot 2026' folder in Google Drive.\n"
            "Please ensure your project is synced to Google Drive."
        )

# Verify folder exists
if not PROJECT_PATH.exists():
    raise FileNotFoundError(f"Project folder not found: {PROJECT_PATH}")

# Add paths to sys.path
if str(PROJECT_PATH) not in sys.path:
    sys.path.insert(0, str(PROJECT_PATH))
if str(PROJECT_PATH / 'PPO approach') not in sys.path:
    sys.path.insert(0, str(PROJECT_PATH / 'PPO approach'))

# Change to project directory
os.chdir(PROJECT_PATH)

print(f"\n‚úì Project path: {PROJECT_PATH}")
print(f"‚úì Dataset name: {DATASET_NAME}")
print(f"‚úì Current directory: {os.getcwd()}")
print("\n" + "="*60)


Mounting Google Drive...
‚ö† IMPORTANT: If using ipykernel from local IDE:
   The authentication popup may not appear.
   Please mount Drive manually in Colab web interface:
   1. Open the notebook in Colab web interface
   2. Run: from google.colab import drive; drive.mount('/content/drive')
   3. Complete authentication in the web browser
   4. Then return to your IDE

üìã If you see an authentication URL below, copy it and:
   1. Open the URL in your web browser
   2. Sign in and authorize access
   3. Copy the authorization code
   4. Paste it in the input field below


‚ö† Mount cancelled by user
üí° You can mount Drive manually in Colab web interface
‚ö† Error setting up paths: No module named 'colab_utils'


FileNotFoundError: Could not find 'Bot 2026' folder in Google Drive.
Please ensure your project is synced to Google Drive.

In [None]:
# Cell 2: Install Dependencies
# =============================

import subprocess
import sys

print("Installing dependencies...")
print("-" * 60)

# Install TensorFlow (for prediction models)
print("\n1. Installing TensorFlow...")
try:
    import tensorflow as tf
    print(f"   TensorFlow already installed: {tf.__version__}")
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "tensorflow>=2.13.0"])
    import tensorflow as tf
    print(f"   ‚úì TensorFlow installed: {tf.__version__}")

# Install PyTorch and stable-baselines3 (for PPO)
print("\n2. Installing PyTorch and stable-baselines3...")
try:
    import torch
    import stable_baselines3
    print(f"   PyTorch already installed: {torch.__version__}")
    print(f"   stable-baselines3 already installed: {stable_baselines3.__version__}")
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "torch", "stable-baselines3[extra]>=2.0.0"])
    import torch
    import stable_baselines3
    print(f"   ‚úì PyTorch installed: {torch.__version__}")
    print(f"   ‚úì stable-baselines3 installed: {stable_baselines3.__version__}")

# Install other dependencies
print("\n3. Installing other dependencies...")
dependencies = [
    'pandas>=1.5.0',
    'numpy>=1.23.0',
    'scikit-learn>=1.2.0',
    'matplotlib>=3.6.0',
    'tqdm>=4.65.0',
    'gymnasium>=0.28.0',
    'tensorboard>=2.13.0',
]

for dep in dependencies:
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", dep])
        print(f"   ‚úì {dep.split('>=')[0]}")
    except Exception as e:
        print(f"   ‚ö† {dep.split('>=')[0]}: {e}")

print("\n" + "="*60)
print("‚úì All dependencies installed")
print("="*60)


In [None]:
# Cell 3: Verify GPU Configuration
# ==================================

print("Verifying GPU Configuration...")
print("-" * 60)

# Check TensorFlow GPU availability
print("\n1. TensorFlow GPU:")
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"   ‚úì Found {len(gpus)} GPU(s)")
    for i, gpu in enumerate(gpus):
        print(f"   GPU {i}: {gpu.name}")
        try:
            details = tf.config.experimental.get_device_details(gpu)
            if details:
                print(f"      Compute Capability: {details.get('compute_capability', 'Unknown')}")
        except:
            pass
else:
    print("   ‚ö† No GPU detected for TensorFlow")
    print("   üí° In Colab: Runtime > Change runtime type > Hardware accelerator > GPU")

# Check PyTorch CUDA availability
print("\n2. PyTorch CUDA:")
import torch
if torch.cuda.is_available():
    print(f"   ‚úì CUDA available: {torch.version.cuda}")
    print(f"   ‚úì GPU: {torch.cuda.get_device_name(0)}")
    print(f"   ‚úì GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("   ‚ö† CUDA not available for PyTorch")

# Run nvidia-smi for verification
print("\n3. nvidia-smi output:")
!nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader

# Warning if no GPU
if not gpus and not torch.cuda.is_available():
    print("\n" + "‚ö†"*30)
    print("‚ö† WARNING: No GPU detected!")
    print("   Training will be VERY slow on CPU.")
    print("   In Google Colab: Runtime > Change runtime type > GPU")
    print("‚ö†"*30)
else:
    print("\n‚úì GPU configuration verified - ready for training!")

print("\n" + "="*60)


In [None]:
# Cell 4: Verify Dataset Exists
# ==============================

print("Verifying Dataset...")
print("-" * 60)

from pathlib import Path
import pandas as pd

# Get datasets path
try:
    from colab_utils import get_datasets_path  # type: ignore
    datasets_path = get_datasets_path()
except:
    datasets_path = PROJECT_PATH / 'datasets'

print(f"Datasets directory: {datasets_path}")

# Check if dataset CSV file exists
dataset_file = datasets_path / f"{DATASET_NAME}.csv"

if not dataset_file.exists():
    # Try to find with partial match
    matches = list(datasets_path.glob(f"*{DATASET_NAME}*.csv"))
    if matches:
        dataset_file = matches[0]
        print(f"‚ö† Found similar file: {dataset_file.name}")
    else:
        print(f"\n‚ùå ERROR: Dataset not found: {DATASET_NAME}.csv")
        print(f"\nAvailable datasets in {datasets_path}:")
        for f in sorted(datasets_path.glob("*.csv")):
            print(f"  - {f.name}")
        raise FileNotFoundError(
            f"Dataset '{DATASET_NAME}.csv' not found in {datasets_path}\n"
            f"Please update DATASET_NAME in Cell 1 or add the dataset file."
        )

print(f"‚úì Dataset found: {dataset_file.name}")

# Load and display dataset info
print("\nLoading dataset...")
try:
    df = pd.read_csv(dataset_file)
    
    print(f"\nDataset Info:")
    print(f"  Rows: {len(df):,}")
    print(f"  Columns: {len(df.columns)}")
    print(f"  Columns: {', '.join(df.columns[:10])}{'...' if len(df.columns) > 10 else ''}")
    
    # Check required columns
    required_cols = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
    missing_cols = [col for col in required_cols if col not in df.columns]
    
    if missing_cols:
        print(f"\n‚ö† WARNING: Missing required columns: {missing_cols}")
        print("   Dataset may not work correctly for training")
    else:
        print(f"  ‚úì All required columns present")
    
    # Display date range if timestamp exists
    if 'timestamp' in df.columns:
        try:
            df['datetime'] = pd.to_datetime(df['timestamp'], unit='ms')
            print(f"\nDate Range:")
            print(f"  Start: {df['datetime'].min()}")
            print(f"  End: {df['datetime'].max()}")
            print(f"  Duration: {(df['datetime'].max() - df['datetime'].min()).days} days")
        except:
            print("  (Could not parse timestamps)")
    
    print(f"\n‚úì Dataset verified and ready for training")
    
except Exception as e:
    print(f"\n‚ùå ERROR loading dataset: {e}")
    raise

print("\n" + "="*60)


In [None]:
# Cell 5: Train Prediction Models
# ================================

print("Training Prediction Models...")
print("="*60)

import sys
from pathlib import Path
from configparser import ConfigParser

# Import train_models functions
sys.path.insert(0, str(PROJECT_PATH))
from train_models import train_all_models, load_config

# Load configuration
# Priority: training_config.txt > config.txt (skip if JSON) > defaults
config_path = PROJECT_PATH / 'training_config.txt'
if not config_path.exists():
    # Try config.txt, but skip if it's JSON format
    config_path_alt = PROJECT_PATH / 'config.txt'
    if config_path_alt.exists():
        # Check if it's JSON (starts with {)
        with open(config_path_alt, 'r') as f:
            first_char = f.read(1).strip()
        if first_char == '{':
            print("‚ö† config.txt is JSON format (for crypto downloader), skipping")
            print("  Using default training configuration")
            config = None  # Will use defaults
        else:
            config_path = config_path_alt
            print(f"Loading config from: {config_path}")
            config = load_config(str(config_path))
    else:
        print("‚ö† Config file not found, using defaults")
        config = None
else:
    print(f"Loading config from: {config_path}")
    config = load_config(str(config_path))

# Train all models
print(f"\nTraining models on dataset: {DATASET_NAME}")
print("Models: LSTM, GRU, BiLSTM, DLSTM")
print("Task: Classification (Fall, Stationary, Rise)")
print("-"*60)

try:
    results = train_all_models(
        datasets_dir=str(PROJECT_PATH / 'datasets'),
        config=config,
        task='classification',
        models=['lstm', 'gru', 'bilstm', 'dlstm'],
        specific_dataset=DATASET_NAME,
        use_ensemble=False
    )
    
    if results:
        print("\n" + "="*60)
        print("‚úì All models trained successfully!")
        print("="*60)
        
        # Print summary
        print("\nTraining Summary:")
        for result in results:
            model_name = result['model_name']
            metrics = result['metrics']
            print(f"\n{model_name.upper()}:")
            for metric, value in metrics.items():
                if isinstance(value, float):
                    print(f"  {metric}: {value:.6f}")
                else:
                    print(f"  {metric}: {value}")
    else:
        print("\n‚ö† No models were trained. Check dataset name and paths.")
        
except Exception as e:
    print(f"\n‚ùå ERROR during training: {e}")
    import traceback
    traceback.print_exc()
    raise

print("\n" + "="*60)
print("‚úì Training completed")
print("="*60)



In [None]:
# Cell 6: Display Prediction Model Metrics
# =========================================

print("Displaying Prediction Model Metrics...")
print("="*60)

import json
from pathlib import Path
import matplotlib.pyplot as plt

# Get paths
from colab_utils import get_models_path, get_scalers_path  # type: ignore
models_path = get_models_path()
results_path = PROJECT_PATH / 'results'  # Prediction model results are in project root
scalers_path = get_scalers_path()

# Load and display metrics for each trained model
models_to_check = ['lstm', 'gru', 'bilstm', 'dlstm']
trained_models = {}

print("\nModel Files:")
print("-" * 60)
for model_name in models_to_check:
    model_pattern = f"{model_name}_{DATASET_NAME}_classification.keras"
    model_file = models_path / model_pattern
    
    if not model_file.exists():
        matches = list(models_path.glob(f"{model_name}*{DATASET_NAME}*.keras"))
        if matches:
            model_file = matches[0]
    
    if model_file.exists():
        size_mb = model_file.stat().st_size / (1024 * 1024)
        print(f"  ‚úì {model_name.upper()}: {model_file.name} ({size_mb:.2f} MB)")
        trained_models[model_name] = model_file
    else:
        print(f"  ‚úó {model_name.upper()}: Not found")

# Load training histories
print("\nTraining Histories:")
print("-" * 60)
for model_name in trained_models.keys():
    history_file = results_path / f"history_{model_name}_{DATASET_NAME}_classification.json"
    if history_file.exists():
        with open(history_file, 'r') as f:
            history = json.load(f)
        
        print(f"\n{model_name.upper()} Training History:")
        if 'accuracy' in history:
            final_acc = history['accuracy'][-1] if history['accuracy'] else 0
            val_acc = history['val_accuracy'][-1] if history['val_accuracy'] else 0
            print(f"  Final Training Accuracy: {final_acc:.4f}")
            print(f"  Final Validation Accuracy: {val_acc:.4f}")
        
        if 'loss' in history:
            final_loss = history['loss'][-1] if history['loss'] else 0
            val_loss = history['val_loss'][-1] if history['val_loss'] else 0
            print(f"  Final Training Loss: {final_loss:.6f}")
            print(f"  Final Validation Loss: {val_loss:.6f}")

# Verify scalers
print("\nScalers:")
print("-" * 60)
scaler_file = scalers_path / f"scaler_{DATASET_NAME}.pkl"
if scaler_file.exists():
    print(f"  ‚úì Scaler found: {scaler_file.name}")
    import pickle
    with open(scaler_file, 'rb') as f:
        scaler_data = pickle.load(f)
    if isinstance(scaler_data, dict):
        print(f"    Sequence length: {scaler_data.get('sequence_length', 'N/A')}")
        print(f"    Features: {len(scaler_data.get('feature_names', []))}")
else:
    print(f"  ‚úó Scaler not found")

# Display training plots if available
print("\nTraining Plots:")
print("-" * 60)
for model_name in trained_models.keys():
    plot_file = results_path / f"training_{model_name}_{DATASET_NAME}_classification.png"
    if plot_file.exists():
        print(f"  ‚úì {model_name.upper()} training plot: {plot_file.name}")
        
        # Display the plot
        try:
            img = plt.imread(plot_file)
            plt.figure(figsize=(12, 5))
            plt.imshow(img)
            plt.axis('off')
            plt.title(f"{model_name.upper()} Training History")
            plt.tight_layout()
            plt.show()
        except Exception as e:
            print(f"    (Could not display: {e})")
    
    cm_file = results_path / f"confusion_{model_name}_{DATASET_NAME}.png"
    if cm_file.exists():
        print(f"  ‚úì {model_name.upper()} confusion matrix: {cm_file.name}")

print("\n" + "="*60)
print("‚úì Metrics display completed")
print("="*60)


In [None]:
# Cell 7: Validate Models Before PPO Training
# ============================================

print("Validating Models Before PPO Training...")
print("="*60)

import numpy as np
import sys

# Import prediction wrapper
sys.path.insert(0, str(PROJECT_PATH / 'PPO approach'))
from prediction_wrapper import PredictionModel, EnsemblePredictionModel

# Test loading each individual model
print("\n1. Testing Individual Model Loading:")
print("-" * 60)
models_to_test = ['lstm', 'gru', 'bilstm', 'dlstm']
loaded_models = {}

for model_name in models_to_test:
    try:
        model = PredictionModel(model_name, DATASET_NAME)
        if model.load():
            print(f"  ‚úì {model_name.upper()} loaded successfully")
            loaded_models[model_name] = model
        else:
            print(f"  ‚úó {model_name.upper()} failed to load")
    except Exception as e:
        print(f"  ‚úó {model_name.upper()} error: {e}")

# Test ensemble loading
print("\n2. Testing Ensemble Loading:")
print("-" * 60)
try:
    ensemble = EnsemblePredictionModel(DATASET_NAME, list(loaded_models.keys()))
    if ensemble.load():
        print(f"  ‚úì Ensemble loaded with {len(ensemble.models)} model(s)")
    else:
        print(f"  ‚úó Ensemble failed to load")
        raise RuntimeError("Ensemble loading failed")
except Exception as e:
    print(f"  ‚úó Ensemble error: {e}")
    raise

# Verify model inference works
print("\n3. Testing Model Inference:")
print("-" * 60)
try:
    # Create dummy sequence for testing
    seq_len = ensemble.sequence_length
    # Get number of features from the first model's feature_names or scaler
    if ensemble.models and hasattr(ensemble.models[0], 'feature_names') and ensemble.models[0].feature_names:
        n_features = len(ensemble.models[0].feature_names)
    elif ensemble.feature_scaler:
        # Try to get from scaler attributes (different scikit-learn versions)
        if hasattr(ensemble.feature_scaler, 'n_features_in_'):
            n_features = ensemble.feature_scaler.n_features_in_
        elif hasattr(ensemble.feature_scaler, 'feature_names_in_'):
            n_features = len(ensemble.feature_scaler.feature_names_in_)
        else:
            # Fallback: get from scaler data if available
            if hasattr(ensemble.models[0], 'scaler_data') and isinstance(ensemble.models[0].scaler_data, dict):
                n_features = len(ensemble.models[0].scaler_data.get('feature_names', []))
            else:
                n_features = 27  # Default: typical number with technical indicators
    else:
        n_features = 27  # Default fallback
    
    dummy_seq = np.random.randn(seq_len, n_features)
    print(f"  Test sequence shape: {dummy_seq.shape}")
    
    # Test single-step prediction
    pred_class, confidence, probs = ensemble.predict(dummy_seq)
    print(f"  ‚úì Single-step prediction:")
    print(f"    Class: {pred_class} ({['Fall', 'Stationary', 'Rise'][pred_class]})")
    print(f"    Confidence: {confidence:.4f}")
    print(f"    Probabilities: Fall={probs[0]:.3f}, Stationary={probs[1]:.3f}, Rise={probs[2]:.3f}")
    
except Exception as e:
    print(f"  ‚úó Inference test failed: {e}")
    import traceback
    traceback.print_exc()
    raise

# Test multi-horizon predictions
print("\n4. Testing Multi-Horizon Predictions:")
print("-" * 60)
try:
    horizons = [1, 2, 3, 5, 10]
    multi_preds = ensemble.predict_multi_horizon(dummy_seq, horizons=horizons)
    
    print(f"  ‚úì Multi-horizon predictions for horizons {horizons}:")
    for horizon in horizons:
        if horizon in multi_preds:
            h_class, h_conf, h_probs = multi_preds[horizon]
            print(f"    t+{horizon}: {['Fall', 'Stationary', 'Rise'][h_class]} "
                  f"(conf={h_conf:.3f}, probs=[{h_probs[0]:.2f}, {h_probs[1]:.2f}, {h_probs[2]:.2f}])")
    
    # Test feature extraction with multi-horizon
    print(f"\n  Testing feature extraction with horizons {horizons}:")
    features = ensemble.get_features(dummy_seq, horizons=horizons)
    print(f"    Feature vector shape: {features.shape}")
    print(f"    Feature vector (first 10): {features[:10]}")
    
except Exception as e:
    print(f"  ‚úó Multi-horizon test failed: {e}")
    import traceback
    traceback.print_exc()
    raise

# Check scaler compatibility
print("\n5. Checking Scaler Compatibility:")
print("-" * 60)
try:
    if ensemble.feature_scaler:
        print(f"  ‚úì Feature scaler available")
        # Get feature count from various possible sources
        if ensemble.models and hasattr(ensemble.models[0], 'feature_names') and ensemble.models[0].feature_names:
            n_features = len(ensemble.models[0].feature_names)
            print(f"    Features: {n_features} (from feature_names)")
        elif hasattr(ensemble.feature_scaler, 'n_features_in_'):
            print(f"    Features: {ensemble.feature_scaler.n_features_in_} (from scaler)")
        elif hasattr(ensemble.feature_scaler, 'feature_names_in_'):
            print(f"    Features: {len(ensemble.feature_scaler.feature_names_in_)} (from feature_names_in_)")
        else:
            print(f"    Features: N/A (scaler doesn't have feature count attribute)")
    else:
        print(f"  ‚ö† Feature scaler not found")
    
    print(f"  Sequence length: {ensemble.sequence_length}")
    
except Exception as e:
    print(f"  ‚ö† Scaler check warning: {e}")

# Display ensemble prediction example
print("\n6. Ensemble Prediction Example:")
print("-" * 60)
try:
    # Use a more realistic sequence (normalized)
    test_seq = np.random.randn(seq_len, n_features) * 0.1 + 0.5  # Normalized-like values
    
    # Single prediction
    pred_class, confidence, probs = ensemble.predict(test_seq)
    print(f"  Ensemble prediction (single-step):")
    print(f"    Direction: {['Fall', 'Stationary', 'Rise'][pred_class]}")
    print(f"    Confidence: {confidence:.4f}")
    
    # Multi-horizon
    multi_preds = ensemble.predict_multi_horizon(test_seq, horizons=[1, 2, 3, 5, 10])
    print(f"  Ensemble prediction (multi-horizon):")
    for h in [1, 2, 3, 5, 10]:
        if h in multi_preds:
            h_class, h_conf, _ = multi_preds[h]
            print(f"    t+{h}: {['Fall', 'Stationary', 'Rise'][h_class]} (conf={h_conf:.3f})")
    
except Exception as e:
    print(f"  ‚ö† Example prediction warning: {e}")

print("\n" + "="*60)
print("‚úì Model validation completed - ready for PPO training")
print("="*60)


In [None]:
# Cell 8: Train PPO Agent
# ========================

print("Training PPO Agent...")
print("="*60)

import os
import sys
from pathlib import Path
from configparser import ConfigParser

# Change to PPO approach directory
ppo_path = PROJECT_PATH / 'PPO approach'
os.chdir(ppo_path)
sys.path.insert(0, str(ppo_path))

# Import PPO training functions
from train_ppo_agent import train_ppo, load_config
from colab_utils import get_ppo_path  # type: ignore

# Load PPO configuration
config_path = ppo_path / 'ppo_config.txt'
print(f"\nLoading PPO config from: {config_path}")

if not config_path.exists():
    print("‚ö† PPO config file not found, using defaults")
    config = load_config(None)
else:
    config = load_config(str(config_path))

# Update dataset name in config if needed
if config['models']['dataset'] != DATASET_NAME:
    print(f"\n‚ö† Updating dataset name in config: {config['models']['dataset']} -> {DATASET_NAME}")
    config['models']['dataset'] = DATASET_NAME

# Verify PPO config matches prediction model settings
print(f"\nConfiguration Check:")
print(f"  Dataset: {config['models']['dataset']}")
print(f"  Sequence length: {config['environment']['sequence_length']}")
print(f"  Prediction model: {config['models']['prediction_model']}")

# Configure prediction horizons
prediction_horizons = config['models'].get('prediction_horizons', [1, 2, 3, 5, 10])
print(f"\nPrediction Horizons Configuration:")
print(f"  Horizons: {prediction_horizons}")
short_term = [h for h in prediction_horizons if h <= 3]
medium_term = [h for h in prediction_horizons if h > 3]
print(f"  Short-term (1-3 steps): {short_term}")
print(f"  Medium-term (5-10 steps): {medium_term}")

# Calculate observation space dimension
# Base: 5 prediction features (t+1) + 5 price + 4 portfolio = 14
# Additional horizons: each adds 4 features
base_features = 5 + 5 + 4  # prediction + price + portfolio
additional_horizons = len(prediction_horizons) - 1
additional_features = additional_horizons * 4
total_features = base_features + additional_features

print(f"\nObservation Space:")
print(f"  Base features: {base_features} (5 prediction + 5 price + 4 portfolio)")
print(f"  Additional horizon features: {additional_features} ({additional_horizons} horizons √ó 4)")
print(f"  Total observation dimension: {total_features}")

# Train PPO agent
print(f"\n{'='*60}")
print("Starting PPO Training...")
print("="*60)

try:
    model = train_ppo(
        model_type=config['models']['prediction_model'],
        dataset=config['models']['dataset'],
        timesteps=config['training']['total_timesteps'],
        config_path=str(config_path),
        resume=True,  # Resume from checkpoint if exists
    )
    
    if model is not None:
        print("\n" + "="*60)
        print("‚úì PPO training completed successfully!")
        print("="*60)
        
        # Display training summary
        from colab_utils import get_ppo_models_path, get_checkpoints_path  # type: ignore
        final_model_path = get_ppo_models_path() / f"ppo_{config['models']['prediction_model']}_{DATASET_NAME}.zip"
        if final_model_path.exists():
            size_mb = final_model_path.stat().st_size / (1024 * 1024)
            print(f"\nFinal model saved: {final_model_path.name} ({size_mb:.2f} MB)")
        
        checkpoint_path = get_checkpoints_path() / f"{config['models']['prediction_model']}_{DATASET_NAME}"
        if checkpoint_path.exists():
            checkpoints = list(checkpoint_path.glob("*.zip"))
            print(f"Checkpoints available: {len(checkpoints)}")
    else:
        print("\n‚ö† PPO training returned None - check for errors above")
        
except Exception as e:
    print(f"\n‚ùå Error during PPO training: {e}")
    import traceback
    traceback.print_exc()
    raise

print("\n" + "="*60)


In [None]:
# Cell 9: Summary and Verification
# ==================================

print("Training Summary and Verification")
print("="*60)

from pathlib import Path
from datetime import datetime
import os

# List all trained prediction models
print("\n1. Trained Prediction Models:")
print("-" * 60)
from colab_utils import get_models_path  # type: ignore
models_path = get_models_path()

model_files = list(models_path.glob(f"*{DATASET_NAME}*classification.keras"))
if model_files:
    total_size = 0
    for model_file in sorted(model_files):
        size_mb = model_file.stat().st_size / (1024 * 1024)
        total_size += size_mb
        print(f"  ‚úì {model_file.name} ({size_mb:.2f} MB)")
    print(f"  Total size: {total_size:.2f} MB")
else:
    print("  ‚ö† No prediction models found")

# List PPO checkpoints and final model
print("\n2. PPO Models and Checkpoints:")
print("-" * 60)
from colab_utils import get_ppo_models_path, get_checkpoints_path  # type: ignore
ppo_models_path = get_ppo_models_path()
checkpoints_path = get_checkpoints_path()

# Final PPO model
ppo_model_pattern = f"ppo_*{DATASET_NAME}.zip"
ppo_models = list(ppo_models_path.glob(ppo_model_pattern))
if ppo_models:
    for model_file in sorted(ppo_models):
        size_mb = model_file.stat().st_size / (1024 * 1024)
        print(f"  ‚úì Final PPO model: {model_file.name} ({size_mb:.2f} MB)")

# Checkpoints
checkpoint_dir = checkpoints_path / f"ensemble_{DATASET_NAME}"
if not checkpoint_dir.exists():
    # Try other possible names
    for pattern in [f"*{DATASET_NAME}*", f"*ensemble*"]:
        matches = list(checkpoints_path.glob(pattern))
        if matches and matches[0].is_dir():
            checkpoint_dir = matches[0]
            break

if checkpoint_dir.exists() and checkpoint_dir.is_dir():
    checkpoints = list(checkpoint_dir.glob("*.zip"))
    if checkpoints:
        print(f"  ‚úì Checkpoints found: {len(checkpoints)} files")
        latest = max(checkpoints, key=lambda p: p.stat().st_mtime)
        size_mb = latest.stat().st_size / (1024 * 1024)
        print(f"    Latest: {latest.name} ({size_mb:.2f} MB)")
    else:
        print(f"  ‚ö† Checkpoint directory exists but no .zip files found")
else:
    print(f"  ‚ö† No checkpoint directory found")

# Display training summary
print("\n3. Training Summary:")
print("-" * 60)
print(f"  Dataset: {DATASET_NAME}")
print(f"  Prediction models trained: {len([m for m in ['lstm', 'gru', 'bilstm', 'dlstm'] if (models_path / f'{m}_{DATASET_NAME}_classification.keras').exists()])}/4")
print(f"  PPO model: {'‚úì Trained' if ppo_models else '‚úó Not found'}")
print(f"  Prediction horizons: {config['models'].get('prediction_horizons', [1, 2, 3, 5, 10])}")

# Verify all files saved to Google Drive
print("\n4. Google Drive Verification:")
print("-" * 60)
project_path = Path(PROJECT_PATH)
if '/content/drive' in str(project_path):
    print(f"  ‚úì Files saved to Google Drive")
    print(f"    Path: {project_path}")
else:
    print(f"  ‚ö† Not running on Colab - files saved locally")

# Show model paths for future use
print("\n5. Model Paths for Future Use:")
print("-" * 60)
print(f"  Project root: {PROJECT_PATH}")
print(f"  Prediction models: {models_path}")
print(f"  PPO models: {ppo_models_path}")
print(f"  Checkpoints: {checkpoints_path}")
print(f"  Dataset: {get_datasets_path()}")

# Display next steps
print("\n6. Next Steps:")
print("-" * 60)
print("  1. Evaluate Models:")
print(f"     python 'PPO approach/evaluate_ppo.py' --model ensemble --dataset {DATASET_NAME}")
print("  2. Backtest:")
print("     Run backtests on historical data to verify profitability")
print("  3. Deploy:")
print("     Integrate models into live trading bot (separate implementation)")

print("\n" + "="*60)
print("‚úì Training Complete!")
print("="*60)
print(f"\nAll models and results saved to: {PROJECT_PATH}")
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*60)
