# PPO Training Notebook
## Train Proximal Policy Optimization Agent for Trade Execution

This notebook trains a Reinforcement Learning agent using PPO (Proximal Policy Optimization) for optimal trade execution:
1. **Custom Gym Environment**: Trading execution environment with realistic market dynamics
2. **PPO Agent**: Stable-Baselines3 PPO implementation
3. **Reward Engineering**: PnL-based rewards with transaction cost penalties
4. **Training Loop**: Multi-episode training with performance monitoring
5. **Policy Evaluation**: Backtesting and performance analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import EvalCallback, BaseCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
import torch
import yaml
import os
import warnings
warnings.filterwarnings('ignore')

# Import our modules
import sys
sys.path.append('../')

from algos.core.exec_rl import TradingExecutionEnv, ExecutionRL, ExecutionTrainingCallback

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## 1. Environment Setup and Configuration

In [None]:
# Load configuration
with open('../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

print("Configuration loaded:")
print(f"Execution config: {config.get('execution', {})}")

# Environment parameters
env_config = {
    'max_position_size': 10000.0,      # Maximum position size
    'transaction_cost': 0.0005,        # 5 bps transaction cost
    'impact_cost': 0.001,              # 10 bps market impact
    'max_steps': 100                   # Maximum steps per episode
}

print(f"Environment config: {env_config}")

In [None]:
# Create and test environment
def make_env():
    env = TradingExecutionEnv(**env_config)
    return env

# Test environment
test_env = make_env()
check_env(test_env)

print("Environment check passed!")
print(f"Observation space: {test_env.observation_space}")
print(f"Action space: {test_env.action_space}")

# Test environment interaction
obs, info = test_env.reset()
print(f"\nInitial observation: {obs}")
print(f"Initial info: {info}")

# Take a random action
action = test_env.action_space.sample()
obs, reward, terminated, truncated, info = test_env.step(action)

print(f"\nAfter random action {action}:")
print(f"Observation: {obs}")
print(f"Reward: {reward}")
print(f"Terminated: {terminated}")
print(f"Info: {info}")

## 2. Custom Training Callbacks

In [None]:
class TradingCallback(BaseCallback):
    """Custom callback for trading PPO training."""
    
    def __init__(self, check_freq=1000, verbose=1):
        super().__init__(verbose)
        self.check_freq = check_freq
        self.episode_rewards = []
        self.episode_lengths = []
        self.execution_costs = []
        
    def _on_step(self):
        """Called at each step."""
        
        # Log episode completion
        if 'episode' in self.locals.get('infos', [{}])[0]:
            episode_info = self.locals['infos'][0]['episode']
            self.episode_rewards.append(episode_info['r'])
            self.episode_lengths.append(episode_info['l'])
            
            # Calculate execution cost if available
            if 'total_cost' in self.locals['infos'][0]:
                self.execution_costs.append(self.locals['infos'][0]['total_cost'])
        
        # Periodic logging
        if self.n_calls % self.check_freq == 0:
            if len(self.episode_rewards) > 0:
                recent_rewards = self.episode_rewards[-100:]  # Last 100 episodes
                recent_lengths = self.episode_lengths[-100:]
                
                avg_reward = np.mean(recent_rewards)
                avg_length = np.mean(recent_lengths)
                
                self.logger.record("rollout/ep_rew_mean", avg_reward)
                self.logger.record("rollout/ep_len_mean", avg_length)
                
                if len(self.execution_costs) > 0:
                    avg_cost = np.mean(self.execution_costs[-100:])
                    self.logger.record("trading/avg_execution_cost", avg_cost)
                
                if self.verbose > 0:
                    print(f"Step {self.n_calls}: Avg Reward = {avg_reward:.4f}, "
                          f"Avg Length = {avg_length:.1f}")
        
        return True
    
    def get_training_metrics(self):
        """Get training metrics for analysis."""
        return {
            'episode_rewards': self.episode_rewards,
            'episode_lengths': self.episode_lengths,
            'execution_costs': self.execution_costs
        }

print("Custom callback defined")

## 3. PPO Model Setup

In [None]:
# Create environment for training
env = make_env()
env = Monitor(env)  # Monitor wrapper for logging

# PPO hyperparameters
ppo_config = {
    'policy': 'MlpPolicy',
    'learning_rate': 3e-4,
    'n_steps': 2048,
    'batch_size': 64,
    'n_epochs': 10,
    'gamma': 0.99,
    'gae_lambda': 0.95,
    'clip_range': 0.2,
    'ent_coef': 0.01,
    'vf_coef': 0.5,
    'max_grad_norm': 0.5,
    'verbose': 1,
    'tensorboard_log': './logs/ppo_execution/'
}

print("PPO Configuration:")
for key, value in ppo_config.items():
    print(f"  {key}: {value}")

# Create PPO model
model = PPO(
    env=env,
    **ppo_config
)

print(f"\nPPO model created successfully")
print(f"Policy network: {model.policy}")
print(f"Device: {model.device}")

## 4. Training Setup and Execution

In [None]:
# Training parameters
TOTAL_TIMESTEPS = 100000  # Adjust based on computational resources
EVAL_FREQ = 5000
SAVE_FREQ = 10000

print(f"Training parameters:")
print(f"  Total timesteps: {TOTAL_TIMESTEPS:,}")
print(f"  Evaluation frequency: {EVAL_FREQ:,}")
print(f"  Save frequency: {SAVE_FREQ:,}")

# Create callbacks
training_callback = TradingCallback(check_freq=1000)

# Evaluation environment
eval_env = Monitor(make_env())

eval_callback = EvalCallback(
    eval_env,
    best_model_save_path='../policies/',
    log_path='../logs/',
    eval_freq=EVAL_FREQ,
    deterministic=True,
    render=False,
    n_eval_episodes=10
)

callbacks = [training_callback, eval_callback]

print("Callbacks configured")

In [None]:
# Start training
print("Starting PPO training...")
print(f"This may take several minutes for {TOTAL_TIMESTEPS:,} timesteps")

try:
    model.learn(
        total_timesteps=TOTAL_TIMESTEPS,
        callback=callbacks,
        progress_bar=True
    )
    print("Training completed successfully!")
    
except KeyboardInterrupt:
    print("Training interrupted by user")
except Exception as e:
    print(f"Training failed with error: {e}")
    raise

## 5. Training Analysis and Visualization

In [None]:
# Get training metrics
training_metrics = training_callback.get_training_metrics()

print("Training Statistics:")
print(f"  Total episodes: {len(training_metrics['episode_rewards'])}")
print(f"  Average reward: {np.mean(training_metrics['episode_rewards']):.4f}")
print(f"  Reward std: {np.std(training_metrics['episode_rewards']):.4f}")
print(f"  Average episode length: {np.mean(training_metrics['episode_lengths']):.1f}")

if training_metrics['execution_costs']:
    print(f"  Average execution cost: {np.mean(training_metrics['execution_costs']):.6f}")

# Plot training progress
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Episode rewards
rewards = training_metrics['episode_rewards']
if len(rewards) > 0:
    axes[0, 0].plot(rewards, alpha=0.7)
    if len(rewards) > 50:
        rolling_mean = pd.Series(rewards).rolling(50).mean()
        axes[0, 0].plot(rolling_mean, color='red', linewidth=2, label='50-episode average')
        axes[0, 0].legend()
    axes[0, 0].set_title('Episode Rewards')
    axes[0, 0].set_xlabel('Episode')
    axes[0, 0].set_ylabel('Reward')
    axes[0, 0].grid(True)

# Episode lengths
lengths = training_metrics['episode_lengths']
if len(lengths) > 0:
    axes[0, 1].plot(lengths, alpha=0.7, color='green')
    if len(lengths) > 50:
        rolling_mean = pd.Series(lengths).rolling(50).mean()
        axes[0, 1].plot(rolling_mean, color='darkgreen', linewidth=2, label='50-episode average')
        axes[0, 1].legend()
    axes[0, 1].set_title('Episode Lengths')
    axes[0, 1].set_xlabel('Episode')
    axes[0, 1].set_ylabel('Steps')
    axes[0, 1].grid(True)

# Reward distribution
if len(rewards) > 0:
    axes[1, 0].hist(rewards, bins=30, alpha=0.7, color='blue')
    axes[1, 0].axvline(np.mean(rewards), color='red', linestyle='--', 
                      label=f'Mean: {np.mean(rewards):.3f}')
    axes[1, 0].legend()
    axes[1, 0].set_title('Reward Distribution')
    axes[1, 0].set_xlabel('Reward')
    axes[1, 0].set_ylabel('Frequency')

# Execution costs (if available)
costs = training_metrics['execution_costs']
if len(costs) > 0:
    axes[1, 1].plot(costs, alpha=0.7, color='orange')
    if len(costs) > 50:
        rolling_mean = pd.Series(costs).rolling(50).mean()
        axes[1, 1].plot(rolling_mean, color='darkorange', linewidth=2, label='50-episode average')
        axes[1, 1].legend()
    axes[1, 1].set_title('Execution Costs')
    axes[1, 1].set_xlabel('Episode')
    axes[1, 1].set_ylabel('Cost')
    axes[1, 1].grid(True)
else:
    axes[1, 1].text(0.5, 0.5, 'No execution cost data', 
                   horizontalalignment='center', verticalalignment='center',
                   transform=axes[1, 1].transAxes)
    axes[1, 1].set_title('Execution Costs (No Data)')

plt.tight_layout()
plt.show()

print("Training analysis complete")

## 6. Policy Evaluation and Testing

In [None]:
# Test trained policy
print("Testing trained policy...")

test_episodes = 10
test_results = []

for episode in range(test_episodes):
    obs, info = env.reset()
    episode_reward = 0
    episode_steps = 0
    episode_actions = []
    episode_positions = []
    
    terminated = truncated = False
    
    while not (terminated or truncated):
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = env.step(action)
        
        episode_reward += reward
        episode_steps += 1
        episode_actions.append(action.copy())
        episode_positions.append(info.get('current_position', 0))
    
    test_results.append({
        'episode': episode,
        'reward': episode_reward,
        'steps': episode_steps,
        'final_position': info.get('current_position', 0),
        'total_pnl': info.get('total_pnl', 0),
        'total_cost': info.get('total_cost', 0),
        'actions': episode_actions,
        'positions': episode_positions
    })
    
    print(f"Episode {episode}: Reward = {episode_reward:.4f}, "
          f"Steps = {episode_steps}, PnL = {info.get('total_pnl', 0):.4f}")

# Analyze test results
test_rewards = [r['reward'] for r in test_results]
test_pnls = [r['total_pnl'] for r in test_results]
test_costs = [r['total_cost'] for r in test_results]

print(f"\nTest Results Summary:")
print(f"  Average reward: {np.mean(test_rewards):.4f} ± {np.std(test_rewards):.4f}")
print(f"  Average PnL: {np.mean(test_pnls):.4f} ± {np.std(test_pnls):.4f}")
print(f"  Average cost: {np.mean(test_costs):.6f} ± {np.std(test_costs):.6f}")
print(f"  Success rate: {sum(1 for r in test_rewards if r > 0) / len(test_rewards):.1%}")

In [None]:
# Visualize test episode behavior
if test_results:
    # Select a representative episode
    best_episode_idx = np.argmax(test_rewards)
    best_episode = test_results[best_episode_idx]
    
    print(f"Analyzing best episode (Episode {best_episode_idx}):")
    print(f"  Reward: {best_episode['reward']:.4f}")
    print(f"  PnL: {best_episode['total_pnl']:.4f}")
    print(f"  Steps: {best_episode['steps']}")
    
    # Plot episode behavior
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Position evolution
    positions = best_episode['positions']
    axes[0, 0].plot(positions, marker='o', markersize=3)
    axes[0, 0].set_title(f'Position Evolution (Episode {best_episode_idx})')
    axes[0, 0].set_xlabel('Step')
    axes[0, 0].set_ylabel('Position')
    axes[0, 0].grid(True)
    
    # Action components
    actions = np.array(best_episode['actions'])
    if len(actions) > 0:
        axes[0, 1].plot(actions[:, 0], label='Size Delta', alpha=0.7)
        axes[0, 1].plot(actions[:, 1], label='Order Type', alpha=0.7)
        axes[0, 1].plot(actions[:, 2], label='Limit Offset', alpha=0.7)
        axes[0, 1].set_title('Action Components')
        axes[0, 1].set_xlabel('Step')
        axes[0, 1].set_ylabel('Action Value')
        axes[0, 1].legend()
        axes[0, 1].grid(True)
    
    # Test reward distribution
    axes[1, 0].hist(test_rewards, bins=10, alpha=0.7, color='green')
    axes[1, 0].axvline(np.mean(test_rewards), color='red', linestyle='--', 
                      label=f'Mean: {np.mean(test_rewards):.3f}')
    axes[1, 0].legend()
    axes[1, 0].set_title('Test Episode Rewards')
    axes[1, 0].set_xlabel('Reward')
    axes[1, 0].set_ylabel('Frequency')
    
    # PnL vs Cost scatter
    axes[1, 1].scatter(test_costs, test_pnls, alpha=0.7)
    axes[1, 1].set_title('PnL vs Execution Cost')
    axes[1, 1].set_xlabel('Execution Cost')
    axes[1, 1].set_ylabel('PnL')
    axes[1, 1].grid(True)
    
    plt.tight_layout()
    plt.show()

print("Policy evaluation complete")

## 7. Save Trained Policy

In [None]:
# Create policies directory
os.makedirs('../policies', exist_ok=True)

# Save the trained model
policy_path = '../policies/ppo_policy.zip'
model.save(policy_path)
print(f"PPO policy saved to {policy_path}")

# Save training configuration and results
training_summary = {
    'training_config': {
        'total_timesteps': TOTAL_TIMESTEPS,
        'ppo_config': ppo_config,
        'env_config': env_config
    },
    'training_results': {
        'total_episodes': len(training_metrics['episode_rewards']),
        'final_avg_reward': np.mean(training_metrics['episode_rewards'][-100:]) if len(training_metrics['episode_rewards']) >= 100 else np.mean(training_metrics['episode_rewards']),
        'final_reward_std': np.std(training_metrics['episode_rewards'][-100:]) if len(training_metrics['episode_rewards']) >= 100 else np.std(training_metrics['episode_rewards']),
        'convergence_achieved': len(training_metrics['episode_rewards']) > 0
    },
    'test_results': {
        'test_episodes': test_episodes,
        'avg_test_reward': np.mean(test_rewards),
        'test_reward_std': np.std(test_rewards),
        'avg_test_pnl': np.mean(test_pnls),
        'avg_test_cost': np.mean(test_costs),
        'success_rate': sum(1 for r in test_rewards if r > 0) / len(test_rewards)
    }
}

summary_path = '../policies/training_summary.yaml'
with open(summary_path, 'w') as f:
    yaml.dump(training_summary, f, default_flow_style=False)
print(f"Training summary saved to {summary_path}")

# Test loading the saved model
try:
    loaded_model = PPO.load(policy_path)
    print("Policy loading test successful")
    
    # Quick test of loaded model
    test_obs, _ = env.reset()
    test_action, _ = loaded_model.predict(test_obs)
    print(f"Loaded model prediction test: action = {test_action}")
    
except Exception as e:
    print(f"Warning: Policy loading test failed: {e}")

print("\n=== PPO TRAINING COMPLETE ===")
print(f"Final average reward: {training_summary['training_results']['final_avg_reward']:.4f}")
print(f"Test success rate: {training_summary['test_results']['success_rate']:.1%}")
print(f"Policy saved and ready for deployment!")