Complete PPO Pipeline --- Continuous with 16 Vectorized Environments

Even tho we can have a risk of different results per run, because each env is in its own process (random seeds still help, but subtle differences can appear)

and some stochastic wrappers can break the SubprocVecEnv

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.callbacks import BaseCallback
from pathlib import Path
from dataclasses import dataclass, asdict
from typing import Optional, Dict, Tuple, List
import imageio.v2 as imageio
import warnings

warnings.filterwarnings('ignore')

Configuration:

In [14]:
SEEDS = [0, 42, 1729, 6174, 196]
FAILURE_PROBS = [0.00, 0.05, 0.10, 0.15, 0.20, 0.25]
TRAINING_TIMESTEPS = 1_000_000  
N_EVAL_EPISODES = 100
N_ENVS = 16  # Number of parallel training environments

PPO_CONFIG = {
    'learning_rate': 3e-4,
    'n_steps': 2048,
    'batch_size': 64,
    'n_epochs': 10,
    'gamma': 0.99,
    'gae_lambda': 0.95,
    'clip_range': 0.2,
    'ent_coef': 0.0,
    'vf_coef': 0.5,
    'max_grad_norm': 0.5,
    'verbose': 0,
}

sns.set_style("whitegrid")
plt.rcParams.update({'figure.dpi': 150, 'font.size': 11})

print("✓ Configuration loaded")


### Environment Wrappers -- ID to DQN 

In [15]:
class StochasticLunarLanderWrapper(gym.Wrapper):
    """Stochastic wrapper - identical to DQN implementation"""
    
    FAMILIES = ['sensor', 'actuator', 'dynamics', 'catastrophic']
    
    def __init__(self, env, failure_prob=0.0, seed=None):
        super().__init__(env)
        self.failure_prob = failure_prob
        self.rng = np.random.RandomState(seed)
        self.active_family = None
        self.failure_occurred = False
        self.failure_step = None
        self.current_step = 0
        self._reset_family_states()
        self.last_obs = None
    
    def _reset_family_states(self):
        self.altimeter_drift_enabled = False
        self.altimeter_bias = 0.0
        self.engine_degradation_active = False
        self.thrust_multiplier = 1.0
        self.thruster_asymmetry_active = False
        self.left_mult = 1.0
        self.right_mult = 1.0
        self.control_lag_active = False
        self.action_buffer = []
        self.fuel_leak_active = False
        self.leak_rate = 0.0
        self.gravity_variation_active = False
        self.gravity_mult = 1.0
        self.current_mass = 1.0
        self.engine_cutoff_active = False
        self.engine_cutoff_step = -1
        self.landing_gear_failure = False
    
    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        self._reset_family_states()
        self.current_step = 0
        self.failure_occurred = False
        self.failure_step = None
        self.last_obs = obs.copy()
        
        if self.rng.random() < self.failure_prob:
            self.active_family = self.rng.choice(self.FAMILIES)
            self._initialize_active_family()
        else:
            self.active_family = None
        
        info['active_family'] = self.active_family
        return obs, info
    
    def _initialize_active_family(self):
        if self.active_family == 'sensor':
            self.altimeter_drift_enabled = True
        elif self.active_family == 'actuator':
            mode = self.rng.choice(['degradation', 'asymmetry'])
            if mode == 'degradation':
                self.engine_degradation_active = True
                self.thrust_multiplier = self.rng.uniform(0.3, 0.6)
            else:
                self.thruster_asymmetry_active = True
                weakness = self.rng.uniform(0.4, 0.8)
                if self.rng.random() < 0.5:
                    self.left_mult, self.right_mult = weakness, 1.0
                else:
                    self.left_mult, self.right_mult = 1.0, weakness
        elif self.active_family == 'dynamics':
            mode = self.rng.choice(['fuel_leak', 'gravity'])
            if mode == 'fuel_leak':
                self.fuel_leak_active = True
                self.leak_rate = self.rng.uniform(0.001, 0.003)
            else:
                self.gravity_variation_active = True
                self.gravity_mult = self.rng.uniform(0.9, 1.1)
        elif self.active_family == 'catastrophic':
            mode = self.rng.choice(['cutoff', 'landing_gear'])
            if mode == 'cutoff':
                self.engine_cutoff_active = True
                self.engine_cutoff_step = self.rng.randint(100, 600)
            else:
                self.landing_gear_failure = True
    
    def observation(self, obs):
        if self.active_family != 'sensor':
            return obs
        
        obs = obs.copy()
        if not self.failure_occurred:
            self.failure_occurred = True
            self.failure_step = self.current_step
        
        if self.altimeter_drift_enabled:
            self.altimeter_bias += self.rng.uniform(-0.02, 0.02)
            obs[1] += self.altimeter_bias
        
        if self.rng.random() < 0.3:
            obs[4] *= self.rng.choice([0.5, 1.5, -1.0])
        
        if self.rng.random() < 0.3:
            obs[2] += self.rng.uniform(-1.0, 1.0)
        
        if self.rng.random() < 0.3:
            dropout_idx = self.rng.choice([0, 1, 4])
            if self.rng.random() < 0.5:
                obs[dropout_idx] = 0.0
            elif self.last_obs is not None:
                obs[dropout_idx] = self.last_obs[dropout_idx]
        
        self.last_obs = obs.copy()
        return obs
    
    def step(self, action):
        self.current_step += 1
        
        if self.active_family == 'actuator':
            action = self._apply_actuator_failures(action)
        
        if self.active_family == 'catastrophic':
            action = self._apply_catastrophic_failures(action)
        
        obs, reward, terminated, truncated, info = self.env.step(action)
        obs = self.observation(obs)
        
        if self.active_family == 'dynamics':
            self._apply_dynamics_failures(action)
        
        if self.active_family == 'actuator':
            self._apply_actuator_physics()
        
        if self.active_family == 'catastrophic' and self.landing_gear_failure:
            if terminated and reward > 0:
                reward = -100
                info['landing_gear_crash'] = True
        
        return obs, reward, terminated, truncated, info
    
    def _apply_actuator_failures(self, action):
        if not self.failure_occurred:
            self.failure_occurred = True
            self.failure_step = self.current_step
        
        if not self.control_lag_active and self.rng.random() < 0.1:
            self.control_lag_active = True
            self.action_buffer = []
        
        if self.control_lag_active:
            self.action_buffer.append(action)
            if len(self.action_buffer) > self.rng.randint(1, 4):
                action = self.action_buffer.pop(0)
            else:
                action = 0
        
        return action
    
    def _apply_actuator_physics(self):
        try:
            lander = self.env.unwrapped.lander
            if lander is None:
                return
            
            if self.engine_degradation_active:
                vel = lander.linearVelocity
                if vel.y > 0:
                    lander.linearVelocity = (
                        vel.x, 
                        vel.y * (1.0 - (1.0 - self.thrust_multiplier) * 0.15)
                    )
            
            if self.thruster_asymmetry_active:
                ang_vel = lander.angularVelocity
                if self.left_mult < 1.0 and ang_vel > 0:
                    lander.angularVelocity = ang_vel * self.left_mult
                elif self.right_mult < 1.0 and ang_vel < 0:
                    lander.angularVelocity = ang_vel * self.right_mult
        except:
            pass
    
    def _apply_dynamics_failures(self, action):
        if not self.failure_occurred:
            self.failure_occurred = True
            self.failure_step = self.current_step
        
        try:
            lander = self.env.unwrapped.lander
            if lander is None:
                return
            
            if action == 2:
                self.current_mass -= 0.001
                self.current_mass = max(0.5, self.current_mass)
            
            if self.fuel_leak_active:
                self.current_mass -= self.leak_rate
                self.current_mass = max(0.5, self.current_mass)
                if action == 2:
                    vel = lander.linearVelocity
                    lander.linearVelocity = (vel.x, vel.y * 0.95)
            
            if self.gravity_variation_active:
                vel = lander.linearVelocity
                correction = (self.gravity_mult - 1.0) * 0.05
                lander.linearVelocity = (vel.x, vel.y - correction)
            
            if self.rng.random() < 0.2:
                wind = self.rng.uniform(-2.0, 2.0)
                vel = lander.linearVelocity
                lander.linearVelocity = (vel.x + wind * 0.1, vel.y)
        except:
            pass
    
    def _apply_catastrophic_failures(self, action):
        if self.engine_cutoff_active and self.current_step >= self.engine_cutoff_step:
            if not self.failure_occurred:
                self.failure_occurred = True
                self.failure_step = self.current_step
            if action == 2:
                action = 0
        
        return action


def make_stochastic_lunar_lander(failure_prob, seed=None):
    """Create single environment for evaluation (non-vectorized)"""
    env = gym.make('LunarLander-v2')
    env = StochasticLunarLanderWrapper(env, failure_prob, seed)
    
    if seed is not None:
        env.reset(seed=seed)
        env.action_space.seed(seed)
        env.observation_space.seed(seed)
    
    return env


print("✓ Environment wrappers loaded")


### Training:

In [16]:
class TrainingProgressCallback(BaseCallback):
    def __init__(self, check_freq=10000, verbose=0):
        super().__init__(verbose)
        self.check_freq = check_freq
    
    def _on_step(self):
        return True


def make_env(failure_prob, seed, rank=0):
    """Factory function for creating a single environment in vectorized setup"""
    def _init():
        env = gym.make('LunarLander-v2')
        env = StochasticLunarLanderWrapper(env, failure_prob, seed + rank)
        env = Monitor(env)
        
        env.reset(seed=seed + rank)
        env.action_space.seed(seed + rank)
        env.observation_space.seed(seed + rank)
        
        return env
    return _init


def train_ppo_agent(failure_prob, seed, save_dir, n_envs=N_ENVS, verbose=1):
    """Train PPO agent with VECTORIZED environments"""
    if verbose:
        print(f"\nTraining PPO | p={failure_prob:.2f} | seed={seed} | {n_envs} parallel envs")
    
    # Create vectorized environment for speedup
    env = SubprocVecEnv([make_env(failure_prob, seed, i) for i in range(n_envs)])
    
    model = PPO('MlpPolicy', env, seed=seed, **PPO_CONFIG)
    
    callback = TrainingProgressCallback(10000, verbose)
    model.learn(TRAINING_TIMESTEPS, callback=callback, progress_bar=(verbose > 0))
    
    save_dir.mkdir(parents=True, exist_ok=True)
    model_path = save_dir / f"ppo_p{int(failure_prob*100):02d}_seed{seed}.zip"
    model.save(str(model_path))
    
    if verbose:
        print(f"✓ Saved: {model_path}")
    
    env.close()
    return model


def train_experiment1(output_dir=Path("models/exp1_ppo"), n_envs=N_ENVS, verbose=1):
    print("\n" + "="*70)
    print(f"EXPERIMENT 1 (PPO): Training 30 agents ({n_envs} parallel envs)")
    print("="*70)
    
    models = {}
    for p in FAILURE_PROBS:
        for seed in SEEDS:
            models[(p, seed)] = train_ppo_agent(p, seed, output_dir, n_envs, verbose)
    
    print("\n✓ Experiment 1 training complete")
    return models


def train_experiment2(output_dir=Path("models/exp2_ppo"), n_envs=N_ENVS, verbose=1):
    print("\n" + "="*70)
    print(f"EXPERIMENT 2 (PPO): Training 5 agents ({n_envs} parallel envs)")
    print("="*70)
    
    models = {}
    for seed in SEEDS:
        models[(0.0, seed)] = train_ppo_agent(0.0, seed, output_dir, n_envs, verbose)
    
    print("\n✓ Experiment 2 training complete")
    return models


def load_all_models_exp1(model_dir=Path("models/exp1_ppo")):
    models = {}
    for p in FAILURE_PROBS:
        for seed in SEEDS:
            path = model_dir / f"ppo_p{int(p*100):02d}_seed{seed}.zip"
            if path.exists():
                models[(p, seed)] = PPO.load(str(path))
    print(f"✓ Loaded {len(models)}/30 models")
    return models


def load_all_models_exp2(model_dir=Path("models/exp2_ppo")):
    models = {}
    for seed in SEEDS:
        path = model_dir / f"ppo_p00_seed{seed}.zip"
        if path.exists():
            models[(0.0, seed)] = PPO.load(str(path))
    print(f"✓ Loaded {len(models)}/5 models")
    return models


print("✓ Training module loaded")

### Evaluation:

In [17]:
@dataclass
class Exp1EpisodeMetrics:
    episode_id: int
    seed: int
    failure_prob: float
    total_return: float
    episode_length: int
    success: bool
    crash: bool


@dataclass
class Exp2EpisodeMetrics:
    episode_id: int
    seed: int
    failure_prob: float
    total_return: float
    episode_length: int
    success: bool
    crash: bool
    failure_occurred: bool
    time_to_failure: int
    pre_fault_return: float
    post_fault_return: float


@dataclass
class Exp1AggregatedMetrics:
    failure_prob: float
    seed: int
    n_episodes: int
    mean_return: float
    std_return: float
    success_rate: float
    crash_rate: float


@dataclass
class Exp2AggregatedMetrics:
    failure_prob: float
    seed: int
    n_episodes: int
    mean_return: float
    std_return: float
    success_rate: float
    crash_rate: float
    mean_time_to_failure: float
    mean_pre_fault_return: float
    mean_post_fault_return: float


print("✓ Metrics dataclasses loaded")


def evaluate_single_episode_exp1(model, env, ep_id, seed, failure_prob):
    obs, info = env.reset()
    
    ret, length = 0.0, 0
    done = False
    
    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = env.step(action)
        ret += reward
        length += 1
        done = terminated or truncated
    
    return Exp1EpisodeMetrics(
        episode_id=ep_id,
        seed=seed,
        failure_prob=failure_prob,
        total_return=ret,
        episode_length=length,
        success=(ret >= 200),
        crash=(ret < -100)
    )


def evaluate_single_episode_exp2(model, env, ep_id, seed, failure_prob, record=False):
    obs, info = env.reset()
    
    ret, length = 0.0, 0
    pre_ret, post_ret = 0.0, 0.0
    frames = [] if record else None
    done = False
    
    while not done:
        if record:
            frame = env.render()
            if frame is not None:
                frames.append(frame)
        
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = env.step(action)
        
        ret += reward
        length += 1
        done = terminated or truncated
        
        if env.failure_occurred and env.failure_step is not None:
            if length <= env.failure_step:
                pre_ret += reward
            else:
                post_ret += reward
        else:
            pre_ret += reward
    
    if env.failure_occurred and env.failure_step is not None:
        time_to_failure = env.failure_step
    else:
        time_to_failure = length
    
    metrics = Exp2EpisodeMetrics(
        episode_id=ep_id,
        seed=seed,
        failure_prob=failure_prob,
        total_return=ret,
        episode_length=length,
        success=(ret >= 200),
        crash=(ret < -100),
        failure_occurred=env.failure_occurred,
        time_to_failure=time_to_failure,
        pre_fault_return=pre_ret,
        post_fault_return=post_ret
    )
    
    return metrics, frames


def evaluate_agent_exp1(model, failure_prob, seed, n_episodes=100):
    env = make_stochastic_lunar_lander(failure_prob, seed)
    
    metrics_list = []
    for ep in range(n_episodes):
        metrics = evaluate_single_episode_exp1(model, env, ep, seed, failure_prob)
        metrics_list.append(metrics)
    
    env.close()
    return metrics_list


def evaluate_agent_exp2(model, failure_prob, seed, n_episodes=100, record_last=0):
    env = make_stochastic_lunar_lander(failure_prob, seed)
    
    if record_last > 0:
        env.close()
        env = gym.make('LunarLander-v2', render_mode='rgb_array')
        env = StochasticLunarLanderWrapper(env, failure_prob, seed)
        env.reset(seed=seed)
    
    metrics_list = []
    all_frames = []
    
    for ep in range(n_episodes):
        should_record = (ep >= n_episodes - record_last) and record_last > 0
        metrics, frames = evaluate_single_episode_exp2(
            model, env, ep, seed, failure_prob, should_record
        )
        metrics_list.append(metrics)
        if frames:
            all_frames.extend(frames)
    
    env.close()
    return metrics_list, (all_frames if all_frames else None)


def aggregate_metrics_exp1(metrics_list):
    returns = [m.total_return for m in metrics_list]
    successes = [m.success for m in metrics_list]
    crashes = [m.crash for m in metrics_list]
    
    return Exp1AggregatedMetrics(
        failure_prob=metrics_list[0].failure_prob,
        seed=metrics_list[0].seed,
        n_episodes=len(metrics_list),
        mean_return=np.mean(returns),
        std_return=np.std(returns),
        success_rate=np.mean(successes),
        crash_rate=np.mean(crashes)
    )


def aggregate_metrics_exp2(metrics_list):
    returns = [m.total_return for m in metrics_list]
    successes = [m.success for m in metrics_list]
    crashes = [m.crash for m in metrics_list]
    ttf = [m.time_to_failure for m in metrics_list]
    pre_rets = [m.pre_fault_return for m in metrics_list]
    post_rets = [m.post_fault_return for m in metrics_list]
    
    return Exp2AggregatedMetrics(
        failure_prob=metrics_list[0].failure_prob,
        seed=metrics_list[0].seed,
        n_episodes=len(metrics_list),
        mean_return=np.mean(returns),
        std_return=np.std(returns),
        success_rate=np.mean(successes),
        crash_rate=np.mean(crashes),
        mean_time_to_failure=np.mean(ttf),
        mean_pre_fault_return=np.mean(pre_rets),
        mean_post_fault_return=np.mean(post_rets)
    )


def evaluate_experiment1(models, n_episodes=100, output_dir=Path("results/exp1_ppo")):
    print("\n" + "="*70)
    print("EXPERIMENT 1 (PPO): Cross-evaluation")
    print("="*70)
    
    results = []
    for (train_p, train_seed), model in models.items():
        print(f"\nModel trained at p={train_p:.2f}, seed={train_seed}")
        for test_p in FAILURE_PROBS:
            print(f"  Testing at p={test_p:.2f}...", end=" ")
            metrics_list = evaluate_agent_exp1(model, test_p, train_seed, n_episodes)
            agg = aggregate_metrics_exp1(metrics_list)
            
            result = {
                'train_p': train_p,
                'test_p': test_p,
                'seed': train_seed,
                **asdict(agg)
            }
            results.append(result)
            print(f"✓ (return={agg.mean_return:.1f})")
    
    df = pd.DataFrame(results)
    output_dir.mkdir(parents=True, exist_ok=True)
    csv_path = output_dir / "exp1_ppo_results.csv"
    df.to_csv(csv_path, index=False)
    print(f"\n✓ Saved: {csv_path}")
    return df


def evaluate_experiment2(models, n_episodes=100, record_gifs=True, output_dir=Path("results/exp2_ppo")):
    print("\n" + "="*70)
    print("EXPERIMENT 2 (PPO): Zero-shot evaluation")
    print("="*70)
    
    results = []
    for (_, seed), model in models.items():
        print(f"\nModel seed={seed}")
        for test_p in FAILURE_PROBS:
            print(f"  Testing at p={test_p:.2f}...", end=" ")
            record_n = 10 if record_gifs else 0
            metrics_list, frames = evaluate_agent_exp2(
                model, test_p, seed, n_episodes, record_n
            )
            agg = aggregate_metrics_exp2(metrics_list)
            
            result = {'train_p': 0.0, 'test_p': test_p, 'seed': seed, **asdict(agg)}
            results.append(result)
            
            if frames:
                gif_dir = output_dir / "gifs"
                gif_dir.mkdir(parents=True, exist_ok=True)
                gif_path = gif_dir / f"exp2_ppo_p{int(test_p*100):02d}_seed{seed}.gif"
                imageio.mimsave(str(gif_path), frames, fps=50, loop=0)
                print(f"✓ GIF saved (2× speed)", end=" ")
            
            print(f"(return={agg.mean_return:.1f})")
    
    df = pd.DataFrame(results)
    output_dir.mkdir(parents=True, exist_ok=True)
    csv_path = output_dir / "exp2_ppo_results.csv"
    df.to_csv(csv_path, index=False)
    print(f"\n✓ Saved: {csv_path}")
    return df


print("✓ Evaluation module loaded")

### Visualization:

In [18]:
def plot_exp1_heatmap(df, metric, output_dir):
    pivot = df.groupby(['train_p', 'test_p'])[metric].mean().unstack()
    if 'rate' in metric:
        pivot *= 100
    
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(pivot, annot=True, fmt='.1f', cmap='RdYlGn', center=50 if 'rate' in metric else 150,
                ax=ax, linewidths=1, cbar_kws={'label': metric.replace('_', ' ').title()})
    
    ax.set_xlabel('Test Failure Probability')
    ax.set_ylabel('Training Failure Probability')
    ax.set_title(f'Experiment 1 (PPO): {metric.replace("_", " ").title()}')
    ax.set_xticklabels([f'{int(p*100)}%' for p in pivot.columns])
    ax.set_yticklabels([f'{int(p*100)}%' for p in pivot.index], rotation=0)
    
    output_dir.mkdir(parents=True, exist_ok=True)
    plt.tight_layout()
    plt.savefig(output_dir / f'exp1_ppo_{metric}.png', dpi=300, bbox_inches='tight')
    plt.close()


def plot_exp1_robustness_curves(df, output_dir):
    fig, ax = plt.subplots(figsize=(12, 7))
    
    for train_p in sorted(df['train_p'].unique()):
        subset = df[df['train_p'] == train_p]
        grouped = subset.groupby('test_p')['success_rate'].agg(['mean', 'sem'])
        
        test_ps = [p * 100 for p in grouped.index]
        means = grouped['mean'] * 100
        sems = grouped['sem'] * 100
        
        ax.plot(test_ps, means, marker='o', linewidth=2, markersize=8, 
                label=f'Train p={int(train_p*100)}%')
        ax.fill_between(test_ps, means - 1.96*sems, means + 1.96*sems, alpha=0.2)
    
    ax.axhline(50, color='red', linestyle='--', linewidth=1.5, alpha=0.7, label='50% Threshold')
    ax.set_xlabel('Test Failure Probability (%)')
    ax.set_ylabel('Success Rate (%)')
    ax.set_title('Experiment 1 (PPO): Robustness Curves')
    ax.legend(loc='best')
    ax.grid(True, alpha=0.3)
    ax.set_ylim(-5, 105)
    
    plt.tight_layout()
    plt.savefig(output_dir / 'exp1_ppo_robustness_curves.png', dpi=300, bbox_inches='tight')
    plt.close()


def plot_exp2_performance(df, metric, output_dir):
    fig, ax = plt.subplots(figsize=(10, 7))
    
    grouped = df.groupby('test_p')[metric].agg(['mean', 'sem'])
    test_ps = [p * 100 for p in grouped.index]
    means = grouped['mean']
    sems = grouped['sem']
    
    if 'rate' in metric:
        means *= 100
        sems *= 100
    
    ax.plot(test_ps, means, marker='o', linewidth=3, markersize=10, color='steelblue')
    ax.fill_between(test_ps, means - 1.96*sems, means + 1.96*sems, alpha=0.3)
    
    if 'success_rate' in metric:
        ax.axhline(50, color='red', linestyle='--', linewidth=2, label='50% Threshold')
    
    ax.set_xlabel('Test Failure Probability (%)')
    ax.set_ylabel(metric.replace('_', ' ').title())
    ax.set_title(f'Experiment 2 (PPO): {metric.replace("_", " ").title()}')
    ax.legend(loc='best')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(output_dir / f'exp2_ppo_{metric}.png', dpi=300, bbox_inches='tight')
    plt.close()


def create_all_exp1_plots(df, output_dir=Path("results/exp1_ppo/plots")):
    print("\nGenerating Experiment 1 (PPO) plots...")
    plot_exp1_heatmap(df, 'success_rate', output_dir)
    plot_exp1_heatmap(df, 'mean_return', output_dir)
    plot_exp1_heatmap(df, 'crash_rate', output_dir)
    plot_exp1_robustness_curves(df, output_dir)
    print("✓ All Experiment 1 (PPO) plots generated")


def create_all_exp2_plots(df, output_dir=Path("results/exp2_ppo/plots")):
    print("\nGenerating Experiment 2 (PPO) plots...")
    plot_exp2_performance(df, 'success_rate', output_dir)
    plot_exp2_performance(df, 'crash_rate', output_dir)
    plot_exp2_performance(df, 'mean_time_to_failure', output_dir)
    plot_exp2_performance(df, 'mean_pre_fault_return', output_dir)
    plot_exp2_performance(df, 'mean_post_fault_return', output_dir)
    print("✓ All Experiment 2 (PPO) plots generated")
print("✓ Visualization module loaded")

### Analysis:

In [19]:
def analyze_exp1(df):
    print("\n" + "="*70)
    print("EXPERIMENT 1 (PPO): Analysis")
    print("="*70)
    
    print("\nBest training probability for each test level:")
    for test_p in FAILURE_PROBS:
        best_train = df[df['test_p'] == test_p].groupby('train_p')['success_rate'].mean().idxmax()
        best_rate = df[(df['test_p'] == test_p) & (df['train_p'] == best_train)]['success_rate'].mean()
        print(f"  Test p={test_p:.2f}: Best train p={best_train:.2f} (success={best_rate*100:.1f}%)")
    
    on_dist = df[df['train_p'] == df['test_p']]['success_rate'].mean()
    off_dist = df[df['train_p'] != df['test_p']]['success_rate'].mean()
    print(f"\nGeneralization gap:")
    print(f"  On-distribution:  {on_dist*100:.1f}%")
    print(f"  Off-distribution: {off_dist*100:.1f}%")
    print(f"  Gap: {(on_dist - off_dist)*100:.1f} pp")


def analyze_exp2(df):
    print("\n" + "="*70)
    print("EXPERIMENT 2 (PPO): Analysis")
    print("="*70)
    
    grouped = df.groupby('test_p')['success_rate'].mean()
    critical_p = None
    for p, rate in grouped.items():
        if rate < 0.5:
            critical_p = p
            break
    
    if critical_p:
        print(f"\nCritical failure threshold: p={critical_p*100:.0f}%")
    else:
        print("\nNo critical failure threshold (robust up to 25%)")
    
    p0 = df[df['test_p'] == 0.0]['success_rate'].mean()
    p25 = df[df['test_p'] == 0.25]['success_rate'].mean()
    print(f"\nPerformance degradation:")
    print(f"  p=0%:  {p0*100:.1f}%")
    print(f"  p=25%: {p25*100:.1f}%")
    print(f"  Drop:  {(p0 - p25)*100:.1f} pp")
    
    print(f"\nTime-to-failure analysis:")
    for p in FAILURE_PROBS:
        ttf = df[df['test_p'] == p]['mean_time_to_failure'].mean()
        print(f"  p={p:.2f}: {ttf:.1f} steps")


def compare_experiments(df1, df2):
    print("\n" + "="*70)
    print("COMPARATIVE ANALYSIS (PPO)")
    print("="*70)
    
    print("\nZero-shot vs Matched training:")
    for test_p in FAILURE_PROBS:
        exp2_rate = df2[df2['test_p'] == test_p]['success_rate'].mean()
        exp1_rate = df1[(df1['train_p'] == test_p) & (df1['test_p'] == test_p)]['success_rate'].mean()
        improvement = (exp1_rate - exp2_rate) * 100
        print(f"  p={test_p:.2f}: Zero-shot={exp2_rate*100:5.1f}%, "
              f"Matched={exp1_rate*100:5.1f}%, Δ={improvement:+5.1f}pp")

print("✓ Analysis module loaded")


### Main Execution:

In [23]:
def run_complete_pipeline():
    """Run complete PPO research pipeline with vectorized training"""
    
    RUN_TRAINING_EXP1 = False
    RUN_TRAINING_EXP2 = True
    RUN_EVALUATION_EXP1 = False
    RUN_EVALUATION_EXP2 = True
    GENERATE_PLOTS = True
    RUN_ANALYSIS = True
    
    if RUN_TRAINING_EXP1:
        models_exp1 = train_experiment1(Path("models/exp1_ppo"), N_ENVS, verbose=1)
    
    if RUN_EVALUATION_EXP1:
        models_exp1 = load_all_models_exp1(Path("models/exp1_ppo"))
        df_exp1 = evaluate_experiment1(models_exp1, N_EVAL_EPISODES, Path("results/exp1_ppo"))
    
    if RUN_TRAINING_EXP2:
        models_exp2 = train_experiment2(Path("models/exp2_ppo"), N_ENVS, verbose=1)
    
    if RUN_EVALUATION_EXP2:
        models_exp2 = load_all_models_exp2(Path("models/exp2_ppo"))
        df_exp2 = evaluate_experiment2(models_exp2, N_EVAL_EPISODES, 
                                       record_gifs=True, output_dir=Path("results/exp2_ppo"))
    
    print("\n" + "="*70)
    print("✓ PIPELINE COMPLETE")
    print("="*70)

In [22]:
print("\n" + "="*70)
print("STARTING PPO WITH VECTORIZED TRAINING")
print(f"Using {N_ENVS} parallel environments for ~{N_ENVS//1.5:.0f}x speedup")
print("="*70)

run_complete_pipeline()


STARTING PPO WITH VECTORIZED TRAINING
Using 16 parallel environments for ~10x speedup

EXPERIMENT 1 (PPO): Training 30 agents (16 parallel envs)

Training PPO | p=0.00 | seed=0 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p00_seed0.zip

Training PPO | p=0.00 | seed=42 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p00_seed42.zip

Training PPO | p=0.00 | seed=1729 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p00_seed1729.zip

Training PPO | p=0.00 | seed=6174 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p00_seed6174.zip

Training PPO | p=0.00 | seed=196 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p00_seed196.zip

Training PPO | p=0.05 | seed=0 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p05_seed0.zip

Training PPO | p=0.05 | seed=42 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p05_seed42.zip

Training PPO | p=0.05 | seed=1729 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p05_seed1729.zip

Training PPO | p=0.05 | seed=6174 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p05_seed6174.zip

Training PPO | p=0.05 | seed=196 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p05_seed196.zip

Training PPO | p=0.10 | seed=0 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p10_seed0.zip

Training PPO | p=0.10 | seed=42 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p10_seed42.zip

Training PPO | p=0.10 | seed=1729 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p10_seed1729.zip

Training PPO | p=0.10 | seed=6174 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p10_seed6174.zip

Training PPO | p=0.10 | seed=196 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p10_seed196.zip

Training PPO | p=0.15 | seed=0 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p15_seed0.zip

Training PPO | p=0.15 | seed=42 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p15_seed42.zip

Training PPO | p=0.15 | seed=1729 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p15_seed1729.zip

Training PPO | p=0.15 | seed=6174 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p15_seed6174.zip

Training PPO | p=0.15 | seed=196 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p15_seed196.zip

Training PPO | p=0.20 | seed=0 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p20_seed0.zip

Training PPO | p=0.20 | seed=42 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p20_seed42.zip

Training PPO | p=0.20 | seed=1729 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p20_seed1729.zip

Training PPO | p=0.20 | seed=6174 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p20_seed6174.zip

Training PPO | p=0.20 | seed=196 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p20_seed196.zip

Training PPO | p=0.25 | seed=0 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p25_seed0.zip

Training PPO | p=0.25 | seed=42 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p25_seed42.zip

Training PPO | p=0.25 | seed=1729 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p25_seed1729.zip

Training PPO | p=0.25 | seed=6174 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p25_seed6174.zip

Training PPO | p=0.25 | seed=196 | 16 parallel envs


Output()

✓ Saved: models\exp1_ppo\ppo_p25_seed196.zip

✓ Experiment 1 training complete
✓ Loaded 30/30 models

EXPERIMENT 1 (PPO): Cross-evaluation

Model trained at p=0.00, seed=0
  Testing at p=0.00... ✓ (return=242.6)
  Testing at p=0.05... ✓ (return=239.5)
  Testing at p=0.10... ✓ (return=242.8)
  Testing at p=0.15... ✓ (return=235.9)
  Testing at p=0.20... ✓ (return=238.8)
  Testing at p=0.25... ✓ (return=216.0)

Model trained at p=0.00, seed=42
  Testing at p=0.00... ✓ (return=239.3)
  Testing at p=0.05... ✓ (return=250.1)
  Testing at p=0.10... ✓ (return=228.2)
  Testing at p=0.15... ✓ (return=227.7)
  Testing at p=0.20... ✓ (return=229.9)
  Testing at p=0.25... ✓ (return=222.5)

Model trained at p=0.00, seed=1729
  Testing at p=0.00... ✓ (return=256.6)
  Testing at p=0.05... ✓ (return=253.1)
  Testing at p=0.10... ✓ (return=242.6)
  Testing at p=0.15... ✓ (return=245.2)
  Testing at p=0.20... ✓ (return=238.8)
  Testing at p=0.25... ✓ (return=225.6)

Model trained at p=0.00, seed=6174
  

In [24]:
print("\n" + "="*70)
print("STARTING PPO WITH VECTORIZED TRAINING")
print(f"Using {N_ENVS} parallel environments for ~{N_ENVS//1.5:.0f}x speedup")
print("="*70)

run_complete_pipeline()


STARTING PPO WITH VECTORIZED TRAINING
Using 16 parallel environments for ~10x speedup

EXPERIMENT 2 (PPO): Training 5 agents (16 parallel envs)

Training PPO | p=0.00 | seed=0 | 16 parallel envs


Output()

✓ Saved: models\exp2_ppo\ppo_p00_seed0.zip

Training PPO | p=0.00 | seed=42 | 16 parallel envs


Output()

✓ Saved: models\exp2_ppo\ppo_p00_seed42.zip

Training PPO | p=0.00 | seed=1729 | 16 parallel envs


Output()

✓ Saved: models\exp2_ppo\ppo_p00_seed1729.zip

Training PPO | p=0.00 | seed=6174 | 16 parallel envs


Output()

✓ Saved: models\exp2_ppo\ppo_p00_seed6174.zip

Training PPO | p=0.00 | seed=196 | 16 parallel envs


Output()

✓ Saved: models\exp2_ppo\ppo_p00_seed196.zip

✓ Experiment 2 training complete
✓ Loaded 5/5 models

EXPERIMENT 2 (PPO): Zero-shot evaluation

Model seed=0
  Testing at p=0.00... ✓ GIF saved (2× speed) (return=242.6)
  Testing at p=0.05... ✓ GIF saved (2× speed) (return=239.5)
  Testing at p=0.10... ✓ GIF saved (2× speed) (return=242.8)
  Testing at p=0.15... ✓ GIF saved (2× speed) (return=235.9)
  Testing at p=0.20... ✓ GIF saved (2× speed) (return=238.8)
  Testing at p=0.25... ✓ GIF saved (2× speed) (return=216.0)

Model seed=42
  Testing at p=0.00... ✓ GIF saved (2× speed) (return=239.3)
  Testing at p=0.05... ✓ GIF saved (2× speed) (return=250.1)
  Testing at p=0.10... ✓ GIF saved (2× speed) (return=228.2)
  Testing at p=0.15... ✓ GIF saved (2× speed) (return=227.7)
  Testing at p=0.20... ✓ GIF saved (2× speed) (return=229.9)
  Testing at p=0.25... ✓ GIF saved (2× speed) (return=222.5)

Model seed=1729
  Testing at p=0.00... ✓ GIF saved (2× speed) (return=256.6)
  Testing at p=0.05.

In [26]:

# ============================================================================
# GENERATE ALL PLOTS FROM EXISTING RESULTS
# ============================================================================

print("\n" + "="*70)
print("GENERATING ALL PLOTS FROM RESULTS")
print("="*70)

# Load experiment 1 results
print("\nLoading Experiment 1 results...")
df_exp1 = pd.read_csv(Path("results/exp1_ppo/exp1_ppo_results.csv"))
print(f"✓ Loaded exp1: {len(df_exp1)} rows")

# Load experiment 2 results
print("Loading Experiment 2 results...")
df_exp2 = pd.read_csv(Path("results/exp2_ppo/exp2_ppo_results.csv"))
print(f"✓ Loaded exp2: {len(df_exp2)} rows")

# Create output directories
plots_dir_exp1 = Path("results/exp1_ppo/plots")
plots_dir_exp2 = Path("results/exp2_ppo/plots")
plots_dir_exp1.mkdir(parents=True, exist_ok=True)
plots_dir_exp2.mkdir(parents=True, exist_ok=True)

# Generate all experiment 1 plots
print("\n" + "="*70)
create_all_exp1_plots(df_exp1, plots_dir_exp1)

# Generate all experiment 2 plots
print("\n" + "="*70)
create_all_exp2_plots(df_exp2, plots_dir_exp2)

# Run analysis
print("\n" + "="*70)
analyze_exp1(df_exp1)

print("\n" + "="*70)
analyze_exp2(df_exp2)

print("\n" + "="*70)
compare_experiments(df_exp1, df_exp2)

print("\n" + "="*70)
print("✓✓✓ ALL PLOTS GENERATED AND SAVED ✓✓✓")
print("="*70)
print("\nPlot locations:")
print("  results/exp1_ppo/plots/ - Experiment 1 heatmaps and robustness curves")
print("  results/exp2_ppo/plots/ - Experiment 2 performance curves")



GENERATING ALL PLOTS FROM RESULTS

Loading Experiment 1 results...
✓ Loaded exp1: 180 rows
Loading Experiment 2 results...
✓ Loaded exp2: 30 rows


Generating Experiment 1 (PPO) plots...
✓ All Experiment 1 (PPO) plots generated


Generating Experiment 2 (PPO) plots...
✓ All Experiment 2 (PPO) plots generated


EXPERIMENT 1 (PPO): Analysis

Best training probability for each test level:
  Test p=0.00: Best train p=0.10 (success=92.6%)
  Test p=0.05: Best train p=0.10 (success=91.4%)
  Test p=0.10: Best train p=0.20 (success=89.6%)
  Test p=0.15: Best train p=0.10 (success=89.0%)
  Test p=0.20: Best train p=0.00 (success=86.4%)
  Test p=0.25: Best train p=0.20 (success=83.2%)

Generalization gap:
  On-distribution:  85.3%
  Off-distribution: 85.9%
  Gap: -0.6 pp


EXPERIMENT 2 (PPO): Analysis

No critical failure threshold (robust up to 25%)

Performance degradation:
  p=0%:  89.4%
  p=25%: 80.0%
  Drop:  9.4 pp

Time-to-failure analysis:
  p=0.00: 337.8 steps
  p=0.05: 317.2 steps
  p=