# MountainCarContinuous-v0 – PPO vs SAC Pipeline

End-to-end Pipeline:
1. Environment-Setup
2. Hyperparameter-Definition
3. Training von PPO und SAC
4. Evaluation und Vergleich
5. Video-Recording der Policies
6. Side-by-Side-Merge mit ffmpeg


In [None]:
# filename: mountaincar_continuous_ppo_sac_pipeline_v1.ipynb
# model: GPT-5.1 Thinking – prompt: PPO vs SAC MountainCarContinuous pipeline notebook

import os
import subprocess

import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

from stable_baselines3 import PPO, SAC
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder


def make_env():
    """Create a monitored MountainCarContinuous-v0 environment."""
    env = gym.make("MountainCarContinuous-v0")
    env = Monitor(env)
    return env


def record_video(model, video_folder, video_name, max_steps=999):
    """Record rollout of a trained model as MP4 using VecVideoRecorder."""
    os.makedirs(video_folder, exist_ok=True)

    def _make_env():
        return gym.make("MountainCarContinuous-v0", render_mode="rgb_array")

    vec_env = DummyVecEnv([_make_env])
    vec_env = VecVideoRecorder(
        vec_env,
        video_folder=video_folder,
        record_video_trigger=lambda step: step == 0,
        video_length=max_steps,
        name_prefix=video_name,
    )

    obs = vec_env.reset()
    for _ in range(max_steps):
        action, _ = model.predict(obs, deterministic=True)
        obs, _, dones, _ = vec_env.step(action)
        if dones[0]:
            break

    vec_env.close()


def merge_videos_side_by_side(
    input_left,
    input_right,
    output_path,
    label_left="PPO",
    label_right="SAC",
):
    """Merge two videos side by side using ffmpeg with simple labels."""
    cmd = [
        "ffmpeg",
        "-i", input_left,
        "-i", input_right,
        "-filter_complex",
        (
            "[0:v]drawtext=text='{label_left}':x=20:y=20:fontsize=36:fontcolor=white[v0];"
            "[1:v]drawtext=text='{label_right}':x=20:y=20:fontsize=36:fontcolor=white[v1];"
            "[v0][v1]hstack=inputs=2[v]"
        ),
        "-map", "[v]",
        "-y",
        output_path,
    ]
    subprocess.run(cmd, check=True)



In [None]:
# filename: mountaincar_continuous_ppo_sac_pipeline_v1.ipynb
# model: GPT-5.1 Thinking – prompt: PPO vs SAC MountainCarContinuous pipeline notebook

TOTAL_TIMESTEPS = 300_000
VIDEO_FOLDER = "videos"

ppo_kwargs = dict(
    learning_rate=3e-4,
    n_steps=2048,
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    ent_coef=0.0,
    vf_coef=0.5,
    max_grad_norm=0.5,
    policy_kwargs=dict(net_arch=[64, 64]),
    verbose=1,
)

sac_kwargs = dict(
    learning_rate=3e-4,
    buffer_size=100_000,
    batch_size=256,
    tau=0.005,
    gamma=0.99,
    train_freq=1,
    gradient_steps=1,
    ent_coef="auto",
    verbose=1,
)



In [None]:
# filename: mountaincar_continuous_ppo_sac_pipeline_v1.ipynb
# model: GPT-5.1 Thinking – prompt: PPO vs SAC MountainCarContinuous pipeline notebook

ppo_env = DummyVecEnv([make_env])
ppo_model = PPO(
    "MlpPolicy",
    ppo_env,
    **ppo_kwargs,
)
ppo_model.learn(total_timesteps=TOTAL_TIMESTEPS)
ppo_model.save("ppo_mountaincar_continuous")
ppo_env.close()

sac_env = DummyVecEnv([make_env])
sac_model = SAC(
    "MlpPolicy",
    sac_env,
    **sac_kwargs,
)
sac_model.learn(total_timesteps=TOTAL_TIMESTEPS)
sac_model.save("sac_mountaincar_continuous")
sac_env.close()



In [None]:
# filename: mountaincar_continuous_ppo_sac_pipeline_v1.ipynb
# model: GPT-5.1 Thinking – prompt: PPO vs SAC MountainCarContinuous pipeline notebook

eval_env = gym.make("MountainCarContinuous-v0")

ppo_mean_reward, ppo_std_reward = evaluate_policy(
    ppo_model,
    eval_env,
    n_eval_episodes=20,
    deterministic=True,
)

sac_mean_reward, sac_std_reward = evaluate_policy(
    sac_model,
    eval_env,
    n_eval_episodes=20,
    deterministic=True,
)

eval_env.close()

print(f"PPO mean reward: {ppo_mean_reward:.2f} ± {ppo_std_reward:.2f}")
print(f"SAC mean reward: {sac_mean_reward:.2f} ± {sac_std_reward:.2f}")

algorithms = ["PPO", "SAC"]
mean_rewards = [ppo_mean_reward, sac_mean_reward]
std_rewards = [ppo_std_reward, sac_std_reward]

plt.figure()
x_positions = np.arange(len(algorithms))
plt.bar(x_positions, mean_rewards, yerr=std_rewards, capsize=5)
plt.xticks(x_positions, algorithms)
plt.ylabel("Mean reward (20 episodes)")
plt.title("MountainCarContinuous-v0: PPO vs SAC")
plt.show()



In [None]:
# filename: mountaincar_continuous_ppo_sac_pipeline_v1.ipynb
# model: GPT-5.1 Thinking – prompt: PPO vs SAC MountainCarContinuous pipeline notebook

os.makedirs(VIDEO_FOLDER, exist_ok=True)

ppo_video_path = os.path.join(VIDEO_FOLDER, "ppo_mountaincar_continuous.mp4")
sac_video_path = os.path.join(VIDEO_FOLDER, "sac_mountaincar_continuous.mp4")

record_video(
    model=ppo_model,
    video_folder=VIDEO_FOLDER,
    video_name="ppo_mountaincar_continuous",
    max_steps=999,
)

record_video(
    model=sac_model,
    video_folder=VIDEO_FOLDER,
    video_name="sac_mountaincar_continuous",
    max_steps=999,
)

print("Videos gespeichert in:", os.path.abspath(VIDEO_FOLDER))



In [None]:
# filename: mountaincar_continuous_ppo_sac_pipeline_v1.ipynb
# model: GPT-5.1 Thinking – prompt: PPO vs SAC MountainCarContinuous pipeline notebook

side_by_side_output = os.path.join(
    VIDEO_FOLDER,
    "mountaincar_ppo_vs_sac_side_by_side.mp4",
)

merge_videos_side_by_side(
    input_left=ppo_video_path,
    input_right=sac_video_path,
    output_path=side_by_side_output,
    label_left="PPO",
    label_right="SAC",
)

print("Side-by-side Video erstellt:", os.path.abspath(side_by_side_output))

