In [2]:
# Import necessary libraries and custom PPO implementation
import numpy as np
import torch
import torch.nn as nn
import gymnasium as gym
import qwop_gym

# Reload train_ppo module to pick up latest changes
import importlib
import train_ppo
importlib.reload(train_ppo)

# Import custom PPO components from train_ppo.py
from train_ppo import (
    ActorCritic,
    RolloutBuffer,
    create_qwop_env,
    train_ppo,
    evaluate_policy,
    save_checkpoint,
    load_checkpoint
)

print("âœ“ All modules imported successfully!")
print(f"Using PyTorch device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")

âœ“ All modules imported successfully!
Using PyTorch device: cpu


In [4]:
# Configuration for QWOP environment
BROWSER_PATH = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"
DRIVER_PATH = "C:\\Program Files\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe"

# Create QWOP environment using the helper function
env = create_qwop_env(
    browser_path=BROWSER_PATH,
    driver_path=DRIVER_PATH,
    stat_in_browser=True,
    game_in_browser=True,
    auto_draw=True,
    frames_per_step=4,
    max_episode_steps=500,
    text_in_browser="ðŸ¤– Training PPO Agent"
)

print(f"âœ“ Environment created successfully!")
print(f"State dimension: {env.observation_space.shape[0]}")
print(f"Action dimension: {env.action_space.n}")
print("Check your Brave browser - the QWOP game should be visible!")

âœ“ Environment created successfully!
State dimension: 60
Action dimension: 16
Check your Brave browser - the QWOP game should be visible!


In [5]:
# Action space mapping
action_map = {
    0: "none", 1: "Q", 2: "W", 3: "O", 4: "P",
    5: "Q+W", 6: "Q+O", 7: "Q+P", 8: "W+O", 9: "W+P", 10: "O+P",
    11: "Q+W+O", 12: "Q+W+P", 13: "Q+O+P", 14: "W+O+P", 15: "Q+W+O+P"
}

print(f"Testing environment with {env.action_space.n} actions\n")

obs, info = env.reset()
print(f"Initial observation shape: {obs.shape}")
print(f"Initial info: {info}\n")

# Take a few random actions
total_reward = 0
for step in range(50):
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)
    total_reward += reward
    
    if step % 10 == 0:
        print(f"Step {step}: Action={action_map.get(action, 'UNKNOWN')}, "
              f"Reward={reward:.2f}, Distance={info['distance']:.2f}m")
    
    if terminated or truncated:
        print(f"\nEpisode finished at step {step + 1}")
        print(f"Final distance: {info['distance']:.2f}m, Total reward: {total_reward:.2f}")
        break

print("\nâœ“ Environment test complete!")



Testing environment with 16 actions

Initial observation shape: (60,)
Initial info: {'time': np.float32(0.0053433334), 'distance': np.float32(0.25110978), 'avgspeed': np.float32(46.99497), 'is_success': False}

Step 0: Action=Q+W+P, Reward=0.08, Distance=0.24m
Step 10: Action=Q+W+P, Reward=0.02, Distance=0.61m
Step 20: Action=Q+W+O+P, Reward=-9.74, Distance=1.19m

Episode finished at step 21
Final distance: 1.19m, Total reward: -9.65

âœ“ Environment test complete!


## Train PPO Agent

In [6]:
# Training configuration
import os
from pathlib import Path

# PPO Hyperparameters
TOTAL_STEPS = 500000
ROLLOUT_STEPS = 2048
BATCH_SIZE = 256
PPO_EPOCHS = 4
GAMMA = 0.995
GAE_LAMBDA = 0.95
CLIP_COEF = 0.2
LEARNING_RATE = 3e-4
VALUE_COEF = 0.5
ENTROPY_COEF = 0.01
MAX_GRAD_NORM = 0.5
HIDDEN_SIZE = 256

# Output directory
OUTPUT_DIR = "../data/PPO-notebook"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Create fresh environment for training
env = create_qwop_env(
    browser_path=BROWSER_PATH,
    driver_path=DRIVER_PATH,
    stat_in_browser=True,
    game_in_browser=True,
    auto_draw=True,
    frames_per_step=8,
    max_episode_steps=500,
    text_in_browser="ðŸ¤– Training PPO Agent"
)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
print(f"State dimension: {state_dim}, Action dimension: {action_dim}")

# Initialize policy and optimizer
policy = ActorCritic(state_dim, action_dim, HIDDEN_SIZE).to(device)
optimizer = torch.optim.Adam(policy.parameters(), lr=LEARNING_RATE, eps=1e-5)

print("\nStarting PPO training...")
print(f"Total steps: {TOTAL_STEPS:,}")
print(f"Rollout steps: {ROLLOUT_STEPS:,}")
print(f"Watch the browser to see the agent learning!\n")

# Train the agent using the train_ppo function
results = train_ppo(
    env=env,
    policy=policy,
    optimizer=optimizer,
    device=device,
    total_steps=TOTAL_STEPS,
    rollout_steps=ROLLOUT_STEPS,
    batch_size=BATCH_SIZE,
    ppo_epochs=PPO_EPOCHS,
    gamma=GAMMA,
    gae_lambda=GAE_LAMBDA,
    clip_coef=CLIP_COEF,
    value_coef=VALUE_COEF,
    entropy_coef=ENTROPY_COEF,
    max_grad_norm=MAX_GRAD_NORM,
    model_dir=OUTPUT_DIR,
    save_every=10000,
    verbose=True
)

# Print training summary
print("\n" + "="*60)
print("TRAINING SUMMARY")
print("="*60)
print(f"Total steps: {results['total_steps']:,}")
print(f"Episodes completed: {results['completed_episodes']}")
print(f"Average episode reward: {results['average_reward']:.2f}")
print(f"Average episode length: {results['average_length']:.1f}")
print(f"Final model saved to: {results['final_checkpoint']}")
print("="*60)

# Close environment
env.close()
print("\nâœ“ Training complete!")

Using device: cpu
State dimension: 60, Action dimension: 16

Starting PPO training...
Total steps: 500,000
Rollout steps: 2,048
Watch the browser to see the agent learning!

episode 1: reward=-10.54 len=  32
episode 2: reward=-10.77 len=  18
episode 3: reward=-11.20 len=  18
episode 4: reward=-10.25 len=   7
episode 5: reward=-10.28 len=  16
episode 6: reward=-11.47 len=  38
episode 7: reward=-10.07 len=   8
episode 8: reward=-10.26 len=  12
episode 9: reward= -9.69 len=  17
episode 10: reward=-10.07 len=   8
episode 11: reward=-12.53 len=  72
episode 12: reward=-10.18 len=  14
episode 13: reward=-10.69 len=  32
episode 14: reward=-10.37 len=  15
episode 15: reward=-10.35 len=   8
episode 16: reward=-10.17 len=  10
episode 17: reward=-10.77 len=  31
episode 18: reward=-10.19 len=   7
episode 19: reward=-10.15 len=  30
episode 20: reward=-10.38 len=  11
episode 21: reward=-11.47 len=  45
episode 22: reward=-10.17 len=  12
episode 23: reward=-10.41 len=  12
episode 24: reward=-10.43 len=

  states = torch.as_tensor(self.states, dtype=torch.float32, device=self.device)


episode 113: reward=-10.21 len=  18
episode 114: reward=-10.53 len=  16
episode 115: reward=-14.24 len= 120
episode 116: reward=-10.18 len=   6
episode 117: reward=-10.03 len=   7
episode 118: reward=-10.08 len=   8
episode 119: reward=-10.21 len=   8
episode 120: reward=-11.13 len=  32
episode 121: reward=-10.10 len=   7
episode 122: reward= -9.98 len=  12
episode 123: reward=-10.63 len=  51
episode 124: reward= -9.94 len=   8
episode 125: reward=-10.04 len=  11
episode 126: reward=-10.36 len=  12
episode 127: reward=-10.12 len=   6
episode 128: reward= -9.88 len=   8
episode 129: reward=-10.26 len=   8
episode 130: reward=-11.16 len=  16
episode 131: reward=-11.70 len=  55
episode 132: reward=-10.24 len=   8
episode 133: reward=-10.26 len=  10
episode 134: reward=-10.16 len=  11
episode 135: reward=-10.28 len=  18
episode 136: reward=-10.57 len=  13
episode 137: reward=-11.24 len=  45
episode 138: reward=-11.51 len=  51
episode 139: reward=-10.14 len=   5
episode 140: reward=-10.17 l

## Evaluate Trained Agent

Load the trained model and watch it play QWOP:

In [8]:
# Load and evaluate the trained PPO agent
MODEL_PATH = "../data/PPO-notebook/ppo_final.pt"

print(f"Loading model from: {MODEL_PATH}")

# Create evaluation environment
eval_env = create_qwop_env(
    browser_path=BROWSER_PATH,
    driver_path=DRIVER_PATH,
    stat_in_browser=True,
    game_in_browser=True,
    auto_draw=True,
    frames_per_step=4,
    max_episode_steps=5000,
    text_in_browser="ðŸ¤– PPO Agent Evaluation"
)

# Initialize policy and load checkpoint
state_dim = eval_env.observation_space.shape[0]
action_dim = eval_env.action_space.n
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

eval_policy = ActorCritic(state_dim, action_dim, HIDDEN_SIZE).to(device)
checkpoint = load_checkpoint(MODEL_PATH, eval_policy, device)

print(f"âœ“ Model loaded (trained for {checkpoint['step']} steps)")
print("\nEvaluating agent for 5 episodes...")
print("Watch the browser to see your trained AI in action!\n")

# Evaluate the policy
results = evaluate_policy(
    env=eval_env,
    policy=eval_policy,
    device=device,
    num_episodes=5,
    verbose=True,
    render_delay=0.02
)

# Close environment
eval_env.close()
print("\nâœ“ Evaluation complete!")

Loading model from: ../data/PPO-notebook/ppo_final.pt
âœ“ Model loaded (trained for 500000 steps)

Evaluating agent for 5 episodes...
Watch the browser to see your trained AI in action!

Episode 1:
  Steps: 1339
  Distance: 100.47m
  Total Reward: 80.69
  âœ“ SUCCESS!

Episode 2:
  Steps: 32
  Distance: 0.43m
  Total Reward: -10.94
  âœ— Failed

Episode 3:
  Steps: 57
  Distance: 1.53m
  Total Reward: -11.01
  âœ— Failed

Episode 4:
  Steps: 1304
  Distance: 100.60m
  Total Reward: 81.81
  âœ“ SUCCESS!

Episode 5:
  Steps: 1255
  Distance: 100.41m
  Total Reward: 83.31
  âœ“ SUCCESS!


EVALUATION SUMMARY
Average Distance: 60.69m
Best Distance: 100.60m
Average Reward: 44.77
Success Rate: 60.0%

âœ“ Evaluation complete!
