In [1]:
import os
import importlib
import numpy as np
import torch
import torch.nn as nn
import gymnasium as gym
import qwop_gym
from pathlib import Path

# Reload train_ppo module to pick up latest changes
import train_ppo
importlib.reload(train_ppo)

# Import custom PPO components from train_ppo.py
from train_ppo import (
    ActorCritic,
    RolloutBuffer,
    create_qwop_env,
    train_ppo,
    evaluate_policy,
    save_checkpoint,
    load_checkpoint
)

print("âœ“ All modules imported successfully!")
print(f"Using PyTorch device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")


âœ“ All modules imported successfully!
Using PyTorch device: cpu


# Custom PPO Training with Custom Reward Function

Train a PPO agent with a custom reward wrapper that penalizes poor posture.


In [7]:
# Custom Reward Wrapper
class CustomRewardWrapper(gym.Wrapper):
    """
    Wrapper to customize the reward function for QWOP.
    
    Default reward is typically based on distance traveled.
    You can modify this to add penalties, bonuses, or completely change the reward structure.
    """
    
    def __init__(self, env, penalty_scale=0.1, y_threshold=1.5, verbose=False):
        super().__init__(env)
        self.prev_distance = 0
        self.steps_taken = 0
        self.torso_y_idx = 1  # Index for torso Y-level in observation
        self.y_threshold = y_threshold  # Minimum acceptable torso height
        self.penalty_scale = penalty_scale  # Scale for posture penalty
        self.verbose = verbose  # Print reward breakdown occasionally
        self.total_base_reward = 0
        self.total_penalty = 0
        self.episode_steps = 0
    
    def reset(self, **kwargs):
        if self.verbose and self.episode_steps > 0:
            print(f"\n[CustomReward] Episode Summary:")
            print(f"  Total base reward: {self.total_base_reward:.2f}")
            print(f"  Total penalty: {self.total_penalty:.2f}")
            print(f"  Penalty scale: {self.penalty_scale}")
            print(f"  Average penalty per step: {self.total_penalty/self.episode_steps:.4f}")
        
        self.prev_distance = 0
        self.steps_taken = 0
        self.total_base_reward = 0
        self.total_penalty = 0
        self.episode_steps = 0
        return self.env.reset(**kwargs)
    
    def step(self, action):
        obs, base_reward, terminated, truncated, info = self.env.step(action)
        self.steps_taken += 1
        
       # Get Torso Y-Level (Need to ensure this index is correct)
        y_torso = obs[self.torso_y_idx]

        posture_penalty = 0
        if y_torso > self.y_threshold:
            # Linear Penalty: -k * (y_threshold - y_torso)
            posture_penalty = -self.penalty_scale * (self.y_threshold - y_torso)

        custom_reward = base_reward + posture_penalty
        
        # Track cumulative rewards for debugging
        self.total_base_reward += base_reward
        self.total_penalty += posture_penalty
        
        # Verbose logging every 100 steps
        if self.verbose and self.steps_taken % 100 == 0:
            print(f"Step {self.steps_taken}: y_torso={y_torso:.3f}, "
                  f"base_reward={base_reward:.3f}, penalty={posture_penalty:.3f}, "
                  f"total_reward={custom_reward:.3f}")
        
        return obs, custom_reward, terminated, truncated, info

print("âœ“ CustomRewardWrapper defined!")
print("\nKey features:")
print("  - Penalizes low torso height (bad posture)")
print("  - Adjustable penalty_scale parameter")
print("  - Set verbose=True to see reward breakdown")
print("\nUsage:")
print("  env = CustomRewardWrapper(env, penalty_scale=0.5, y_threshold=1.5, verbose=True)")


âœ“ CustomRewardWrapper defined!

Key features:
  - Penalizes low torso height (bad posture)
  - Adjustable penalty_scale parameter
  - Set verbose=True to see reward breakdown

Usage:
  env = CustomRewardWrapper(env, penalty_scale=0.5, y_threshold=1.5, verbose=True)


## Custom Reward Wrapper

This wrapper modifies the reward function to penalize bad posture (low torso height).


In [5]:
# Configuration for QWOP environment
BROWSER_PATH = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"
DRIVER_PATH = "C:\\Program Files\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe"

# Create QWOP environment using the helper function
env = create_qwop_env(
    browser_path=BROWSER_PATH,
    driver_path=DRIVER_PATH,
    stat_in_browser=True,
    game_in_browser=True,
    auto_draw=True,
    frames_per_step=4,
    max_episode_steps=2000,
    text_in_browser="ðŸ¤– Training PPO Agent"
)

print(f"âœ“ Environment created successfully!")
print(f"State dimension: {env.observation_space.shape[0]}")
print(f"Action dimension: {env.action_space.n}")
print("Check your Brave browser - the QWOP game should be visible!")

âœ“ Environment created successfully!
State dimension: 60
Action dimension: 16
Check your Brave browser - the QWOP game should be visible!


## Train PPO Agent

In [9]:
# Training configuration

# PPO Hyperparameters
TOTAL_STEPS = 25000
ROLLOUT_STEPS = 2048
BATCH_SIZE = 256
PPO_EPOCHS = 4
GAMMA = 0.995
GAE_LAMBDA = 0.95
CLIP_COEF = 0.2
LEARNING_RATE = 3e-4
VALUE_COEF = 0.5
ENTROPY_COEF = 0.01
MAX_GRAD_NORM = 0.5
HIDDEN_SIZE = 256

# Custom Reward Parameters
PENALTY_SCALE = 0.5  # Increase this to make penalty more significant (try 0.5, 1.0, 2.0)
Y_THRESHOLD = 1.5    # Torso height threshold (lower = more strict)
REWARD_VERBOSE = True  # Set to True to see reward breakdown

# Output directory
OUTPUT_DIR = "../data/PPO-notebook"
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [None]:
# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Create fresh environment for training
env = create_qwop_env(
    browser_path=BROWSER_PATH,
    driver_path=DRIVER_PATH,
    stat_in_browser=True,
    game_in_browser=True,
    auto_draw=True,
    frames_per_step=8,
    max_episode_steps=500,
    text_in_browser="ðŸ¤– Training PPO Agent"
)

# Apply custom reward wrapper with specified parameters
env = CustomRewardWrapper(env, penalty_scale=PENALTY_SCALE, y_threshold=Y_THRESHOLD, verbose=REWARD_VERBOSE)
print(f"âœ“ Custom reward wrapper applied!")
print(f"  Penalty scale: {PENALTY_SCALE}")
print(f"  Y threshold: {Y_THRESHOLD}")
print(f"  Verbose: {REWARD_VERBOSE}")

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
print(f"State dimension: {state_dim}, Action dimension: {action_dim}")

# Initialize policy and optimizer
policy = ActorCritic(state_dim, action_dim, HIDDEN_SIZE).to(device)
optimizer = torch.optim.Adam(policy.parameters(), lr=LEARNING_RATE, eps=1e-5)

print("\nStarting PPO training...")
print(f"Total steps: {TOTAL_STEPS:,}")
print(f"Rollout steps: {ROLLOUT_STEPS:,}")
print(f"Watch the browser to see the agent learning!\n")

# Train the agent using the train_ppo function
results = train_ppo(
    env=env,
    policy=policy,
    optimizer=optimizer,
    device=device,
    total_steps=TOTAL_STEPS,
    rollout_steps=ROLLOUT_STEPS,
    batch_size=BATCH_SIZE,
    ppo_epochs=PPO_EPOCHS,
    gamma=GAMMA,
    gae_lambda=GAE_LAMBDA,
    clip_coef=CLIP_COEF,
    value_coef=VALUE_COEF,
    entropy_coef=ENTROPY_COEF,
    max_grad_norm=MAX_GRAD_NORM,
    model_dir=OUTPUT_DIR,
    save_every=10000,
    verbose=True
)

# Print training summary
print("\n" + "="*60)
print("TRAINING SUMMARY")
print("="*60)
print(f"Total steps: {results['total_steps']:,}")
print(f"Episodes completed: {results['completed_episodes']}")
print(f"Average episode reward: {results['average_reward']:.2f}")
print(f"Average episode length: {results['average_length']:.1f}")
print(f"Final model saved to: {results['final_checkpoint']}")
print("="*60)

# Close environment
env.close()
print("\nâœ“ Training complete!")


Using device: cpu


âœ“ Custom reward wrapper applied!
  Penalty scale: 0.5
  Y threshold: 1.5
  Verbose: True
State dimension: 60, Action dimension: 16

Starting PPO training...
Total steps: 25,000
Rollout steps: 2,048
Watch the browser to see the agent learning!

episode 1: reward=-10.12 len=  14
episode 2: reward=-10.47 len=  18
episode 3: reward=-11.56 len=  21
episode 4: reward=-10.85 len=  33
episode 5: reward=-10.46 len=  12
episode 6: reward=-10.14 len=  11
episode 7: reward=-10.43 len=  15





[info] KeyboardInterrupt received; saving final checkpoint...
Training interrupted. Latest policy saved to ..\data\PPO-notebook\ppo_final.pt

TRAINING SUMMARY
Total steps: 132
Episodes completed: 7
Average episode reward: -10.58
Average episode length: 17.7
Final model saved to: ..\data\PPO-notebook\ppo_final.pt

âœ“ Training complete!


## Evaluate Trained Agent

Load the trained model and watch it play QWOP:

In [10]:
# Load and evaluate the trained PPO agent
MODEL_PATH = "../data/PPO-notebook/custom_ppo_final.pt"  # Use custom trained model
HIDDEN_SIZE = 256

print(f"Loading model from: {MODEL_PATH}")

# Create evaluation environment
eval_env = create_qwop_env(
    browser_path=BROWSER_PATH,
    driver_path=DRIVER_PATH,
    stat_in_browser=True,
    game_in_browser=True,
    auto_draw=True,
    frames_per_step=4,
    max_episode_steps=5000,
    text_in_browser="ðŸ¤– PPO Agent Evaluation"
)

# Apply same custom reward wrapper (though rewards don't matter much during eval)
eval_env = CustomRewardWrapper(eval_env, penalty_scale=PENALTY_SCALE, y_threshold=Y_THRESHOLD, verbose=False)

# Initialize policy and load checkpoint
state_dim = eval_env.observation_space.shape[0]
action_dim = eval_env.action_space.n
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

eval_policy = ActorCritic(state_dim, action_dim, HIDDEN_SIZE).to(device)
checkpoint = load_checkpoint(MODEL_PATH, eval_policy, device)

print(f"âœ“ Model loaded (trained for {checkpoint['step']} steps)")
print("\nEvaluating agent for 5 episodes...")
print("Watch the browser to see your trained AI in action!\n")

# Evaluate the policy
results = evaluate_policy(
    env=eval_env,
    policy=eval_policy,
    device=device,
    num_episodes=5,
    verbose=True,
    render_delay=0.02
)

# Close environment
eval_env.close()
print("\nâœ“ Evaluation complete!")


Loading model from: ../data/PPO-notebook/custom_ppo_final.pt
âœ“ Model loaded (trained for 50000 steps)

Evaluating agent for 5 episodes...
Watch the browser to see your trained AI in action!

Episode 1:
âœ“ Model loaded (trained for 50000 steps)

Evaluating agent for 5 episodes...
Watch the browser to see your trained AI in action!

Episode 1:
  Steps: 2232
  Distance: 30.40m
  Total Reward: -61.66
  âœ— Failed

Episode 2:
  Steps: 2232
  Distance: 30.40m
  Total Reward: -61.66
  âœ— Failed

Episode 2:
  Steps: 2448
  Distance: 32.12m
  Total Reward: -67.71
  âœ— Failed

Episode 3:
  Steps: 2448
  Distance: 32.12m
  Total Reward: -67.71
  âœ— Failed

Episode 3:
  Steps: 1779
  Distance: 26.37m
  Total Reward: -49.71
  âœ— Failed

Episode 4:
  Steps: 1779
  Distance: 26.37m
  Total Reward: -49.71
  âœ— Failed

Episode 4:
  Steps: 1987
  Distance: 24.10m
  Total Reward: -58.36
  âœ— Failed

Episode 5:
  Steps: 1987
  Distance: 24.10m
  Total Reward: -58.36
  âœ— Failed

Episode 5:
  Ste