In [1]:
import gymnasium as gym
import qwop_gym  # This registers the QWOP-v1 environment
from stable_baselines3 import PPO
import time

browser_path = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"
driver_path = "C:\\Program Files\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe"

In [2]:
# Load the trained model
model_path = "../data/Good Models/stable_baselines_ppo_1.zip"
print(f"Loading model from {model_path}...")
model = PPO.load(model_path)
print("Model loaded successfully!\n")

# Create environment for evaluation (with slower frameskip to watch better)
eval_env = gym.make(
    'QWOP-v1',
    browser=browser_path,
    driver=driver_path,
    stat_in_browser=True,
    game_in_browser=True,
    auto_draw=True,
    frames_per_step=4,  # Slower for better visualization
    max_episode_steps=3000,
    text_in_browser="ðŸ¤– Trained AI Playing"
)

print("Playing 5 episodes with the trained model...")
print("Watch the browser to see your AI in action!\n")

episode_rewards = []
episode_distances = []

for episode in range(5):
    obs, info = eval_env.reset()
    episode_reward = 0
    steps = 0
    
    print(f"Episode {episode + 1}:")
    
    while True:
        # Use the trained model to predict actions (deterministic for consistency)
        action, _states = model.predict(obs, deterministic=True)
        
        obs, reward, terminated, truncated, info = eval_env.step(action)
        episode_reward += reward
        steps += 1
        
        if terminated or truncated:
            distance = info.get('distance', 0)
            episode_rewards.append(episode_reward)
            episode_distances.append(distance)
            
            print(f"  Steps: {steps}")
            print(f"  Distance: {distance:.2f}m")
            print(f"  Total Reward: {episode_reward:.2f}")
            print(f"  {'âœ“ SUCCESS!' if info.get('is_success') else 'âœ— Failed'}\n")
            break
        
        time.sleep(0.02)  # Small delay to watch the game

print("\n" + "="*50)
print("PERFORMANCE SUMMARY")
print("="*50)
print(f"Average Distance: {sum(episode_distances)/len(episode_distances):.2f}m")
print(f"Best Distance: {max(episode_distances):.2f}m")
print(f"Average Reward: {sum(episode_rewards)/len(episode_rewards):.2f}")
print(f"Successes: {sum(1 for d in episode_distances if d >= 100)}/5")
print("="*50)

eval_env.close()
print("\nâœ“ Evaluation complete!")

Loading model from ../data/Good Models/stable_baselines_ppo_1.zip...
Model loaded successfully!

Model loaded successfully!

Playing 5 episodes with the trained model...
Watch the browser to see your AI in action!

Episode 1:
Playing 5 episodes with the trained model...
Watch the browser to see your AI in action!

Episode 1:
  Steps: 3000
  Distance: 72.57m
  Total Reward: -45.64
  âœ— Failed

Episode 2:
  Steps: 3000
  Distance: 72.57m
  Total Reward: -45.64
  âœ— Failed

Episode 2:
  Steps: 3000
  Distance: 71.07m
  Total Reward: -46.90
  âœ— Failed

Episode 3:
  Steps: 3000
  Distance: 71.07m
  Total Reward: -46.90
  âœ— Failed

Episode 3:
  Steps: 3000
  Distance: 72.57m
  Total Reward: -45.77
  âœ— Failed

Episode 4:
  Steps: 3000
  Distance: 72.57m
  Total Reward: -45.77
  âœ— Failed

Episode 4:
  Steps: 3000
  Distance: 71.07m
  Total Reward: -46.90
  âœ— Failed

Episode 5:
  Steps: 3000
  Distance: 71.07m
  Total Reward: -46.90
  âœ— Failed

Episode 5:
  Steps: 3000
  Distance:

In [3]:
# Import necessary libraries and custom PPO implementation
import numpy as np
import torch
import torch.nn as nn
import gymnasium as gym
import qwop_gym

# Reload train_ppo module to pick up latest changes
import importlib
import train_ppo
importlib.reload(train_ppo)

# Import custom PPO components from train_ppo.py
from train_ppo import (
    ActorCritic,
    RolloutBuffer,
    create_qwop_env,
    train_ppo,
    evaluate_policy,
    save_checkpoint,
    load_checkpoint
)

print("âœ“ All modules imported successfully!")

# PPO Hyperparameters
TOTAL_STEPS = 50000
ROLLOUT_STEPS = 2048
BATCH_SIZE = 256
PPO_EPOCHS = 4
GAMMA = 0.995
GAE_LAMBDA = 0.95
CLIP_COEF = 0.2
LEARNING_RATE = 3e-4
VALUE_COEF = 0.5
ENTROPY_COEF = 0.01
MAX_GRAD_NORM = 0.5
HIDDEN_SIZE = 256

âœ“ All modules imported successfully!


In [4]:
# Load and evaluate the trained PPO agent
MODEL_PATH = "../data/Good Models/custom_ppo.pt"

print(f"Loading model from: {MODEL_PATH}")

# Create evaluation environment
eval_env = create_qwop_env(
    browser_path=browser_path,
    driver_path=driver_path,
    stat_in_browser=True,
    game_in_browser=True,
    auto_draw=True,
    frames_per_step=4,
    max_episode_steps=5000,
    text_in_browser="ðŸ¤– PPO Agent Evaluation"
)

# Initialize policy and load checkpoint
state_dim = eval_env.observation_space.shape[0]
action_dim = eval_env.action_space.n
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

eval_policy = ActorCritic(state_dim, action_dim, HIDDEN_SIZE).to(device)
checkpoint = load_checkpoint(MODEL_PATH, eval_policy, device)

print(f"âœ“ Model loaded (trained for {checkpoint['step']} steps)")
print("\nEvaluating agent for 5 episodes...")
print("Watch the browser to see your trained AI in action!\n")

# Evaluate the policy
results = evaluate_policy(
    env=eval_env,
    policy=eval_policy,
    device=device,
    num_episodes=5,
    verbose=True,
    render_delay=0.02
)

# Close environment
eval_env.close()
print("\nâœ“ Evaluation complete!")

Loading model from: ../data/Good Models/custom_ppo.pt
âœ“ Model loaded (trained for 500000 steps)

Evaluating agent for 5 episodes...
Watch the browser to see your trained AI in action!

Episode 1:
âœ“ Model loaded (trained for 500000 steps)

Evaluating agent for 5 episodes...
Watch the browser to see your trained AI in action!

Episode 1:
  Steps: 1260
  Distance: 100.59m
  Total Reward: 83.37
  âœ“ SUCCESS!

Episode 2:
  Steps: 1260
  Distance: 100.59m
  Total Reward: 83.37
  âœ“ SUCCESS!

Episode 2:
  Steps: 1286
  Distance: 100.53m
  Total Reward: 82.36
  âœ“ SUCCESS!

Episode 3:
  Steps: 1286
  Distance: 100.53m
  Total Reward: 82.36
  âœ“ SUCCESS!

Episode 3:
  Steps: 28
  Distance: 0.36m
  Total Reward: -10.85
  âœ— Failed

Episode 4:
  Steps: 28
  Distance: 0.36m
  Total Reward: -10.85
  âœ— Failed

Episode 4:
  Steps: 1260
  Distance: 100.70m
  Total Reward: 83.35
  âœ“ SUCCESS!

Episode 5:
  Steps: 1260
  Distance: 100.70m
  Total Reward: 83.35
  âœ“ SUCCESS!

Episode 5:
  St

In [5]:
# Load and evaluate the trained PPO agent
MODEL_PATH = "../data/Good Models/custom_ppo_and_reward.pt"

print(f"Loading model from: {MODEL_PATH}")

# Create evaluation environment
eval_env = create_qwop_env(
    browser_path=browser_path,
    driver_path=driver_path,
    stat_in_browser=True,
    game_in_browser=True,
    auto_draw=True,
    frames_per_step=4,
    max_episode_steps=5000,
    text_in_browser="ðŸ¤– PPO Agent Evaluation"
)

# Initialize policy and load checkpoint
state_dim = eval_env.observation_space.shape[0]
action_dim = eval_env.action_space.n
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

eval_policy = ActorCritic(state_dim, action_dim, HIDDEN_SIZE).to(device)
checkpoint = load_checkpoint(MODEL_PATH, eval_policy, device)

print(f"âœ“ Model loaded (trained for {checkpoint['step']} steps)")
print("\nEvaluating agent for 5 episodes...")
print("Watch the browser to see your trained AI in action!\n")

# Evaluate the policy
results = evaluate_policy(
    env=eval_env,
    policy=eval_policy,
    device=device,
    num_episodes=5,
    verbose=True,
    render_delay=0.02
)

# Close environment
eval_env.close()
print("\nâœ“ Evaluation complete!")

Loading model from: ../data/Good Models/custom_ppo_and_reward.pt
âœ“ Model loaded (trained for 25000 steps)

Evaluating agent for 5 episodes...
Watch the browser to see your trained AI in action!

Episode 1:
âœ“ Model loaded (trained for 25000 steps)

Evaluating agent for 5 episodes...
Watch the browser to see your trained AI in action!

Episode 1:
  Steps: 1743
  Distance: 36.02m
  Total Reward: -41.15
  âœ— Failed

Episode 2:
  Steps: 1743
  Distance: 36.02m
  Total Reward: -41.15
  âœ— Failed

Episode 2:
  Steps: 15
  Distance: 0.63m
  Total Reward: -10.21
  âœ— Failed

Episode 3:
  Steps: 15
  Distance: 0.63m
  Total Reward: -10.21
  âœ— Failed

Episode 3:
  Steps: 21
  Distance: -0.32m
  Total Reward: -11.15
  âœ— Failed

Episode 4:
  Steps: 21
  Distance: -0.32m
  Total Reward: -11.15
  âœ— Failed

Episode 4:
  Steps: 26
  Distance: 0.70m
  Total Reward: -10.51
  âœ— Failed

Episode 5:
  Steps: 26
  Distance: 0.70m
  Total Reward: -10.51
  âœ— Failed

Episode 5:
  Steps: 4712
  D