In [5]:
import gymnasium as gym
import qwop_gym  # This registers the environment
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import CheckpointCallback


In [6]:
print(dir(qwop_gym))
# Check available environments

print(gym.envs.registry.keys())  # Look for qwop entries

['QwopEnv', 'RecordWrapper', 'VerboseWrapper', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'all', 'envs', 'gymnasium', 'wrappers']
dict_keys(['CartPole-v0', 'CartPole-v1', 'MountainCar-v0', 'MountainCarContinuous-v0', 'Pendulum-v1', 'Acrobot-v1', 'phys2d/CartPole-v0', 'phys2d/CartPole-v1', 'phys2d/Pendulum-v0', 'LunarLander-v2', 'LunarLanderContinuous-v2', 'BipedalWalker-v3', 'BipedalWalkerHardcore-v3', 'CarRacing-v2', 'Blackjack-v1', 'FrozenLake-v1', 'FrozenLake8x8-v1', 'CliffWalking-v0', 'Taxi-v3', 'tabular/Blackjack-v0', 'tabular/CliffWalking-v0', 'Reacher-v2', 'Reacher-v4', 'Pusher-v2', 'Pusher-v4', 'InvertedPendulum-v2', 'InvertedPendulum-v4', 'InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v4', 'HalfCheetah-v2', 'HalfCheetah-v3', 'HalfCheetah-v4', 'Hopper-v2', 'Hopper-v3', 'Hopper-v4', 'Swimmer-v2', 'Swimmer-v3', 'Swimmer-v4', 'Walker2d-v2', 'Walker2d-v3', 'Walker2d-v4', 'Ant-v2', 'Ant-v3', 'Ant-v4', '

In [7]:
# Create/recreate the QWOP environment
browser_path = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"
driver_path = "C:\\Program Files\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe"

# Close existing environment if it exists
try:
    env.close()
    print("Closed existing environment")
except:
    pass

# Create new environment with visualization options
env = qwop_gym.QwopEnv(
    browser=browser_path, 
    driver=driver_path,
    stat_in_browser=True,  # Show statistics in browser
    game_in_browser=True,  # Show the game (default)
    auto_draw=True         # Automatically render each frame
)
print("Environment created successfully!")
print("Check your Brave browser - the QWOP game should be visible!")

Environment created successfully!
Check your Brave browser - the QWOP game should be visible!


In [8]:
# Test environment interaction with proper qwop-gym API
import time

# Action space mapping (for 16 actions - full action set)
action_map = {
    0: "none",
    1: "Q",
    2: "W", 
    3: "O",
    4: "P",
    5: "Q+W",
    6: "Q+O",
    7: "Q+P",
    8: "W+O",
    9: "W+P",
    10: "O+P",
    11: "Q+W+O",
    12: "Q+W+P",
    13: "Q+O+P",
    14: "W+O+P",
    15: "Q+W+O+P"
}

print(f"Environment action space size: {env.action_space.n}")
print(f"Action mapping: {action_map}\n")

# Reset environment - returns (observation, info) tuple
observation, info = env.reset()
print(f"Initial observation shape: {observation.shape}")
print(f"Initial observation (first 10 values): {observation[:10]}")
print(f"Initial info: {info}\n")

# Take some random actions
total_reward = 0
for step in range(300):
    # Random action (integer from 0 to action_space.n-1)
    action = env.action_space.sample()
    
    print(f"\nStep {step + 1}: Action {action} = '{action_map.get(action, 'UNKNOWN')}'")
    
    # Step environment - returns (obs, reward, terminated, truncated, info)
    observation, reward, terminated, truncated, info = env.step(action)
    total_reward += reward
    
    print(f"  Reward: {reward:.4f}, Total: {total_reward:.4f}")
    print(f"  Distance: {info['distance']:.2f}m, Time: {info['time']:.2f}s")
    print(f"  Terminated: {terminated}, Truncated: {truncated}")
    
    if terminated or truncated:
        print(f"\n{'SUCCESS!' if info.get('is_success') else 'FAILED!'} Episode finished after {step + 1} steps!")
        print(f"Final distance: {info['distance']:.2f}m")
        break
    
    time.sleep(0.05)  # Small delay to watch

print(f"\nFinal total reward: {total_reward:.4f}")
env.close()

Environment action space size: 16
Action mapping: {0: 'none', 1: 'Q', 2: 'W', 3: 'O', 4: 'P', 5: 'Q+W', 6: 'Q+O', 7: 'Q+P', 8: 'W+O', 9: 'W+P', 10: 'O+P', 11: 'Q+W+O', 12: 'Q+W+P', 13: 'Q+O+P', 14: 'W+O+P', 15: 'Q+W+O+P'}

Initial observation shape: (60,)
Initial observation (first 10 values): [-0.9763941  -0.1822861  -0.20857537 -0.49996138 -0.3929341  -0.9737969
 -0.55740976  0.01065133 -0.50024533 -0.39300483]
Initial info: {'time': np.float32(0.0065033333), 'distance': np.float32(0.25110978), 'avgspeed': np.float32(38.612473), 'is_success': False}


Step 1: Action 5 = 'Q+W'
  Reward: 0.1561, Total: 0.1561
  Distance: 0.25m, Time: 0.01s
  Terminated: False, Truncated: False

Step 2: Action 12 = 'Q+W+P'
  Reward: -0.0370, Total: 0.1191
  Distance: 0.25m, Time: 0.01s
  Terminated: False, Truncated: False

Step 3: Action 7 = 'Q+P'
  Reward: -0.0392, Total: 0.0799
  Distance: 0.25m, Time: 0.02s
  Terminated: False, Truncated: False

Step 4: Action 10 = 'O+P'
  Reward: -0.0372, Total: 0.

In [9]:
# Diagnostic: Test if environment can complete multiple episodes
import gymnasium as gym
import qwop_gym

browser_path = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"
driver_path = "C:\\Program Files\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe"

print("Creating environment...")
test_env = gym.make(
    'QWOP-v1',  # Correct environment ID
    browser=browser_path,
    driver=driver_path,
    stat_in_browser=True,
    game_in_browser=True,
    auto_draw=True,
    max_episode_steps=500,
    frames_per_step=8
)

print("Testing 3 quick episodes...")
for episode in range(3):
    obs, info = test_env.reset()
    print(f"\nEpisode {episode + 1}:")
    print(f"  Initial obs shape: {obs.shape}")
    
    total_reward = 0
    for step in range(50):  # Just 50 steps per episode
        action = test_env.action_space.sample()
        obs, reward, terminated, truncated, info = test_env.step(action)
        total_reward += reward
        
        if terminated or truncated:
            print(f"  Episode ended at step {step + 1}")
            print(f"  Distance: {info.get('distance', 'N/A')}")
            print(f"  Total reward: {total_reward:.2f}")
            break
    else:
        print(f"  Completed 50 steps without terminating")
        print(f"  Distance: {info.get('distance', 'N/A')}")
        print(f"  Total reward: {total_reward:.2f}")

print("\n✓ Environment test complete!")
test_env.close()

Creating environment...
Testing 3 quick episodes...

Episode 1:
  Initial obs shape: (60,)
  Episode ended at step 9
  Distance: 1.3011192083358765
  Total reward: -9.80

Episode 2:
  Initial obs shape: (60,)
  Episode ended at step 44
  Distance: 1.263525128364563
  Total reward: -11.02

Episode 3:
  Initial obs shape: (60,)
  Episode ended at step 15
  Distance: 1.6054109334945679
  Total reward: -9.94

✓ Environment test complete!


## Debug Training Issues

If training shuts down after 1 step, try these diagnostics:

In [10]:
# Train PPO directly in notebook with better error handling
import gymnasium as gym
import qwop_gym
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
from stable_baselines3.common.monitor import Monitor
import os

# Create output directory
output_dir = "../data/PPO-notebook"
os.makedirs(output_dir, exist_ok=True)

browser_path = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"
driver_path = "C:\\Program Files\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe"

print("Creating training environment...")
# Create environment with gym.make for proper integration
train_env = gym.make(
    'QWOP-v1',  # Correct environment ID
    browser=browser_path,
    driver=driver_path,
    stat_in_browser=True,
    game_in_browser=True,
    auto_draw=True,
    frames_per_step=8,  # Frameskip for faster training
    max_episode_steps=500,
    text_in_browser="Training in progress..."
)

# Wrap with Monitor for better logging
train_env = Monitor(train_env, output_dir)

print("Creating PPO model...")
model = PPO(
    "MlpPolicy",
    train_env,
    verbose=1,
    learning_rate=0.001,
    n_steps=256,  # Small batch for testing
    batch_size=32,
    n_epochs=10,
    gamma=0.9,
    gae_lambda=0.98,
    clip_range=0.4,
    ent_coef=0.001,
    tensorboard_log=output_dir
)

# Create checkpoint callback
checkpoint_callback = CheckpointCallback(
    save_freq=10000,
    save_path=output_dir,
    name_prefix="ppo_qwop"
)

print("\nStarting training for 50,000 timesteps...")
print("Watch the browser window to see the agent learning!")
print("Press Ctrl+C to stop training early\n")

try:
    model.learn(
        total_timesteps=50000,  # Start with smaller number
        callback=checkpoint_callback,
        progress_bar=True
    )
    print("\n✓ Training completed successfully!")
    
    # Save final model
    model.save(f"{output_dir}/ppo_qwop_final")
    print(f"Model saved to {output_dir}/ppo_qwop_final.zip")
    
except KeyboardInterrupt:
    print("\n\n⚠ Training interrupted by user")
    model.save(f"{output_dir}/ppo_qwop_interrupted")
    print(f"Model saved to {output_dir}/ppo_qwop_interrupted.zip")
    
except Exception as e:
    print(f"\n\n❌ Training failed with error: {e}")
    import traceback
    traceback.print_exc()
    
finally:
    train_env.close()
    print("Environment closed.")

Creating training environment...
Creating PPO model...
Using cpu device
Wrapping the env in a DummyVecEnv.

Starting training for 50,000 timesteps...
Watch the browser window to see the agent learning!
Press Ctrl+C to stop training early

Logging to ../data/PPO-notebook\PPO_10
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.2     |
|    ep_rew_mean     | -10.6    |
|    success_rate    | 0        |
| time/              |          |
|    fps             | 70       |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 256      |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 20.2       |
|    ep_rew_mean          | -10.6      |
|    success_rate         | 0          |
| time/                   |            |
|    fps                  | 63         |
|    iterations           | 2          |
|    time_elapsed   


✓ Training completed successfully!
Model saved to ../data/PPO-notebook/ppo_qwop_final.zip
Environment closed.


## Watch Trained Model Play

Load the trained model and watch it play QWOP:

In [13]:
# Load and watch the trained model
import gymnasium as gym
import qwop_gym
from stable_baselines3 import PPO
import time

browser_path = "C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"
driver_path = "C:\\Program Files\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe"

# Load the trained model
model_path = "../data/PPO-notebook/ppo_qwop_final.zip"
print(f"Loading model from {model_path}...")
model = PPO.load(model_path)
print("✓ Model loaded successfully!\n")

# Create environment for evaluation (with slower frameskip to watch better)
eval_env = gym.make(
    'QWOP-v1',
    browser=browser_path,
    driver=driver_path,
    stat_in_browser=True,
    game_in_browser=True,
    auto_draw=True,
    frames_per_step=4,  # Slower for better visualization
    max_episode_steps=5000,
    text_in_browser="🤖 Trained AI Playing"
)

print("Playing 5 episodes with the trained model...")
print("Watch the browser to see your AI in action!\n")

episode_rewards = []
episode_distances = []

for episode in range(5):
    obs, info = eval_env.reset()
    episode_reward = 0
    steps = 0
    
    print(f"Episode {episode + 1}:")
    
    while True:
        # Use the trained model to predict actions (deterministic for consistency)
        action, _states = model.predict(obs, deterministic=True)
        
        obs, reward, terminated, truncated, info = eval_env.step(action)
        episode_reward += reward
        steps += 1
        
        if terminated or truncated:
            distance = info.get('distance', 0)
            episode_rewards.append(episode_reward)
            episode_distances.append(distance)
            
            print(f"  Steps: {steps}")
            print(f"  Distance: {distance:.2f}m")
            print(f"  Total Reward: {episode_reward:.2f}")
            print(f"  {'✓ SUCCESS!' if info.get('is_success') else '✗ Failed'}\n")
            break
        
        time.sleep(0.02)  # Small delay to watch the game

print("\n" + "="*50)
print("PERFORMANCE SUMMARY")
print("="*50)
print(f"Average Distance: {sum(episode_distances)/len(episode_distances):.2f}m")
print(f"Best Distance: {max(episode_distances):.2f}m")
print(f"Average Reward: {sum(episode_rewards)/len(episode_rewards):.2f}")
print(f"Successes: {sum(1 for d in episode_distances if d >= 100)}/5")
print("="*50)

eval_env.close()
print("\n✓ Evaluation complete!")

Loading model from ../data/PPO-notebook/ppo_qwop_final.zip...
✓ Model loaded successfully!

Playing 5 episodes with the trained model...
Watch the browser to see your AI in action!

Episode 1:
  Steps: 5000
  Distance: 72.57m
  Total Reward: -112.32
  ✗ Failed

Episode 2:




  Steps: 642
  Distance: 0.54m
  Total Reward: 13.11
  ✗ Failed

Episode 3:


KeyboardInterrupt: 