In [None]:
import gymnasium as gym
import numpy as np
from stable_baselines3 import DQN

env = gym.make("CartPole-v1")

model = DQN(
    "MlpPolicy",
    env,
    learning_rate=0.001,
    gamma=0.99,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.01,
    verbose=1,
    seed=42,
)

def evaluate(model, n_episodes=100):
    rewards = []
    for _ in range(n_episodes):
        obs, _ = env.reset()
        done = False
        total_reward = 0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            done = terminated or truncated
        rewards.append(total_reward)
    return np.mean(rewards)

target_score = 475
timesteps_per_round = 10_000  
average_reward = 0

while average_reward < target_score:
    model.learn(total_timesteps=timesteps_per_round, reset_num_timesteps=False)
    average_reward = evaluate(model)
    print(f"Average reward over 100 episodes: {average_reward:.2f}")


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 23       |
|    ep_rew_mean      | 23       |
|    exploration_rate | 0.909    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 11501    |
|    time_elapsed     | 0        |
|    total_timesteps  | 92       |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 27.6     |
|    ep_rew_mean      | 27.6     |
|    exploration_rate | 0.781    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1475     |
|    time_elapsed     | 0        |
|    total_timesteps  | 221      |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 0.116    |
|    n_updates        | 30       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 26.5     |
|    ep_rew_mean      | 26.5     |
|    exploration_rate | 0.685    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 1360     |
|    time_elapsed     | 0        |
|    total_timesteps  | 318      |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 0.0092   |
|    n_updates      

In [None]:
n_eval_episodes = 100
episode_rewards = []

for i in range(n_eval_episodes):
    render_mode = "human" if i == n_eval_episodes - 1 else None
    eval_env = gym.make("CartPole-v1", render_mode=render_mode)

    obs, _ = eval_env.reset()
    done = False
    total_reward = 0

    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, _ = eval_env.step(action)
        total_reward += reward
        done = terminated or truncated

    episode_rewards.append(total_reward)
    eval_env.close()

# Show evaluation stats
print(f"Ran {n_eval_episodes} test episodes.")
print(f"Mean reward: {np.mean(episode_rewards):.2f}")
print(f"Std reward:  {np.std(episode_rewards):.2f}")
print(f"Min reward:  {np.min(episode_rewards):.2f}")
print(f"Max reward:  {np.max(episode_rewards):.2f}")



Ran 100 test episodes.
Mean reward: 500.00
Std reward:  0.00
Min reward:  500.00
Max reward:  500.00
