In [25]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

# 1. Create the CartPole environment
env = gym.make("Acrobot-v1")

# 2. Instantiate the PPO agent
model = PPO(
    policy="MlpPolicy",  # Multi-layer Perceptron policy
    env=env,
    learning_rate=0.0003,
    n_steps=2048,
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    ent_coef=0.0,
    verbose=1,
)

# 3. Train the PPO agent
print("Training the PPO agent...")
model.learn(total_timesteps=5000)  # Adjust timesteps as needed

# 4. Save the trained model
model.save("ppo_cartpole")
print("Model saved!")


mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=3)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Training the PPO agent...
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 500      |
|    ep_rew_mean     | -500     |
| time/              |          |
|    fps             | 391      |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 478          |
|    ep_rew_mean          | -478         |
| time/                   |              |
|    fps                  | 325          |
|    iterations           | 2            |
|    time_elapsed         | 12           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0062476033 |
|    clip_fraction        | 0.0356       |
|    clip_range        

In [26]:

env = gym.make("Acrobot-v1", render_mode = 'human')
# 5. Load the trained model (optional)
model = PPO.load("ppo_cartpole", env=env)

# 6. Evaluate the trained policy

# 7. Run the trained agent
obs, _ = env.reset()
for i in range(1000):  # Run for a fixed number of timesteps
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, _ = env.step(action)
    env.render()
    if terminated or truncated:
        obs, _ = env.reset()
        print("Done ", i)
    # if not i%20: print(i)

env.close()


Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Done  90
Done  279
Done  370
Done  461
Done  540
Done  653
Done  744
