In [None]:
%pip install 'stable-baselines3[extra]' -q

## Playing with OpenAI Gymnassium

In [None]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import matplotlib.pyplot as plt
from IPython import display
%matplotlib inline

In [None]:
playground_config = "CartPole-v1"
playground = gym.make(playground_config, render_mode="rgb_array")

In [None]:
episodes = 5
playground.reset()
img = plt.imshow(playground.render()) 
for episode in range(episodes):
    state = playground.reset()
    done = False
    score = 0
    while not done:
        img.set_data(playground.render())
        action = playground.action_space.sample()
        obs, reward, terminated, truncated , info = playground.step(action)
        done = terminated or truncated
        score += reward
        title_box = (f"Episode: {episode + 1}, Score: {score}")
        plt.title(title_box)
        display.display(plt.gcf())
        display.clear_output(wait=True)
playground.close()

In [None]:
%pip install torch

## Training the Playground Model

In [None]:

logpath = "training/logs"
os.makedirs(logpath, exist_ok=True)

In [None]:
# Check if Apple MPS or Nvidia CUDA is available on PyTorch

import torch

if torch.backends.mps.is_available():
    print("MPS backend is available. Setting to Apple MPS.")
    torch.device("mps")
elif torch.cuda.is_available():
    print("Nvidia CUDA is available. Setting to CUDA.")
    torch.device("cuda")
else:
    print("MPS backend or CUDA is not available.")

In [None]:
playground = gym.make(playground_config)
playground = DummyVecEnv([lambda: playground])
model = PPO("MlpPolicy", playground, verbose=1, tensorboard_log=logpath)
# Issue with Apple M Series, even though it uses Metal, it displays CPU

In [None]:
# First Train the Model
model.learn(total_timesteps=50000)

# Save the Model
os.makedirs("training/models", exist_ok=True)
model.save("training/models/ppo_cartpole")

print("Model has been saved.")

In [None]:
# Load from the Model
loaded_model = PPO.load("training/models/ppo_cartpole",env=playground)

In [None]:
# Evaluate the Model
mean_reward, std_reward = evaluate_policy(loaded_model, playground, n_eval_episodes=10,render=False)
print(f"Mean Reward: {mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
# Test the Model
episodes = 5
playground = gym.make(playground_config, render_mode="rgb_array")
playground = DummyVecEnv([lambda: playground])
for episode in range(episodes):
    state = playground.reset()
    done = False
    score = 0
    while not done:
        action, _ = loaded_model.predict(state)
        state, reward, done, info = playground.step(action)
        score += reward
        title_box = (f"Episode: {episode + 1}, Score: {score}")
        # plt.title(title_box)
        # display.display(plt.gcf())
        # display.clear_output(wait=True)
    print(f"Episode: {episode + 1}, Score: {score}")
playground.close()

In [None]:
state

In [None]:
"Tensorboard can be used to visualize the training process."
# training_ppo5 = os.path.join(logpath, "PPO_6")
# !tensorboard --logdir={training_ppo5}

## Change the Architectures of the Model

In [None]:
# Extend the Training
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=500, verbose=1)
eval_callback = EvalCallback(playground, callback_on_new_best=stop_callback, eval_freq=100000, best_model_save_path="training/models/ppo_cartpole_best", verbose=1)

In [None]:
model = PPO("MlpPolicy", playground, verbose=1, tensorboard_log=logpath)
model.learn(total_timesteps=200000, callback=eval_callback)

In [None]:
# Change Policy Network Architecture
net_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])]
model = PPO("MlpPolicy", playground, policy_kwargs=dict(net_arch=net_arch), verbose=1, tensorboard_log=logpath)
model.learn(total_timesteps=100000, callback=eval_callback)

In [None]:
# Use Alternate Algorithms
from stable_baselines3 import DQN
model = DQN("MlpPolicy", playground, verbose=1, tensorboard_log=logpath)
model.learn(total_timesteps=100000)