In [5]:
import numpy as np
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor, ResultsWriter

In [6]:
# Create the environment
env = gym.make("ALE/MsPacman-ram-v5", render_mode='rgb_array')  
train_monitor = Monitor(env, filename="monitors/ppo_train")
test_monitor = Monitor(env, filename="monitors/ppo_test")

In [7]:
env.reset()
train_monitor.reset()

# sample action:
print("sample action:", train_monitor.action_space.sample())

# observation space shape:
print("observation space shape:", train_monitor.observation_space.shape)

# sample observation:
print("sample observation:", train_monitor.observation_space.sample())


sample action: 8
observation space shape: (128,)
sample observation: [191  38  17 128  46   6 247  25 208  68   9 103 244 250  64 218  81   5
  31   7 251   6 120 108 171 197 190 151  50  39  97  57 101 137 164  42
  43  78  96 241  69 190  93  22 168  39 108  25 202 217 184 178 229 222
 235   5  75  67  96  72 161  19   9  23  95  81 233 185  59 255 147 196
 135  36 104   1 100  27 213  29   1 189  27 238  56 202 230 164  25 117
 195  59  88 137 187 212 223  89 151 218 239  43 219  15 182 117  19 253
  59 194 169 147 123 162 102 168  18 218  41 113 221  16  86 123  75  30
 176 148]


In [8]:
# Model creation and learning
eval_callback = EvalCallback(train_monitor, best_model_save_path="./logs/logsPPO/",
                             log_path="./logs/logsPPO", eval_freq=100,
                             deterministic=True, render=False)

# logge:r https://stable-baselines3.readthedocs.io/en/master/common/logger.html

model = PPO('MlpPolicy', env, verbose=0)
model.learn(total_timesteps=100000, log_interval=4, callback=eval_callback)        


  if not isinstance(terminated, (bool, np.bool8)):


Eval num_timesteps=100, episode_reward=374.00 +/- 114.82
Episode length: 672.60 +/- 104.11
New best mean reward!
Eval num_timesteps=200, episode_reward=266.00 +/- 112.18
Episode length: 567.00 +/- 76.97
Eval num_timesteps=300, episode_reward=206.00 +/- 18.55
Episode length: 615.00 +/- 128.19
Eval num_timesteps=400, episode_reward=228.00 +/- 105.53
Episode length: 673.00 +/- 105.37
Eval num_timesteps=500, episode_reward=596.00 +/- 681.78
Episode length: 776.60 +/- 291.71
New best mean reward!
Eval num_timesteps=600, episode_reward=240.00 +/- 81.73
Episode length: 601.80 +/- 97.18
Eval num_timesteps=700, episode_reward=284.00 +/- 124.52
Episode length: 625.40 +/- 102.41
Eval num_timesteps=800, episode_reward=178.00 +/- 69.40
Episode length: 579.00 +/- 109.02
Eval num_timesteps=900, episode_reward=288.00 +/- 138.33
Episode length: 647.00 +/- 106.55
Eval num_timesteps=1000, episode_reward=284.00 +/- 55.71
Episode length: 709.00 +/- 143.79
Eval num_timesteps=1100, episode_reward=270.00 +/- 

KeyboardInterrupt: 

In [None]:
model = PPO.load("logs/logsPPO/best_model")

In [None]:
# Testing the results with 50 episodes
episodes = 3
frames = []
for ep in range(episodes):
    obs = test_monitor.reset()
    obs = obs[0]
    done = False
    rew = 0
    steps = 0
    while not done:
        action, _states = model.predict(obs)
        obs, reward, done, tr, info = test_monitor.step(action)
        img = test_monitor.render()
        frames.append(img)

In [None]:
# Now show the animation:
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
plt.rcParams['animation.embed_limit'] = 80  # For example, set it to 30 MB

# To get smooth animations
import matplotlib.animation as animation
mpl.rc('animation', html='jshtml')
def update_scene(num, frames, patch):
    # print(num)
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim

In [None]:
plot_animation(frames)

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_monitor.get_episode_rewards())
plt.xlabel("Episode")
plt.ylabel("Rewards per episode")
plt.title("Train Rewards")
plt.grid()
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(test_monitor.get_episode_rewards())
plt.xlabel("Episode")
plt.ylabel("Rewards per episode")
plt.title("Test Rewards")
plt.grid()
plt.show()