In [9]:
import numpy as np
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor, ResultsWriter

In [10]:
# Create the environment
env = gym.make("ALE/MsPacman-ram-v5", render_mode='rgb_array')  
train_monitor = Monitor(env, filename="monitors/ppo_train")
test_monitor = Monitor(env, filename="monitors/ppo_test")

In [11]:
train_monitor.reset()

# sample action:
print("sample action:", train_monitor.action_space.sample())

# observation space shape:
print("observation space shape:", train_monitor.observation_space.shape)

# sample observation:
print("sample observation:", train_monitor.observation_space.sample())


sample action: 8
observation space shape: (128,)
sample observation: [159 140   0 101 241 163 201  87  48 238  73 179  81  66 150 126  92   4
 119  73  82 237  62 104  11  69 115 118  53 236 203 199 232  53 103 150
   4 248 146 245  89 216  86  45 243  59 169  63   2 214  89 150 138 248
   0   4  15  53 234  77 240 201 167 135  72 203 203  76 172 239  32 221
  75  38  36 157 241  54 230   5 246 231 191 219  35   6 139 118 108 255
 234 102 206 181 242 148  32 129 172 167 248 187 157 248  78 127 111  89
 123 116 123 225 229  29  51  63  84 205 133 187 151 249 175 177 206  44
 251  12]


In [12]:
# Model creation and learning
eval_callback = EvalCallback(train_monitor, best_model_save_path="./logs/logsPPO/",
                             log_path="./logs/logsPPO", eval_freq=100,
                             deterministic=True, render=False)

# logge:r https://stable-baselines3.readthedocs.io/en/master/common/logger.html

model = PPO('MlpPolicy', env, verbose=0)
model.learn(total_timesteps=100000, log_interval=4, callback=eval_callback)        




Eval num_timesteps=100, episode_reward=520.00 +/- 160.00
Episode length: 686.20 +/- 38.40
New best mean reward!
Eval num_timesteps=200, episode_reward=532.00 +/- 155.10
Episode length: 658.60 +/- 65.68
New best mean reward!
Eval num_timesteps=300, episode_reward=534.00 +/- 178.06
Episode length: 654.20 +/- 73.87
New best mean reward!
Eval num_timesteps=400, episode_reward=622.00 +/- 218.85
Episode length: 724.60 +/- 96.58
New best mean reward!
Eval num_timesteps=500, episode_reward=536.00 +/- 152.92
Episode length: 720.20 +/- 85.46
Eval num_timesteps=600, episode_reward=450.00 +/- 20.00
Episode length: 669.40 +/- 32.06
Eval num_timesteps=700, episode_reward=440.00 +/- 0.00
Episode length: 663.40 +/- 26.30
Eval num_timesteps=800, episode_reward=600.00 +/- 195.96
Episode length: 706.20 +/- 48.03
Eval num_timesteps=900, episode_reward=444.00 +/- 4.90
Episode length: 678.60 +/- 31.68
Eval num_timesteps=1000, episode_reward=440.00 +/- 0.00
Episode length: 655.00 +/- 19.39
Eval num_timesteps

KeyboardInterrupt: 

In [None]:
model = PPO.load("logs/logsPPO/best_model")

In [None]:
# Testing the results with 50 episodes
episodes = 3
frames = []
for ep in range(episodes):
    obs = test_monitor.reset()
    obs = obs[0]
    done = False
    rew = 0
    steps = 0
    while not done:
        action, _states = model.predict(obs)
        obs, reward, done, tr, info = test_monitor.step(action)
        img = test_monitor.render()
        frames.append(img)

In [None]:
# Now show the animation:
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
plt.rcParams['animation.embed_limit'] = 80  # For example, set it to 30 MB

# To get smooth animations
import matplotlib.animation as animation
mpl.rc('animation', html='jshtml')
def update_scene(num, frames, patch):
    # print(num)
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim

In [None]:
plot_animation(frames)

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_monitor.get_episode_rewards())
plt.xlabel("Episode")
plt.ylabel("Rewards per episode")
plt.title("Train Rewards")
plt.grid()
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(test_monitor.get_episode_rewards())
plt.xlabel("Episode")
plt.ylabel("Rewards per episode")
plt.title("Test Rewards")
plt.grid()
plt.show()