In [16]:
!pip3 install atari_py
!pip3 install Box2D
!pip3 install box2d-py
!pip3 install "stable-baselines3[extra]>=2.0.0a4"
!pip3 install gym
!pip3 install opencv-python

zsh:1: command not found: apt-get


In [5]:
import torch as th
import cv2
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import StopTrainingOnRewardThreshold, EvalCallback
import random
import warnings
warnings.filterwarnings('ignore')

In [17]:
class AtariGames:
    def __init__(self, env_path, render_mode, path_save_best, path_save, video_path):
        self.env_path = env_path
        self.render_mode = render_mode
        self.path_save_best = path_save_best
        self.path_save = path_save
        self.video_path = video_path

    def make_env(self, which_env):
        self.which_env = which_env
        env = gym.make(self.env_path[self.which_env], render_mode=self.render_mode)
        self.env = DummyVecEnv([lambda: env])
        
    def train_model(self, total_timesteps, episodes, policy_kwargs, reward_threshold, eval_freq):
        stop_callback_func = StopTrainingOnRewardThreshold(reward_threshold=reward_threshold, verbose=1)
        eval_callback = EvalCallback(self.env,
                                      callback_on_new_best=stop_callback_func,
                                      eval_freq=eval_freq,
                                      best_model_save_path=self.path_save_best[0],
                                      verbose=1)


        self.model = PPO("MlpPolicy", self.env,  verbose=1, policy_kwargs=policy_kwargs)
        self.model.learn(total_timesteps=total_timesteps, callback=eval_callback)
        evaluate_policy(self.model, self.env, n_eval_episodes=episodes, render=True)
        
        self.model.save(self.path_save[self.which_env])
        del self.model
        
    def test_model(self, episodes, fps):
        model = PPO.load(self.path_save[self.which_env], env=self.env)
        observation= self.env.reset()
        frame_size = (self.env.render().shape[1], self.env.render().shape[0])

        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        video_writer = cv2.VideoWriter(self.video_path[self.which_env],  fourcc, fps, frame_size)
        for i in range(1, episodes+1):
            observation = self.env.reset()
            score = 0
            done = False

            while not done:
                frame = self.env.render()
                action, _ = model.predict(observation)
                observation, reward, done, _ = self.env.step(action)
                frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
                video_writer.write(frame_bgr)
                score+=reward

            print(f"Episode: {i}, score: {score}")
        video_writer.release()
        

    
# List of enviroments
env = ('CartPole-v1', "LunarLander-v2", "SpaceInvaders-v4")

# Render mode
render_mode = "rgb_array"

# Path to the best reward
path_save_best = ("CartPole-v1-BEST", "LunarLander-v2-BEST", "SpaceInvaders-v4")

# Path to model
path_save = ("cart_pole_model.zip", "lunar_landler.zip", "space_invaders.zip")

# Path to Video
video_path = ("CartPole-video.mp4", "LunarLander-video.mp4", "SpaceInvaders-video.mp4")

policy_kwargs = dict(pi=[64, 128, 128, 64], vf=[64, 128, 128, 64])
act_fn=th.nn.ReLU
custom_policy_kwargs = dict(net_arch=policy_kwargs, activation_fn=act_fn)

games = AtariGames(env, render_mode, path_save_best, path_save, video_path)

# CartPole game
games.make_env(0)
games.train_model(total_timesteps=100000, episodes=10, policy_kwargs=custom_policy_kwargs, reward_threshold=500, eval_freq= 20000)
games.test_model(episodes=10, fps=35)

# 

Using cpu device
-----------------------------
| time/              |      |
|    fps             | 1987 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1365        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008325238 |
|    clip_fraction        | 0.0239      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.69       |
|    explained_variance   | -0.0223     |
|    learning_rate        | 0.0003      |
|    loss                 | 3.99        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00451    |
|    value_loss           | 36.8        |
-----------------------------------------
-----------------

-----------------------------------------
| time/                   |             |
|    fps                  | 1022        |
|    iterations           | 13          |
|    time_elapsed         | 26          |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.015050242 |
|    clip_fraction        | 0.164       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.474      |
|    explained_variance   | 0.995       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.198       |
|    n_updates            | 120         |
|    policy_gradient_loss | -0.0101     |
|    value_loss           | 0.978       |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1019        |
|    iterations           | 14          |
|    time_elapsed         | 28          |
|    total_timesteps      | 28672 

In [13]:
# LunarLander game
games.make_env(1)
games.train_model(total_timesteps=100000, episodes=10, policy_kwargs=custom_policy_kwargs, reward_threshold=300, eval_freq= 18000)
games.test_model(episodes=10, fps=35)

Using cpu device
-----------------------------
| time/              |      |
|    fps             | 1545 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1113        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.000422131 |
|    clip_fraction        | 0           |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.39       |
|    explained_variance   | 0.00101     |
|    learning_rate        | 0.0003      |
|    loss                 | 213         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.000977   |
|    value_loss           | 1.01e+03    |
-----------------------------------------
-----------------

-----------------------------------------
| time/                   |             |
|    fps                  | 848         |
|    iterations           | 13          |
|    time_elapsed         | 31          |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.014570181 |
|    clip_fraction        | 0.145       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.15       |
|    explained_variance   | 0.705       |
|    learning_rate        | 0.0003      |
|    loss                 | 22.2        |
|    n_updates            | 120         |
|    policy_gradient_loss | -0.0145     |
|    value_loss           | 74.8        |
-----------------------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 826        |
|    iterations           | 14         |
|    time_elapsed         | 34         |
|    total_timesteps      | 28672      

-----------------------------------------
| time/                   |             |
|    fps                  | 603         |
|    iterations           | 24          |
|    time_elapsed         | 81          |
|    total_timesteps      | 49152       |
| train/                  |             |
|    approx_kl            | 0.013472773 |
|    clip_fraction        | 0.149       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.07       |
|    explained_variance   | 0.966       |
|    learning_rate        | 0.0003      |
|    loss                 | 2.56        |
|    n_updates            | 230         |
|    policy_gradient_loss | -0.0106     |
|    value_loss           | 7.34        |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 598         |
|    iterations           | 25          |
|    time_elapsed         | 85          |
|    total_timesteps      | 51200 

------------------------------------------
| time/                   |              |
|    fps                  | 544          |
|    iterations           | 35           |
|    time_elapsed         | 131          |
|    total_timesteps      | 71680        |
| train/                  |              |
|    approx_kl            | 0.0022176118 |
|    clip_fraction        | 0.072        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.91        |
|    explained_variance   | 0.817        |
|    learning_rate        | 0.0003       |
|    loss                 | 17.4         |
|    n_updates            | 340          |
|    policy_gradient_loss | 0.000381     |
|    value_loss           | 54.5         |
------------------------------------------
Eval num_timesteps=72000, episode_reward=94.32 +/- 123.21
Episode length: 195.00 +/- 50.13
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 195          |
|    m

-----------------------------------------
| time/                   |             |
|    fps                  | 546         |
|    iterations           | 45          |
|    time_elapsed         | 168         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.005699401 |
|    clip_fraction        | 0.0598      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.963      |
|    explained_variance   | 0.933       |
|    learning_rate        | 0.0003      |
|    loss                 | 6.54        |
|    n_updates            | 440         |
|    policy_gradient_loss | -0.00162    |
|    value_loss           | 19.7        |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 546         |
|    iterations           | 46          |
|    time_elapsed         | 172         |
|    total_timesteps      | 94208 

In [15]:
# SpaceInvaders game
games.make_env(2)
games.train_model(total_timesteps=20000, episodes=10, policy_kwargs=custom_policy_kwargs, reward_threshold=300, eval_freq= 10000)
games.test_model(episodes=10, fps=35)

Using cpu device
Wrapping the env in a VecTransposeImage.
-----------------------------
| time/              |      |
|    fps             | 258  |
|    iterations      | 1    |
|    time_elapsed    | 7    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 70           |
|    iterations           | 2            |
|    time_elapsed         | 57           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0045278557 |
|    clip_fraction        | 0.041        |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.79        |
|    explained_variance   | 0.00282      |
|    learning_rate        | 0.0003       |
|    loss                 | 1.19         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00399     |
|    value_loss           | 8.67         |
-