In [1]:
!pip install torch
!pip install stable-baselines3[extra]
!pip install gymnasium
!pip install 'gymnasium[atari]'
!pip install ale-py

Collecting shimmy~=1.1.0 (from shimmy[atari]~=1.1.0; extra == "extra"->stable-baselines3[extra])
  Downloading Shimmy-1.1.0-py3-none-any.whl.metadata (3.3 kB)
Collecting autorom~=0.6.1 (from autorom[accept-rom-license]~=0.6.1; extra == "extra"->stable-baselines3[extra])
  Downloading AutoROM-0.6.1-py3-none-any.whl.metadata (2.4 kB)
Collecting AutoROM.accept-rom-license (from autorom[accept-rom-license]~=0.6.1; extra == "extra"->stable-baselines3[extra])
  Downloading AutoROM.accept-rom-license-0.6.1.tar.gz (434 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.7/434.7 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting ale-py~=0.8.1 (from shimmy[atari]~=1.1.0; extra == "extra"->stable-baselines3[extra])
  Downloading ale_py-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux20

In [2]:
import os
import gymnasium as gym

import ale_py 

import os

import torch

from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.evaluation import evaluate_policy

  from jax import xla_computation as _xla_computation


In [3]:
gym.register_envs(ale_py)

  and should_run_async(code)


In [4]:
environment_name = 'MsPacman-v4'

In [5]:
env = gym.make(environment_name, render_mode='rgb_array')

In [6]:
env.metadata['render_fps'] = 30  # Définir le FPS de rendu

In [7]:
episodes = 1
for episode in range(1, episodes+1):
    state = env.reset()
    terminated = False
    score = 0

    while not terminated:
        env.render() 
        action = env.action_space.sample()
        observation, reward, terminated, truncated, info = env.step(action)
        score += reward
    print(f'Episode: {episode}, Score: {score}')
env.close()

Episode: 1, Score: 280.0


In [8]:
env.action_space

Discrete(9)

In [9]:
env.observation_space

Box(0, 255, (210, 160, 3), uint8)

# Training DQN

In [10]:
environment_name = 'MsPacman-v4'

In [11]:
env = make_atari_env(environment_name, n_envs=1, seed=0)

In [12]:
# Empilement des frames pour prendre en compte l'historique des observations
env = VecFrameStack(env, n_stack=4)


In [13]:
log_path = os.path.join('Training', 'Logs','dqn_pacman_tensorboard')

In [14]:
# Vérifier si un GPU est disponible
print(f"GPU disponible : {torch.cuda.is_available()}")
print(f"Nom du GPU : {torch.cuda.get_device_name(0)}")

GPU disponible : True
Nom du GPU : Tesla P100-PCIE-16GB


In [15]:
# Initialisation du modèle DQN
model = DQN(
    'CnnPolicy',  # Utilisation d'un CNN pour traiter les images
    env,
    verbose=1,
    learning_rate=1e-4,
    buffer_size=100000,  # Taille de la mémoire de replay
    learning_starts=10000,  # Nombre de steps avant de commencer l'apprentissage
    batch_size=32,
    tau=1.0,  # Paramètre pour la mise à jour du réseau cible
    gamma=0.99,  # Facteur de discount
    train_freq=4,  # Fréquence de mise à jour du réseau
    target_update_interval=1000,  # Fréquence de mise à jour du réseau cible
    exploration_fraction=0.1,  # Fraction de l'exploration
    exploration_initial_eps=1.0,  # Exploration initiale
    exploration_final_eps=0.01,  # Exploration finale
    tensorboard_log=log_path  # Dossier pour les logs TensorBoard
)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [16]:
# Entraînement du modèle
model.learn(total_timesteps=1000000, log_interval=100)

Logging to Training/Logs/dqn_pacman_tensorboard/DQN_1


  and should_run_async(code)


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 674      |
|    ep_rew_mean      | 365      |
|    exploration_rate | 0.946    |
| time/               |          |
|    episodes         | 100      |
|    fps              | 311      |
|    time_elapsed     | 17       |
|    total_timesteps  | 5417     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 691      |
|    ep_rew_mean      | 385      |
|    exploration_rate | 0.89     |
| time/               |          |
|    episodes         | 200      |
|    fps              | 288      |
|    time_elapsed     | 38       |
|    total_timesteps  | 11099    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0552   |
|    n_updates        | 274      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean    

<stable_baselines3.dqn.dqn.DQN at 0x79917289db10>

# Save & Reload Model

In [17]:
dqn_path = os.path.join('Training', 'Saved Models', 'dqn_pacman')

In [18]:
# Sauvegarde du modèle
model.save(dqn_path)



In [19]:
# Évaluation du modèle
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f"Mean reward: {mean_reward} +/- {std_reward}")

Mean reward: 1833.0 +/- 423.1796308897677


In [20]:
# Fermeture de l'environnement
env.close()