In [13]:
import sys
import os
import gym
from gym import spaces
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random

# Ajouter le chemin du répertoire contenant le module 'qoridor'
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from qoridor.game import QoridorGame, Move, MoveType, WallOrientation

In [None]:
# We need to use the Gym convention for our environment

class QoridorEnv :

In [19]:
class QuoridorEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    
    def __init__(self, board_size=5, num_walls=3):
        super(QuoridorEnv, self).__init__()
        self.board_size = board_size
        self.num_walls = num_walls
        self.game = QoridorGame(board_size, num_walls)
        
        # Define action and observation space
        # Actions: move (row, col) or place wall (row, col, orientation)
        self.action_space = spaces.Discrete(board_size * board_size + 2 * (board_size - 1) * (board_size - 1))
        
        # Observation: board state, player positions, remaining walls
        self.observation_space = spaces.Box(low=0, high=1, shape=(board_size, board_size, 3), dtype=np.float32)
        
    def reset(self):
        self.game = QoridorGame(self.board_size, self.num_walls)
        return self._get_observation()
    
    def step(self, action):
        # Decode action
        if action < self.board_size * self.board_size:
            row, col = divmod(action, self.board_size)
            move = Move(MoveType.MOVE, (row, col))
        else:
            action -= self.board_size * self.board_size
            if action < (self.board_size - 1) * (self.board_size - 1):
                row, col = divmod(action, self.board_size - 1)
                move = Move(MoveType.WALL, (row, col), WallOrientation.HORIZONTAL)
            else:
                action -= (self.board_size - 1) * (self.board_size - 1)
                row, col = divmod(action, self.board_size - 1)
                move = Move(MoveType.WALL, (row, col), WallOrientation.VERTICAL)
        
        # Apply action
        reward = 0
        done = False
        if self.game.make_move(move):
            reward = 1
            if self.game.is_game_over():
                done = True
                reward = 10
        else:
            reward = -1
        
        return self._get_observation(), reward, done, {}
    
    def render(self, mode='human'):
        # Optionally implement rendering logic
        self.game.render()
    
    def close(self):
        # Optionally implement cleanup logic
        pass
    
    def _get_observation(self):
        board = np.zeros((self.board_size, self.board_size, 3), dtype=np.float32)
        for row in range(self.board_size):
            for col in range(self.board_size):
                if self.game.state.board.horizontal_walls[row, col]:
                    board[row, col, 0] = 1
                if self.game.state.board.vertical_walls[row, col]:
                    board[row, col, 1] = 1
        board[self.game.state.player1_pos[0], self.game.state.player1_pos[1], 2] = 1
        board[self.game.state.player2_pos[0], self.game.state.player2_pos[1], 2] = 1
        return board


In [20]:
class DQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(input_shape[0], 32, kernel_size=3, stride=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(self._get_conv_output(input_shape), 128)
        self.fc2 = nn.Linear(128, num_actions)
    
    def _get_conv_output(self, shape):
        o = torch.zeros(1, *shape)
        o = self.conv1(o)
        o = self.conv2(o)
        return int(np.prod(o.size()))
    
    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

class DQNAgent:
    def __init__(self, state_shape, num_actions):
        self.state_shape = state_shape
        self.num_actions = num_actions
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95  # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = DQN(state_shape, num_actions).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.criterion = nn.MSELoss()
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.num_actions)
        state = torch.FloatTensor(state).to(device)
        with torch.no_grad():
            act_values = self.model(state)
        return np.argmax(act_values.cpu().data.numpy())
    
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = torch.FloatTensor(state).to(device)
            next_state = torch.FloatTensor(next_state).to(device)
            reward = torch.FloatTensor([reward]).to(device)
            done = torch.FloatTensor([done]).to(device)
            
            target = reward
            if not done:
                target = reward + self.gamma * torch.max(self.model(next_state))
            target_f = self.model(state)
            target_f = target_f.clone()
            target_f[action] = target
            
            self.optimizer.zero_grad()
            loss = self.criterion(self.model(state), target_f)
            loss.backward()
            self.optimizer.step()
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [22]:
# Entraînement du modèle DQN
env = QuoridorEnv()
state_shape = (3, env.board_size, env.board_size)  # Channels first for PyTorch
num_actions = env.action_space.n
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
agent = DQNAgent(state_shape, num_actions)
episodes = 100
batch_size = 32

for e in range(episodes):
    state = env.reset()
    state = np.transpose(state, (2, 0, 1))  # Convert to channels first
    state = np.expand_dims(state, axis=0)  # Add batch dimension
    for time in range(500):
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        next_state = np.transpose(next_state, (2, 0, 1))  # Convert to channels first
        next_state = np.expand_dims(next_state, axis=0)  # Add batch dimension
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            print(f"Episode: {e}/{episodes}, score: {time}, e: {agent.epsilon:.2}")
            break
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

IndexError: index 4 is out of bounds for axis 1 with size 4