**Running the Code**
Following note, includes the training process of the Deep RL agent.
1.   first execute the next note (Game class)

In [2]:
from abc import ABC, abstractmethod
from copy import deepcopy
from enum import Enum
import numpy as np

# Rules on PDF and https://cdn.1j1ju.com/medias/a8/5e/26-quixo-rulebook.pdf


class Move(Enum):
    '''
    Selects where you want to place the taken piece. The rest of the pieces are shifted
    '''
    TOP = 0
    BOTTOM = 1
    LEFT = 2
    RIGHT = 3


class Player(ABC):
    def __init__(self) -> None:
        '''You can change this for your player if you need to handle state/have memory'''
        pass

    @abstractmethod
    def make_move(self, game: 'Game') -> tuple[tuple[int, int], Move]:
        '''
        The game accepts coordinates of the type (X, Y). X goes from left to right, while Y goes from top to bottom, as in 2D graphics.
        Thus, the coordinates that this method returns shall be in the (X, Y) format.

        game: the Quixo game. You can use it to override the current game with yours, but everything is evaluated by the main game
        return values: this method shall return a tuple of X,Y positions and a move among TOP, BOTTOM, LEFT and RIGHT
        '''
        pass


class Game(object):
    def __init__(self) -> None:
        self._board = np.ones((5, 5), dtype=np.uint8) * -1
        self.current_player_idx = 1
    # def get_current_player(self):
    #     return self.current_player_idx

    def get_board(self) -> np.ndarray:
        '''
        Returns the board
        '''
        return deepcopy(self._board)

    def get_current_player(self) -> int:
        '''
        Returns the current player
        '''
        return deepcopy(self.current_player_idx)

    def print(self):
        '''Prints the board. -1 are neutral pieces, 0 are pieces of player 0, 1 pieces of player 1'''
        print(self._board)

    def check_winner(self) -> int:
        '''Check the winner. Returns the player ID of the winner if any, otherwise returns -1'''
        # for each row
        player = self.get_current_player()
        winner = -1
        for x in range(self._board.shape[0]):
            # if a player has completed an entire row
            if self._board[x, 0] != -1 and all(self._board[x, :] == self._board[x, 0]):
                # return winner is this guy
                winner = self._board[x, 0]
        if winner > -1 and winner != self.get_current_player():
            return winner
        # for each column
        for y in range(self._board.shape[1]):
            # if a player has completed an entire column
            if self._board[0, y] != -1 and all(self._board[:, y] == self._board[0, y]):
                # return the relative id
                winner = self._board[0, y]
        if winner > -1 and winner != self.get_current_player():
            return winner
        # if a player has completed the principal diagonal
        if self._board[0, 0] != -1 and all(
            [self._board[x, x]
                for x in range(self._board.shape[0])] == self._board[0, 0]
        ):
            # return the relative id
            winner = self._board[0, 0]
        if winner > -1 and winner != self.get_current_player():
            return winner
        # if a player has completed the secondary diagonal
        if self._board[0, -1] != -1 and all(
            [self._board[x, -(x + 1)]
             for x in range(self._board.shape[0])] == self._board[0, -1]
        ):
            # return the relative id
            winner = self._board[0, -1]
        return winner

    def play(self, player1: Player, player2: Player) -> int:
        '''Play the game. Returns the winning player'''
        players = [player1, player2]
        winner = -1
        while winner < 0:
            self.current_player_idx += 1
            self.current_player_idx %= len(players)
            ok = False
            while not ok:
                from_pos, slide = players[self.current_player_idx].make_move(
                    self)
                ok = self.__move(from_pos, slide, self.current_player_idx)
            winner = self.check_winner()
        return winner

    def __move(self, from_pos: tuple[int, int], slide: Move, player_id: int) -> bool:
        '''Perform a move'''
        if player_id > 2:
            return False
        # Oh God, Numpy arrays
        prev_value = deepcopy(self._board[(from_pos[1], from_pos[0])])
        acceptable = self.__take((from_pos[1], from_pos[0]), player_id)
        if acceptable:
            acceptable = self.__slide((from_pos[1], from_pos[0]), slide)
            if not acceptable:
                self._board[(from_pos[1], from_pos[0])] = deepcopy(prev_value)
        return acceptable

    def __take(self, from_pos: tuple[int, int], player_id: int) -> bool:
        '''Take piece'''
        # acceptable only if in border
        acceptable: bool = (
            # check if it is in the first row
            (from_pos[0] == 0 and from_pos[1] < 5)
            # check if it is in the last row
            or (from_pos[0] == 4 and from_pos[1] < 5)
            # check if it is in the first column
            or (from_pos[1] == 0 and from_pos[0] < 5)
            # check if it is in the last column
            or (from_pos[1] == 4 and from_pos[0] < 5)
            # and check if the piece can be moved by the current player
        ) and (self._board[from_pos] < 0 or self._board[from_pos] == player_id)
        if acceptable:
            self._board[from_pos] = player_id
        return acceptable

    def __slide(self, from_pos: tuple[int, int], slide: Move) -> bool:
        '''Slide the other pieces'''
        # define the corners
        SIDES = [(0, 0), (0, 4), (4, 0), (4, 4)]
        # if the piece position is not in a corner
        if from_pos not in SIDES:
            # if it is at the TOP, it can be moved down, left or right
            acceptable_top: bool = from_pos[0] == 0 and (
                slide == Move.BOTTOM or slide == Move.LEFT or slide == Move.RIGHT
            )
            # if it is at the BOTTOM, it can be moved up, left or right
            acceptable_bottom: bool = from_pos[0] == 4 and (
                slide == Move.TOP or slide == Move.LEFT or slide == Move.RIGHT
            )
            # if it is on the LEFT, it can be moved up, down or right
            acceptable_left: bool = from_pos[1] == 0 and (
                slide == Move.BOTTOM or slide == Move.TOP or slide == Move.RIGHT
            )
            # if it is on the RIGHT, it can be moved up, down or left
            acceptable_right: bool = from_pos[1] == 4 and (
                slide == Move.BOTTOM or slide == Move.TOP or slide == Move.LEFT
            )
        # if the piece position is in a corner
        else:
            # if it is in the upper left corner, it can be moved to the right and down
            acceptable_top: bool = from_pos == (0, 0) and (
                slide == Move.BOTTOM or slide == Move.RIGHT)
            # if it is in the lower left corner, it can be moved to the right and up
            acceptable_left: bool = from_pos == (4, 0) and (
                slide == Move.TOP or slide == Move.RIGHT)
            # if it is in the upper right corner, it can be moved to the left and down
            acceptable_right: bool = from_pos == (0, 4) and (
                slide == Move.BOTTOM or slide == Move.LEFT)
            # if it is in the lower right corner, it can be moved to the left and up
            acceptable_bottom: bool = from_pos == (4, 4) and (
                slide == Move.TOP or slide == Move.LEFT)
        # check if the move is acceptable
        acceptable: bool = acceptable_top or acceptable_bottom or acceptable_left or acceptable_right
        # if it is
        if acceptable:
            # take the piece
            piece = self._board[from_pos]
            # if the player wants to slide it to the left
            if slide == Move.LEFT:
                # for each column starting from the column of the piece and moving to the left
                for i in range(from_pos[1], 0, -1):
                    # copy the value contained in the same row and the previous column
                    self._board[(from_pos[0], i)] = self._board[(
                        from_pos[0], i - 1)]
                # move the piece to the left
                self._board[(from_pos[0], 0)] = piece
            # if the player wants to slide it to the right
            elif slide == Move.RIGHT:
                # for each column starting from the column of the piece and moving to the right
                for i in range(from_pos[1], self._board.shape[1] - 1, 1):
                    # copy the value contained in the same row and the following column
                    self._board[(from_pos[0], i)] = self._board[(
                        from_pos[0], i + 1)]
                # move the piece to the right
                self._board[(from_pos[0], self._board.shape[1] - 1)] = piece
            # if the player wants to slide it upward
            elif slide == Move.TOP:
                # for each row starting from the row of the piece and going upward
                for i in range(from_pos[0], 0, -1):
                    # copy the value contained in the same column and the previous row
                    self._board[(i, from_pos[1])] = self._board[(
                        i - 1, from_pos[1])]
                # move the piece up
                self._board[(0, from_pos[1])] = piece
            # if the player wants to slide it downward
            elif slide == Move.BOTTOM:
                # for each row starting from the row of the piece and going downward
                for i in range(from_pos[0], self._board.shape[0] - 1, 1):
                    # copy the value contained in the same column and the following row
                    self._board[(i, from_pos[1])] = self._board[(
                        i + 1, from_pos[1])]
                # move the piece down
                self._board[(self._board.shape[0] - 1, from_pos[1])] = piece
        return acceptable

2.   following note generates the possible actions, given the current game board (excluding already occupied elements)

In [26]:
def get_possible_moves(game: 'Game') -> list[tuple[tuple[int, int], Move]]:

    # possible moves:
    # - take border empty and fill the hole by moving in the 3 directions
    # - take one of your blocks on the border and fill the hole by moving in the 3 directions
    # 44 at start possible moves
    pos = set()
    for r in [0, 4]:
        for c in range(5):
            if game.get_board()[r, c] == -1:
                if r == 0 and c == 0:  # OK
                    pos.add(((c, r), Move.BOTTOM))
                    pos.add(((c, r), Move.RIGHT))
                elif r == 0 and c == 4:  # OK
                    pos.add(((c, r), Move.BOTTOM))
                    pos.add(((c, r), Move.LEFT))
                elif r == 4 and c == 0:  # OK
                    pos.add(((c, r), Move.TOP))
                    pos.add(((c, r), Move.RIGHT))
                elif r == 4 and c == 4:  # OK
                    pos.add(((c, r), Move.TOP))
                    pos.add(((c, r), Move.LEFT))
                elif r == 0:  # OK
                    pos.add(((c, r), Move.BOTTOM))
                    pos.add(((c, r), Move.LEFT))
                    pos.add(((c, r), Move.RIGHT))
                elif r == 4:  # OK
                    pos.add(((c, r), Move.TOP))
                    pos.add(((c, r), Move.LEFT))
                    pos.add(((c, r), Move.RIGHT))
    for c in [0, 4]:
        for r in range(5):
            if game.get_board()[r, c] == -1 or game.get_board()[r, c] == player:
                if r == 0 and c == 0:  # OK
                    pos.add(((c, r), Move.BOTTOM))
                    pos.add(((c, r), Move.RIGHT))
                elif r == 0 and c == 4:  # OK
                    pos.add(((c, r), Move.BOTTOM))
                    pos.add(((c, r), Move.LEFT))
                elif r == 4 and c == 0:  # OK
                    pos.add(((c, r), Move.TOP))
                    pos.add(((c, r), Move.RIGHT))
                elif r == 4 and c == 4:  # OK
                    pos.add(((c, r), Move.TOP))
                    pos.add(((c, r), Move.LEFT))
                elif c == 0:
                    pos.add(((c, r), Move.TOP))
                    pos.add(((c, r), Move.RIGHT))
                    pos.add(((c, r), Move.BOTTOM))
                elif c == 4:
                    pos.add(((c, r), Move.TOP))
                    pos.add(((c, r), Move.LEFT))
                    pos.add(((c, r), Move.BOTTOM))
    return list(pos)

g=Game()
print(get_possible_moves(g))
print(len(get_possible_moves(g)))

[((2, 0), <Move.LEFT: 2>), ((0, 3), <Move.TOP: 0>), ((3, 0), <Move.BOTTOM: 1>), ((3, 4), <Move.TOP: 0>), ((4, 1), <Move.BOTTOM: 1>), ((0, 4), <Move.TOP: 0>), ((1, 0), <Move.LEFT: 2>), ((4, 2), <Move.BOTTOM: 1>), ((4, 1), <Move.TOP: 0>), ((4, 3), <Move.BOTTOM: 1>), ((4, 2), <Move.TOP: 0>), ((3, 4), <Move.LEFT: 2>), ((2, 4), <Move.RIGHT: 3>), ((1, 4), <Move.RIGHT: 3>), ((4, 0), <Move.LEFT: 2>), ((0, 0), <Move.RIGHT: 3>), ((4, 3), <Move.TOP: 0>), ((2, 0), <Move.RIGHT: 3>), ((0, 2), <Move.RIGHT: 3>), ((4, 1), <Move.LEFT: 2>), ((3, 0), <Move.LEFT: 2>), ((4, 4), <Move.TOP: 0>), ((0, 0), <Move.BOTTOM: 1>), ((0, 1), <Move.RIGHT: 3>), ((1, 4), <Move.TOP: 0>), ((4, 2), <Move.LEFT: 2>), ((2, 0), <Move.BOTTOM: 1>), ((2, 4), <Move.TOP: 0>), ((1, 0), <Move.RIGHT: 3>), ((0, 2), <Move.BOTTOM: 1>), ((0, 1), <Move.BOTTOM: 1>), ((4, 3), <Move.LEFT: 2>), ((0, 2), <Move.TOP: 0>), ((1, 0), <Move.BOTTOM: 1>), ((0, 1), <Move.TOP: 0>), ((0, 3), <Move.RIGHT: 3>), ((3, 4), <Move.RIGHT: 3>), ((4, 4), <Move.LEFT: 

3.   Here we generate any possible position in the game along with any possible action, these are infact the output classes of our deep neural network.

In [25]:
classes = [
    ((0, 3), Move.RIGHT),
    ((3, 0), Move.BOTTOM),
    ((3, 4), Move.RIGHT),
    ((4, 2), Move.BOTTOM),
    ((0, 4), Move.RIGHT),
    ((3, 4), Move.LEFT),
    ((0, 3), Move.TOP),
    ((4, 0), Move.LEFT),
    ((3, 0), Move.RIGHT),
    ((3, 4), Move.TOP),
    ((0, 4), Move.TOP),
    ((4, 1), Move.LEFT),
    ((4, 3), Move.BOTTOM),
    ((4, 1), Move.TOP),
    ((3, 0), Move.LEFT),
    ((4, 2), Move.LEFT),
    ((4, 2), Move.TOP),
    ((4, 3), Move.LEFT),
    ((0, 0), Move.BOTTOM),
    ((1, 4), Move.RIGHT),
    ((4, 3), Move.TOP),
    ((2, 0), Move.BOTTOM),
    ((2, 4), Move.RIGHT),
    ((0, 2), Move.BOTTOM),
    ((0, 1), Move.BOTTOM),
    ((0, 0), Move.RIGHT),
    ((4, 4), Move.LEFT),
    ((4, 4), Move.TOP),
    ((1, 0), Move.BOTTOM),
    ((2, 4), Move.LEFT),
    ((1, 4), Move.LEFT),
    ((2, 0), Move.RIGHT),
    ((2, 4), Move.TOP),
    ((1, 4), Move.TOP),
    ((0, 2), Move.RIGHT),
    ((0, 1), Move.RIGHT),
    ((0, 3), Move.BOTTOM),
    ((1, 0), Move.RIGHT),
    ((2, 0), Move.LEFT),
    ((4, 0), Move.BOTTOM),
    ((0, 2), Move.TOP),
    ((0, 1), Move.TOP),
    ((1, 0), Move.LEFT),
    ((4, 1), Move.BOTTOM)
]


4.   Following is the definition of our agent, including our model architecture, make_move and train functions.

In [65]:
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque

# Defines the neural network architecture
class ComplexDQN(nn.Module):
    def __init__(self, input_size, output_size):
        super(ComplexDQN, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)
        self.fc2 = nn.Linear(256, 512)
        self.fc3 = nn.Linear(512, 512)
        self.fc4 = nn.Linear(512, 256)
        self.fc5 = nn.Linear(256, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        return self.fc5(x)


# Defines constants
input_size = 5 * 5  # Size of the flattened game board
output_size = 44     # Number of possible moves (from_pos, move)
hidden_size = 64    # Size of the hidden layer in the neural network
learning_rate = 0.001
epsilon = 0.01       # Epsilon for epsilon-greedy policy
batch_size = 32     # Batch size for training the neural network
memory_size = 10000 # Size of the experience replay buffer

# Let's define Deep RL player
class DeepRLPlayer(Player):
    def __init__(self) -> None:
        super().__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = ComplexDQN(input_size, output_size).to(self.device)
        self.target_model = ComplexDQN(input_size, output_size).to(self.device)
        self.target_model.load_state_dict(self.model.state_dict())
        self.target_model.eval()
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
        self.memory = deque(maxlen=memory_size)
        self.steps_done = 0
        self.gamma = 0.99  # Discount factor for future rewards

    def make_move(self, game: 'Game') -> tuple[tuple[int, int], Move]:
        state = torch.tensor(game.get_board().flatten(), dtype=torch.float32).unsqueeze(0).to(self.device)

        # Choose action using epsilon-greedy policy
        if random.random() < epsilon:
            action = random.choice([Move.TOP, Move.BOTTOM, Move.LEFT, Move.RIGHT])
            from_pos = (random.randint(0, 4), random.randint(0, 4))
        else:
            with torch.no_grad():
                q_values = self.model(state)
                from_pos = classes[torch.argmax(q_values).item()][0]
                action = classes[torch.argmax(q_values).item()][1]
        return from_pos, action

    def train(self):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        states, actions, rewards, next_states, dones = zip(*minibatch)
        states = torch.tensor(states, dtype=torch.float32).to(self.device)
        actions = torch.tensor(actions, dtype=torch.int64).to(self.device)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(self.device)
        next_states = torch.tensor(next_states, dtype=torch.float32).to(self.device)
        dones = torch.tensor(dones, dtype=torch.float32).to(self.device)

        current_q_values = self.model(states).gather(1, actions.unsqueeze(1))
        next_q_values = self.target_model(next_states).max(1)[0].detach()
        target_q_values = rewards + (1 - dones) * self.gamma * next_q_values

        loss = F.smooth_l1_loss(current_q_values, target_q_values.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        if self.steps_done % target_update == 0:
            self.target_model.load_state_dict(self.model.state_dict())

        self.steps_done += 1

    def store_experience(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

# Define constants
target_update = 1000  # Frequency of updating the target network

This one will be used in testing mode, skip it.

In [None]:
import os
# Define a directory to save the trained models
MODEL_DIR = 'trained_models'
os.makedirs(MODEL_DIR, exist_ok=True)

This one will be used in testing mode as well, skip it.

In [None]:
from tqdm import tqdm
model_filename = "deep_rl_player_10.pth, TrainWinRate_82.53,roundNum_ 10000, lr_, 0.001, epsilon_0.01, structure_ 256_512"

# Function to load model parameters
def load_model(model, filename):
    filepath = os.path.join(MODEL_DIR, filename)
    model.load_state_dict(torch.load(filepath))
    print(f"Model loaded from '{filepath}'")

g = Game()
player1 = RandomPlayer()
player2 = DeepRLPlayer()

# Load the trained model
load_model(player2.model, model_filename)
print("Model loaded.")

5.   Run to create the random player.

In [16]:
import random
class RandomPlayer(Player):
    def __init__(self) -> None:
        super().__init__()

    def make_move(self, game: 'Game') -> tuple[tuple[int, int], Move]:
        from_pos = (random.randint(0, 4), random.randint(0, 4))
        move = random.choice([Move.TOP, Move.BOTTOM, Move.LEFT, Move.RIGHT])
        return from_pos, move

6.   Following code includes the training process.

In [66]:
import torch
import os
from tqdm import tqdm

# Define a directory to save the trained models
MODEL_DIR = 'trained_models'
os.makedirs(MODEL_DIR, exist_ok=True)

# Function to save model parameters
def save_model(model, filename):
    filepath = os.path.join(MODEL_DIR, filename)
    torch.save(model.state_dict(), filepath)
    print(f"Model saved to '{filepath}'")

# Function to load model parameters
def load_model(model, filename):
    filepath = os.path.join(MODEL_DIR, filename)
    model.load_state_dict(torch.load(filepath))
    print(f"Model loaded from '{filepath}'")

# Training and saving the model
num_episodes = 10000  # Number of episodes for training

if __name__ == '__main__':
    g = Game()
    player1 = RandomPlayer()
    player2 = DeepRLPlayer()
    countWin = 0
    countLoss = 0

    for _ in tqdm(range(num_episodes), desc="Training"):
        winner = g.play(player1, player2)
        player2.train()  # Train the agent after each episode
        if winner == 1:
            countWin += 1
        else:
            countLoss += 1

    trainWinRate = (countWin/num_episodes)*100
    trainLossRate = (countLoss/num_episodes)*100
    model_filename = f"deep_rl_player_10.pth, TrainWinRate:{trainWinRate},roundNum: {num_episodes}, lr:, {learning_rate}, epsilon:{epsilon}, structure: 256_512"  # Filename to save the model
    # Save the trained model
    save_model(player2.model, model_filename)
    print("Training completed. Model saved.")

    print("TrainWinRate: ", trainWinRate, "TrainLossRate: ", trainLossRate)


Training: 100%|██████████| 10000/10000 [18:19<00:00,  9.10it/s]

Model saved to 'trained_models/deep_rl_player_10.pth, TrainWinRate:82.53,roundNum: 10000, lr:, 0.001, epsilon:0.01, structure: 256_512'
Training completed. Model saved.
TrainWinRate:  82.53 TrainLossRate:  17.47





7.  We have tested the trained model in 100 games and the result was 87% win rate.

In [77]:
g = Game()
player1 = RandomPlayer()
player2 = DeepRLPlayer()

# Load the trained model
load_model(player2.model, model_filename)
print("Model loaded.")
totalGames = 100
RgWin = 0
RgLoss = 0
for _ in tqdm(range(totalGames), desc = "Matching"):
  winner = g.play(player1, player2)
  if winner == 1:
    RgWin += 1
  elif winner == 0:
    RgLoss += 1

print("\n WinRate: ", (RgWin/totalGames)*100,"%", "LossRate: ", (RgLoss/totalGames)*100,"%")

Model loaded from 'trained_models/deep_rl_player_10.pth, TrainWinRate:82.53,roundNum: 10000, lr:, 0.001, epsilon:0.01, structure: 256_512'
Model loaded.


Matching: 100%|██████████| 100/100 [00:10<00:00,  9.36it/s]


 WinRate:  87.0 % LossRate:  13.0 %



