<a href="https://colab.research.google.com/github/IGieckI/TrAIs/blob/main/TrAIs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import numpy as np
import copy

# Tris environment
class Tris:
    def __init__(self):
        self.WIN_REWARD = 10
        self.LOSE_REWARD = -10
        self.TIE_REWARD = 0
        self.EMPTY_CELL = ' '
        self.board = [[self.EMPTY_CELL, self.EMPTY_CELL, self.EMPTY_CELL], [self.EMPTY_CELL, self.EMPTY_CELL, self.EMPTY_CELL], [self.EMPTY_CELL, self.EMPTY_CELL, self.EMPTY_CELL]]

    def __str__(self):
        return "  A B C\n" + \
            "1 {}|{}|{}\n".format(self.board[0][0], self.board[0][1], self.board[0][2]) + \
            "  -+-+-\n" + \
            "2 {}|{}|{}\n".format(self.board[1][0], self.board[1][1], self.board[1][2]) + \
            "  -+-+-\n" + \
            "3 {}|{}|{}".format(self.board[2][0], self.board[2][1], self.board[2][2])

    def get_cell_by_id(self, cell_id : int) -> str:
        return self.board[cell_id//3][cell_id%3]

    def set_cell_by_id(self, cell_id : int, sign : str):
        self.board[cell_id//3][cell_id%3] = sign

    # note: cell_ids are going from top-left(id:0) to bottom-right(id:8) goind row by row
    def check_winner(self) -> str:
        """Check if there is a winner

        Returns:
            str: return the sign of a winner, EMPTY_CELL if no one wins
        """
        for i in range(3):
            if self.board[i][0] == self.board[i][1] and self.board[i][1] == self.board[i][2] and self.board[i][0] != self.EMPTY_CELL:
                return self.board[i][0]

            if self.board[0][i] == self.board[1][i] and self.board[1][i] == self.board[2][i] and self.board[0][i] != self.EMPTY_CELL:
                return self.board[0][i]

        if (self.board[0][0] == self.board[1][1] and self.board[1][1] == self.board[2][2]) or (self.board[0][2] == self.board[1][1] and self.board[1][1] == self.board[2][0]) and self.board[1][1] != self.EMPTY_CELL:
            return self.board[1][1]
        return self.EMPTY_CELL

    def available_actions(self) -> list[int]:
        """Getter of the available actions (cells without a sign already)

        Returns:
            list[int]: a list of the available actions (cell_ids are going from top-left(id:0) to bottom-right(id:8) increasing from left to right and from top to bottom)
        """
        return [i for i in range(9) if self.get_cell_by_id(i) == self.EMPTY_CELL]

    def deep_copy(self) -> 'Tris':
        new_tris = Tris()
        new_tris.board = [row[:] for row in new_tris.board]
        return new_tris

    def step(self, action : int, sign : str) -> 'Tris, int':
        """Update the board with the given action and sign, calculating the new state and the reward

        Args:
            action (int): action took by an agent
            sign (str): sign of that agent

        Returns:
            Tris, int: The new board and the reward of the action
        """
        new_board = copy.deepcopy(self)

        new_board.set_cell_by_id(action, sign)

        winner = new_board.check_winner()

        if winner == self.EMPTY_CELL:
            reward = self.TIE_REWARD
        elif winner == sign:
            reward = self.WIN_REWARD
        else:
            reward = -self.LOSE_REWARD

        return new_board, reward

    def get_hash(self) -> str:
        """Hand made hash function for the board

        Returns:
            str: A string composed by the board state (signs on the board, following cells order)
        """
        hash_string = ''
        for c in [cell for row in self.board for cell in row]:
            hash_string += c
        return hash_string

    def is_playing(self) -> bool:
        return self.check_winner() == self.EMPTY_CELL and len(self.available_actions()) > 0

In [23]:
# The AI model itself
class TrAIs:
    def __init__(self, learning_rate : float, learning_rate_decay_rate : float, min_learning_rate : float, gamma : float, sign : str):
        self.sign = sign
        self.learning_rate = learning_rate
        self.learning_rate_decay_rate = learning_rate_decay_rate
        self.min_learning_rate = min_learning_rate
        self.gamma = gamma
        self.DEAFULT_Q_VALUE = 0.6
        self.INVALID_ACTION_VALUE = -100000

        self.qTable = {}
        self.history = []

    def get_qRow(self, board : Tris) -> list[float]:
        board_hash = board.get_hash()
        if board_hash in self.qTable:
            row = self.qTable[board_hash]
        else:
            row = np.full(9, self.DEAFULT_Q_VALUE)
            self.qTable[board_hash] = row

        return row

    def set_qElem(self, board : Tris, action : int, new_value : float):
        board_hash = board.get_hash()
        self.qTable[board_hash][action] = new_value

    def get_action(self, board : Tris) -> int:
        """Compute the best possible action for the given board

        Args:
            board (Tris): Current board state

        Returns:
            int: the action to take
        """
        board_hash = board.get_hash()
        while True:
            qRow = self.get_qRow(board)
            index = np.random.choice(np.where(qRow == qRow.max())[0])
            if index in board.available_actions():
                return index
            else:
                self.set_qElem(board, index, self.INVALID_ACTION_VALUE)

    def train(self, reward : int):
        """Update the qTable with the given reward

        Args:
            final_reward (int): reward given to the agent
        """
        for memory in self.history:
            curr_action_value = self.get_qRow(memory[0])[memory[1]]
            new_state_row = self.get_qRow(memory[2])
            new_qValue = curr_action_value + self.learning_rate * (reward + self.gamma * max(new_state_row) - curr_action_value)
            self.set_qElem(memory[0], memory[1], new_qValue)
        self.history = []

        self.learning_rate -= self.learning_rate_decay_rate
        if self.learning_rate < self.min_learning_rate:
            self.learning_rate = self.min_learning_rate

In [24]:
# Training parameters
NUM_EPISODES = 100000

GAMMA = 0.9
LEARNING_RATE = 1
LEARNING_RATE_DECAY_RATE = 0.00001
MIN_LEARNING_RATE = 0.01
EPSILON = 1.0
MIN_EPSILON = 0.005
EPSILON_DECAY_RATE = 0.00001

In [25]:
import random
import copy

# TrAIs models to train against eachother
model1 = TrAIs(LEARNING_RATE, LEARNING_RATE_DECAY_RATE, MIN_LEARNING_RATE, GAMMA, 'X')
model2 = TrAIs(LEARNING_RATE, LEARNING_RATE_DECAY_RATE, MIN_LEARNING_RATE, GAMMA, 'O')

# Local variables to compute agent learning
current_episode = 0
current_epsilon = EPSILON

while current_episode < NUM_EPISODES:
    # Initialize a new game
    board = Tris()
    total_reward1, total_reward2 = 0, 0
    turn = True if random.uniform(0, 1) < 0.5 else False

    # Start the game
    while board.is_playing():

        # Chance of taking a random choice, useful keeping this high at the beginning of the training to help the agent explore different choices
        if random.uniform(0, 1) < current_epsilon:
            action = random.choice(board.available_actions())
        else:
            action = model1.get_action(board) if turn else model2.get_action(board)

        # Apply the chosen action on the game board, taking the reward and initializing the new board
        if turn:
            new_board, reward = board.step(action, model1.sign)
            model1.history.append([copy.deepcopy(board), action, copy.deepcopy(new_board)])
            total_reward1 += reward
        else:
            new_board, reward = board.step(action, model2.sign)
            model2.history.append([copy.deepcopy(board), action, copy.deepcopy(new_board)])
            total_reward2 += reward

        board = new_board
        turn = not turn

    # Update models using the given rewards and taken actions (saved in model.history)
    model1.train(total_reward1 if total_reward2 == 0 else -total_reward2)
    model2.train(total_reward2 if total_reward1 == 0 else -total_reward1)
    winner = board.check_winner()

    current_episode += 1
    current_epsilon -= EPSILON_DECAY_RATE

    if current_epsilon < MIN_EPSILON:
        current_epsilon = MIN_EPSILON

    print(round(current_episode / NUM_EPISODES * 100, 2), "%")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
95.0 %
95.0 %
95.0 %
95.0 %
95.0 %
95.01 %
95.01 %
95.01 %
95.01 %
95.01 %
95.01 %
95.01 %
95.01 %
95.01 %
95.02 %
95.02 %
95.02 %
95.02 %
95.02 %
95.02 %
95.02 %
95.02 %
95.02 %
95.02 %
95.03 %
95.03 %
95.03 %
95.03 %
95.03 %
95.03 %
95.03 %
95.03 %
95.03 %
95.03 %
95.03 %
95.04 %
95.04 %
95.04 %
95.04 %
95.04 %
95.04 %
95.04 %
95.04 %
95.04 %
95.05 %
95.05 %
95.05 %
95.05 %
95.05 %
95.05 %
95.05 %
95.05 %
95.05 %
95.05 %
95.06 %
95.06 %
95.06 %
95.06 %
95.06 %
95.06 %
95.06 %
95.06 %
95.06 %
95.06 %
95.06 %
95.07 %
95.07 %
95.07 %
95.07 %
95.07 %
95.07 %
95.07 %
95.07 %
95.07 %
95.08 %
95.08 %
95.08 %
95.08 %
95.08 %
95.08 %
95.08 %
95.08 %
95.08 %
95.08 %
95.08 %
95.09 %
95.09 %
95.09 %
95.09 %
95.09 %
95.09 %
95.09 %
95.09 %
95.09 %
95.09 %
95.1 %
95.1 %
95.1 %
95.1 %
95.1 %
95.1 %
95.1 %
95.1 %
95.1 %
95.1 %
95.11 %
95.11 %
95.11 %
95.11 %
95.11 %
95.11 %
95.11 %
95.11 %
95.11 %
95.12 %
95.12 %
95.12 %
95.12 %
95.12 

In [28]:
# To play against the bot

# convert player input into board cell
def convert_input(input : str) -> int:
    cell = 0
    if input[0] == 'a':
        cell += 0
    elif input[0] == 'b':
        cell += 1
    elif input[0] == 'c':
        cell += 2
    else:
        return -1

    if input[1] == '1':
        cell += 0
    elif input[1] == '2':
        cell += 3
    elif input[1] == '3':
        cell += 6
    else:
        return -1
    return cell

# generate a new board
board = Tris()

# choose who play first
if random.uniform(0, 1) < 0.5:
    player_turn = True
else:
    player_turn = False

# start game
while board.is_playing():
    print(board)
    if player_turn:
        print("Player turn:")
        print("Seleziona la casella(ex:\"a1\"):")
        action = convert_input(input())
        if action == -1 or action not in board.available_actions():
            continue
        new_board, reward = board.step(action, "O")
    else:
        print("TrAIs turn:")
        action = model1.get_action(board)
        new_board, reward = board.step(action, model1.sign)

    board = new_board
    player_turn = not player_turn

print(board)

# declare winner
winner = board.check_winner()

if winner == board.EMPTY_CELL:
    print("It's a draw!")
elif winner == model1.sign:
    print("TrAIs wins!")
else:
    print("Player wins!")

  A B C
1  | | 
  -+-+-
2  | | 
  -+-+-
3  | | 
TrAIs turn:
  A B C
1  |X| 
  -+-+-
2  | | 
  -+-+-
3  | | 
Player turn:
Seleziona la casella(ex:"a1"):
b1
  A B C
1  |X| 
  -+-+-
2  | | 
  -+-+-
3  | | 
Player turn:
Seleziona la casella(ex:"a1"):
b1
  A B C
1  |X| 
  -+-+-
2  | | 
  -+-+-
3  | | 
Player turn:
Seleziona la casella(ex:"a1"):
a1
  A B C
1 O|X| 
  -+-+-
2  | | 
  -+-+-
3  | | 
TrAIs turn:
  A B C
1 O|X| 
  -+-+-
2  |X| 
  -+-+-
3  | | 
Player turn:
Seleziona la casella(ex:"a1"):
b3
  A B C
1 O|X| 
  -+-+-
2  |X| 
  -+-+-
3  |O| 
TrAIs turn:
  A B C
1 O|X| 
  -+-+-
2  |X| 
  -+-+-
3 X|O| 
Player turn:
Seleziona la casella(ex:"a1"):
c1
  A B C
1 O|X|O
  -+-+-
2  |X| 
  -+-+-
3 X|O| 
TrAIs turn:
  A B C
1 O|X|O
  -+-+-
2  |X|X
  -+-+-
3 X|O| 
Player turn:
Seleziona la casella(ex:"a1"):
a2
  A B C
1 O|X|O
  -+-+-
2 O|X|X
  -+-+-
3 X|O| 
TrAIs turn:
  A B C
1 O|X|O
  -+-+-
2 O|X|X
  -+-+-
3 X|O|X
It's a draw!
