<a href="https://colab.research.google.com/github/IGieckI/TrAIs/blob/main/TrAIs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [77]:
import numpy as np

# Tris environment
class Tris:
    def __init__(self):
        self.board = [[' ', ' ', ' '], [' ', ' ', ' '], [' ', ' ', ' ']]

    def get_cell_by_id(self, cell_id):
        return self.board[cell_id//3][cell_id%3]

    def set_cell_by_id(self, cell_id, new_value):
        self.board[cell_id//3][cell_id%3] = new_value

    def check_winner(self):
        for i in range(3):
            if self.board[i][0] == self.board[i][1] and self.board[i][1] == self.board[i][2]:
                return self.board[i][0]

            if self.board[0][i] == self.board[1][i] and self.board[1][i] == self.board[2][i]:
                return self.board[0][i]

        if (self.board[0][0] == self.board[1][1] and self.board[1][1] == self.board[2][2]) or (self.board[0][2] == self.board[1][1] and self.board[1][1] == self.board[2][0]):
            return self.board[1][1]
        return ' '

    def available_actions(self):
        return [i for i in range(9) if self.get_cell_by_id(i) == ' ']

    def step(self, action, sign):
        self.set_cell_by_id(action, sign)

        winner = self.check_winner()

        if winner == ' ':
            reward = 0
        elif winner == sign:
            reward = 1
        else:
            reward = -1

        return self, reward

    def get_hash(self):
        hash_string = ''
        for c in [cell for row in self.board for cell in row]:
            hash_string += c
        return hash_string

    '''
    def get_hash(self):
        hash_string = ''
        for c in [cell for row in self.board for cell in row]:
        if c == ' ':
            hash_string += '0'
        elif c == 'X':
            hash_string += '1'
        else:
            hash_string += '2'
        return int(hash_string)
    '''
    def is_playing(self):
        return self.check_winner() == ' ' and len(self.available_actions()) > 0

In [78]:
# The AI model itself
class TrAIs:
    def __init__(self, learning_rate, gamma, sign):
        self.sign = sign
        self.learning_rate = learning_rate
        self.gamma = gamma

        self.qTable = {}
        self.history = []

    def get_qRow(self, board_hash):
        if board_hash in self.qTable:
            row = self.qTable[board_hash]
        else:
            row = np.full(9, 0.6)
            self.qTable[board_hash] = row

        return row

    def get_action(self, board) -> int:
        board_hash = board.get_hash()
        while True:
            qRow = self.get_qRow(board_hash)
            index = np.argmax(qRow)
            if index in board.available_actions():
                return index
            else:
                self.qTable[board_hash][index] = -1.0
                qRow[index] = -1.0

    # !!! PROVA A FARE IL TRAINING IN REVERSE
    def train(self, final_reward):
        for memory in self.history:
            #print(memory[0].board, memory[1], memory[2].board)
            cur_action_value = self.get_qRow(memory[0].get_hash())[memory[1]]
            self.qTable[memory[0].get_hash()][memory[1]] = cur_action_value + self.learning_rate * (final_reward + self.gamma * max(self.get_qRow(memory[2].get_hash())) - cur_action_value)
        history = []


In [79]:
# Training parameters
num_episodes = 100000
learning_rate = 0.1
max_steps = 100
gamma = 0.99

epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.001

In [None]:
import random

model1 = TrAIs(learning_rate, gamma, 'X')
model2 = TrAIs(learning_rate, gamma, 'O')

current_episode = 0
m1_wins = 0
m2_wins = 0
ties = 0

while current_episode < num_episodes:
    print(current_episode)
    board = Tris()
    total_reward1, total_reward2 = 0, 0
    turn = True
    while board.is_playing():
        if random.uniform(0, 1) < epsilon:
            action =  random.choice(board.available_actions())
        else:
            action = model1.get_action(board) if turn else model2.get_action(board)

        #print("ACTION: ", turn, " ", action)

        if turn:
            new_board, reward = board.step(action, model1.sign)
            model1.history.append([board, action, new_board])
            total_reward1 += reward
        else:
            new_board, reward = board.step(action, model2.sign)
            model2.history.append([board, action, new_board])
            total_reward2 += reward

        board = new_board
        turn = not turn

    model1.train(total_reward1)
    model2.train(total_reward2)
    #print(board.board)
    #print(total_reward1, " <- rewards -> ", total_reward2)

    winner = board.check_winner()

    if winner == '':
        ties+=1
    elif winner == model1.sign:
        m1_wins+=1
    else:
        m2_wins+=1


    print(m1_wins, " - ", m2_wins, " - ", ties)

    current_episode += 1
    epsilon-=decay_rate
