<a href="https://colab.research.google.com/github/IGieckI/TrAIs/blob/main/TrAIs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import copy

# Tris environment
class Tris:
    def __init__(self):
        self.board = [[' ', ' ', ' '], [' ', ' ', ' '], [' ', ' ', ' ']]

    def __str__(self):
        return "  A B C\n" + \
            "1 {}|{}|{}\n".format(self.board[0][0], self.board[0][1], self.board[0][2]) + \
            "  -+-+-\n" + \
            "2 {}|{}|{}\n".format(self.board[1][0], self.board[1][1], self.board[1][2]) + \
            "  -+-+-\n" + \
            "3 {}|{}|{}".format(self.board[2][0], self.board[2][1], self.board[2][2])


    def get_cell_by_id(self, cell_id):
        return self.board[cell_id//3][cell_id%3]

    def set_cell_by_id(self, cell_id, new_value):
        self.board[cell_id//3][cell_id%3] = new_value

    def check_winner(self):
        for i in range(3):
            if self.board[i][0] == self.board[i][1] and self.board[i][1] == self.board[i][2] and self.board[i][0] != ' ':
                return self.board[i][0]

            if self.board[0][i] == self.board[1][i] and self.board[1][i] == self.board[2][i] and self.board[0][i] != ' ':
                return self.board[0][i]

        if (self.board[0][0] == self.board[1][1] and self.board[1][1] == self.board[2][2]) or (self.board[0][2] == self.board[1][1] and self.board[1][1] == self.board[2][0]) and self.board[1][1] != ' ':
            return self.board[1][1]
        return ' '

    def available_actions(self):
        return [i for i in range(9) if self.get_cell_by_id(i) == ' ']

    def step(self, action, sign):
        new_board = copy.deepcopy(self)

        new_board.set_cell_by_id(action, sign)

        winner = new_board.check_winner()

        if winner == ' ':
            reward = 0
        elif winner == sign:
            reward = 10
        else:
            reward = -10

        return new_board, reward

    def get_hash(self):
        hash_string = ''
        for c in [cell for row in self.board for cell in row]:
            hash_string += c
        return hash_string

    '''
    def get_hash(self):
        hash_string = ''
        for c in [cell for row in self.board for cell in row]:
        if c == ' ':
            hash_string += '0'
        elif c == 'X':
            hash_string += '1'
        else:
            hash_string += '2'
        return int(hash_string)
    '''
    def is_playing(self):
        return self.check_winner() == ' ' and len(self.available_actions()) > 0

In [2]:
# The AI model itself
class TrAIs:
    def __init__(self, learning_rate, learning_rate_decay_rate, min_learning_rate, gamma, sign):
        self.sign = sign
        self.learning_rate = learning_rate
        self.learning_rate_decay_rate = learning_rate_decay_rate
        self.min_learning_rate = min_learning_rate
        self.gamma = gamma

        self.qTable = {}
        self.history = []

    def get_qRow(self, board):
        board_hash = board.get_hash()
        if board_hash in self.qTable:
            row = self.qTable[board_hash]
        else:
            row = np.full(9, 0.6)
            self.qTable[board_hash] = row

        return row

    def set_qElem(self, board, action, new_value):
        board_hash = board.get_hash()
        self.qTable[board_hash][action] = new_value

    def get_action(self, board) -> int:
        board_hash = board.get_hash()
        while True:
            qRow = self.get_qRow(board)
            index = np.random.choice(np.where(qRow == qRow.max())[0])
            #print(index, ": ", qRow)
            if index in board.available_actions():
                return index
            else:
                self.set_qElem(board, index, -100000)

    # !!! PROVA A FARE IL TRAINING IN REVERSE
    # !!! PROVA AD USARE IL VALORE ATTESO <--- USEFUL OPTIMIZATION
    def train(self, final_reward):
        for memory in self.history:
            ''' OLD FORMULA
            cur_action_value = self.get_qRow(memory[0])[memory[1]]
            new_state_row = self.get_qRow(memory[2])
            old_value = (1-self.learning_rate) * cur_action_value
            new_value = self.learning_rate * (final_reward + self.gamma * max(new_state_row))
            #print("Training data: ", final_reward, " ", old_value, " ", self.qTable[memory[0].get_hash()][memory[1]])
            #print("Training data: ", old_value + new_value)
            self.set_qElem(memory[0], memory[1], old_value + new_value)
            #print("Changed ", memory[1], " :", self.get_qRow(memory[0]))
            '''
            curr_action_value = self.get_qRow(memory[0])[memory[1]]
            new_state_row = self.get_qRow(memory[2])
            new_qValue = curr_action_value + self.learning_rate * (final_reward + self.gamma * max(new_state_row) - curr_action_value)
            self.set_qElem(memory[0], memory[1], new_qValue)
        self.history = []

        self.learning_rate -= self.learning_rate_decay_rate
        if self.learning_rate < self.min_learning_rate:
            self.learning_rate = self.min_learning_rate

In [3]:
# Training parameters
NUM_EPISODES = 100000
LEARNING_RATE = 1
LEARNING_RATE_DECAY_RATE = 0.00001
MIN_LEARNING_RATE = 0.01
GAMMA = 0.9

EPSILON = 1.0
MIN_EPSILON = 0.005
EPSILON_DECAY_RATE = 0.00001

In [4]:
import random
import copy

model1 = TrAIs(LEARNING_RATE, LEARNING_RATE_DECAY_RATE, MIN_LEARNING_RATE, GAMMA, 'X')
model2 = TrAIs(LEARNING_RATE, LEARNING_RATE_DECAY_RATE, MIN_LEARNING_RATE, GAMMA, 'O')

current_episode = 0
current_epsilon = EPSILON

m1_wins = 0
m2_wins = 0
ties = 0

random_takes, non_random_takes = 0, 0

while current_episode < NUM_EPISODES:
    board = Tris()
    total_reward1, total_reward2 = 0, 0
    turn = True if random.uniform(0, 1) < 0.5 else False
    while board.is_playing():
        if random.uniform(0, 1) < current_epsilon:
            action = random.choice(board.available_actions())
            random_takes+=1
        else:
            action = model1.get_action(board) if turn else model2.get_action(board)
            non_random_takes+=1

        if turn:
            new_board, reward = board.step(action, model1.sign)
            model1.history.append([copy.deepcopy(board), action, copy.deepcopy(new_board)])
            total_reward1 += reward
        else:
            new_board, reward = board.step(action, model2.sign)
            model2.history.append([copy.deepcopy(board), action, copy.deepcopy(new_board)])
            total_reward2 += reward

        board = new_board
        turn = not turn

    model1.train(total_reward1 if total_reward2 == 0 else -total_reward2)
    model2.train(total_reward2 if total_reward1 == 0 else -total_reward1)
    winner = board.check_winner()

    if winner == ' ':
        ties+=1
    elif winner == model1.sign:
        m1_wins+=1
    else:
        m2_wins+=1

    """
    if current_episode%10 == 0:
        print(board)
    """
    print("N.", current_episode, " | ", m1_wins, " - ", ties, " - ", m2_wins)
    current_episode += 1
    current_epsilon -= EPSILON_DECAY_RATE
    #print("Curr epsilon: ", current_epsilon)
    if current_epsilon < MIN_EPSILON:
        current_epsilon = MIN_EPSILON
    print("Curr. eps: ", current_epsilon, " Curr. lr: ", model1.learning_rate)

#print(model1.qTable)
print("randomness: ", random_takes/(NUM_EPISODES)*100, "%")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Curr. eps:  0.024990000001915397  Curr. lr:  0.024990000001915397
N. 97501  |  39201  -  20820  -  37481
Curr. eps:  0.024980000001915397  Curr. lr:  0.024980000001915397
N. 97502  |  39201  -  20821  -  37481
Curr. eps:  0.024970000001915398  Curr. lr:  0.024970000001915398
N. 97503  |  39201  -  20822  -  37481
Curr. eps:  0.024960000001915398  Curr. lr:  0.024960000001915398
N. 97504  |  39201  -  20823  -  37481
Curr. eps:  0.0249500000019154  Curr. lr:  0.0249500000019154
N. 97505  |  39201  -  20824  -  37481
Curr. eps:  0.0249400000019154  Curr. lr:  0.0249400000019154
N. 97506  |  39201  -  20825  -  37481
Curr. eps:  0.0249300000019154  Curr. lr:  0.0249300000019154
N. 97507  |  39201  -  20826  -  37481
Curr. eps:  0.0249200000019154  Curr. lr:  0.0249200000019154
N. 97508  |  39201  -  20827  -  37481
Curr. eps:  0.0249100000019154  Curr. lr:  0.0249100000019154
N. 97509  |  39201  -  20828  -  37481
Curr. eps:

In [None]:
# To play against the bot

# convert player input into board cell
def convert_input(input:str):
    cell = 0
    if input[0] == 'a':
        cell += 0
    elif input[0] == 'b':
        cell += 1
    elif input[0] == 'c':
        cell += 2
    else:
        return -1

    if input[1] == '1':
        cell += 0
    elif input[1] == '2':
        cell += 3
    elif input[1] == '3':
        cell += 6
    else:
        return -1
    return cell

board = Tris()
# choose who play first
if random.uniform(0, 1) < 0.5:
    player_turn = True
else:
    player_turn = False

# start game
while board.is_playing():
    print(board)
    if player_turn:
        print("Player turn:")
        print("Seleziona la casella(ex:\"a1\"):")
        action = convert_input(input())
        if action == -1:
            continue
        new_board, reward = board.step(action, "O")
    else:
        print("TrAIs turn:")
        action = model1.get_action(board)
        new_board, reward = board.step(action, model1.sign)
        print(model1.qTable[board.get_hash()])

    board = new_board
    player_turn = not player_turn

print(board)
winner = board.check_winner()

if winner == ' ':
    print("It's a draw!")
elif winner == model1.sign:
    print("TrAIs wins!")
else:
    print("Player wins!")

  A B C
1  | | 
  -+-+-
2  | | 
  -+-+-
3  | | 
TrAIs turn:
[-0.03697441 -0.35450383  0.33868253 -0.48470486  0.59108285 -0.19923685
  0.18682854 -0.28129871 -0.04369122]
  A B C
1  | | 
  -+-+-
2  |X| 
  -+-+-
3  | | 
Player turn:
Seleziona la casella(ex:"a1"):
