Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

# Tic Tac Toe Game
Here the game definition

In [1]:
from dataclasses import dataclass

import numpy as np


@dataclass
class Tic_Tac_Toe:
    
    board: np.ndarray
    actual_player: int
    state: int
    
    def __init__(self):
        self.board = np.ones((3, 3), dtype=np.int8) * -1
        self.actual_player = 0
        self.state = -1
        
    def possible_moves(self):
        return np.argwhere(self.board == -1)
    
    def reset(self):
        self.board = np.ones((3, 3), dtype=np.int8) * -1
        self.state = -1
        self.actual_player = 0
    
    def move(self, position, player):
        if player != self.actual_player:
            raise ValueError(f"Wrong player making the move")
        reward = .1
        self.actual_player = 1-player
        self.board[position] = player
        self.state = self.check_winner()
        if len(np.argwhere(self.board == -1)) == 0:
            self.state = 2
        return reward, self.state
        
    def check_winner(self) -> int:
        for i in range(3):
            if self.board[i][0] == self.board[i][1] == self.board[i][2] != -1:
                return self.board[i][0]
            
            if self.board[0][i] == self.board[1][i] == self.board[2][i] != -1:
                return self.board[0][i]

        if self.board[0][0] == self.board[1][1] == self.board[2][2] != -1:
            return self.board[0][0]

        if self.board[0][2] == self.board[1][1] == self.board[2][0] != -1:
            return self.board[0][2]

        return -1
    
    @staticmethod
    def convert_number(number):
        if number == -1:
            return "-"
        elif number == 0:
            return "0"
        else:
            return "X"
    
    def print_board(self):
        for i in range(3): 
            print(f"{self.convert_number(self.board[i][0])}  {self.convert_number(self.board[i][1])}  {self.convert_number(self.board[i][2])}")
            print()
        print()
    
    def print_state(self):
        if self.state == 2:
            print("Draw")
        elif self.state == 0:
            print("Player 0 wins")
        elif self.state == 1:
            print("Player 1 wins")
        else:   
            print("Play in progress")

# Random Agent
A simple random agent that makes a random move considering the possible ones

In [2]:
from random import choice


class RandomAgent:
    
    player_number: int

    def __init__(self, player_number: int):
        self.player_number = player_number
    
    def move(self, state, possible_moves):
        return tuple(choice(possible_moves))

# Q Agent
The agent use a q learning approach
It uses some utility function to convert the states of the game in a string to memorize in the q table dictionary
It does the same for the action to convert them from 2 dimensional to 1 dimensional ((2, 1) -> 8)
It can use 3 different strategies to performa a move
- An epsilon greedy policy that use the exploration rate to decide if take a random action or to choose the best action based on the actual data in
- A UCB policy that computes a value for each action based on the number of times the action were used on the total number of action performed, and then take the max values
- A Boltzmann or softmax policy that computes a softmax vector representing a probability distribution of the q values for the actual state and perform a choice based on the probabilities
For every move at training time, it uses a random variable generation to choose if take a random move or to choose the best action based on the actual data in the q table
After each episode, it adopts an exponential decrementation of the exploration rate
The update in the Q table is based on this formula:
$$
Q(s_t, a_t) \leftarrow (1 - \alpha) * Q(s_t, a_t) + \alpha * ( R_{t+1} + \gamma * ( - \max_a Q(s_{t+1}, a) ) )
$$
The minus sign in the above formula is used because the next state is the opponent state
Note: I take this idea from [Davide Vitabile](https://github.com/Vitabile/Computational-Intelligence/tree/main)


In [3]:
from copy import deepcopy
import base64
from typing import Dict, Tuple
from random import random

REWARD_WIN = 1
REWARD_DRAW = 0
REWARD_LOSE = -1


class Q_Agent:
    player_number: int
    learning_rate: float
    discount_rate: float
    exploration_rate: float
    min_exploration_rate: float
    exploration_decay: float
    q_table: Dict[Tuple, float]
    exploration_strategy: int
    
    
    def __init__(self, learning_rate: float, discount_rate: float, exploration_rate: float, min_exploration_rate: float, exploration_decay: float, opponent, exploration_strategy: int):
        self.learning_rate = learning_rate
        self.discount_rate = discount_rate
        self.exploration_rate = exploration_rate
        self.min_exploration_rate = min_exploration_rate
        self.exploration_decay = exploration_decay
        self.q_table = {}
        self.opponent = opponent
        self.exploration_strategy = exploration_strategy
    
    def convert_state(self, state):
        return "".join(str(_) for _ in state.flatten())
    
    def convert_action(self, action):
        return action[0]*3+action[1]
        
    def move(self, state, possible_moves):
        converted_state = self.convert_state(state)
        if converted_state not in self.q_table:
                self.q_table[converted_state] = np.zeros((9,))
        if self.exploration_strategy == 0:
            if random() < self.exploration_rate:
                return tuple(choice(possible_moves))
            else:
                possible_moves = [self.convert_action(action) for action in possible_moves] 
                possible_values = [self.q_table[converted_state][action]  for action in possible_moves]
                max_value = max(possible_values)
                best_moves = [action for action, value in zip(possible_moves, possible_values) if value == max_value]
                move = choice(best_moves)
                return move // 3, move % 3    
        elif self.exploration_strategy == 1:
            if random() < self.exploration_rate:
                return tuple(choice(possible_moves))
            else:
                possible_moves = [self.convert_action(action) for action in possible_moves] 
                ucb_values = [self.q_table[converted_state][action] + np.sqrt(2 * np.log(len(possible_moves)) /
                                  max(1, np.sum(self.q_table[converted_state][action])))
                                for action in possible_moves]
                move = possible_moves[np.argmax(ucb_values)]
                return move // 3, move % 3
        else:
            possible_moves = [self.convert_action(action) for action in possible_moves]
            possible_values = [self.q_table[converted_state][action]  for action in possible_moves]
            max_value = np.max(possible_values)
            scaled_values = [val - max_value for val in possible_values]
            exp_values = np.exp(np.array(scaled_values) / self.exploration_rate)
            boltzmann_probs = exp_values / np.sum(exp_values)
            chosen_action_index = np.random.choice(len(possible_moves), p=boltzmann_probs)
            move = possible_moves[chosen_action_index]
            return move // 3, move % 3
        
    def get_game_reward(self, winner):
        if winner == self.player_number:
            return REWARD_WIN
        elif winner == 2:
            return REWARD_DRAW
        else:
            return REWARD_LOSE
    
    def update_q_table(self, prev_state, action, reward, next_state):
        action = self.convert_action(action)
        if next_state not in self.q_table:
            self.q_table[next_state] = np.zeros((9,))
        if prev_state not in self.q_table:
            self.q_table[prev_state] = np.zeros((9,))
        self.q_table[prev_state][action] = ((1-self.learning_rate) * self.q_table[prev_state][action] + 
                                            self.learning_rate * (reward + self.discount_rate * (-np.max(self.q_table[next_state]))))
        
    def train(self, n_episodes, first_Player=True):
        game = Tic_Tac_Toe()
        self.player_number = 0 if first_Player else 1
        self.opponent.player_number = 1-self.player_number 
        players = [self, self.opponent]
        turn = 0 if first_Player else 1
        for episode in range(n_episodes):
            while game.state == -1:
                possible_moves = game.possible_moves()
                actual_state = self.convert_state(game.board)
                if game.actual_player == self.player_number:
                    action = players[turn].move(game.board, possible_moves)
                    reward, game_state = game.move(action, players[turn].player_number)
                    next_state = self.convert_state(game.board)
                    self.update_q_table(actual_state, action, reward, next_state)
                else:
                    game.move(players[turn].move(game.board, possible_moves), players[turn].player_number)
                turn = 1-turn
            turn = 0 if first_Player else 1
            game_reward = self.get_game_reward(game_state)
            self.update_q_table(actual_state, action, game_reward, next_state)
            game.reset()  
            self.exploration_rate = np.clip(
                np.exp(-self.exploration_decay * episode), self.min_exploration_rate, 1
            )
            
    def train_backprop(self, n_episodes, first_Player=True):
        game = Tic_Tac_Toe()
        self.player_number = 0 if first_Player else 1
        self.opponent.player_number = 1-self.player_number 
        players = [self, self.opponent]
        turn = 0 if first_Player else 1
        for episode in range(n_episodes):
            states_action_traversed = []
            while game.state == -1:
                possible_moves = game.possible_moves()
                actual_state = self.convert_state(game.board)
                if game.actual_player == self.player_number:
                    action = players[turn].move(game.board, possible_moves)
                    states_action_traversed.append((actual_state, action))
                    _, game_state = game.move(action, players[turn].player_number)
                    next_state = self.convert_state(game.board)
                    #self.update_q_table(actual_state, action, reward, next_state)
                else:
                    game.move(players[turn].move(game.board, possible_moves), players[turn].player_number)
                turn = 1-turn
            turn = 0 if first_Player else 1
            game_reward = self.get_game_reward(game_state)
            self.update_q_table(actual_state, action, game_reward, next_state)
            for state, action in states_action_traversed[::-1]:
                self.update_q_table(state, action, game_reward, next_state)
            game.reset()  
            self.exploration_rate = np.clip(
                np.exp(-self.exploration_decay * episode), self.min_exploration_rate, 1
            )
            
    def train_backprop_incremental(self, n_episodes, first_Player=True):
        game = Tic_Tac_Toe()
        self.player_number = 0 if first_Player else 1
        self.opponent.player_number = 1-self.player_number 
        players = [self, self.opponent]
        turn = 0 if first_Player else 1
        for episode in range(n_episodes):
            states_action_traversed = []
            while game.state == -1:
                possible_moves = game.possible_moves()
                actual_state = self.convert_state(game.board)
                if game.actual_player == self.player_number:
                    action = players[turn].move(game.board, possible_moves)
                    states_action_traversed.append((actual_state, action))
                    _, game_state = game.move(action, players[turn].player_number)
                    next_state = self.convert_state(game.board)
                    #self.update_q_table(actual_state, action, reward, next_state)
                else:
                    game.move(players[turn].move(game.board, possible_moves), players[turn].player_number)
                turn = 1-turn
            turn = 0 if first_Player else 1
            game_reward = self.get_game_reward(game_state)
            self.update_q_table(actual_state, action, game_reward, next_state)
            reward_step_decrement = game_reward / len(states_action_traversed)
            for state, action in states_action_traversed[::-1]:
                self.update_q_table(state, action, game_reward, next_state)
                game_reward -= reward_step_decrement if game_reward > 0 else -reward_step_decrement
            game.reset()  
            self.exploration_rate = np.clip(
                np.exp(-self.exploration_decay * episode), self.min_exploration_rate, 1
            )

In [4]:
def match(a1, a2):
    game = Tic_Tac_Toe()
    players = [a1, a2]
    i = 1
    while game.state == -1:
        i = 1-i
        action = players[i].move(game.board, game.possible_moves())
        game.move(action, players[i].player_number)
    return game.state

def test_agent(agent, opponent, n_match, first_Player=True):
    victories = 0
    draws = 0
    players = [agent, opponent]
    turn = 0 if first_Player else 1
    agent.player_number = 0 if first_Player else 1
    opponent.player_number = 1 - agent.player_number
    for _ in range(n_match):
        result = match(players[turn], players[1-turn])
        if (result == 0 and first_Player) or (result == 1 and not first_Player):
            victories += 1
        if result == 2:
            draws += 1
    print(f"Wins: {victories}, draws: {draws} over {N_MATCHES}")
    print(f"Explored states: {len(agent.q_table)}")
    return victories, draws

Here i trained 3 agents as first player, each one with a different strategies to perform an action 

In [1228]:
agent_greedy = Q_Agent(learning_rate=0.2, 
                  discount_rate=0.8, 
                  exploration_rate= 1, 
                  min_exploration_rate=0.01, 
                  exploration_decay= 3e-6,
                  opponent=RandomAgent(1),
                  exploration_strategy=0)
agent_greedy.train(500000)
print(f"Exploration rate: {agent_greedy.exploration_rate}")
N_MATCHES = 10000
agent_greedy.exploration_rate = 0
test_agent(agent_greedy, RandomAgent(1), N_MATCHES)

Exploration rate: 0.22313082953991437
Wins: 7554, draws: 1307 over 10000
Explored states: 5162


(7554, 1307)

In [1216]:
agent_ucb = Q_Agent(learning_rate=0.1, 
                  discount_rate=0.99, 
                  exploration_rate= 1, 
                  min_exploration_rate=0.01, 
                  exploration_decay= 3e-6,
                  opponent=RandomAgent(0),
                  exploration_strategy=1)
agent_ucb.train(2000000)
N_MATCHES = 10000
agent_ucb.exploration_rate = 0
test_agent(agent_ucb, RandomAgent(1), N_MATCHES)

Wins: 8383, draws: 547 over 10000
Explored states: 5162


(8383, 547)

In [5]:
agent_boltzmann = Q_Agent(learning_rate=0.1, 
                  discount_rate=0.99, 
                  exploration_rate= 1, 
                  min_exploration_rate=0.01, 
                  exploration_decay= 3e-6,
                  opponent=RandomAgent(0),
                  exploration_strategy=2)
agent_boltzmann.train(1000000)

In [9]:
N_MATCHES = 10000
agent_boltzmann.exploration_rate = 0.00001
test_agent(agent_boltzmann, RandomAgent(1), N_MATCHES)

Wins: 7587, draws: 1242 over 10000
Explored states: 5162


(7587, 1242)

For the victories as first player the best agent is the ucb but not for the loss since it does fewer draws and more losses

In [1207]:
N_MATCHES = 10000
agent_greedy.exploration_rate = 0
test_agent(agent_greedy, RandomAgent(1), N_MATCHES)

Wins: 7256, draws: 1468 over 10000
Explored states: 5478


(7256, 1468)

In [1146]:
N_MATCHES = 10000
agent_ucb.exploration_rate = 0
test_agent(agent_ucb, RandomAgent(1), N_MATCHES)

Wins: 8473, draws: 484 over 10000
Explored states: 5478


(8473, 484)

In [1148]:
N_MATCHES = 10000
agent_boltzmann.exploration_rate = 0.0001
wins, draws = test_agent(agent_boltzmann, RandomAgent(1), N_MATCHES)

Wins: 7532, draws: 1253 over 10000
Explored states: 5474


Here i trained 3 agents as second player, each one with a different strategies to perform an action 

In [1199]:
agent_greedy_2_player = Q_Agent(learning_rate=0.1,
                       discount_rate=0.99,
                       exploration_rate=1,
                       min_exploration_rate=0.01,
                       exploration_decay=3e-6,
                       opponent=RandomAgent(1),
                       exploration_strategy=0)
agent_greedy_2_player.train(1000000, first_Player=False)
N_MATCHES = 10000
agent_greedy_2_player.exploration_rate = 0
test_agent(agent_greedy_2_player, RandomAgent(1), N_MATCHES, first_Player=False)

Wins: 5300, draws: 1976 over 10000
Explored states: 5477


(5300, 1976)

In [None]:
agent_ucb_2_player = Q_Agent(learning_rate=0.1,
                    discount_rate=0.99,
                    exploration_rate=1,
                    min_exploration_rate=0.01,
                    exploration_decay=3e-6,
                    opponent=RandomAgent(0),
                    exploration_strategy=1)
agent_ucb_2_player.train(1000000,  first_Player=False)
N_MATCHES = 10000
agent_ucb_2_player.exploration_rate = 0
test_agent(agent_ucb_2_player, RandomAgent(1), N_MATCHES)

In [None]:
agent_boltzmann_2_player = Q_Agent(learning_rate=0.1,
                          discount_rate=0.99,
                          exploration_rate=1,
                          min_exploration_rate=0.01,
                          exploration_decay=3e-6,
                          opponent=RandomAgent(0),
                          exploration_strategy=2)
agent_boltzmann_2_player.train(1000000, first_Player=False)
N_MATCHES = 10000
agent_boltzmann_2_player.exploration_rate = 0.00001
test_agent(agent_boltzmann_2_player, RandomAgent(1), N_MATCHES)

In [None]:
agent_greedy = Q_Agent(learning_rate=0.1, 
                  discount_rate=0.99, 
                  exploration_rate= 1, 
                  min_exploration_rate=0.01, 
                  exploration_decay= 3e-6,
                  opponent=RandomAgent(1),
                  exploration_strategy=0)
agent_greedy.train(10)

In [1198]:
agent_greedy.exploration_rate = 0
test_agent(agent_greedy, RandomAgent(1), N_MATCHES)

Wins: 7407, draws: 1397 over 10000
Explored states: 5478


(7407, 1397)

In [7]:
agent_greedy = Q_Agent(learning_rate=0.2, 
                  discount_rate=0.8, 
                  exploration_rate= 1, 
                  min_exploration_rate=0.01, 
                  exploration_decay= 3e-6,
                  opponent=RandomAgent(1),
                  exploration_strategy=0)
agent_greedy.train_backprop(500000)
N_MATCHES = 10000
agent_greedy.exploration_rate = 0
test_agent(agent_greedy, RandomAgent(1), N_MATCHES)

Wins: 9682, draws: 231 over 10000
Explored states: 3925


(9682, 231)

In [8]:
agent_greedy = Q_Agent(learning_rate=0.2, 
                  discount_rate=0.8, 
                  exploration_rate= 1, 
                  min_exploration_rate=0.01, 
                  exploration_decay= 3e-6,
                  opponent=RandomAgent(1),
                  exploration_strategy=1)
agent_greedy.train_backprop(500000)
N_MATCHES = 10000
agent_greedy.exploration_rate = 0
test_agent(agent_greedy, RandomAgent(1), N_MATCHES)

Wins: 9283, draws: 621 over 10000
Explored states: 3925


(9283, 621)

In [10]:
agent_greedy = Q_Agent(learning_rate=0.2, 
                  discount_rate=0.8, 
                  exploration_rate= 1, 
                  min_exploration_rate=0.01, 
                  exploration_decay= 3e-6,
                  opponent=RandomAgent(1),
                  exploration_strategy=2)
agent_greedy.train_backprop(1000000)
N_MATCHES = 10000
agent_greedy.exploration_rate = 0.00001
test_agent(agent_greedy, RandomAgent(1), N_MATCHES)

Wins: 9846, draws: 154 over 10000
Explored states: 3925


(9846, 154)

In [13]:
agent_greedy = Q_Agent(learning_rate=0.2, 
                  discount_rate=0.8, 
                  exploration_rate= 1, 
                  min_exploration_rate=0.01, 
                  exploration_decay= 3e-6,
                  opponent=RandomAgent(1),
                  exploration_strategy=2)
agent_greedy.train_backprop(1000000, first_Player=False)
N_MATCHES = 10000
agent_greedy.exploration_rate = 0.00001
test_agent(agent_greedy, RandomAgent(1), N_MATCHES, first_Player=False)

Wins: 9223, draws: 279 over 10000
Explored states: 3991


(9223, 279)

In [9]:
agent_greedy = Q_Agent(learning_rate=0.2, 
                  discount_rate=0.8, 
                  exploration_rate= 1, 
                  min_exploration_rate=0.01, 
                  exploration_decay= 3e-6,
                  opponent=RandomAgent(1),
                  exploration_strategy=2)
agent_greedy.train_backprop_incremental(1000000)
N_MATCHES = 10000
agent_greedy.exploration_rate = 0.00001
test_agent(agent_greedy, RandomAgent(1), N_MATCHES)

Wins: 9955, draws: 45 over 10000
Explored states: 3925


(9955, 45)

In [10]:
agent_greedy = Q_Agent(learning_rate=0.2, 
                  discount_rate=0.8, 
                  exploration_rate= 1, 
                  min_exploration_rate=0.01, 
                  exploration_decay= 3e-6,
                  opponent=RandomAgent(1),
                  exploration_strategy=2)
agent_greedy.train_backprop_incremental(1000000, first_Player=False)
N_MATCHES = 10000
agent_greedy.exploration_rate = 0.00001
test_agent(agent_greedy, RandomAgent(1), N_MATCHES, first_Player=False)

Wins: 9180, draws: 160 over 10000
Explored states: 3991


(9180, 160)