Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

# Tic Tac Toe Game

In [770]:
from dataclasses import dataclass

import numpy as np


@dataclass
class Tic_Tac_Toe:
    
    board: np.ndarray
    actual_player: int
    state: int
    
    def __init__(self, first_player=0):
        self.board = np.ones((3, 3), dtype=np.int8) * -1
        self.actual_player = first_player
        self.state = -1
        
    def possible_moves(self):
        return np.argwhere(self.board == -1)
    
    def reset(self):
        self.board = np.ones((3, 3), dtype=np.int8) * -1
        self.actual_player = 0
        self.results = -1
    
    def move(self, position, player):
        if position not in self.possible_moves() or self.board[position]!= -1 :
            raise ValueError(f"Position {position} not valid")
        if player != self.actual_player:
            raise ValueError(f"Wrong player making the move")
        self.board[position] = player
        self.state = self.check_winner()
        self.actual_player = 1-player
        if self.state == player:
            self.state = player
        if len(np.argwhere(self.board == -1)) == 0:
            self.state = 2
        return 1, self.state
        
    def check_winner(self) -> int:
        for i in range(3):
            if self.board[i][0] == self.board[i][1] == self.board[i][2] != -1:
                return self.board[i][0]
            
            if self.board[0][i] == self.board[1][i] == self.board[2][i] != -1:
                return self.board[0][i]

        if self.board[0][0] == self.board[1][1] == self.board[2][2] != -1:
            return self.board[0][0]

        if self.board[0][2] == self.board[1][1] == self.board[2][0] != -1:
            return self.board[0][2]

        return -1
    
    def get_board(self) -> np.ndarray:
        return self.board
    
    @staticmethod
    def convert_number(number):
        if number == -1:
            return "-"
        elif number == 0:
            return "0"
        else:
            return "X"
    
    def print_board(self):
        for i in range(3): 
            print(f"{self.convert_number(self.board[i][0])}  {self.convert_number(self.board[i][1])}  {self.convert_number(self.board[i][2])}")
            print()
        print()
    
    def print_state(self):
        if self.state == 2:
            print("Draw")
        elif self.state == 0:
            print("Player 0 wins")
        elif self.state == 1:
            print("Player 1 wins")
        else:   
            print("Play in progress")

In [771]:
from random import choice


class RandomAgent:
    
    player_number: int

    def __init__(self, player_number: int):
        self.player_number = player_number
    
    def move(self, state, possible_moves):
        return tuple(choice(possible_moves))

In [772]:
def match(agent1, agent2):
    game = Tic_Tac_Toe()
    agents = [agent1, agent2]
    l = 0
    while game.state == -1:
        action = agents[l].move(game.board, game.possible_moves())
        game.move(action, agents[l].player_number)
        l = 1-l
    return game.state

In [773]:
for i in range(20):
    match(RandomAgent(0), RandomAgent(1))

In [774]:
from copy import deepcopy
import base64
from typing import Dict, Tuple
from random import random

REWARD_WIN = 10
REWARD_DRAW = 0
REWARD_LOSE = -10


class Q_Agent:
    player_number: int
    learning_rate: float
    discount_rate: float
    exploration_rate: float
    min_exploration_rate: float
    exploration_decay: float
    q_table: Dict[Tuple, float]
    
    def __init__(self, player_number: int, learning_rate: float, discount_rate: float, exploration_rate: float, min_exploration_rate: float, exploration_decay: float):
        self.player_number = player_number
        self.learning_rate = learning_rate
        self.discount_rate = discount_rate
        self.exploration_rate = exploration_rate
        self.min_exploration_rate = min_exploration_rate
        self.exploration_decay = exploration_decay
        self.q_table = {}
    
    def convert_state(self, state):
        return "".join(str(_) for _ in state.flatten())
    
    def convert_action(self, action):
        return action[0]*3+action[1]
        
    def move(self, state, possible_moves):
        converted_state = self.convert_state(state)
        if converted_state not in self.q_table:
                self.q_table[converted_state] = np.zeros((9,))
        if random() < self.exploration_rate:
            return tuple(choice(possible_moves))
        else:
            possible_moves = [self.convert_action(action) for action in possible_moves]  
            possible_values = [self.q_table[converted_state][action]  for action in possible_moves]
            max_value_index = np.argmax(possible_values)
            move = possible_moves[max_value_index]
            return move // 3, move % 3
    
    def get_game_reward(self, winner):
        if winner == self.player_number:
            return REWARD_WIN
        elif winner == 2:
            return REWARD_DRAW
        else:
            return REWARD_LOSE
    
    def update_q_table(self, prev_state, action, reward, next_state):
        action = self.convert_action(action)
        if next_state not in self.q_table:
            self.q_table[next_state] = np.zeros((9,))
        new_q_table_value = self.q_table[prev_state][action]*(1-self.learning_rate) + self.learning_rate * (reward + self.discount_rate * (-np.max(self.q_table[next_state])))
        self.q_table[prev_state][action] = new_q_table_value
        
    def train(self, game, n_episodes):
        players = [self, RandomAgent(1-self.player_number)]
        l = 0
        game_state = -1
        for episodes in range(n_episodes):
            while game_state == -1:
                possible_moves = game.possible_moves()
                actual_state = self.convert_state(game.board)
                if l == self.player_number:
                    action = players[l].move(game.board, possible_moves)
                    reward, game_state = game.move(action, players[l].player_number)
                    next_state = self.convert_state(game.board)
                    self.update_q_table(actual_state, action, reward, next_state)
                else:
                    game.move(players[l].move(game.board, possible_moves), players[l].player_number)
                    next_state = self.convert_state(game.board)
                l = 1-l
            game_reward = self.get_game_reward(game_state)
            self.update_q_table(actual_state, action, game_reward, next_state)
            game.reset()
            game_state = -1
            l = 0
            self._exploration_rate = np.clip(
                np.exp(-self.exploration_decay * episodes), self.min_exploration_rate, 1
            )

In [775]:
q_agent = Q_Agent(player_number=0, 
                  learning_rate=0.1, 
                  discount_rate=0.9, 
                  exploration_rate= 1, 
                  min_exploration_rate=0.1, 
                  exploration_decay= 3e-6)
q_agent.train(Tic_Tac_Toe(), 500000)
print(q_agent.exploration_rate)

KeyboardInterrupt: 

In [None]:
print(f"explored states: {len(q_agent.q_table)}")

rand1 = RandomAgent(1)
q_agent.exploration_rate = 0.0
wins = 0
draw = 0
for _ in range(10000):
    result = match(q_agent, rand1)
    if result == 0:
        wins += 1
    if result == 2:
        draw += 1
print(wins)
print(draw)



