Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [51]:
from itertools import combinations
from random import choice
from copy import deepcopy
from tqdm.auto import tqdm
import numpy as np

In [52]:
class Tic_Tac_Toe(object):
    '''Class of the game.'''

    def __init__(self):
        self.board = np.zeros((3, 3))
        self.MAGIC = np.array([[2, 7, 6], [9, 5, 1], [4, 3, 8]])
        self.player=1 
    
    #check if the player wins after the last move
    def win(self, player):
        cells = self.MAGIC[self.board == player]
        return any(sum(h) == 15 for h in combinations(cells, 3))
    
    def reward(self, player):
        if self.win(player):
            return 1
        elif self.win(3 - player):
            return -1
        else:
            return 0

    def get_state(self):
        return self.board
    
    #parameter action contains the coordinates of the board where to write the x or o
    def make_move(self, action):
        if self.board[action] == 0:
            self.board[action] = self.player
            if self.player == 1:
                self.player = 2
            else:
                self.player = 1
            return True
        else:
            return False
    
    #list of available actions after updating the state, need for the Bellman equation
    def next_step_moves(self):
        if self.win(1) or self.win(2):
            return list()
        r , c = np.where(self.board == 0)
        return list(zip(r, c))

    def end_game(self):
        return len(self.next_step_moves()) == 0 or self.win(1) or self.win(2)
    
    def reset(self):
        self.board = np.zeros((3, 3))
        self.player=1


In [53]:
class Q_learning:
    def __init__(self, alpha, gamma, epsilon): #parameters of the Bellman Equation, epsilon is to balance exploration and exploitation
        self.table = {}   #the table is implemented as a dictionary
        self.epsilon = epsilon
        self.gamma = gamma
        self.alpha = alpha
        
    def get_epsilon(self):
        return self.epsilon
    
    def set_epsilon(self, eps):
        self.epsilon = eps
    
    #take the move according to epsilon
    def choice_move(self, state, moves): 
        if np.random.uniform() < self.epsilon: 
            return moves[choice(range(len(moves)))]
        else:
            values = np.array([self.get_value(state, move) for move in moves]) #exploitation, you take the move with the max value in the q_table
            max= np.max(values)
            return moves[choice(np.where(values == max)[0])]
    
    def get_value(self, state, move):
        if (state, move) not in self.table:
            self.table[(state, move)] = 0
        return self.table[(state, move)]
    
    #update the value in the table according with the Bellman equation
    def update_value(self, state, move, reward, next_state, next_moves): 
        value = self.get_value(state, move) 
        next_q_values = np.array([self.get_value(next_state, next_action) for next_action in next_moves])
        maximum = np.max(next_q_values) if len(next_q_values) > 0 else 0  # you take the move with the max value in the q_table for the next state of this player
        self.table[(state, move)] = value + self.alpha * (reward + self.gamma * maximum - value)

In [54]:
Q1 = Q_learning(0.5, 0.9, 1)
games = 100000
gamma_epsilon=0.999
min_eps=0.15
game = Tic_Tac_Toe()

for i in tqdm(range(games)):
    if Q1.get_epsilon() > min_eps:  #minimum exploration
       Q1.set_epsilon(Q1.get_epsilon() * gamma_epsilon)
    game.reset()

    while not game.end_game():
        state = game.get_state().copy()
        moves = game.next_step_moves() 
        move = Q1.choice_move(str(state), moves)
        game.make_move(move)

        if game.end_game():
            next_state = game.get_state().copy()
            next_moves = game.next_step_moves() 
            reward = game.reward(1)
            Q1.update_value(str(state), move, reward, str(next_state), next_moves)
            
        else: 
            reward = game.reward(1) 
            enemy_moves = game.next_step_moves()
            enemy_move = enemy_moves[np.random.choice(range(len(enemy_moves)))] #the other player is always randomic
            game.make_move(enemy_move)

            if game.end_game():
                reward = game.reward(1)

            next_state = game.get_state().copy()
            next_moves = game.next_step_moves()
            
            Q1.update_value(str(state), move, reward, str(next_state), next_moves)

  0%|          | 0/100000 [00:00<?, ?it/s]

In [55]:
wins, draws, loses = 0, 0, 0
game = Tic_Tac_Toe()
games=2000

for i in range(games):
    game.reset()
    
    while not game.end_game():
        if game.player == 1:
            moves = game.next_step_moves()
            state = game.get_state()
            move = Q1.choice_move(str(state), moves)
            game.make_move(move)

        else:
            moves = game.next_step_moves()
            state = game.get_state()
            move = moves[choice(range(len(moves)))]
            game.make_move(move)
        
    if game.win(1):
        wins += 1
    elif game.win(2):
        loses += 1
    else:
        draws += 1

print(f"Playing as first:\n Number of Wins: {wins}\n Number of Draws: {draws}\n Number of Loses: {loses}\n Percentage of Wins: {wins/games *100}% \n")

Playing as first:
 Number of Wins: 1929
 Number of Draws: 44
 Number of Loses: 27
 Percentage of Wins: 96.45% 

