Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [1]:
import numpy as np
import hashlib as hl
import random as rd
from tqdm.auto import tqdm

In [2]:
def check_win(state):
    state=np.array(state).reshape(3,3)
     # Controllare le righe e le colonne
    for i in range(3):
        # Controllare le righe
        if all(state[i, j] == state[i, 0] and state[i, 0] != ' ' for j in range(3)):
            return True
        # Controllare le colonne
        if all(state[j, i] == state[0, i] and state[0, i] != ' ' for j in range(3)):
            return True
    # Controllare le diagonali
    if state[0, 0] == state[1, 1] == state[2, 2] and state[0, 0] != ' ':
        return True
    if state[0, 2] == state[1, 1] == state[2, 0] and state[0, 2] != ' ':
        return True
    return False

            

# Random Agent

In [3]:
class RandomPlayer:
    def __init__(self, symbol):
        self.symbol = symbol

    def make_move(self, state):
        empty_cells = np.where(np.array(state) == ' ')[0]
        # print("random player empty cells",empty_cells)
        action = np.random.choice(len(empty_cells))
        # print("random player action",action)    
        state[action] = self.symbol
        return state

# QLearningAgent 


In [10]:
def reward(state):
    if state.count(' ') == 0:
        return 0.5 #draw
    elif check_win(state)==True :
        return 10 #q-learning agent won
    else:
        # check if random player won
        shadow_state = state.copy()
        shadow_state = RandomPlayer('O').make_move(shadow_state)
        if check_win(shadow_state)==True:
            return -10 #q-learning agent lost
        else:
            return 0.1 #reward for generic move, game not ended  
        


def get_qvalue_max(qtable,state,action):
    best_act = None
    val_max = float('-inf')
    
    if action is not None and (tuple(state),action) not in qtable:
        return action,0
    else:
        for key, val in qtable.items():
            if key[0] == tuple(state) and val > val_max:
                val_max = val
                best_act = key[1]

    return best_act,val_max

def print_board(state):
    state_as_list = [list(state[i:i+3]) for i in range(0, len(state), 3)]
    for row in state_as_list:
        print(row)
    print("-----------------")
    return


In [8]:

QAGENTMODE="RANDOM"

class QLearningAgent():
    def __init__(self, symbol):
        self.symbol = symbol
        self.eps=0.4
    
    def make_move(self, state):
        # print("qply_state",state)
        possible_moves=np.where(np.array(state)==' ')[0]
        # print("board",state)
        # print("qply possible_moves",possible_moves)
        # action=np.random.choice(possible_moves)
        
        if QAGENTMODE=="EPSILON_GREEDY":
            # epsilon greedy
            if rd.random() < self.eps:
                action=np.random.choice(possible_moves)
                # print("qplayer action explorative",action)
            else:
                if all((tuple(state),val) not in qtable for val in possible_moves)==True:
                    action=np.random.choice(possible_moves)
                    # print("qplayer action explorative",action)
                else:
                    action,_=get_qvalue_max(qtable,state,None)
                    # print("qplayer action exploitative",action)

        elif QAGENTMODE=="RANDOM":
            action=np.random.choice(possible_moves)
        else:
            print("ERROR MODE NOT FOUND")
            return None
            
        hashable_state=tuple(state)

        # print(hashable_state,action)

        if (hashable_state,action) not in qtable:
            #if new state,action tuple is discovered assign as q-value 
            # a random number between -1 and 1
            qtable[(hashable_state,action)]=np.random.uniform(-1,1)
            # print("new move")
            # print("qtable",qtable)
            # print("action:",action)
            state[action]=self.symbol #update the state
            return state 
        else:
            #if a value already exists for the state, update the q-value
            state[action]=self.symbol #update the state 
            _,qvalue_max=get_qvalue_max(qtable,tuple(state),action)
            # print("already discovered")
            # print("qvalue_max",qvalue_max)
            # print("qtable",qtable)
            curr_value=qtable[(hashable_state,action)]
            qtable[(hashable_state,action)]=(1-lr)*curr_value+lr*(reward(state)+discount*qvalue_max)
            # print("updated qtable",qtable)
        return state

    def use_only_qtable(self,state):
        action,_=get_qvalue_max(qtable,state,None)
        if action is None: #new tuple state,action not found in training
            action=np.random.choice(np.where(np.array(state)==' ')[0])
        state[action]=self.symbol
        return state




# Training

In [60]:
lr=0.001
discount=0.5

qtable=dict() #key:(state,action) tuple, value: q-value

rndply= RandomPlayer('X')
qply=QLearningAgent('O')
results=[0,0,0] #draws,player1 wins,player2 wins
games=2000

for i in tqdm(range(games)):
    board=[' ',' ',' ',' ',' ',' ',' ',' ',' ']
    player=1 if games<games//2 else 0
    # print("game:{}",i+1)
    while check_win(board)==False and board.count(' ')>0:
        if player==1:
            # completly random Q-learning agent
            board=qply.make_move(board)
        else:
            board=rndply.make_move(board)
        player=1-player
        # print_board(board)
        # print("spaces left",board.count(' '))
    if board.count(' ')==0:
        # print("Game over,draw!")
        results[0]+=1
    else:
        # print("Game over,won by player",player+1,"!")
        results[player+1]+=1
print(results)
print("Q-learning agent win rate:",results[2]/games*100,"%")
print("Random player win rate:",results[1]/games*100,"%")
print("Draw rate:",results[0]/games*100,"%")
print("qtable size:",len(qtable))

  0%|          | 0/2000 [00:00<?, ?it/s]

[284, 865, 851]
Q-learning agent win rate: 42.55 %
Random player win rate: 43.25 %
Draw rate: 14.2 %
qtable size: 4906


# Play against human

In [47]:
class HumanPlayer():
    def __init__(self, symbol):
        self.symbol = symbol

    def make_move(self, state):
        while True:
            action = int(input("Choose your move, avaible moves are: " + str(np.where(np.array(state) == ' ')[0]) + "\n"))
            if state[action] == ' ':
                state[action] = self.symbol
                return state
            else:
                print("Invalid move!")

In [61]:
qply=QLearningAgent('O')
hlpy= HumanPlayer('X')

board=[' ',' ',' ',' ',' ',' ',' ',' ',' ']
while check_win(board)==False and board.count(' ')>0:
    if player==1:
        # Q-learning 4agent
        board=qply.use_only_qtable(board)
    else:
        board=hlpy.make_move(board)
    player=1-player
    print_board(board)
if board.count(' ')==0:
    print("Draw!")
elif player==1:
    print("Q-learning agent won!")
else:
    print("Human player won!")


[' ', ' ', ' ']
[' ', ' ', ' ']
['O', ' ', ' ']
-----------------
['X', ' ', ' ']
[' ', ' ', ' ']
['O', ' ', ' ']
-----------------
['X', ' ', ' ']
[' ', ' ', ' ']
['O', ' ', 'O']
-----------------
['X', ' ', 'X']
[' ', ' ', ' ']
['O', ' ', 'O']
-----------------
['X', 'O', 'X']
[' ', ' ', ' ']
['O', ' ', 'O']
-----------------
['X', 'O', 'X']
[' ', ' ', ' ']
['O', 'X', 'O']
-----------------
['X', 'O', 'X']
[' ', 'O', ' ']
['O', 'X', 'O']
-----------------
['X', 'O', 'X']
[' ', 'O', 'X']
['O', 'X', 'O']
-----------------
['X', 'O', 'X']
['O', 'O', 'X']
['O', 'X', 'O']
-----------------
Draw!
