Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10
Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: Dies Natalis Solis Invicti ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))
* Reviews: Befana

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [None]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice, uniform
from copy import deepcopy
from IPython.display import clear_output
import time
import logging
import matplotlib.pyplot as plt 
import random

from tqdm.auto import tqdm
import numpy as np

## Loggin configuration

In [None]:
logging.basicConfig(filename='tic_tac_toe_training_debug.log', level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

## Parameters needed

In [None]:
num_episodes = 20_000
test_game = 5_000

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

In [None]:
MAGIC = [2, 7, 6,
         9, 5, 1,
         4, 3, 8]
State = namedtuple('State', ['x', 'o'])

## TicTacToeEnvironment

In [None]:
class TicTacToeEnvironment:
    def reset(self):
        self.board = State(set(), set())
        self.current_player = "X"
        self.available = [2, 7, 6,
                          9, 5, 1,
                          4, 3, 8]
        return (self.board, self.current_player)
    

    def win(self, player):
        if player == "X":
            return any(sum(c) == 15 for c in combinations(self.board.x, 3)) # X wins if this expression returns true 
        
        return any(sum(c) == 15 for c in combinations(self.board.o, 3))
    
    
    def get_reward(self, done):
        if done:
            return 1
        
        return 0

    def __init__(self):
        self.reset()   


    def render(self):
        for r in range(3):
            for c in range(3):
                i = r * 3 + c
                if MAGIC[i] in self.board.x:
                    print('X', end='')
                elif MAGIC[i] in self.board.o:
                    print('O', end='')
                else:
                    print('.', end='')
            print()
        print()

    
    def step(self, action):
       
      
        if self.current_player == "X":
            self.board.x.add(action)
            self.available.remove(action)  # Updated available moves
        else:
            self.board.o.add(action)
            self.available.remove(action)

        done = self.win(self.current_player)  # It keeps track if our episode if finished
        reward = self.get_reward(done) # Compute the reward

        if done:
            info = {"The player: " + self.current_player + " wins!"}
        else:
            info = {"The player: " + self.current_player + " ,does action: " + str(action) }
            self.current_player = "O" if self.current_player == "X" else "X"

        return self.board, reward, done, info
    
    def compute(self, comb):
        done = any(sum(c) == 15 for c in combinations(comb, 3)) 
        reward = self.get_reward(done)
        return reward
    
    def evaluate_possible_outcome(self, new_state):
        combs = [new_state.o.union({available_action}) for available_action in self.available]
        possible_outcomes = map(self.compute, combs)
        possible_outcomes_list = list(possible_outcomes)
        print(f'possible_outcomes:  {combs}')
        print(f'possible_outcomes value:  {possible_outcomes_list}')
        return possible_outcomes_list
        


## Generate the Q-table

In [None]:
q_table= defaultdict(lambda: [0.0] *9)
#q_table = defaultdict(lambda: [random.gauss(0, 0.01) for _ in range(9)])   POSSIBLE INITIALIZATION WITH VALUES TAKEN FROM A GAUSSIAN DISTRIBUTION  WITH STANDARD DEVIATION = 0.01

## Q-Learning algorithm vs random player
* X is the smart agent
* O is the random player

In [None]:
env = TicTacToeEnvironment()
rewards_all_episodes = []

# Q-Learning algorithm
for episode in tqdm(range(num_episodes)):
    state, player= env.reset() 
    current_state = deepcopy(state)
    
    rewards_current_episode = 0
   

    while True:
        invalid = True
       
        logging.debug(f"\n*********** Player: {env.current_player} turn!********\n")
        logging.debug(f'\navailable: {env.available}')

        if env.current_player == "X":

            #Exploration-exploitation trade-off
            exploration_rate_threshold = uniform(0,1)

            if exploration_rate_threshold > exploration_rate:
                
                #The algorithnm can choose only available actions
                # An action is available if it allows to write in a empty cell. 
                available_actions = [a - 1 for a in list(env.available)]  # My actions are in a range between 1 and 9 while in the q-table they are expressed in a list of 9 element which indexes start from 0 (to 8)
                state_str = str(current_state) 
                available_q_values = [q_table[state_str][a] for a in available_actions]
                action = available_actions[np.argmax(available_q_values)] + 1
        
            else:
                action = choice(list(env.available)) 
                
            new_state, reward, done, info = env.step(action)
            
            new_state_str = str(new_state)
            state_str = str(current_state)
            logging.debug(f'current:  {current_state}')
            logging.debug(info)
            logging.debug(f'reward:  {reward}')
            logging.debug(f'new:  {new_state}')
        
            
            outcomes_from_future = env.evaluate_possible_outcome(new_state)
            Q_S_t_next = 0 if not outcomes_from_future else outcomes_from_future
            action -= 1
            q_table[state_str][action] = q_table[state_str][action] * (1- learning_rate) + learning_rate * (reward + discount_rate * (-1.0 *np.max(Q_S_t_next)))
       
            rewards_current_episode += reward

        else:
            action = choice(list(env.available)) 
            new_state, reward, done, info = env.step(action)
            logging.debug(f'current:  {current_state}')
            logging.debug(info)
            logging.debug(f'new:  {new_state}')

        current_state = deepcopy(new_state)
        
        if done == True or not env.available: 
            break 
              
    # Exploration rate decay
    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)

    rewards_all_episodes.append(rewards_current_episode)

## Final Q-Table

In [None]:
for key, value in q_table.items():
    print(f"{key.ljust(40)} \t{value}")

## Calculate and print the average reward per thousand episodes

In [None]:
x_axis = []
y_axis = []
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)  # split into groups of 1000 episodes
count = 1000
print("********Average reward per thousand episodes*******\n")
for r in rewards_per_thousand_episodes:
    x_axis.append(count)
    avg = sum(r/1000)
    print(count, " --> ", str(avg))
    y_axis.append(avg)
    count += 1000

## Avg rewad graph

In [None]:

plt.plot(x_axis, y_axis)

plt.xlabel('episodes')
plt.ylabel('Average reward per thousand episodes')


plt.title('Average reward over groups of 1000 episodes')

plt.show()

## Statistical Test

In [None]:
count_x = 0
count_o = 0
count_draw = 0

for episode in range(test_game):
    state, player = env.reset()
    current_state = deepcopy(state)
    
    while True:
    
        state_str = str(current_state)

        if env.current_player == "X":
            available_actions = [a - 1 for a in list(env.available)]  
            state_str = str(current_state) 
            available_q_values = [q_table[state_str][a] for a in available_actions]
            action = available_actions[np.argmax(available_q_values)] + 1
        else:
            action = choice(list(env.available)) 
                 
        new_state, reward, done, info = env.step(action)

        if done or not env.available:
            if reward == 1 and env.current_player == "X":
                count_x += 1
            elif reward == 1 and env.current_player == "O":
                count_o += 1
            else:
                count_draw += 1
            break

        current_state = deepcopy(new_state)


## Test result

In [None]:
print(f"Q-learning agent wins {count_x/test_game *100}%\nRandom player wins: {count_o/test_game *100}%\nDrawn: {count_draw/test_game*100}%")

## Graphical Test

In [None]:

for episode in range(3):
    state, player = env.reset()
    current_state = deepcopy(state)
    
    print("*****Episode ", episode+1, "******\n\n")
    time.sleep(1.5)

    while True:
        clear_output(wait=True) 
      
        print(f"\n*********** Player: {env.current_player} turn!********\n")
        print('available: ', env.available)

        state_str = str(current_state) 
        if env.current_player == "X":
            available_actions = [a - 1 for a in list(env.available)]  
            state_str = str(current_state) 
            available_q_values = [q_table[state_str][a] for a in available_actions]
            action = available_actions[np.argmax(available_q_values)] + 1
        else:
            action = choice(list(env.available)) 
           
    
        new_state, reward, done, info = env.step(action)
        print(info)
        print(current_state)
        print(new_state)
        env.render()
        time.sleep(3)

        if done or not env.available:
          
            if reward == 1 and env.current_player == "X":
                print("****X wins the game ****")
                time.sleep(2)
            elif reward == 1 and env.current_player == "O":
                print("****O wins the game****")
                time.sleep(2)
            else:
                print("****Draw****")
                time.sleep(2)
            break

        current_state = deepcopy(new_state)

