In [1]:
import matplotlib.pyplot as plt
import random
import numpy as np
import tqdm


In [2]:
class tic_tac_toe_model():
    def __init__(self, n):
        self.n = n
        self.matriz = np.full((n,n),0,dtype=int)
    
    def reset_matrix(self):
        self.matriz = np.full((self.n,self.n),0,dtype=int)

    def print_game(self):
        matriz = self.matriz
        substituicoes = {1: "X", 2: "O", 0:' '}
        for linha in matriz:
            linha_formatada = [str(substituicoes.get(valor, valor)) for valor in linha]
            print(" | ".join(linha_formatada))
            print("-" * 9)
    

    def number_ij(self, number):
        i,j = np.unravel_index(number, self.matriz.shape)
        return i,j
    
    
    def get_avaible_moves(self):
        avaible_moves = np.ravel_multi_index(np.where(self.matriz == 0), self.matriz.shape)
        return list(avaible_moves)

    def get_random_move(self, piece):
        possible_move_i, possible_move_j = np.where(self.matriz == 0)
        if possible_move_j.shape[0] > 0:  # Verifica se há movimentos possíveis
            index = random.randint(0, possible_move_j.shape[0] - 1)  # Correção aqui
            self.move(possible_move_i[index], possible_move_j[index], piece)



    def move(self,index_i,index_j,piece):
        if self.matriz[index_i][index_j] == 0:
            self.matriz[index_i][index_j] = piece
        
    
    def reward_piece(self,piece):
        w = self.check_win()
        if w != 4 :
            if w!=3:
                if w == piece:
                    return 1
                else:
                    return -1
            
        return 0
          
    def check_win(self):
        state = False
        win_piece = -1
        value_counts_diagonal = np.unique(self.matriz.diagonal())
        value_counts_diagonal2  = np.unique(np.fliplr(self.matriz).diagonal())
        if value_counts_diagonal.shape[0] == 1 and value_counts_diagonal[0] !=0:
            state=True     
            win_piece = value_counts_diagonal[0] 
            return win_piece          
        if value_counts_diagonal2.shape[0] == 1 and value_counts_diagonal2[0] !=0:
            state=True    
            win_piece = value_counts_diagonal2[0]   
            return win_piece         

        for i in range(0,self.n):
            value_counts_linha = np.unique(self.matriz[i,:])
            value_counts_coluna = np.unique(self.matriz[:,i])
            
            if value_counts_linha.shape[0] == 1 and value_counts_linha[0] != 0 :
                state=True
                win_piece = value_counts_linha[0]
                break
            if value_counts_coluna.shape[0] == 1 and value_counts_coluna[0] != 0:
                state=True
                win_piece = value_counts_coluna[0]
                break
            
        velha = np.where(self.matriz == 0)
        
        if state:
            return win_piece
        if velha[0].shape[0] == 0: 
            return 3
        else:
            return 4      

In [3]:
class QlearningAgent():
    def __init__(self, epsilon,alpha ,discount_factor, train):
        self.q_table = {}
        self.q_table_values = {}
        self.q_table_qtd = {}
        self.epsilon = epsilon
        self.alpha = alpha
        self.discount_factor = discount_factor
        self.train = train

         
    def get_q_value(self, state, action, piece):
        state_tuple = tuple(state.flatten())

        if (state_tuple,action,piece) not in self.q_table:
            self.q_table[(state_tuple, action, piece)] = 0.0
            self.q_table_qtd[(state_tuple, action, piece)] = 0
            self.q_table_values[(state_tuple, action, piece)] = 0

        return self.q_table[(state_tuple, action, piece)]

    def choose_move(self, state, available_moves, piece):
        q_values = []
        for action in available_moves:
            q_values.append(self.get_q_value(state, action, piece))
            
        if random.uniform(0, 1) < self.epsilon and self.train:
            return random.choice(available_moves)
        else:
            max_q_value = max(q_values)
            if q_values.count(max_q_value) > 1:
                best_moves = [i for i in range(len(available_moves)) if q_values[i] == max_q_value]
                i = random.choice(best_moves)
            else:
                i = q_values.index(max_q_value)
            return available_moves[i]

    def update_q_value(self, states, rewards):
        #estado atual + alpha[retorno estado atual + ymax(proximo_estado) - estado atual]
        for i,state in enumerate(states):
            rt = 0
            if state not in self.q_table.keys():
                self.q_table[state] = 0.0
                self.q_table_qtd[state] = 0
                self.q_table_values[state] = 0

            for ii in range(0,len(rewards)):
                rt+= rewards[ii] * (self.discount_factor ** (ii-i))

            if i == len(states)-1:
                next_reward = 0
            else:
                next_reward = rewards[i+1]
           
            q_formula =  self.q_table[state] + (self.alpha*(rewards[i] + self.discount_factor*(next_reward) - self.q_table[state]))
            self.q_table[state] = q_formula
            #self.q_table_qtd[state] +=1 
            #self.q_table[state] = self.q_table_values[state] / self.q_table_qtd[state]


In [4]:
class environment():
    def __init__(self, tic_tac_toe: tic_tac_toe_model, q_agent: QlearningAgent, train: bool):
        self.board = tic_tac_toe
        self.q_agent = q_agent
        self.train = train
    
    def play_one_game(self, piece):
        game_over = False
        win_piece = 0
        self.board.reset_matrix()
        if piece == 1:
            piece_enemy = 2
            states_x = []
            rewards_x = []
            while not game_over:
                w = self.board.check_win()
                if w != 4:
                    state = self.board.matriz.copy()
                    reward_x = self.board.reward_piece(piece)
                    state_x = (tuple(state.flatten()),-1,piece)# -1 for terminal state
                    states_x.append(state_x)
                    rewards_x.append(reward_x)
                    win_piece = w
                    break

                #x move and state/reward/move dynamic
                state = self.board.matriz.copy()
                avaible_moves = self.board.get_avaible_moves()
                action_x = self.q_agent.choose_move(state,avaible_moves,piece)
                i, j = self.board.number_ij(action_x)
                self.board.move(i,j,piece)
                reward_x = self.board.reward_piece(piece)
                state_x = (tuple(state.flatten()),action_x,piece)
                states_x.append(state_x)
                rewards_x.append(reward_x)
                w = self.board.check_win()
                if w != 4:
                    state = self.board.matriz.copy()
                    reward_x = self.board.reward_piece(piece)
                    state_x = (tuple(state.flatten()),-1,piece)# -1 for terminal state
                    states_x.append(state_x)
                    rewards_x.append(reward_x)
                    win_piece = w
                    break
                self.board.get_random_move(piece_enemy)
               
            self.q_agent.update_q_value(states_x,rewards_x)
                
            return win_piece
        else:
            piece_enemy = 1
            states_o = []
            rewards_o = []
            while not game_over:
                self.board.get_random_move(piece_enemy)
               
                w = self.board.check_win()  
                if w != 4:
                    state = self.board.matriz.copy()
                    reward_o = self.board.reward_piece(piece)
                    state_o = (tuple(state.flatten()),-1,piece)# -1 for terminal state
                    states_o.append(state_o)
                    rewards_o.append(reward_o)
                    win_piece = w
                    break
                state = self.board.matriz.copy()
                avaible_moves = self.board.get_avaible_moves()
                action_o = self.q_agent.choose_move(state,avaible_moves,piece)
                i, j = self.board.number_ij(action_o)
                self.board.move(i,j,piece)
                
                reward_o = self.board.reward_piece(piece)
                state_o = (tuple(state.flatten()), action_o, piece)
                states_o.append(state_o)
                rewards_o.append(reward_o)
                w = self.board.check_win()  
                if w != 4:
                    state = self.board.matriz.copy()
                    reward_o = self.board.reward_piece(piece)
                    state_o = (tuple(state.flatten()),-1,piece)# -1 for terminal state
                    states_o.append(state_o)
                    rewards_o.append(reward_o)
                    win_piece = w
                    break
                 
            self.q_agent.update_q_value(states_o,rewards_o)
                
            return win_piece
   
            
    def play_ia_vs_ia(self):
        game_over = False
        self.board.reset_matrix()
        ia_x = 1
        ia_o = 2
        states_x = []
        rewards_x = []
        states_o = []
        rewards_o = []
        while not game_over:
            w = self.board.check_win()
            if w != 4:
                win_piece = w
                break
            state_x = self.board.matriz.copy()
            avaible_moves_x = self.board.get_avaible_moves()
            action_x = self.q_agent.choose_move(state_x,avaible_moves_x,ia_x)
            i, j = self.board.number_ij(action_x)
            self.board.move(i,j,ia_x) # x play

            #x state/reward
            reward_x = self.board.reward_piece(ia_x)
            state_x = (tuple(state_x.flatten()),action_x,ia_x)
            states_x.append(state_x)
            rewards_x.append(reward_x)

            w = self.board.check_win()
            if w != 4:
                win_piece = w
                break

            state_o = self.board.matriz.copy()
            avaible_moves_o = self.board.get_avaible_moves()
            action_o = self.q_agent.choose_move(state_o,avaible_moves_o,ia_o)
            i, j = self.board.number_ij(action_o)
            self.board.move(i,j,ia_o) # o play

            reward_o = self.board.reward_piece(ia_o)
            state_o = (tuple(state_o.flatten()),action_o,ia_o)
            states_o.append(state_o)
            rewards_o.append(reward_o)
        
        if win_piece == 1:
            state = self.board.matriz.copy()
            reward_o = self.board.reward_piece(ia_o)
            state_o = (tuple(state.flatten()),-1,ia_o)# -1 for terminal state
            states_o.append(state_o)
            rewards_o.append(reward_o)     
        elif win_piece == 2:
            state = self.board.matriz.copy()
            reward_x = self.board.reward_piece(ia_x)
            state_x = (tuple(state.flatten()),-1,ia_x)# -1 for terminal state
            states_x.append(state_x)
            rewards_x.append(reward_x)

            
        self.q_agent.update_q_value(states_x,rewards_x)
        self.q_agent.update_q_value(states_o,rewards_o)

        return win_piece
    
    
    def run(self, n):
        wins_x = []
        wins_o = []
        wins_ia = []
        
        print(f'Playing {n} games with X')
        for i in tqdm.tqdm(range(0,n)):
            wins_x.append(self.play_one_game(piece=1))
        
        print(f'Playing {n} games with O')
        for i in tqdm.tqdm(range(0,n)):
            wins_o.append(self.play_one_game(piece=2))
            #print(wins_o[i])
            #print("====================================")

        print(f'Playing {n} games ia vs ia')
        for i in tqdm.tqdm(range(0,n)):
            wins_ia.append(self.play_ia_vs_ia())
        
        return wins_x,wins_o, wins_ia

In [5]:
class Game():
    def __init__(self, tic_tac_toe: tic_tac_toe_model, q_agent: QlearningAgent):
        self.board = tic_tac_toe
        self.q_agent = q_agent
    

    def ia_vs_ia(self):
        game_over = False
        self.board.reset_matrix()
        while not game_over:
            w = self.board.check_win()
            if w != 4:
                win_piece = w
                break
            state = self.board.matriz.copy()
            avaible_moves = self.board.get_avaible_moves()
            action = self.q_agent.choose_move(state,avaible_moves,1)
            i, j = self.board.number_ij(action)
            self.board.move(i,j,1)
            if w != 4:
                win_piece = w
                break

            state = self.board.matriz.copy()
            avaible_moves = self.board.get_avaible_moves()
            action = self.q_agent.choose_move(state,avaible_moves,2)
            i, j = self.board.number_ij(action)
            self.board.move(i,j,2)

            
        return win_piece
        
    def run_ia_vs_ia(self,n):
        games = []
        for i in tqdm.tqdm(range(0,n)):
            w = self.ia_vs_ia()
            games.append(w)
        return games
    
    def ia_vs_user(self):
        print('Menu')
        print("1 - Iniciar aleatorio")
        print("2 - Escolher peça [X-O]")
        m1 = int(input())
        while m1 != 1 and m1 !=2:
            print('Insira um valor valido 1-2')
            print("1 - Iniciar aleatorio")
            print("2 - Escolher peça [X-O]")
            m1 = int(input())

        game_over = False
        self.board.reset_matrix()
        pieces = [1,2]
        win_piece = 0
        if m1 == 1 :
            user = random.choice(pieces)    
        else:
            print('1 - X \n 2 - O')
            user = int(input())
            while user != 1 and user !=2:
                print("Insira um valor válido 1-2")
                print('1 - X \n 2 - O')
                user = int(input())
                

        ia = 2 if user == 1 else 1
        print("Começo de jogo")
        self.board.print_game()
        if ia == 1:
            while not game_over:
                w = self.board.check_win()
                if w != 4:
                    win_piece = w
                    break
                state = self.board.matriz.copy()
                avaible_moves = self.board.get_avaible_moves()
                action = self.q_agent.choose_move(state,avaible_moves,ia)
                i, j = self.board.number_ij(action)
                print('Jogada da IA - X')
                self.board.move(i,j,ia)
                self.board.print_game()
                if w != 4:
                    win_piece = w
                    break
                print("Jogue Usuario - O")
                mv = int(input())
                i,j = self.board.number_ij(mv)
                self.board.move(i,j,user)
                self.board.print_game()

        else:
             while not game_over:
                w = self.board.check_win()
                if w != 4:
                    win_piece = w
                    break

                print('Jogue Usuario - X')
                mv = int(input())
                i,j = self.board.number_ij(mv)
                self.board.move(i,j,user)
                self.board.print_game()

                print('Jogada da IA - O')
                state = self.board.matriz.copy()
                avaible_moves = self.board.get_avaible_moves()
                action = self.q_agent.choose_move(state,avaible_moves,ia)
                i, j = self.board.number_ij(action)
                self.board.move(i,j,ia)
                self.board.print_game()

                if w != 4:
                    win_piece = w
                    break
               

        if win_piece == ia :
            print("IA venceu Humano Fraco")
        elif win_piece == user :
            print("Humano venceu, esta preparado para a revolução?")
        else:
            print('Deu velha, mas a I.A segue aprendendo e melhorando e você?')
        return win_piece
        

In [21]:
board = tic_tac_toe_model(3)
q_agent = QlearningAgent(epsilon=0.9, alpha=0.6,discount_factor=1, train=True)
exp = environment(board,q_agent, train=True)
n = 100000
wins_x,wins_o, wins_ia = exp.run(n)


Playing 100000 games with X


100%|██████████| 100000/100000 [01:06<00:00, 1492.81it/s]


Playing 100000 games with O


100%|██████████| 100000/100000 [01:00<00:00, 1661.85it/s]


Playing 100000 games ia vs ia


100%|██████████| 100000/100000 [01:27<00:00, 1149.38it/s]


In [24]:
len(q_agent.q_table.keys())

18083

In [26]:
print("IA VS IA")
games = np.array(wins_ia)
print('IA[X] vs IA[O]')
print(f"Jogos ganhos[x]: {np.where(games==1)[0].shape[0]*100/n}%\n",
      f"Jogos perdidos[x]: {np.where(games==2)[0].shape[0]*100/n}%\n",
      f"Jogos empatados: {np.where(games==3)[0].shape[0]*100/n}%\n")

print('IA[X] vs Aleatorio ')
games = np.array(wins_x)
print(f"Jogos ganhos: {np.where(games==1)[0].shape[0]*100/n}%\n",
      f"Jogos perdidos: {np.where(games==2)[0].shape[0]*100/n}%\n",
      f"Jogos empatados: {np.where(games==3)[0].shape[0]*100/n}%\n")

print('IA[0] vs Aleatorio ')
games = np.array(wins_o)
print(f"Jogos ganhos: {np.where(games==2)[0].shape[0]*100/n}%\n",
      f"Jogos perdidos: {np.where(games==1)[0].shape[0]*100/n}%\n",
      f"Jogos empatados: {np.where(games==3)[0].shape[0]*100/n}%\n")

IA VS IA
IA[X] vs IA[O]
Jogos ganhos[x]: 58.449%
 Jogos perdidos[x]: 30.045%
 Jogos empatados: 11.506%

IA[X] vs Aleatorio 
Jogos ganhos: 63.105%
 Jogos perdidos: 25.458%
 Jogos empatados: 11.437%

IA[0] vs Aleatorio 
Jogos ganhos: 33.266%
 Jogos perdidos: 54.173%
 Jogos empatados: 12.561%



In [31]:
q_agent.epsilon = 0
q_agent.train = False

n = 1000
wins_x,wins_o, wins_ia = exp.run(n)

Playing 1000 games with X


100%|██████████| 1000/1000 [00:00<00:00, 2075.75it/s]


Playing 1000 games with O


100%|██████████| 1000/1000 [00:00<00:00, 1957.19it/s]


Playing 1000 games ia vs ia


100%|██████████| 1000/1000 [00:00<00:00, 1149.64it/s]


In [32]:
print("IA VS IA")
games = np.array(wins_ia)
print('IA[X] vs IA[O]')
print(f"Jogos ganhos[x]: {np.where(games==1)[0].shape[0]*100/n}%\n",
      f"Jogos perdidos[x]: {np.where(games==2)[0].shape[0]*100/n}%\n",
      f"Jogos empatados: {np.where(games==3)[0].shape[0]*100/n}%\n")

print('IA[X] vs Aleatorio ')
games = np.array(wins_x)
print(f"Jogos ganhos: {np.where(games==1)[0].shape[0]*100/n}%\n",
      f"Jogos perdidos: {np.where(games==2)[0].shape[0]*100/n}%\n",
      f"Jogos empatados: {np.where(games==3)[0].shape[0]*100/n}%\n")

print('IA[0] vs Aleatorio ')
games = np.array(wins_o)
print(f"Jogos ganhos: {np.where(games==2)[0].shape[0]*100/n}%\n",
      f"Jogos perdidos: {np.where(games==1)[0].shape[0]*100/n}%\n",
      f"Jogos empatados: {np.where(games==3)[0].shape[0]*100/n}%\n")

IA VS IA
IA[X] vs IA[O]
Jogos ganhos[x]: 41.7%
 Jogos perdidos[x]: 29.8%
 Jogos empatados: 28.5%

IA[X] vs Aleatorio 
Jogos ganhos: 97.1%
 Jogos perdidos: 1.7%
 Jogos empatados: 1.2%

IA[0] vs Aleatorio 
Jogos ganhos: 80.5%
 Jogos perdidos: 9.2%
 Jogos empatados: 10.3%



In [10]:
game = Game(board, q_agent)

In [11]:
#game.ia_vs_user()

In [12]:
#game.run_ia_vs_ia(10)