# Treino e teste de modelos treinados com o algortitmo AlphaZero - <b>Go</b>

### - Inspirado no vídeo do freeCodeCamp: https://www.youtube.com/watch?v=wuSQpLinRB4


Imports necessários para o funcionamento do código.


In [45]:
from __future__ import print_function
import go
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import random
import torch
import math
from torch.autograd import Variable
from tqdm.notebook import trange

### Implementação da Rede Neural Convolucional (CNN) que será utilizada para a predição de probabilidades de jogadas e valores de estados 

In [46]:

class ResNet(nn.Module):
    def __init__(self, game, num_resBlocks, num_hidden, device):
        super().__init__()
        
        # Inicialização da rede
        self.device = device
        
        # Camada inicial da rede (primeiro bloco)
        self.startBlock = nn.Sequential(
            nn.Conv2d(3, num_hidden, kernel_size=3, padding=1),  # Convolução 2D com ativação ReLU
            nn.BatchNorm2d(num_hidden),  # Normalização por lotes
            nn.ReLU()  # Ativação ReLU
        )
        
        # Bloco principal contendo vários blocos residuais
        self.backBone = nn.ModuleList(
            [ResBlock(num_hidden) for i in range(num_resBlocks)]  # Lista de blocos residuais
        )
        
        # Cabeça de política (saída para ações)
        self.policyHead = nn.Sequential(
            nn.Conv2d(num_hidden, 32, kernel_size=3, padding=1),  # Convolução 2D com ativação ReLU
            nn.BatchNorm2d(32),  # Normalização por lotes
            nn.ReLU(),  # Ativação ReLU
            nn.Flatten(),  # Aplanamento dos dados
            nn.Linear(32 * game.row_count * game.column_count, game.action_size)  # Camada totalmente conectada
        )
        
        # Saída para avaliação de estado
        self.valueHead = nn.Sequential(
            nn.Conv2d(num_hidden, 3, kernel_size=3, padding=1),  # Convolução 2D com ativação ReLU
            nn.BatchNorm2d(3),  # Normalização por lotes
            nn.ReLU(),  # Ativação ReLU
            nn.Flatten(),  # Aplanamento dos dados
            nn.Linear(3 * game.row_count * game.column_count, 1),  # Camada fully connected
            nn.Tanh()  # Função de ativação tangente hiperbólica
        )
        
        # Configuração do dispositivo
        self.to(device)
        
    def forward(self, x):
        # Propagação dos dados através da rede
        x = self.startBlock(x)
        for resBlock in self.backBone:
            x = resBlock(x)
        policy = self.policyHead(x)  # Saída da cabeça de política
        value = self.valueHead(x)  # Saída da cabeça de valor
        return policy, value


class ResBlock(nn.Module):
    def __init__(self, num_hidden):
        super().__init__()
        # Definição do bloco residual
        self.conv1 = nn.Conv2d(num_hidden, num_hidden, kernel_size=3, padding=1)  # Primeira camada convolucional
        self.bn1 = nn.BatchNorm2d(num_hidden)  # Normalização por lotes para a primeira camada convolucional
        self.conv2 = nn.Conv2d(num_hidden, num_hidden, kernel_size=3, padding=1)  # Segunda camada convolucional
        self.bn2 = nn.BatchNorm2d(num_hidden)  # Normalização por lotes para a segunda camada convolucional
        
    def forward(self, x):
        # Propagação dos dados através do bloco residual
        residual = x
        x = F.relu(self.bn1(self.conv1(x)))  # Ativação ReLU após a primeira camada convolucional
        x = self.bn2(self.conv2(x))  # Normalização por lotes após a segunda camada convolucional
        x += residual  # Adição do atalho (conexão residual)
        x = F.relu(x)  # Ativação ReLU final
        return x


### Implementação do algoritmo MCTS (Monte Carlo Tree Search) que será utilizado para a escolha de jogadas
Neste caso é o MCTS Paralelo, que utiliza múltiplas threads para simular vários jogos simultaneamente durante o treino para escolher a melhor jogada possível

In [47]:

# Classe auxiliar para armazenar os dados de um jogo
class Node:
    def __init__(self, game, args, state, parent=None, action_taken=None, prior=0, visit_count=0):
        # Inicialização de um nó no MCTS
        self.game = game
        self.args = args
        self.state = state
        self.parent = parent
        self.action_taken = action_taken
        self.prior = prior
        
        self.children = []
        
        self.visit_count = visit_count
        self.value_sum = 0
        
    def is_fully_expanded(self):
        return len(self.children) > 0
    
    def select(self):
        # Seleção do melhor filho com base no UCB (Upper Confidence Bound)
        best_child = None
        best_ucb = -np.inf
        
        for child in self.children:
            ucb = self.get_ucb(child)
            if ucb > best_ucb:
                best_child = child
                best_ucb = ucb
                
        return best_child
    
    def get_ucb(self, child):
        # Cálculo do UCB para um filho específico
        if child.visit_count == 0:
            q_value = 0
        else:
            q_value = 1 - ((child.value_sum / child.visit_count) + 1) / 2
        return q_value + self.args['C'] * (math.sqrt(self.visit_count) / (child.visit_count + 1)) * child.prior
    
    def expand(self, policy):
        # Expansão do nó com base na policy de probabilidade
        for action, prob in enumerate(policy):
            if prob > 0:
                child_state = self.state.copy()
                child_state = self.game.get_next_state(child_state, action, 1)
                child_state = self.game.change_perspective(child_state, player=-1)

                child = Node(self.game, self.args, child_state, self, action, prob)
                self.children.append(child)
                
        return child
            
    def backpropagate(self, value):
        # Backpropagation do valor do nó até a raiz
        self.value_sum += value
        self.visit_count += 1
        
        value = self.game.get_opponent_value(value)
        if self.parent is not None:
            self.parent.backpropagate(value)  

class MCTSParallel:
    def __init__(self, game, args, model):
        # Inicialização do MCTS
        self.game = game
        self.args = args
        self.model = model
        
    @torch.no_grad()
    def search(self, state, spGames, pre=None):
        # Realiza uma busca MCTS para vários jogos simultaneamente
        policy, _ = self.model(
            torch.tensor(self.game.get_encoded_state(state), device=self.model.device)
        )
        policy = torch.softmax(policy, axis=1).cpu().numpy()
        policy = (1 - self.args['dirichlet_epsilon']) * policy + self.args['dirichlet_epsilon'] \
            * np.random.dirichlet([self.args['dirichlet_alpha']] * self.game.action_size, size=policy.shape[0])
        
        # Inicializa os nós raiz para cada jogo
        for i, spg in enumerate(spGames):
            p=None
            spg_policy = policy[i]
            if pre[0] is not None:
                p= pre[i]
            valid_moves = self.game.get_valid_moves(state[i], p)
            spg_policy *= valid_moves
            spg_policy /= np.sum(spg_policy)

            spg.root = Node(self.game, self.args, state[i], visit_count=1)
            spg.root.expand(spg_policy)
        
        for search in range(self.args['num_searches']):
            for spg in spGames:
                spg.node = None
                node = spg.root
                pas=False  # Variável para indicar se ocorreu um passe
                pre=None  # Estado anterior do jogo

                while node.is_fully_expanded():
                    node = node.select()

                # Verifica se ocorreu um passe
                if node.parent is not None:
                    if (node.parent.action_taken == self.game.action_size - 1 and node.action_taken == self.game.action_size - 1):
                        pas = True 

                # Verifica se o jogo termino
                value, is_terminal = self.game.get_value_and_terminated(node.state, pas)
                value = self.game.get_opponent_value(value)

                if is_terminal:
                    node.backpropagate(value)
                else:
                    spg.node = node
            # Realiza a expansão dos nós
            expandable_spGames = [mappingIdx for mappingIdx in range(len(spGames)) if spGames[mappingIdx].node is not None]

            # Realiza a expansão dos nós
            if len(expandable_spGames) > 0:
                state = np.stack([spGames[mappingIdx].node.state for mappingIdx in expandable_spGames])
                
                policy, value = self.model(
                    torch.tensor(self.game.get_encoded_state(state), device=self.model.device) # Codifica o estado do jogo
                )
                # Decodifica a policy e o valor
                policy = torch.softmax(policy, axis=1).cpu().numpy()
                value = value.cpu().numpy()

            # Atualiza os nós expandidos
            for i, mappingIdx in enumerate(expandable_spGames):
                node = spGames[mappingIdx].node
                spg_policy, spg_value = policy[i], value[i]

                if node.parent is not  None:
                    if node.parent.parent  is not None:
                        pre= node.parent.parent.state

                valid_moves = self.game.get_valid_moves(node.state, pre)
                spg_policy *= valid_moves
                spg_policy /= np.sum(spg_policy)
                pre=None

                node.expand(spg_policy)
                node.backpropagate(spg_value)  




MCTS normal para testar os modelos treinados

In [48]:
class MCTS:
    def __init__(self, game, args, model):
        # Inicialização do MCTS
        self.game = game
        self.args = args
        self.model = model
        
    @torch.no_grad()
    def search(self, state, pre=None):
        # Busca MCTS para selecionar a melhor ação dado um estado do jogo
        root = Node(self.game, self.args, state, visit_count=1)
        
        # Obtenção da política de probabilidade a partir da rede
        policy, _ = self.model(
            torch.tensor(self.game.get_encoded_state(state), device=self.model.device).unsqueeze(0)
        )
        policy = torch.softmax(policy, axis=1).squeeze(0).cpu().numpy()
        
        # Adição de ruído de Dirichlet para exploração estocástica
        policy = (1 - self.args['dirichlet_epsilon']) * policy + self.args['dirichlet_epsilon'] \
            * np.random.dirichlet([self.args['dirichlet_alpha']] * self.game.action_size)
        
        # Aplicação das jogadas válidas ao nó raiz
        valid_moves = self.game.get_valid_moves(root.state, pre)
        policy *= valid_moves
        policy /= np.sum(policy)
        root.expand(policy)

        # Realização de iterações de busca MCTS
        for search in range(self.args['num_searches']):
            node = root
            pas = False  # Variável para indicar se ocorreu um pass
            pre = None  # Estado anterior do jogo
            
            while node.is_fully_expanded():
                node = node.select()

            if node.parent is not None:
                if (node.parent.action_taken == self.game.action_size-1 and node.action_taken == self.game.action_size-1):
                    pas = True 

            # Avaliação do valor e se é um state terminal
            value, is_terminal = self.game.get_value_and_terminated(node.state, pas)
            value = self.game.get_opponent_value(value)

            if not is_terminal:
                # Obtenção da policy de probabilidade e valor da rede para o nó atual
                policy, value = self.model(
                    torch.tensor(self.game.get_encoded_state(node.state), device=self.model.device).unsqueeze(0)
                )
                policy = torch.softmax(policy, axis=1).squeeze(0).cpu().numpy()
                
                # Atualização do estado anterior
                if node.parent is not None:
                    if node.parent.parent is not None:
                        pre = node.parent.parent.state

                # Aplicação das jogadas válidas ao nó atual
                valid_moves = self.game.get_valid_moves(node.state, pre)
                policy *= valid_moves
                policy /= np.sum(policy)

                pre = None
                value = value.item()

                # Expansão do nó e backpropagation do valor
                node.expand(policy)
                
            node.backpropagate(value)    
            
        # Cálculo das probabilidades de ação normalizadas a partir dos visit_count dos filhos do nó raiz
        action_probs = np.zeros(self.game.action_size)
        for child in root.children:
            action_probs[child.action_taken] = child.visit_count
        action_probs /= np.sum(action_probs)
        return action_probs

### Implementação do algoritmo AlphaZero que utiliza a CNN e o MCTS para treinar um modelo de IA para jogar Go

Neste caso é o AlphaZero Paralelo, que utiliza múltiplas threads para simular diversos jogos ao mesmo tempo

In [49]:
class AlphaZeroParallel:
    def __init__(self, model, optimizer, game, args):
        # Inicialização do AlphaZero para treino paralelo
        self.model = model
        self.optimizer = optimizer
        self.game = game
        self.args = args
        self.mcts = MCTSParallel(game, args, model)
        
    def selfPlay(self):
        # Simulação de partidas para recolha de dados de treino
        return_memory = []
        player = 1
        spGames = [SPG(self.game) for spg in range(self.args['num_parallel_games'])]
        pre_states= [None]

        
        # Realização de jogadas até que todos os jogos terminem
        while len(spGames) > 0:
            print(len(spGames))
            states = np.stack([spg.state for spg in spGames])
            neutral_states = self.game.change_perspective(states, player)
            if(spGames[0].prev[player] is not None):
                pre_states= np.stack([spg.prev[player]for spg in spGames])
            self.mcts.search(neutral_states, spGames,pre_states)
            # Realização de jogadas para cada jogo
            for i in range(len(spGames))[::-1]:
                spg = spGames[i]
                
                action_probs = np.zeros(self.game.action_size)
                for child in spg.root.children:
                    action_probs[child.action_taken] = child.visit_count
                action_probs /= np.sum(action_probs)

                spg.memory.append((spg.root.state, action_probs, player))
                # Realização de jogadas aleatórias com base na temperatura
                temperature_action_probs = action_probs ** (1 / self.args['temperature'])
                temperature_action_probs /= np.sum(temperature_action_probs)
                action = np.random.choice(self.game.action_size, p=temperature_action_probs)

                spg.prev[player]= spg.state
                spg.state = self.game.get_next_state(spg.state, action, player)
                if action == self.game.action_size - 1:
                    spg.pas += 1
                    if spg.pas == 2:
                        spg.passe = True
                else:
                    spg.pas = 0

                value, is_terminal = self.game.get_value_and_terminated(spg.state, spg.passe)
                spg.passe = False
                # Adição dos dados de treino
                if is_terminal:
                    for hist_neutral_state, hist_action_probs, hist_player in spg.memory:
                        hist_outcome = value if hist_player == player else self.game.get_opponent_value(value)
                        return_memory.append((
                            self.game.get_encoded_state(hist_neutral_state),
                            hist_action_probs,
                            hist_outcome
                        ))
                    del spGames[i]
            
            player = self.game.get_opponent(player)
        
        return return_memory
                
    def train(self, memory):
        # Treino da rede neuronal com os dados recolhidos
        random.shuffle(memory)
        for batchIdx in range(0, len(memory), self.args['batch_size']):
            sample = memory[batchIdx:min(len(memory) - 1, batchIdx + self.args['batch_size'])]
            state, policy_targets, value_targets = zip(*sample)
            
            state, policy_targets, value_targets = np.array(state), np.array(policy_targets), np.array(value_targets).reshape(-1, 1)
            
            state = torch.tensor(state, dtype=torch.float32, device=self.model.device)
            policy_targets = torch.tensor(policy_targets, dtype=torch.float32, device=self.model.device)
            value_targets = torch.tensor(value_targets, dtype=torch.float32, device=self.model.device)
            
            out_policy, out_value = self.model(state)
            
            policy_loss = F.cross_entropy(out_policy, policy_targets)
            value_loss = F.mse_loss(out_value, value_targets)
            loss = policy_loss + value_loss
            
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
    
    def learn(self):
        # Treino iterativo do modelo de acordo com os parâmetros definidos
        for iteration in range(self.args['num_iterations']):
            memory = []
            
            self.model.eval()
            for selfPlay_iteration in trange(self.args['num_selfPlay_iterations'] // self.args['num_parallel_games']):
                memory += self.selfPlay()
                
            self.model.train()
            for epoch in trange(self.args['num_epochs']):
                self.train(memory)
            
            # Guarda o modelo e o optimizer
            torch.save(self.model.state_dict(), f"model_7_{iteration}_{self.game}.pt")
            torch.save(self.optimizer.state_dict(), f"optimizer_7_{iteration}_{self.game}.pt")

class SPG:
    def __init__(self, game):
        # Inicialização de uma instância de jogo para autojogo
        self.state = game.get_initial_state()
        self.memory = []
        self.root = None
        self.node = None
        self.passe = False
        self.pas = 0
        self.prev=[None,None]


AlphaZero normal (não usamos na nossa implementação)

In [50]:
class AlphaZero:
    def __init__(self, model, optimizer, game, args):
        # Inicialização do AlphaZero para treino
        self.model = model
        self.optimizer = optimizer
        self.game = game
        self.args = args
        self.mcts = MCTS(game, args, model)
        
    def selfPlay(self):
        # Simulação de partidas para recolha de dados de treino
        memory = []
        player = 1
        state = self.game.get_initial_state()
        pas = 0
        
        while True:
            passe = False
            neutral_state = self.game.change_perspective(state, player)
            action_probs = self.mcts.search(neutral_state)
            
            memory.append((neutral_state, action_probs, player))
            
            temperature_action_probs = action_probs ** (1 / self.args['temperature'])
            temperature_action_probs /= np.sum(temperature_action_probs)
            action = np.random.choice(self.game.action_size, p=temperature_action_probs)
            
            state = self.game.get_next_state(state, action, player)
            
            if action == self.game.action_size - 1:
                pas += 1
                if pas == 2:
                    passe = True
            else:
                pas = 0
            
            value, is_terminal = self.game.get_value_and_terminated(state, passe)
            
            if is_terminal:
                return_memory = []
                for hist_neutral_state, hist_action_probs, hist_player in memory:
                    hist_outcome = value if hist_player == player else self.game.get_opponent_value(value)
                    return_memory.append((
                        self.game.get_encoded_state(hist_neutral_state),
                        hist_action_probs,
                        hist_outcome
                    ))
                return return_memory
            
            player = self.game.get_opponent(player)
                
    def train(self, memory):
        # Treino da rede neuronal com os dados recolhidos
        random.shuffle(memory)
        for batchIdx in range(0, len(memory), self.args['batch_size']):
            sample = memory[batchIdx:min(len(memory) - 1, batchIdx + self.args['batch_size'])]
            state, policy_targets, value_targets = zip(*sample)
            
            state, policy_targets, value_targets = np.array(state), np.array(policy_targets), np.array(value_targets).reshape(-1, 1)
            
            state = torch.tensor(state, dtype=torch.float32, device=self.model.device)
            policy_targets = torch.tensor(policy_targets, dtype=torch.float32, device=self.model.device)
            value_targets = torch.tensor(value_targets, dtype=torch.float32, device=self.model.device)
            
            out_policy, out_value = self.model(state)
            
            policy_loss = F.cross_entropy(out_policy, policy_targets)
            value_loss = F.mse_loss(out_value, value_targets)
            loss = policy_loss + value_loss
            
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
    
    def learn(self):
        # Treino iterativo do modelo
        for iteration in range(self.args['num_iterations']):
            memory = []
            
            self.model.eval()
            for selfPlay_iteration in trange(self.args['num_selfPlay_iterations']):
                memory += self.selfPlay()
                
            self.model.train()
            for epoch in trange(self.args['num_epochs']):
                self.train(memory)
            
            torch.save(self.model.state_dict(), f"new_model_7x7_{iteration}_{self.game}.pt")
            torch.save(self.optimizer.state_dict(), f"new_optimizer_7x7_{iteration}_{self.game}.pt")


### Classe auxiliar para a criação de um tabuleiro de Go e para a realização de jogadas

In [51]:

class Connect2Game:
    def __init__(self,n):
        # Inicialização do jogo
        self.row_count = n
        self.column_count = n
        self.action_size = n*n+1
        
    def __repr__(self):
        # Representação do jogo
        return "Go"
        
    def get_initial_state(self):
        # Retorna o tabuleiro inicial do jogo (vazio)
        return np.zeros((self.row_count, self.column_count))
    
    def get_next_state(self, state, action, player):
        # Retorna o tabuleiro após a jogada
        b = go.GameState(state, play_idx=1)
        b.turn=player
        # Transforma a ação em coordenadas
        row = action// self.column_count
        col = action % self.column_count

        if action == self.column_count**2:
            boa= b.pass_turn()
        else:
            boa = b.move(row,col)
        return boa.board
    
    def get_valid_moves(self, state,previous):
        # Retorna as jogadas válidas
        valid_moves = [0] * self.action_size
        valid_moves[-1]=1
        b = go.GameState(state, play_idx=1)
        # Vai buscar o estado anterior
        if previous is not None:
            b.previous_boards[1]=previous
        # Verifica as jogadas possíveis em coordenadas
        possi=go.check_possible_moves(b)
        # Transforma as coordenadas em ações
        for i in possi:
            action = i[0] * self.column_count + i[1]
            valid_moves[action]=1
        return valid_moves
    
    def get_value_and_terminated(self, state, pas):
        # Retorna o winner e se é um estado terminal
        b = go.GameState(state, play_idx=1)
        if pas:
            b.pass_count = 2
            self.game_over = True
        _ , terminated =b.get_value_and_terminated(b)
        value = self.winner(state)
        return value, terminated
    
    def winner(self, state):
        # Retorna o vencedor do jogo
        scores = self.scores(state)
        value = value_scores(scores)
        return value
    
    def get_opponent(self, player):
        # Retorna o oponente
        return -player
    
    def get_opponent_value(self, value):
        # Retorna o valor do oponente
        return -value
    
    def change_perspective(self, state, player):
        # Retorna o tabuleiro com a perspetiva do jogador
        return state * player
    
    def get_encoded_state(self, state):
        # Retorna o estado codificado
        encoded_state = np.stack(
            (state == -1, state == 0, state == 1)
        ).astype(np.float32)
        
        if len(state.shape) == 3:
            encoded_state = np.swapaxes(encoded_state, 0, 1)
        
        return encoded_state
    
    def scores(self, state):
        # retorna os scores do jogo
        b = go.GameState(state, play_idx=1)
        return b.get_scores()
    
# Função auxiliar para calcular o valor do jogo
def value_scores(scores):
    if scores[1] > scores[-1]:
        return 1
    elif scores[1] < scores[-1]:
        return -1
    else:
        return 0



## Treino de um modelo de IA para jogar Go

In [None]:
# Inicialização do jogo
game = Connect2Game(7)

# Definição do dispositivo a utilizar
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialização do modelo
model = ResNet(game, 9, 128, device)

# Definição do optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

# Definição dos parâmetros de treino
args = {
    'C': 2,
    'num_searches': 300,
    'num_iterations': 10,
    'num_selfPlay_iterations': 50,
    'num_parallel_games': 25,
    'num_epochs': 10,
    'batch_size': 128,
    'temperature': 1.25,
    'dirichlet_epsilon': 0.25,
    'dirichlet_alpha': 0.3
}

# Inicialização do AlphaZero e treino
alphaZero = AlphaZeroParallel(model, optimizer, game, args)
alphaZero.learn()

### Testes com os modelos treinados

Teste 1: Modelo vs Player Mau (só passa)

In [None]:
n_board = 7
game = Connect2Game(n_board)
player = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

args = {
    'C': 2,
    'num_searches': 200,
    'num_iterations': 5,
    'num_selfPlay_iterations': 50,
    'num_parallel_games': 25,
    'num_epochs': 10,
    'batch_size': 128,
    'temperature': 1.25,
    'dirichlet_epsilon': 0.25,
    'dirichlet_alpha': 0.3
}



model = ResNet(game, 9,128,device)
model.load_state_dict(torch.load("./Modelos/model_7_8_Go.pt"))
model.eval()

mcts = MCTS(game, args, model)

state = game.get_initial_state()
previous_state = game.get_initial_state()


pass_count = 0

while True:
    print(state)

    if player != 1:
        valid_moves = game.get_valid_moves(state, previous_state)
        print("valid moves", [i for i in range(game.action_size) if valid_moves[i] == 1])
        action = 49 #int(input("Enter action: "))
        print("minha " +str(action))
        if action == game.action_size -1:
            pass_count +=1
        else:
            pass_count = 0
        print("pass count", str( pass_count))
        if valid_moves[action] == 0:
            print("Invalid move")
            continue
        
        previous_state = state
    
    else:
        neutral_state = game.change_perspective(state, player)
        mtcs_probs = mcts.search(neutral_state)
        action = np.argmax(mtcs_probs)
        if action == game.action_size -1:
            pass_count +=1
        else:
            pass_count = 0
        print("bot", str(action))
        
    state = game.get_next_state(state, action, player)
    
    value = game.winner(state)
    scores = game.scores(state)
    print(f"scores:  {scores[1]}, {scores[-1]}")
    previous_state = state
    
    
    if pass_count == 2:
        print(state)
        print("Game over")
        value = game.winner(state)
        print(f"scores: {scores[1]}, {scores[-1]}")
        if value != 0:
            print(f"{value} wins")
        else:
            print("Draw")
        break
    
    player = game.get_opponent(player)
    



Teste 2: Modelo com menos treino vs Modelo com mais treino

In [None]:
n_board = 7
game = Connect2Game(n_board)
player = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = ResNet(game, 9,128,device)
model.load_state_dict(torch.load("./Modelos/model_7_8_Go.pt"))
model.eval()

mcts = MCTS(game, args, model)

model2 = ResNet(game, 9,128,device)
model2.load_state_dict(torch.load("./Modelos/model_7_9_Go.pt"))
model2.eval()

mcts2 = MCTS(game, args, model2)

state = game.get_initial_state()
previous_state = game.get_initial_state()


pass_count = 0

while True:
    print(state)
    scores = game.scores(state)
    print(f"scores: {scores[1]}, {scores[-1]}")
    if player ==1:
        neutral_state = game.change_perspective(state, player)
        mtcs_probs = mcts2.search(neutral_state)
        action = np.argmax(mtcs_probs)
        if action == game.action_size -1:
            pass_count +=1
        else:
            pass_count = 0
        print("bot 1- ", str(action))
    
    else:
        neutral_state = game.change_perspective(state, player)
        mtcs_probs = mcts.search(neutral_state)
        action = np.argmax(mtcs_probs)
        if action == game.action_size -1:
            pass_count +=1
        else:
            pass_count = 0
        print("bot 2- ", str(action))
        
    state = game.get_next_state(state, action, player)


    previous_state = state
    
    if pass_count == 2:
        print(state)
        print("Game over")
        value = game.winner(state)
        print(f"scores: {scores[1]}, {scores[-1]}")
        if value != 0:
            print(f"{value} wins")
        else:
            print("Draw")
        break
    
    player = game.get_opponent(player)
                            

[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]]
scores: 0, 5.5
bot 1-  10
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]]
scores: 49, 5.5
bot 2-  22
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]]
scores: 1, 6.5
bot 1-  21
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1. -1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]]
scores: 2, 6.5
bot 2-  9
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  1.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 1. -1.  0.  0.  0.  0.  0.

Teste 3: Modelo treinado vs Humano

In [None]:
state = game.get_initial_state()
previous_state = game.get_initial_state()

player = 1

pass_count = 0

def value_scores(scores):
    if scores[1] > scores[-1]:
        return 1
    elif scores[1] < scores[-1]:
        return -1
    else:
        return 0
    
while True:
    print(state)
    scores = game.scores(state)
    print(f"scores: {scores[1]}, {scores[-1]}")

    if player != 1:
        valid_moves = game.get_valid_moves(state, previous_state)
        print("valid moves", [i for i in range(game.action_size) if valid_moves[i] == 1])
        action = int(input("Enter action: "))
        print("minha " +str(action))
        if action == game.action_size -1:
            pass_count +=1
        else:
            pass_count = 0
        print("pass count", str( pass_count))
        if valid_moves[action] == 0:
            print("Invalid move")
            continue

    
    else:
        valid_moves = game.get_valid_moves(state, previous_state)
        print("valid moves", [i for i in range(game.action_size) if valid_moves[i] == 1])
        action = int(input("Enter action: "))
        print("minha " +str(action))
        if action == game.action_size -1:
            pass_count +=1
        else:
            pass_count = 0
        print("pass count", str( pass_count))
        if valid_moves[action] == 0:
            print("Invalid move")
            continue
        
    state = game.get_next_state(state, action, player)
    
    previous_state = state
    
    if pass_count == 2:
        print(state)
        print("Game over")
        value = game.winner(state)
        print(f"scores: {scores[1]}, {scores[-1]}")
        if value != 0:
            print(f"{value} wins")
        else:
            print("Draw")
        break

    
    player = game.get_opponent(player)

[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]]
scores: 0, 5.5
valid moves [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
minha 49
pass count 1
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]]
scores: 0, 5.5
valid moves [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
minha 49
pass count 2
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]]
Game over
sco