In [43]:
import go
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import random
import torch
import math

In [44]:
class Connect2Model(nn.Module):

    def __init__(self, board_size, action_size, device):

        super(Connect2Model, self).__init__()

        self.device = device
        self.size = board_size
        self.action_size = action_size

        self.fc1 = nn.Linear(in_features=self.size, out_features=16)
        self.fc2 = nn.Linear(in_features=16, out_features=16)

        # Two heads on our network
        self.action_head = nn.Linear(in_features=16, out_features=self.action_size)
        self.value_head = nn.Linear(in_features=16, out_features=1)

        self.to(device)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))

        action_logits = self.action_head(x)
        value_logit = self.value_head(x)

        return F.softmax(action_logits, dim=1), torch.tanh(value_logit)

    def predict(self, board):
        board = torch.FloatTensor(board.astype(np.float32)).to(self.device)
        board = board.view(1, self.size)
        self.eval()
        with torch.no_grad():
            pi, v = self.forward(board)

        return pi.data.cpu().numpy()[0], v.data.cpu().numpy()[0]

In [45]:
def ucb_score(parent, child):
    """
    The score for an action that would transition between the parent and child.
    """
    prior_score = child.prior * math.sqrt(parent.visit_count) / (child.visit_count + 1)
    if child.visit_count > 0:
        # The value of the child is from the perspective of the opposing player
        value_score = -child.value()
    else:
        value_score = 0

    return value_score + prior_score


class Node:
    def __init__(self, prior, to_play):
        self.visit_count = 0
        self.to_play = to_play
        self.prior = prior
        self.value_sum = 0
        self.children = {}
        self.state = None
        

    def expanded(self):
        return len(self.children) > 0

    def value(self):
        if self.visit_count == 0:
            return 0
        return self.value_sum / self.visit_count

    def select_action(self, temperature):
        """
        Select action according to the visit count distribution and the temperature.
        """
        visit_counts = np.array([child.visit_count for child in self.children.values()])
        actions = [action for action in self.children.keys()]
        if temperature == 0:
            action = actions[np.argmax(visit_counts)]
        elif temperature == float("inf"):
            action = np.random.choice(actions)
        else:
            # See paper appendix Data Generation
            visit_count_distribution = visit_counts ** (1 / temperature)
            visit_count_distribution = visit_count_distribution / sum(visit_count_distribution)
            action = np.random.choice(actions, p=visit_count_distribution)

        return action

    def select_child(self):
        """
        Select the child with the highest UCB score.
        """
        best_score = -np.inf
        best_action = -1
        best_child = None

        for action, child in self.children.items():
            score = ucb_score(self, child)
            if score > best_score:
                best_score = score
                best_action = action
                best_child = child

        return best_action, best_child

    def expand(self, state, to_play, action_probs):
        """
        We expand a node and keep track of the prior policy probability given by neural network
        """
        self.to_play = to_play
        self.state = state
        for a, prob in enumerate(action_probs):
            if prob != 0:
                self.children[a] = Node(prior=prob, to_play=self.to_play * -1)

    def __repr__(self):
        """
        Debugger pretty print node info
        """
        prior = "{0:.2f}".format(self.prior)
        return "{} Prior: {} Count: {} Value: {}".format(self.state.__str__(), prior, self.visit_count, self.value())


class MCTS:

    def __init__(self, game, model, args):
        self.game = game
        self.model = model
        self.args = args

    def run(self, model, state, to_play):

        root = Node(0, to_play)


        # EXPAND root
        action_probs, value = model.predict(state)
        valid_moves = self.game.get_valid_moves(state)
        action_probs = action_probs * valid_moves  # mask invalid moves
        if(np.sum(action_probs)>1):
            action_probs /= np.sum(action_probs)
        root.expand(state, to_play, action_probs)

        for _ in range(self.args['num_simulations']):
            node = root
            search_path = [node]

            # SELECT
            while node.expanded():
                action, node = node.select_child()
                search_path.append(node)
            
            parent = search_path[-2]
            state = parent.state
            # Now we're at a leaf node and we would like to expand
            # Players always play from their own perspective
            next_state, _ = self.game.get_next_state(state, player=1, action=action)
            # Get the board from the perspective of the other player
            next_state = self.game.get_canonical_board(next_state, player=-1)

            # The value of the new state from the perspective of the other player
            value = self.game.get_reward_for_player(next_state, player=1)
            if value is None:
                # If the game has not ended:
                # EXPAND
                action_probs, value = model.predict(next_state)
                valid_moves = self.game.get_valid_moves(next_state)
                action_probs = action_probs * valid_moves  # mask invalid moves
                if np.sum(action_probs)>1:
                    action_probs /= np.sum(action_probs)
                node.expand(next_state, parent.to_play * -1, action_probs)

            self.backpropagate(search_path, value, parent.to_play * -1)

        return root

    def backpropagate(self, search_path, value, to_play):
        """
        At the end of a simulation, we propagate the evaluation all the way up the tree
        to the root.
        """
        for node in reversed(search_path):
            node.value_sum += value if node.to_play == to_play else -value
            node.visit_count += 1

In [46]:
from random import shuffle
import os
import torch
import torch.optim as optim

class Trainer:

    def __init__(self, game, model, args):
        self.game = game
        self.model = model
        self.args = args
        self.mcts = MCTS(self.game, self.model, self.args)

    def exceute_episode(self):

        train_examples = []
        current_player = 1
        state = self.game.get_init_board()

        while True:
            canonical_board = self.game.get_canonical_board(state, current_player)

            self.mcts = MCTS(self.game, self.model, self.args)
            root = self.mcts.run(self.model, canonical_board, to_play=1)

            action_probs = [0 for _ in range(self.game.get_action_size())]
            for k, v in root.children.items():
                action_probs[k] = v.visit_count
            if(np.sum(action_probs) >1):
                action_probs = action_probs / np.sum(action_probs)
            train_examples.append((canonical_board, current_player, action_probs))

            action = root.select_action(temperature=0)
            state, current_player = self.game.get_next_state(state, current_player, action)
            reward = self.game.get_reward_for_player(state, current_player)

            if reward is not None:
                ret = []
                for hist_state, hist_current_player, hist_action_probs in train_examples:
                    # [Board, currentPlayer, actionProbabilities, Reward]
                    ret.append((hist_state, hist_action_probs, reward * ((-1) ** (hist_current_player != current_player))))

                return ret

    def learn(self):
        for i in range(1, self.args['numIters'] + 1):

            print("{}/{}".format(i, self.args['numIters']))

            train_examples = []

            for eps in range(self.args['numEps']):
                iteration_train_examples = self.exceute_episode()
                train_examples.extend(iteration_train_examples)

            shuffle(train_examples)
            self.train(train_examples)
            filename = self.args['checkpoint_path']
            self.save_checkpoint(folder=".", filename=filename)

    def train(self, examples):
        optimizer = optim.Adam(self.model.parameters(), lr=5e-4)
        pi_losses = []
        v_losses = []

        for epoch in range(self.args['epochs']):
            self.model.train()

            batch_idx = 0

            while batch_idx < int(len(examples) / self.args['batch_size']):
                sample_ids = np.random.randint(len(examples), size=self.args['batch_size'])
                boards, pis, vs = list(zip(*[examples[i] for i in sample_ids]))
                boards = torch.FloatTensor(np.array(boards).astype(np.float64))
                target_pis = torch.FloatTensor(np.array(pis))
                target_vs = torch.FloatTensor(np.array(vs).astype(np.float64))

                # predict
                boards = boards.contiguous()
                target_pis = target_pis.contiguous()
                target_vs = target_vs.contiguous()

                # compute output
                out_pi, out_v = self.model(boards)
                l_pi = self.loss_pi(target_pis, out_pi)
                l_v = self.loss_v(target_vs, out_v)
                total_loss = l_pi + l_v

                pi_losses.append(float(l_pi))
                v_losses.append(float(l_v))

                optimizer.zero_grad()
                total_loss.backward()
                optimizer.step()

                batch_idx += 1

            print()
            print("Policy Loss", np.mean(pi_losses))
            print("Value Loss", np.mean(v_losses))
            print("Examples:")
            print(out_pi[0].detach())
            print(target_pis[0])

    def loss_pi(self, targets, outputs):
        loss = -(targets * torch.log(outputs)).sum(dim=1)
        return loss.mean()

    def loss_v(self, targets, outputs):
        loss = torch.sum((targets-outputs.view(-1))**2)/targets.size()[0]
        return loss

    def save_checkpoint(self, folder, filename):
        if not os.path.exists(folder):
            os.mkdir(folder)

        filepath = os.path.join(folder, filename)
        torch.save({
            'state_dict': self.model.state_dict(),
        }, filepath)

In [47]:
import go
#connectar melhor as regras
class Connect2Game:

    def __init__(self,n):
        self.columns = n

    def get_init_board(self):
        b = np.zeros((self.columns,self.columns), dtype=int)
        return b

    def get_board_size(self):
        return self.columns**2

    def get_action_size(self):
        return self.columns**2 +1

    def get_next_state(self, board, player, action):
        b = go.GameState(board)
        b.turn =player
        row = action// self.columns
        col = action % self.columns
        if action == self.columns**2:
            boa= b.pass_turn()
        else:
            boa = b.move(row,col)

        # Return the new game, but
        # change the perspective of the game with negative
        return (boa.board, -player)

    def has_legal_moves(self, board):
        b = go.GameState(board)
        if( len(go.check_possible_moves(b))!=0):
            return True
        else:
            return False

    def get_valid_moves(self, board):
        # All moves are invalid by default
        valid_moves = [0] * self.get_action_size()
        valid_moves[-1]=1
        b = go.GameState(board)
        possi=go.check_possible_moves(b)
        for i in possi:
            action = i[0] * self.columns + i[1]
            valid_moves[action]=1
        print(board)
        print(valid_moves)
        return valid_moves

    def is_win(self, board, player):
        b = go.GameState(board)
        b.turn=player
        return go.is_game_finished(b)

    def get_reward_for_player(self, board, player):
        # return None if not ended, 1 if player 1 wins, -1 if player 1 lost

        if self.is_win(board, player):
            return 1
        if self.is_win(board, -player):
            return -1
        return None

    def get_canonical_board(self, board, player):
        return player * board

In [48]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

args = {
    'batch_size': 12,
    'numIters': 50,                                # Total number of training iterations
    'num_simulations': 10,                         # Total number of MCTS simulations to run when deciding on a move to play
    'numEps': 10,                                  # Number of full games (episodes) to run during each iteration
    'numItersForTrainExamplesHistory': 20,
    'epochs': 2,                                    # Number of epochs of training per iteration
    'checkpoint_path': 'latest.pth'                 # location to save latest set of weights
}

game = Connect2Game(2)
board_size = game.get_board_size()
action_size = game.get_action_size()

model = Connect2Model(board_size, action_size, device)

trainer = Trainer(game, model, args)
trainer.learn()

1/50
[[0 0]
 [0 0]]
[1, 1, 1, 1, 1]
[[-1  0]
 [ 0  0]]
[0, 1, 1, 1, 1]
[[1 0]
 [0 0]]
[0, 1, 1, 1, 1]
[[0 0]
 [0 0]]
[1, 1, 1, 1, 1]
[[-1  0]
 [ 0  0]]
[0, 1, 1, 1, 1]
[[ 0  0]
 [-1  0]]
[1, 1, 0, 1, 1]
[[0 0]
 [1 0]]
[1, 1, 0, 1, 1]
[[ 0 -1]
 [ 0  0]]
[1, 0, 1, 1, 1]
[[-1  1]
 [ 0  0]]
[0, 0, 1, 1, 1]
[[ 0  0]
 [ 0 -1]]
[1, 1, 1, 0, 1]
[[-1  0]
 [ 0  1]]
[0, 1, 1, 0, 1]
[[-1  0]
 [ 0  0]]
[0, 1, 1, 1, 1]
[[ 1 -1]
 [ 0  0]]
[0, 0, 1, 1, 1]
[[-1  1]
 [ 0  0]]
[0, 0, 1, 1, 1]
[[1 0]
 [0 0]]
[0, 1, 1, 1, 1]
[[-1  0]
 [ 0  0]]
[0, 1, 1, 1, 1]
[[ 1  0]
 [-1  0]]
[0, 1, 0, 1, 1]
[[-1  0]
 [ 1  0]]
[0, 1, 0, 1, 1]
[[ 1  0]
 [ 0 -1]]
[0, 1, 1, 0, 1]
[[-1  0]
 [ 0  1]]
[0, 1, 1, 0, 1]
[[1 0]
 [0 0]]
[0, 1, 1, 1, 1]
[[-1 -1]
 [ 0  0]]
[0, 0, 1, 1, 1]
[[1 0]
 [0 0]]
[0, 1, 1, 1, 1]
[[-1 -1]
 [ 0  0]]
[0, 0, 1, 1, 1]
[[1 1]
 [0 0]]
[0, 0, 1, 1, 1]
[[-1  0]
 [ 0  0]]
[0, 1, 1, 1, 1]
[[1 0]
 [0 0]]
[0, 1, 1, 1, 1]
[[-1  0]
 [-1  0]]
[0, 1, 0, 1, 1]
[[1 0]
 [1 0]]
[0, 1, 0, 1, 1]
[[-1  0]
 [ 0 -1]]
[

KeyboardInterrupt: 