# Initialize a game

In [1]:
from ConnectN import ConnectN

game_setting = {'size':(6,6), 'N':4, 'pie_rule':True}
game = ConnectN(**game_setting)


In [2]:
% matplotlib notebook

from Play import Play


gameplay=Play(ConnectN(**game_setting), 
              player1=None, 
              player2=None)


<IPython.core.display.Javascript object>

# Define our policy

Please go ahead and define your own policy! See if you can train it under 1000 games and with only 1000 steps of exploration in each move.

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from math import *
import numpy as np

from ConnectN import ConnectN
game_setting = {'size':(6,6), 'N':4}
game = ConnectN(**game_setting)

class Policy(nn.Module):

    def __init__(self, game):
        super(Policy, self).__init__()

        # input = 6x6 board
        # convert to 5x5x8
        self.conv1 = nn.Conv2d(1, 16, kernel_size=2, stride=1, bias=False)
        # 5x5x16 to 3x3x32
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, bias=False)

        self.size=3*3*32
        
        # the part for actions
        self.fc_action1 = nn.Linear(self.size, self.size//4)
        self.fc_action2 = nn.Linear(self.size//4, 36)
        
        # the part for the value function
        self.fc_value1 = nn.Linear(self.size, self.size//6)
        self.fc_value2 = nn.Linear(self.size//6, 1)
        self.tanh_value = nn.Tanh()
        
    def forward(self, x):

        y = F.leaky_relu(self.conv1(x))
        y = F.leaky_relu(self.conv2(y))
        y = y.view(-1, self.size)
        
        # action head
        a = self.fc_action2(F.leaky_relu(self.fc_action1(y)))
        
        avail = (torch.abs(x.squeeze())!=1).type(torch.FloatTensor)
        avail = avail.contiguous().view(-1, 36)
        maxa = torch.max(a)
        exp = avail*torch.exp(a-maxa)
        prob = exp/torch.sum(exp)
        
        # value head
        value = self.tanh_value(self.fc_value2(F.leaky_relu( self.fc_value1(y) )))
        return prob.view(6,6), value

policy = Policy(game)


# Define a MCTS player for Play

In [4]:
import MCTS

from copy import copy

def Policy_Player_MCTS(game):
    mytree = MCTS.Node(copy(game))
    for _ in range(1000):
        mytree.explore(policy)
       
    mytreenext, (v, nn_v, p, nn_p) = mytree.next(temperature=0.1)
    
    return mytreenext.game.last_move

import random

def Random_Player(game):
    return random.choice(game.available_moves())    


# Play a game against a random policy

In [5]:
% matplotlib notebook

from Play import Play


gameplay=Play(ConnectN(**game_setting), 
              player1=Policy_Player_MCTS, 
              player2=None)


<IPython.core.display.Javascript object>

# Training

In [6]:
# initialize our alphazero agent and optimizer
import torch.optim as optim

game=ConnectN(**game_setting)
policy = Policy(game)
optimizer = optim.Adam(policy.parameters(), lr=.01, weight_decay=1.e-5)

! pip install progressbar

Collecting progressbar
  Downloading https://files.pythonhosted.org/packages/a3/a6/b8e451f6cff1c99b4747a2f7235aa904d2d49e8e1464e0b798272aa84358/progressbar-2.5.tar.gz
Building wheels for collected packages: progressbar
  Running setup.py bdist_wheel for progressbar ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/c0/e9/6b/ea01090205e285175842339aa3b491adeb4015206cda272ff0
Successfully built progressbar
Installing collected packages: progressbar
Successfully installed progressbar-2.5


Beware, training is **VERY VERY** slow!!

In [None]:
# train our agent

from collections import deque
import MCTS

# try a higher number
episodes = 2000

import progressbar as pb
widget = ['training loop: ', pb.Percentage(), ' ', 
          pb.Bar(), ' ', pb.ETA() ]
timer = pb.ProgressBar(widgets=widget, maxval=episodes).start()

outcomes = []
policy_loss = []

Nmax = 1000

for e in range(episodes):

    mytree = MCTS.Node(game)
    logterm = []
    vterm = []
    
    while mytree.outcome is None:
        for _ in range(Nmax):
            mytree.explore(policy)
            if mytree.N >= Nmax:
                break
            
        current_player = mytree.game.player
        mytree, (v, nn_v, p, nn_p) = mytree.next()
        mytree.detach_mother()
        
        loglist = torch.log(nn_p)*p
        constant = torch.where(p>0, p*torch.log(p),torch.tensor(0.))
        logterm.append(-torch.sum(loglist-constant))

        vterm.append(nn_v*current_player)
        
    # we compute the "policy_loss" for computing gradient
    outcome = mytree.outcome
    outcomes.append(outcome)
    
    loss = torch.sum( (torch.stack(vterm)-outcome)**2 + torch.stack(logterm) )
    optimizer.zero_grad()
    loss.backward()
    policy_loss.append(float(loss))

    optimizer.step()
    
    if e%10==0:
        print("game: ",e+1, ", mean loss: {:3.2f}".format(np.mean(policy_loss[-20:])),
              ", recent outcomes: ", outcomes[-10:])
    
    if e%500==0:
        torch.save(policy,'6-6-4-pie-{:d}.mypolicy'.format(e))
    del loss
    
    timer.update(e+1)
    
timer.finish()





  "type " + obj.__name__ + ". It won't be checked "
training loop:   0% |                                          | ETA:  17:22:56

game:  1 , mean loss: 15.72 , recent outcomes:  [-1]


training loop:   0% |                                          | ETA:  12:44:31

game:  11 , mean loss: 25.75 , recent outcomes:  [-1, 1, -1, -1, 1, -1, -1, -1, 1, 1]


training loop:   1% |                                          | ETA:  10:35:03

game:  21 , mean loss: 24.59 , recent outcomes:  [1, 1, -1, -1, -1, 1, 1, 1, 1, -1]


training loop:   1% |                                           | ETA:  9:49:26

game:  31 , mean loss: 21.53 , recent outcomes:  [1, -1, 1, -1, 1, -1, -1, -1, 1, -1]


training loop:   2% |                                           | ETA:  9:10:16

game:  41 , mean loss: 18.39 , recent outcomes:  [-1, 1, -1, 1, 1, 1, 1, 1, -1, -1]


training loop:   2% |#                                          | ETA:  8:27:38

game:  51 , mean loss: 15.51 , recent outcomes:  [1, -1, 1, -1, 1, 1, 1, 1, 1, 1]


training loop:   3% |#                                          | ETA:  7:51:36

game:  61 , mean loss: 14.02 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:   3% |#                                          | ETA:  7:05:53

game:  71 , mean loss: 14.11 , recent outcomes:  [1, 1, 1, 1, -1, 1, 1, -1, 1, 1]


training loop:   4% |#                                          | ETA:  6:30:20

game:  81 , mean loss: 11.41 , recent outcomes:  [1, 1, 1, -1, 1, 1, 1, 1, 1, 1]


training loop:   4% |#                                          | ETA:  5:58:27

game:  91 , mean loss: 7.06 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:   5% |##                                         | ETA:  5:32:59

game:  101 , mean loss: 6.59 , recent outcomes:  [-1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:   5% |##                                         | ETA:  5:12:42

game:  111 , mean loss: 5.49 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:   6% |##                                         | ETA:  4:57:35

game:  121 , mean loss: 3.85 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:   6% |##                                         | ETA:  4:41:38

game:  131 , mean loss: 3.84 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:   7% |###                                        | ETA:  4:29:15

game:  141 , mean loss: 4.61 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:   7% |###                                        | ETA:  4:19:45

game:  151 , mean loss: 3.67 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:   8% |###                                        | ETA:  4:07:17

game:  161 , mean loss: 2.73 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:   8% |###                                        | ETA:  3:56:15

game:  171 , mean loss: 3.62 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:   9% |###                                        | ETA:  3:46:52

game:  181 , mean loss: 3.21 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:   9% |####                                       | ETA:  3:38:32

game:  191 , mean loss: 2.53 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  10% |####                                       | ETA:  3:32:27

game:  201 , mean loss: 3.59 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  10% |####                                       | ETA:  3:26:46

game:  211 , mean loss: 3.42 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  11% |####                                       | ETA:  3:20:19

game:  221 , mean loss: 2.84 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  11% |####                                       | ETA:  3:14:42

game:  231 , mean loss: 2.64 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  12% |#####                                      | ETA:  3:10:03

game:  241 , mean loss: 1.61 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  12% |#####                                      | ETA:  3:06:45

game:  251 , mean loss: 2.54 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  13% |#####                                      | ETA:  3:03:34

game:  261 , mean loss: 2.57 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  13% |#####                                      | ETA:  3:01:56

game:  271 , mean loss: 7.42 , recent outcomes:  [1, -1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  14% |######                                     | ETA:  3:00:10

game:  281 , mean loss: 10.12 , recent outcomes:  [1, 1, 1, 1, -1, 1, 1, 1, 1, 1]


training loop:  14% |######                                     | ETA:  2:56:50

game:  291 , mean loss: 4.97 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  15% |######                                     | ETA:  2:54:33

game:  301 , mean loss: 3.03 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  15% |######                                     | ETA:  2:51:44

game:  311 , mean loss: 2.98 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  16% |######                                     | ETA:  2:49:20

game:  321 , mean loss: 2.71 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  16% |#######                                    | ETA:  2:47:09

game:  331 , mean loss: 1.89 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  17% |#######                                    | ETA:  2:46:05

game:  341 , mean loss: 3.32 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  17% |#######                                    | ETA:  2:43:29

game:  351 , mean loss: 3.42 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  18% |#######                                    | ETA:  2:40:42

game:  361 , mean loss: 1.64 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  18% |#######                                    | ETA:  2:38:33

game:  371 , mean loss: 3.09 , recent outcomes:  [1, 1, 1, 1, -1, 1, 1, 1, 1, 1]


training loop:  19% |########                                   | ETA:  2:36:24

game:  381 , mean loss: 4.91 , recent outcomes:  [1, 1, -1, 1, 1, 1, 1, 1, 1, 1]


training loop:  19% |########                                   | ETA:  2:34:31

game:  391 , mean loss: 3.16 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  20% |########                                   | ETA:  2:31:50

game:  411 , mean loss: 1.08 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  21% |#########                                  | ETA:  2:30:02

game:  421 , mean loss: 1.08 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  21% |#########                                  | ETA:  2:28:16

game:  431 , mean loss: 2.99 , recent outcomes:  [1, 1, 1, -1, 1, 1, 1, 1, 1, 1]


training loop:  22% |#########                                  | ETA:  2:27:21

game:  441 , mean loss: 2.92 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  22% |#########                                  | ETA:  2:25:50

game:  451 , mean loss: 1.05 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  23% |#########                                  | ETA:  2:25:33

game:  461 , mean loss: 3.13 , recent outcomes:  [1, 1, 1, 1, 1, 1, -1, 1, 1, 1]


training loop:  23% |##########                                 | ETA:  2:24:33

game:  471 , mean loss: 4.73 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, -1, 1]


training loop:  24% |##########                                 | ETA:  2:23:10

game:  481 , mean loss: 2.69 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  24% |##########                                 | ETA:  2:21:50

game:  491 , mean loss: 1.94 , recent outcomes:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


training loop:  24% |##########                                 | ETA:  2:21:39

# setup environment to pit your AI against the challenge policy '6-6-4-pie.policy'

In [7]:
challenge_policy = torch.load('6-6-4-pie.policy')

def Challenge_Player_MCTS(game):
    mytree = MCTS.Node(copy(game))
    for _ in range(1000):
        mytree.explore(challenge_policy)
       
    mytreenext, (v, nn_v, p, nn_p) = mytree.next(temperature=0.1)
    
    return mytreenext.game.last_move





# Let the game begin!

In [14]:
% matplotlib notebook
gameplay=Play(ConnectN(**game_setting), 
              player2=Challenge_Player_MCTS,#Policy_Player_MCTS, 
              player1=None)

<IPython.core.display.Javascript object>