In [249]:
from typing import Any, Dict, Optional, Tuple
import numpy as np
import torch
from torch import nn

from check_submission import check_submission
from game_mechanics import (
    OthelloEnv,
    choose_move_randomly,
    load_network,
    play_othello_game,
    save_network,
    get_legal_moves
)

from einops.layers.torch import Rearrange
from einops import rearrange
from tqdm import tqdm_notebook
import random

In [14]:
env = OthelloEnv()
state, reward, done, info = env.reset(verbose=True)

Starting game. Player -1 has first move
 [['*' '*' '*' '*' '*' '*']
 ['*' '*' '*' '*' '*' '*']
 ['*' '*' 'X' 'O' '*' '*']
 ['*' '*' 'O' 'X' '*' '*']
 ['*' '*' '*' '*' '*' '*']
 ['*' '*' '*' '*' '*' '*']]

Player -1 places counter at row 4, column 3
[['*' '*' '*' '*' '*' '*']
 ['*' '*' '*' '*' '*' '*']
 ['*' '*' 'X' 'O' '*' '*']
 ['*' '*' 'O' 'O' '*' '*']
 ['*' '*' '*' 'O' '*' '*']
 ['*' '*' '*' '*' '*' '*']]



In [394]:
class OthelloNet(nn.Module):
    def __init__(self):
        super(OthelloNet, self).__init__()
        hidden = 20 
        # stride 1
        self.conv1 = nn.Conv2d(1,hidden, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(hidden*2,hidden, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(hidden*2,1, kernel_size=1, padding=0)
        
        # stride 2
        self.conv1_s2 = nn.Conv2d(1,hidden, kernel_size=3, padding=2,dilation=2)
        self.conv2_s2 = nn.Conv2d(hidden*2,hidden, kernel_size=3, padding=2,dilation=2)
    
    def forward(self, x):
        x_1a = self.conv1(x)
        x_1a = nn.functional.relu(x_1a)
        x_1b = self.conv1_s2(x)
        x_1b = nn.functional.relu(x_1b)
        
        x = torch.concat([x_1a, x_1b],dim=1)
        x_2a = self.conv2(x)
        x_2a = nn.functional.relu(x_2a)
        x_2b = self.conv2_s2(x)
        x_2b = nn.functional.relu(x_2b)

        x = torch.concat([x_2a, x_2b],dim=1)
        x = self.conv3(x)
        x = nn.functional.tanh(x)
        x = rearrange(x, 'b 1 w h -> b w h')
        return x

In [395]:
net = OthelloNet()

In [397]:
net(rearrange(tensor_state, 'w h -> 1 1 w h'))[0,0]:shape

torch.Size([1, 1, 6, 6])
torch.Size([1, 40, 6, 6])
torch.Size([1, 1, 6, 6])


torch.Size([1, 1, 6, 6])

In [197]:
network = nn.Sequential(
    # block 1
    nn.Conv2d(1,20, kernel_size=3, padding=1),
    nn.ReLU(),
    nn.Dropout2d(p=0.1),
    nn.BatchNorm2d(20),
    
    # block 2
    nn.Conv2d(20,20, kernel_size=3, padding=1),
    nn.ReLU(),
    nn.Dropout2d(p=0.1),
    nn.BatchNorm2d(20),
    
    # block 3
    nn.Conv2d(20,20, kernel_size=3, padding=1),
    nn.ReLU(),
    nn.Dropout2d(p=0.1),
    nn.BatchNorm2d(20),
    
    # compress channels to 1 and softmax
    nn.Conv2d(20, 1, kernel_size=1),
    Rearrange('b 1 w h -> b w h'),
    nn.Tanh()
#     nn.Softmax(dim = 1),
#     Rearrange('b (w h) -> b w h', w = 6, h = 6)
)
batch_boards = torch.as_tensor([[state], [state]], dtype=torch.float32)
# network.to(device)
# assert network(batch_boards).sum(axis=1).sum(axis=1)[0] == 1.0

In [198]:
preds = network(torch.as_tensor([[state]], dtype=torch.float32))

In [199]:
preds

tensor([[[ 0.4081,  0.3099,  0.3523, -0.2370,  0.1816,  0.1558],
         [ 0.1065,  0.7942, -0.0668, -0.4197, -0.4196, -0.5736],
         [-0.0603, -0.2125, -0.1890,  0.9293, -0.6834, -0.1090],
         [ 0.0640,  0.8236, -0.7896,  0.9443,  0.3014, -0.3497],
         [ 0.6135,  0.6385,  0.7154, -0.3351, -0.0139, -0.2774],
         [ 0.1625,  0.5912, -0.0162,  0.3044, -0.0772,  0.0234]]],
       grad_fn=<TanhBackward0>)

In [200]:
np.array(possible_moves)

array([[2, 4],
       [4, 2],
       [4, 4]])

In [201]:
estimated_values = preds[0][np.array(possible_moves).transpose()]

In [202]:
estimated_values

tensor([-0.6834,  0.7154, -0.0139], grad_fn=<IndexBackward0>)

In [229]:
batch_boards = torch.as_tensor([[state], [state+1]], dtype=torch.float32)
batch_boards

tensor([[[[ 0.,  0.,  0.,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  0.,  0.,  0.],
          [ 0.,  0.,  1., -1.,  0.,  0.],
          [ 0.,  0., -1., -1.,  0.,  0.],
          [ 0.,  0.,  0., -1.,  0.,  0.],
          [ 0.,  0.,  0.,  0.,  0.,  0.]]],


        [[[ 1.,  1.,  1.,  1.,  1.,  1.],
          [ 1.,  1.,  1.,  1.,  1.,  1.],
          [ 1.,  1.,  2.,  0.,  1.,  1.],
          [ 1.,  1.,  0.,  0.,  1.,  1.],
          [ 1.,  1.,  1.,  0.,  1.,  1.],
          [ 1.,  1.,  1.,  1.,  1.,  1.]]]])

In [235]:
preds = network(batch_boards)
preds.round(decimals=2)

tensor([[[ 0.3100,  0.5100,  0.0500,  0.0700, -0.0600,  0.1000],
         [ 0.1300,  0.1200,  0.1000, -0.2700, -0.5500,  0.2300],
         [ 0.1900, -0.5100,  0.0900,  0.1800, -0.1900, -0.2300],
         [-0.4700,  0.5700,  0.0000,  0.6000, -0.0300, -0.8900],
         [ 0.6200,  0.2700,  0.1000,  0.1000,  0.5800, -0.4500],
         [-0.0100,  0.3300,  0.1700,  0.5200,  0.1900, -0.3000]],

        [[ 0.6700,  0.3300, -0.1200, -0.0800, -0.1600,  0.3500],
         [ 0.5200,  0.1900,  0.1700,  0.0200,  0.6500,  0.4000],
         [-0.0900,  0.8800,  0.2500,  0.7200,  0.7700, -0.6500],
         [ 0.0400,  0.5400,  0.1700,  0.8200, -0.1100, -0.4800],
         [ 0.1900,  0.6800,  0.3100, -0.1300, -0.2400, -0.6900],
         [ 0.3400,  0.2500, -0.0800,  0.7300,  0.5600, -0.0800]]],
       grad_fn=<RoundBackward1>)

In [230]:
batch_moves = torch.as_tensor([(2,2), (2,2)], dtype=torch.long)
batch_moves

tensor([[2, 2],
        [2, 2]])

In [239]:
preds[range(preds.shape[0]),batch_moves[0], batch_moves[1]]

tensor([0.0943, 0.2469], grad_fn=<IndexBackward0>)

In [290]:
def greedy_move(net, state, possible_moves):
    if len(possible_moves) == 0: return None
    preds = net(rearrange(torch.as_tensor(state), 'w h -> 1 1 w h'))
    values = preds[0][np.array(possible_moves).transpose()]
    return possible_moves[values.argmax()]

In [205]:
class GameState:
    def __init__(self, state, gamma=0.95, player='player', parent = None):
        self.state = state
        self.player = player
        if parent_state: self.parent = parent
        if parent_state: self.depth = parent.depth + 1
        else: self.depth = 0
        

In [207]:
def build_tree(move_player, move_opponent, branching_factor = np.ones(36)):
    state, reward, done, info = env.reset(verbose=True)
    root = GameState(state)
    
    return root

In [248]:
rearrange(torch.as_tensor(state), 'w h -> 1 1 w h').shape

torch.Size([1, 1, 6, 6])

In [285]:
old_values.shape

torch.Size([64, 6, 6])

In [286]:
moves.shape

torch.Size([64, 2])

In [287]:
moves[:,0]

tensor([4, 2, 0, 4, 4, 0, 2, 4, 5, 0, 2, 2, 0, 3, 1, 2, 0, 0, 2, 5, 4, 4, 2, 1,
        4, 5, 4, 5, 1, 5, 5, 1, 0, 3, 2, 1, 4, 3, 1, 3, 1, 2, 4, 1, 4, 2, 0, 0,
        0, 1, 4, 3, 3, 4, 3, 4, 0, 5, 0, 1, 1, 0, 4, 5])

In [283]:
old_states.shape

torch.Size([64, 6, 6])

In [301]:
state, reward, done, info = env.reset()

In [302]:
state

array([[ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0., -1.,  1.,  0.,  0.],
       [ 0.,  0.,  1., -1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.]])

In [303]:
tensor_state = torch.as_tensor(state, dtype=torch.float32)

In [None]:
tensor_state.se

tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

In [311]:
state1 = (tensor_state == 1).to(torch.float32)
state0 = (tensor_state == 0).to(torch.float32)
state_1 = (tensor_state == -1).to(torch.float32)

In [330]:
pyramid = torch.ones((2,2))
pyramid = nn.functional.pad(pyramid, pad=(1,1,1,1),value=0)
pyramid = nn.functional.pad(pyramid, pad=(1,1,1,1),value=-1)
pyramid

tensor([[-1., -1., -1., -1., -1., -1.],
        [-1.,  0.,  0.,  0.,  0., -1.],
        [-1.,  0.,  1.,  1.,  0., -1.],
        [-1.,  0.,  1.,  1.,  0., -1.],
        [-1.,  0.,  0.,  0.,  0., -1.],
        [-1., -1., -1., -1., -1., -1.]])

In [314]:
torch.stack([state1, state0, state_1]).shape

torch.Size([3, 6, 6])

In [331]:
def tensorify(np_state):
    state = torch.as_tensor(state, dtype=torch.float32)
    state1 = (tensor_state == 1).to(torch.float32)
    state0 = (tensor_state == 0).to(torch.float32)
    state_1 = (tensor_state == -1).to(torch.float32)
    return torch.stack([state1,state0,state_1,pyramid])

In [292]:
def train():
    n_episodes = 100
    gamma = 0.9
    epsilon = 0.3
    epsilon_decay = 0.99
    env = OthelloEnv()
    loss_fn = torch.nn.MSELoss()

    optim = torch.optim.AdamW(network.parameters())
    memory = []

    N = 2000
    M = 64

    for episode in tqdm_notebook(range(n_episodes)):
        state, reward, done, info = env.reset()
        state = torch.as_tensor(state, dtype=torch.float32)

        while not done:
            prev_state = state
    #             prev_state_value = V(prev_state)
            possible_moves = get_legal_moves(state)
            if len(possible_moves) == 0:
                move = None

            elif random.random() < epsilon:
                move = random.choice(possible_moves)
            else:
                move = greedy_move(network, prev_state, possible_moves)
            state, reward, done, info = env.step(move)
            state = torch.as_tensor(state, dtype=torch.float32)
            if move is not None:
                memory.append((prev_state, reward, move, state))

            if len(memory) > N:
                memory.pop(0)

            if M < len(memory):

                random_choices = np.random.choice(range(len(memory)), size=M, replace=False)

                old_states = torch.stack([memory[idx][0] for idx in random_choices])
                old_states = rearrange(old_states, 'b w h -> b 1 w h')
                states = torch.stack([memory[idx][3] for idx in random_choices])
                states = rearrange(states, 'b w h -> b 1 w h')
                rewards = torch.tensor(np.array([memory[idx][1] for idx in random_choices]),
                                        dtype=torch.float32)
                moves = torch.as_tensor([memory[idx][2] for idx in random_choices], dtype=torch.long)
                old_values = network(old_states)
                old_value_moves = old_values[range(old_values.shape[0]),moves[:,0], moves[:,1]]

                with torch.no_grad():
                    new_values = network(states)
                    new_value_moves = new_values[range(new_values.shape[0]),moves[:,0], moves[:,1]]
                loss = loss_fn(old_value_moves, rewards + gamma * new_value_moves)
                optim.zero_grad()
                loss.backward()
                optim.step()

        epsilon *= epsilon_decay
        
train()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for episode in tqdm_notebook(range(n_episodes)):


  0%|          | 0/100 [00:00<?, ?it/s]

In [296]:
0.995**300

0.22229219984074694

In [295]:
def choose_move_no_value_fn(state: Any) -> Optional[Tuple[int, int]]:
    """The arguments in play_connect_4_game() require functions that only take the state as
    input.

    This converts choose_move() to that format.
    """
    state = torch.as_tensor(state, dtype=torch.float32)
    possible_moves = get_legal_moves(state)
    return greedy_move(network, state, possible_moves)

outcomes = {}
for _ in tqdm_notebook(range(1000)):
    reward = play_othello_game(
        your_choose_move=choose_move_no_value_fn,
        opponent_choose_move=choose_move_randomly,
        game_speed_multiplier=10000000000000000,
        verbose=False,
    )
    outcomes[reward] = outcomes.get(reward, 0) + 1
    
print(outcomes)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for _ in tqdm_notebook(range(1000)):


  0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 