In [250]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

import random
from einops.layers.torch import Rearrange
from einops import rearrange

from typing import Any, Dict, Tuple, Optional
from game_mechanics import GoEnv, choose_move_randomly, load_pkl, play_go, save_pkl
from tqdm import notebook

from functools import partial
import pandas as pd
from datetime import datetime

In [271]:
def choose_move(observation: np.ndarray, legal_moves: np.ndarray, neural_network: nn.Module) -> int:
    observation = normalize(observation)
    with torch.no_grad():
        probs, value = my_network(observation, legal_moves)
    probs = probs.cpu().detach().numpy()
    move = np.random.choice(range(82), p=probs)
    return move


def choose_move_human(observation: np.ndarray, legal_moves:np.ndarray, neural_network: nn.Module) -> int:
    i, j = [int(_) for _ in input().split(" ")]

    return (i-1)*9 + j-1

def random_move(observation, legal_moves):
    return random.choice(legal_moves)

In [277]:
class alpha_go_zero(nn.Module):
    def __init__(self):
        super().__init__()

        # This represents the shared layer(s) before the different heads        
        self.layer1 = nn.Linear(81, 600)
        self.layer2 = nn.Linear(600, 600)
        
        self.head1 = nn.Linear(600, 82)
        self.head2 = nn.Linear(600, 1)   


    def forward(self, x, legal_moves):
        illegal_moves = [i for i in range(81) if i not in legal_moves]
        # Run the shared layer(s)
        x1 = rearrange(x, 'w h -> (w h)') #self.flatten(x)
        x1 = self.layer1(x1)
        x1 = F.elu(x1)
        
        x2 = self.layer2(x1)
        x2 = F.elu(x2)
        
        x3 = self.layer2(x2)
        x3 = F.elu(x3)
        
        
        # Run the different heads with the output of the shared layers as input
        # stochastic predictions
        x4a = self.head1(x3)
        x4a[illegal_moves] = -torch.inf
        x4a = F.softmax(x4a, dim=-1)
        
        #value function
        x4b = self.head2(x3)
        x4b = torch.tanh(x4b)
        
        return x4a, x4b

#         x = rearrange(x, 'w h -> (w h)') #self.flatten(x)
#         x1 = self.head1(x)
#         x1[illegal_moves] = -torch.inf
#         x1 = F.softmax(x1, dim=-1)
        
#         x2 = self.head2(x)
#         x2 = torch.tanh(x2)        


        

        
#         return x1, x2



In [9]:
def normalize(observation: np.ndarray) -> torch.Tensor:
    return torch.as_tensor(observation, dtype=torch.float32)

In [72]:
def play_episode(network, env):
    observations = []
    rewards = []
    observation, reward, done, info = env.reset()
    while not done:
        legal_moves = info['legal_moves']
        observation = normalize(observation)
        network_move = choose_move(observation, legal_moves, network)
        observation, reward, done, info = env.step(network_move)
    return reward

In [None]:
scheduler1 = ExponentialLR(optimizer, gamma=0.9)
scheduler2 = MultiStepLR(optimizer, milestones=[30,80], gamma=0.1)

In [289]:
from torch.optim.lr_scheduler import LinearLR, ExponentialLR, ChainedScheduler, ReduceLROnPlateau

In [295]:
for _ in notebook.tqdm(range(20)):
    burn_in.step()

  0%|          | 0/20 [00:00<?, ?it/s]

In [291]:
optimizer = torch.optim.AdamW(my_network.parameters(), lr=0.0001)


In [282]:
# set up network & env:
experiment_name = 'Baseline_L'
my_network = alpha_go_zero()
opponent_choose_move = random_move
game_speed_multiplier=1000000
render=False
verbose=False
env = GoEnv(
    opponent_choose_move,
    verbose=verbose,
    render=render,
    game_speed_multiplier=game_speed_multiplier,
)


optimizer = torch.optim.AdamW(my_network.parameters(), lr=0.001)
burn_in = LinearLR(optimizer, start_factor=0.1, end_factor=1.0, total_iters=10_000)

metrics = []
test_eval_size = []

num_episodes = 10_000
num_test_episodes = 25
block_train_episodes = 100
gamma = 0.9
total_score = 0
total_played = 0
train_rewards = []
train_losses = {
    'policy': [],
    'value': []
}
for episode in notebook.tqdm(range(num_episodes)):
    old_observation, reward, done, info = env.reset()
    old_value = 0 #torch.tensor(0, dtype=torch.float32)
    old_observation = normalize(old_observation)
    observation = old_observation
    while not done:
        legal_moves = info['legal_moves']
        probs, value = my_network(observation, legal_moves = legal_moves)
        chosen_move = np.random.choice(range(0,82), p=probs.detach().numpy())
        observation, reward, done, info = env.step(chosen_move)
        observation = normalize(observation)
        

        # train value function
        optimizer.zero_grad()
        if not done:
            loss_v = (old_value - reward - value*gamma)**2
            loss_policy = -torch.log(probs[chosen_move])*(reward + value.detach()*gamma)
        if done:
            loss_v = (value - reward)**2
            loss_policy = -torch.log(probs[chosen_move])*(torch.Tensor([reward]))
#         if episode % 300 == 50:
#             print(f'{episode}:   {value.detach().numpy()[0].round(3)},    {loss_v.detach().numpy()[0].round(3)}')
        
        loss = loss_v + loss_policy # + entropy regularization?
        loss.backward()
        optimizer.step()
        
        
        
        train_losses['policy'].append(loss_policy.detach().numpy()[0])
        train_losses['value'].append(loss_v.detach().numpy()[0])
        
        

        
        old_value = value.detach()
        old_observation = observation
        
    train_rewards.append(reward)
    burn_in.step
        
    if episode % block_train_episodes == 0:
        opponent_choose_move = random_move
        test_env = GoEnv(
            opponent_choose_move,
            verbose=verbose,
            render=render,
            game_speed_multiplier=game_speed_multiplier,
        )
        rewards = [play_episode(my_network, test_env) for _ in notebook.tqdm(range(num_test_episodes))]
        test_wr = sum([r == 1 for r in rewards])/num_test_episodes
        test_score = sum(rewards)/num_test_episodes
        test_ties = sum([r == 0 for r in rewards])/num_test_episodes
        
        train_wr = sum([r == 1 for r in train_rewards])/block_train_episodes
        train_score = sum(train_rewards)/block_train_episodes
        train_ties = sum([r == 0 for r in train_rewards])/block_train_episodes
        train_rewards = []
        metrics.append({'test_win_rate': test_wr,
                        'test_score': test_score,
                        'test_ties': test_ties,
                        'train_win_rate': train_wr,
                        'train_score': train_score,
                        'train_ties': train_ties,
                        'episode': episode,
                        'total_score': total_score,
                        'total_played': total_played,
                        'train_loss_policy': sum(train_losses['policy'])/len(train_losses['policy']),
                        'train_loss_value': sum(train_losses['value'])/len(train_losses['value'])
                       })
#         if train_score > 0.2: # if 60+% winrate, increase difficulty
#             opponent_choose_move = partial(choose_move, neural_network=my_network)
#             env = GoEnv(
#                 opponent_choose_move,
#                 verbose=verbose,
#                 render=render,
#                 game_speed_multiplier=game_speed_multiplier,
#             )

        pd.DataFrame(metrics).to_csv(f'logs/{experiment_name}_{episode}.csv')
        train_losses = {
            'policy': [],
            'value': []
        }

    total_score += reward
    total_played += 1
#     print(round(total_score/total_played, 2), total_score, total_played, loss_policy.detach())


  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

KeyboardInterrupt: 