In [1]:
import tensorflow as tf
import pickle
import keras
from keras.models import Model, load_model, clone_model
from keras.utils import multi_gpu_model
from keras.layers import *
from keras.optimizers import Adam, SGD
from keras.regularizers import l2
from keras.losses import binary_crossentropy
import keras.backend as K
import numpy as np
import matplotlib.pyplot as plt
import sys
import random
sys.path.append('src')  # Fix for jupyter
import src.emulator as emulator
import src.emulator_utils as emulator_utils
import src.emulator_vis as emulator_vis
%matplotlib inline
from IPython.display import clear_output
import time
import multiprocessing
from multiprocessing import Event, Queue, Pipe
from multiprocessing import Process as Thread
import os
import logging
from mcts import *
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

  return f(*args, **kwds)
  return f(*args, **kwds)
Using TensorFlow backend.


# Logging

In [2]:
logging.basicConfig(filename='logging.log', level=logging.INFO, format='%(asctime)s %(message)s', filemode='w')

# Model Params

In [3]:
INPUT_SIZE = (16, 16, 5) # Map size fixed to 16x16 (2 to 3 players)
N_ACTIONS = 4
gpus = 1

# Define the Layers Blocks

In [4]:
filters = 64

# Convolutional Block
def conv_block(in_layer, name, filters=filters, kernel_size=(3,3), bn=True, relu=True):
    l = Conv2D(filters, kernel_size, use_bias = False, 
               padding='same', name = name, kernel_regularizer=l2(1e-4))(in_layer)
    if bn:
        l = BatchNormalization(axis=3, name = name + '_bn')(l)
    if relu:
        #l = Activation('relu', name = name + '_relu')(l)
        l = LeakyReLU(name = name + '_lkrelu')(l)

    return l

# Residual Block
def residual_conv(in_layer, idx, filters=filters, kernel_size=(3,3), bn=True, relu=True):
    name = 'res_' + str(idx)
    # Full conv block of pre-defined shape
    l = conv_block(in_layer, name + '_conv1', filters, kernel_size=(3,3), bn=True, relu=True)
    # Second block with skip connection
    l = Conv2D(filters, kernel_size, use_bias = False, padding='same', 
               name = name + '_conv2', kernel_regularizer=l2(1e-4))(l)
    if bn:
        l = BatchNormalization(axis=3, name = name + '_conv2_bn')(l)
    
    l = Concatenate()([in_layer, l]) # Skip conn.
    #l = Add()([in_layer, l]) # Skip conn.
    
    if relu:
        #l = Activation('relu', name = name + '_relu')(l)
        l = LeakyReLU(name = name + '_lkrelu')(l)
        
    return l

def value_head(in_layer):
    l = conv_block(in_layer, 'value_head', filters=1, kernel_size=(1,1))
    l = Flatten(name = 'value_flatten')(l)
    l = Dense(64, kernel_regularizer=l2(1e-4), name = 'value_dense')(l)
    #l = Activation('relu', name = 'value_relu')(l)
    l = LeakyReLU(name = 'value_lkrelu')(l)
    
    l = BatchNormalization(axis=1, name = 'value_bn')(l)

    l = Dense(1, use_bias = False, name = 'value', kernel_regularizer=l2(1e-4),
              activation='tanh')(l) # Value output
    return l

def policy_head(in_layer):
    l = conv_block(in_layer, 'policy_head', filters=2, kernel_size=(1,1))
    l = Flatten(name = 'policy_flatten')(l)
    #l = Dense(128, kernel_regularizer=l2(1e-4), name = 'policy_dense')(l)
    #l = LeakyReLU(name = 'policy_lkrelu')(l)

    l = Dense(N_ACTIONS, name = 'policy', use_bias = False, kernel_regularizer=l2(1e-4),
              activation='linear')(l) # Policy output
    return l

# Model

In [5]:
def create_model():
    def declare_model():
        n_residual = 6

        input_layer = Input(INPUT_SIZE)
        l = conv_block(input_layer, 'conv')
        for i in range(n_residual):
            l = residual_conv(l, idx=i + 1)

        policy = policy_head(l)
        
        value = value_head(l)

        alphabot = Model(input_layer, [policy, value])
        return alphabot
    
    if gpus > 1:
        with tf.device('/cpu:0'):
            alphabot = declare_model()
        alphabot_multi = multi_gpu_model(alphabot, gpus=gpus)
        return alphabot_multi, alphabot
    
    alphabot = declare_model()
    return alphabot, alphabot

In [6]:
def policy_rot90(policy, k = 1):
    k = k % 4
    policy = np.array(policy)
    for i in range(k):
        policy = policy[..., [1, 2, 3, 0]]
    
    return policy

def policy_flip(policy, vert=False):
    policy = np.array(policy)
    if vert:
        return policy[..., [0, 3, 2, 1]]
    
    return policy[..., [2, 1, 0, 3]]

def state_flip(state, vert=False):
    state = np.array(state)
    
    if vert:
        return state[:, ::-1]
    return state[:, :, ::-1]

def apply_simmetries(train_steps):
    # 90;180:270 degrees rotations
    # 0 right, 1 down, 2 left, 3 up
    # Flips
    
    t_s = []
    t_p = []
    value = []
    for step in train_steps:
        t_s.append(step.state)
        t_p.append(step.policy)
        value.append(step.value)
    
    i = np.random.randint(1, 5)
    j = np.random.randint(0, 3)
    
    state = np.rot90(t_s, k=i, axes=(1, 2))
    policy = policy_rot90(t_p, k=i)
    
    if j == 0: # Horizontal flip
        state = state_flip(state, vert=False)
        policy = policy_flip(policy, vert=False)
    elif j == 1: # Vertical flip
        state = state_flip(state, vert=True)
        policy = policy_flip(policy, vert=True)

    steps = [TrainStep(s, v, p) for s, p, v in zip(state, policy, value)]

    return steps

In [36]:
def manage_predictions():
    t = 0
    
    while not winner_buffer.full():
        indices, states, net = [], [], []
        if processable_buffer.qsize() < min(num_threads, 2): # Wait until a bunch of requests are queued
            continue

        for i in range(processable_buffer.qsize()):
            index, state, net = processable_buffer.get()
            indices.append(index)
            states.append(state)
            
        states = np.array(states, dtype=np.float32)
        if net == 'alphabot':
            predictions = alphabot.predict(states)
        elif net == 'alphabot_best':
            predictions = alphabot_best.predict(states)

        for i, pred in enumerate(tuple(zip(predictions[0], predictions[1]))):
            pipes[indices[i]].send(dict(zip(alphabot.output_names, pred)))
        
        if time.time() - t > 30: # Every 30 secs
            t = time.time()
            logging.info('Finished evaluation %d games' % winner_buffer.qsize())
        
def simulate_games():
    logging.debug('Starting Threads for parallel Games')
    
    parallel_sim(evaluation=False) # Parallel Games
    
    while not history_buffer.full():
        indices, states = [], []
        if processable_buffer.qsize() < min(num_threads, 2): # Wait until a bunch of requests are queued
            continue

        for i in range(processable_buffer.qsize()):
            index, state, _ = processable_buffer.get()
            indices.append(index)
            states.append(state)
            
        states = np.array(states, dtype=np.float32)
        predictions = alphabot.predict(states)
        for i, pred in enumerate(tuple(zip(predictions[0], predictions[1]))):
            pipes[indices[i]].send(dict(zip(alphabot.output_names, pred)))

    logging.info('Finished Simulating %s games', n_games)

In [11]:
def play_eval(reverted=False, pipe=None, process_id=None):
    global alphabot_best
    global alphabot
    
    game = emulator.Game(2)
    mapp = game.reset()
  
    tree_player0 = MCTS()
    tree_player0.alpha = MCTS_eval_alpha
  
    tree_player1 = MCTS()
    tree_player1.alpha = MCTS_eval_alpha

    old_mapp = None
    head = None
    turn = 0
    s = map_to_state(mapp, old_mapp, None, 0)
    old_mapp = copy.deepcopy(mapp)
  
    states = []
    policies = []
    #reverted = np.random.random() > 0.5
    if reverted:
        player1 = 'alphabot'
        player0 = 'alphabot_best'
    else:
        player0 = 'alphabot'
        player1 = 'alphabot_best'
    
    while True:
        states.append(np.array(s))
        if turn == 0:
            policy = do_search(MCTS_eval_steps, s, mapp, game, tree_player0, pipe=pipe, process_id=process_id, ask_predict=ask_predict, alphabot=player0, allow_move=allow_move)
        else:
            policy = do_search(MCTS_eval_steps, s, mapp, game, tree_player1, pipe=pipe, process_id=process_id, ask_predict=ask_predict, alphabot=player1, allow_move=allow_move)
            
        if not use_eval_choice:
            choosen = np.argmax(policy)
        else :
            choosen = np.random.choice(4, p=policy)

        policies.append(np.array(policy))
        mapp, tmp_head = game.step(mapp, s, choosen, turn, mcts=True)

        turn = 1 - turn
        if turn == 0:  # We update the state
            s = map_to_state(mapp, old_mapp, s, 0, head)  # TODO: Map to state
        else:
            head = tmp_head
            s[..., -1] = 1

        if turn == 0:
            old_mapp = np.array(mapp)
        
        logging.debug('Turn of %d Policy was %s Took action %s' % (1 - turn, np.round(policy, 2), game.dir_name[choosen])) 
        printable_state = map_to_state(mapp, old_mapp, s, 0, head) 
        printable_mapp = copy.copy(mapp) 
        printable_mapp[np.where(printable_state[..., 1] == 1)] = 2 
        printable_mapp[np.where(printable_state[..., 3] == 1)] = 3 
        logging.debug('\n' + str(printable_mapp).replace('-1', '--')) 
        
        if game.game_ended():
            if not reverted:
                return int(turn)
            else:
                return  int(not turn)

In [48]:
def train_model():
        picked_data = random.sample(complete_history, k=min(BATCH_SIZE, len(complete_history)))
        #picked_data = apply_simmetries(picked_data)
        
        state = []
        policy = []
        value = []
        for step in picked_data:
            policy.append(step.policy)
            state.append(step.state)
            value.append(step.value)
            
        y = [np.zeros((len(state), 4)), np.zeros((len(state), 1))]
        y[0] = policy
        y[1] = value
        
        logging.debug('The label is %s', y)
        losses = alphabot.train_on_batch(np.array(state, dtype=np.float32), y)
        return losses

In [13]:
def training_cycle():
    global alphabot
    global alphabot_best
    global total_improv
    
    logging.info('Starting Training Cycle')
    simulate_games()
    
    # history_buffer contains the games, we store them inside complete history    
    for g in range(history_buffer.qsize()):
        complete_history.append(history_buffer.get())
    stop_simulation() # We can now stop the simulation (will free the memory)
    
    logging.info('Starting Model Training')
    losses = [0, 0, 0] # For debug purpose
    sum_loss = 0
    cc = 1
    for i in range(t_steps + 1):
        if i % 100 == 0:
            logging.info('Training Interaction: %s losses: %s %s', i, 
                         round(sum_loss / cc, 2), np.round(losses, 2))

        losses = train_model()
        sum_loss += losses[0]
        logging.debug('Losses: %s', losses)
        
        cc += 1
        if i % eval_steps == 0 and i > 0:
            cc = 1 # Reset loss counter
            sum_loss = 0
            wins = {'candidate' : 0, 'best' : 0}
            n_c = {0 : 'candidate', 1 : 'best'}
            
            logging.info('Starting self-play evaluation')    
            parallel_sim(evaluation=True) # Start Parallel Games
            manage_predictions()
            for i in range(winner_buffer.qsize()):
                w = winner_buffer.get()
                wins[n_c[w]] += 1 # add a win to the winner
            stop_simulation()
            
            win_ratio = round(wins['candidate'] / eval_games, 2)
            if win_ratio >= win_percent:
                logging.info('Great! Our candidate won %s percent of games', round(win_ratio * 100))
                total_improv += 1
                logging.info('Our bot got better %s times', total_improv)
                alphabot.save('alphabot_best.pickle')
                replace_best()    
            else:
                logging.info('Damn! Our candidate only won %s percent of games', round(win_ratio * 100, 2))         
                logging.info('Cloning to best')
                reload_best()
            
    if len(complete_history) >= k * n_games: #* 2:
        logging.info('Removing oldest games')
        del complete_history[:n_games] # Delete n oldest games from history

In [14]:
def load_best(best_model):
    global alphabot
    global alphabot_best
    alphabot_best = load_model(best_model, 
                               custom_objects={'softmax_cross_entropy_with_logits' : softmax_cross_entropy_with_logits})
    alphabot = load_model(best_model, 
                               custom_objects={'softmax_cross_entropy_with_logits' : softmax_cross_entropy_with_logits})

In [15]:
def reload_best():
    global alphabot
    global alphabot_best
    
    alphabot.set_weights(alphabot_best.get_weights())

In [16]:
def replace_best():
    global alphabot
    global alphabot_best
    
    alphabot_best.set_weights(alphabot.get_weights())

In [17]:
def train(cycles):
    global alphabot_best
    global alphabot
    
    #replace_best()
    
    complete_history = []
    for i in range(cycles):
        logging.info('Training cycle: %s', i)
        training_cycle()

In [20]:
def ask_predict(idi, x, net=None):
    # Adds to queue id and data from process
    processable_buffer.put((idi, x, net))

def sim(process_id, pipe, evaluation=False):
    np.random.seed()
    random.seed()
    
    if evaluation:
        while True:
            reverted = np.random.random() >= 0.5
            winner = play_eval(reverted, pipe, process_id)
            
            try:
                winner_buffer.put_nowait(winner)
            except:
                break
    
    else:
        while True:
            train_steps = simulate_game(MCTS_steps, MCTS_alpha, pipe, ask_predict, process_id)    
        
            try:
                for step in train_steps:
                    history_buffer.put_nowait(step)
            except:
                break
                    
def stop_simulation():
    global workers
    global history_buffer
    global processable_buffer
    global winner_buffer
    global pipes
    global child_pipes
    
    if 'workers' in globals() and len(workers) != 0:
        for worker in workers:
            worker.terminate()
    workers = []
    
    for pipe in pipes:
        pipe.close()

    for pipe in child_pipes:
        pipe.close()
    
    #for _ in range(history_buffer.qsize()):
    #    try:
    #        history_buffer.get_nowait()
    #    except:
    #        break
            
    #for _ in range(processable_buffer.qsize()):
    #    try:
    #        processable_buffer.get_nowait()
    #    except:
    #        break
        
    history_buffer.close()
    processable_buffer.close()
    winner_buffer.close()
    
    # Then we empty the queues
    del history_buffer
    del processable_buffer
    del pipes
    del child_pipes
    del winner_buffer

def parallel_sim(evaluation=False):
    global workers
    global history_buffer
    global processable_buffer
    global winner_buffer
    global pipes
    global child_pipes
    
    if 'workers' in globals() and len(workers) != 0:
        stop_simulation()
    
    history_buffer = Queue(n_games) # This numbers can be tweaked
    winner_buffer = Queue(eval_games)
    processable_buffer = Queue(num_threads)
    pipes = []
    child_pipes = []
    
    workers = []
    for i in range(num_threads):
        parent_pipe, child_pipe = Pipe() # Pipe to communicate with childs
        pipes.append(parent_pipe)
        child_pipes.append(child_pipe)
        
        worker = Thread(target=sim, args=[i, child_pipe, evaluation])
        worker.daemon = False
        worker.start()
        workers.append(worker)

In [40]:
import tensorflow as tf

def softmax_cross_entropy_with_logits(y_true, y_pred):

    p = y_pred
    pi = y_true

    zero = tf.zeros(shape = tf.shape(pi), dtype=tf.float32)
    where = tf.equal(pi, zero)

    negatives = tf.fill(tf.shape(pi), -100.0) 
    p = tf.where(where, negatives, p)

    loss = tf.maximum(0., tf.nn.softmax_cross_entropy_with_logits(labels = pi, logits = p) - 1)

    return loss

In [49]:
alphabot, _ = create_model()
alphabot.compile(optimizer=SGD(1e-4, momentum=0.9),
                          loss={'value' : 'mse', 'policy' : softmax_cross_entropy_with_logits},
                          loss_weights={'value' : 0.5, 'policy' : 0.5})
alphabot.summary(line_length=112)

________________________________________________________________________________________________________________
Layer (type)                        Output Shape             Param #       Connected to                         
input_3 (InputLayer)                (None, 16, 16, 5)        0                                                  
________________________________________________________________________________________________________________
conv (Conv2D)                       (None, 16, 16, 32)       1440          input_3[0][0]                        
________________________________________________________________________________________________________________
conv_bn (BatchNormalization)        (None, 16, 16, 32)       128           conv[0][0]                           
________________________________________________________________________________________________________________
conv_lkrelu (LeakyReLU)             (None, 16, 16, 32)       0             conv_bn[0][0]        

In [50]:
alphabot.save('alphabot_best.pickle')
alphabot_best = load_model('alphabot_best.pickle', 
                           custom_objects={'softmax_cross_entropy_with_logits' : softmax_cross_entropy_with_logits})

In [15]:
load_best('alphabot_best.pickle')

ValueError: Unable to create group (no write intent on file)

In [46]:
# History of games for training
complete_history = []

# Game Params
n_players = 2
n_games = 10_000 # Simulate N games before each training
k = 10 # Games to be stored n_games * K

# Eval options
allow_move = True
use_eval_choice = False

# Simulation Params
num_threads = 6

MCTS_steps = 40
MCTS_eval_steps = 30
MCTS_alpha = 0.8
MCTS_eval_alpha = 0.8

# Training Params
t_steps = 3000 # Steps of training
eval_steps = 1000 # How many steps before evaluation
eval_games = 100 # How many games to play to evaluate how's best model
win_percent = 0.55 # Ratio of game won to become best model
BATCH_SIZE = 256
total_improv = 0

In [None]:
#complete_history = []
cycles = 1000

train(cycles)

In [33]:
K.set_value(alphabot.optimizer.lr, 1e-4)

In [64]:
for i in range(10_000):
    #picked_data = random.sample(complete_history, k=min(BATCH_SIZE, len(complete_history)))
    picked_data = complete_history[1:2]
    
    state = []
    policy = []
    value = []
    for step in picked_data:
        policy.append(step.policy)
        state.append(step.state)
        value.append(step.value)

    y = [np.zeros((len(state), 4)), np.zeros((len(state), 1))]
    y[0] = policy
    y[1] = value
    
    losses = alphabot.train_on_batch(np.array(state, dtype=np.float32), y)
    print(losses)

[1.2112255, 0.13448942, 1.0]
[1.1683626, 0.09228718, 0.99933934]
[1.1323993, 0.05758381, 0.99807936]
[1.1333538, 0.060325146, 0.9962925]
[1.1539772, 0.08322132, 0.9940196]
[1.1650805, 0.097022295, 0.9913221]
[1.1548651, 0.089883566, 0.98824555]
[1.131397, 0.069838285, 0.98482263]
[1.1125907, 0.054754257, 0.98110026]
[1.109841, 0.05599785, 0.97710705]
[1.1177636, 0.068155766, 0.97287184]
[1.1214706, 0.076307416, 0.96842706]
[1.1126026, 0.07207155, 0.96379495]
[1.0965642, 0.060823083, 0.95900506]
[1.084106, 0.05330038, 0.9540695]
[1.0799371, 0.054194927, 0.94900596]
[1.0804355, 0.059860468, 0.94383883]
[1.0790234, 0.06371069, 0.93857646]
[1.0723771, 0.06240821, 0.93323296]
[1.0620867, 0.05752349, 0.92782724]
[1.0524385, 0.053340673, 0.92236185]
[1.0465485, 0.05296099, 0.9168514]
[1.0438011, 0.055759907, 0.9113051]
[1.0408003, 0.058337927, 0.9057264]
[1.0349436, 0.058081865, 0.9001258]
[1.026714, 0.055470705, 0.8945074]
[1.0185868, 0.05296719, 0.8888836]
[1.0124128, 0.052424908, 0.8832519

[0.3897641, 0.052271962, 0.26074836]
[0.38854542, 0.052271962, 0.25952965]
[0.38733545, 0.052271962, 0.2583196]
[0.38613325, 0.052271962, 0.25711733]
[0.38493913, 0.052271962, 0.25592312]
[0.38375482, 0.052271962, 0.25473875]
[0.38257703, 0.052271843, 0.25356102]
[0.38140923, 0.052271962, 0.2523931]
[0.3802479, 0.052271962, 0.25123176]
[0.37909412, 0.052271843, 0.25007802]
[0.37794873, 0.052271962, 0.24893244]
[0.37681168, 0.052271843, 0.24779546]
[0.3756824, 0.052271962, 0.24666606]
[0.37456027, 0.052271962, 0.24554385]
[0.37344518, 0.052271962, 0.24442868]
[0.37233838, 0.052271962, 0.2433218]
[0.37123948, 0.052271843, 0.24222295]
[0.37014708, 0.052271843, 0.24113046]
[0.3690635, 0.052271962, 0.24004674]
[0.3679854, 0.052271843, 0.23896879]
[0.36691552, 0.052271962, 0.23789878]
[0.36585388, 0.052271962, 0.23683703]
[0.36479706, 0.052271962, 0.23578015]
[0.3637489, 0.052271843, 0.23473209]
[0.36270592, 0.052271962, 0.23368892]
[0.36167252, 0.052271843, 0.23265558]
[0.36064452, 0.052272

[0.2424588, 0.052271962, 0.11343382]
[0.24212658, 0.052271962, 0.11310155]
[0.2417956, 0.052271843, 0.112770684]
[0.2414664, 0.05227208, 0.11244118]
[0.24113876, 0.052271962, 0.11211363]
[0.2408119, 0.052271843, 0.11178689]
[0.24048714, 0.052271962, 0.11146197]
[0.24016413, 0.052271843, 0.11113904]
[0.23984261, 0.052271962, 0.110817365]
[0.23952183, 0.052271962, 0.11049655]
[0.23920359, 0.052271962, 0.11017831]
[0.23888654, 0.052271962, 0.109861225]
[0.23857008, 0.052271962, 0.109544724]
[0.23825638, 0.052271962, 0.109231]
[0.23794392, 0.052271962, 0.10891848]
[0.23763257, 0.052271962, 0.10860711]
[0.23732287, 0.052271843, 0.10829752]
[0.23701403, 0.052271843, 0.10798865]
[0.23670733, 0.052271962, 0.10768178]
[0.23640168, 0.052271962, 0.10737613]
[0.23609771, 0.052271843, 0.10707224]
[0.23579493, 0.052271962, 0.10676933]
[0.2354934, 0.052271843, 0.106467895]
[0.23519352, 0.052271843, 0.10616797]
[0.23489492, 0.052271962, 0.10586921]
[0.23459762, 0.052271962, 0.10557188]
[0.23430228, 0.

[0.19529085, 0.052271962, 0.066261314]
[0.19515434, 0.052271843, 0.06612492]
[0.19501913, 0.052271962, 0.06598956]
[0.19488367, 0.052271962, 0.065854095]
[0.19474879, 0.052271962, 0.065719195]
[0.1946143, 0.05227208, 0.065584615]
[0.19448048, 0.052271962, 0.06545088]
[0.19434686, 0.052271843, 0.06531736]
[0.19421384, 0.052271843, 0.065184325]
[0.19408146, 0.052271962, 0.06505181]
[0.19394931, 0.052271962, 0.06491968]
[0.1938174, 0.052271843, 0.064787865]
[0.19368616, 0.052271962, 0.06465649]
[0.19355577, 0.052271962, 0.06452606]
[0.19342545, 0.052271962, 0.06439573]
[0.19329533, 0.052271843, 0.06426572]
[0.19316636, 0.052271962, 0.064136654]
[0.19303705, 0.052271962, 0.06400732]
[0.1929086, 0.052271962, 0.06387885]
[0.19278039, 0.052271843, 0.063750744]
[0.19265266, 0.052271843, 0.063623]
[0.19252571, 0.052271962, 0.063495934]
[0.19239867, 0.052271843, 0.063369]
[0.19227211, 0.052271843, 0.06324245]
[0.19214652, 0.052271962, 0.06311669]
[0.19202107, 0.052271962, 0.06299121]
[0.19189598

[0.17291653, 0.052271962, 0.043885287]
[0.17284887, 0.052271843, 0.043817736]
[0.1727817, 0.052271843, 0.04375061]
[0.17271473, 0.052271962, 0.043683488]
[0.17264761, 0.052271843, 0.043616492]
[0.17258112, 0.052271843, 0.043549992]
[0.17251465, 0.052271843, 0.04348352]
[0.17244819, 0.052271962, 0.04341695]
[0.1723821, 0.052271962, 0.043350853]
[0.172316, 0.052271843, 0.04328488]
[0.1722504, 0.052271962, 0.043219157]
[0.17218514, 0.05227208, 0.043153755]
[0.17211919, 0.052271962, 0.04308791]
[0.1720539, 0.052271843, 0.043022804]
[0.17198876, 0.05227208, 0.04295738]
[0.17192407, 0.05227208, 0.042892694]
[0.17185931, 0.052271962, 0.04282806]
[0.1717946, 0.052271962, 0.042763323]
[0.17173012, 0.052271962, 0.04269886]
[0.17166553, 0.052271843, 0.042634394]
[0.17160168, 0.052271962, 0.042570423]
[0.17153774, 0.052271843, 0.042506594]
[0.17147394, 0.052271962, 0.04244267]
[0.17141047, 0.052271962, 0.04237918]
[0.1713469, 0.052271962, 0.042315647]
[0.17128366, 0.05227208, 0.042252254]
[0.17122

[0.16119693, 0.05227208, 0.03216583]
[0.1611573, 0.052271843, 0.032126438]
[0.16111828, 0.052271962, 0.03208733]
[0.16107906, 0.052271962, 0.03204809]
[0.16104023, 0.052271962, 0.032009263]
[0.1610011, 0.052271962, 0.03197016]
[0.16096243, 0.052271962, 0.031931464]
[0.16092336, 0.052271843, 0.031892516]
[0.1608849, 0.052271962, 0.03185397]
[0.16084638, 0.052271962, 0.031815432]
[0.16080782, 0.052271962, 0.03177687]
[0.16076922, 0.052271843, 0.03173842]
[0.160731, 0.052271962, 0.03170008]
[0.16069278, 0.052271962, 0.031661883]
[0.16065457, 0.05227208, 0.031623546]
[0.16061625, 0.052271843, 0.031585462]
[0.16057836, 0.052271843, 0.03154759]
[0.16054031, 0.052271843, 0.03150951]
[0.16050237, 0.052271843, 0.031471603]
[0.16046469, 0.052271843, 0.03143393]
[0.160427, 0.052271962, 0.03139611]
[0.16038944, 0.052271962, 0.031358562]
[0.16035204, 0.05227208, 0.03132104]
[0.16031441, 0.052271962, 0.03128352]
[0.16027689, 0.052271962, 0.031246021]
[0.16023962, 0.052271843, 0.031208882]
[0.1602021

[0.15396063, 0.052271962, 0.024931198]
[0.1539355, 0.052271962, 0.024906076]
[0.15391032, 0.052271843, 0.024881022]
[0.15388551, 0.052271962, 0.024856115]
[0.15386043, 0.052271962, 0.02483105]
[0.15383546, 0.052271843, 0.024806203]
[0.15381052, 0.052271843, 0.024781257]
[0.15378581, 0.052271843, 0.024756547]
[0.15376106, 0.052271843, 0.024731813]
[0.15373649, 0.052271962, 0.024707148]
[0.15371163, 0.052271962, 0.024682308]
[0.15368702, 0.052271962, 0.024657706]
[0.1536625, 0.052271962, 0.02463319]
[0.15363793, 0.052271962, 0.02460861]
[0.15361321, 0.052271843, 0.024584025]
[0.15358898, 0.052271962, 0.024559695]
[0.15356448, 0.052271962, 0.024535209]
[0.15354, 0.052271962, 0.024510717]
[0.15351582, 0.052271962, 0.024486555]
[0.15349159, 0.05227208, 0.024462217]
[0.15346713, 0.052271962, 0.024437891]
[0.15344314, 0.052271962, 0.024413913]
[0.15341885, 0.052271962, 0.02438963]
[0.15339456, 0.052271843, 0.024365472]
[0.15337065, 0.052271962, 0.024341417]
[0.15334669, 0.05227208, 0.02431737

[0.14927885, 0.052271962, 0.02025182]
[0.1492616, 0.052271962, 0.020234536]
[0.14924432, 0.05227208, 0.02021721]
[0.149227, 0.052271962, 0.020199994]
[0.14920953, 0.052271843, 0.020182664]
[0.14919245, 0.052271962, 0.020165479]
[0.14917526, 0.052271843, 0.020148419]
[0.14915799, 0.052271962, 0.020131027]
[0.14914086, 0.052271962, 0.02011393]
[0.14912374, 0.052271962, 0.020096809]
[0.14910649, 0.052271843, 0.020079693]
[0.14908941, 0.052271962, 0.020062534]
[0.14907257, 0.05227208, 0.020045567]
[0.14905545, 0.052271962, 0.020028591]
[0.14903829, 0.052271962, 0.020011438]
[0.14902136, 0.052271962, 0.019994494]
[0.14900449, 0.052271962, 0.019977657]
[0.14898732, 0.052271843, 0.019960627]
[0.14897054, 0.052271962, 0.01994372]
[0.14895375, 0.052271962, 0.019926956]
[0.14893669, 0.052271843, 0.019910032]
[0.14892003, 0.052271962, 0.019893248]
[0.14890336, 0.05227208, 0.01987647]
[0.14888632, 0.052271843, 0.019859668]
[0.14886957, 0.052271843, 0.01984294]
[0.14885288, 0.052271962, 0.01982615]

[0.14587228, 0.052271962, 0.01684837]
[0.1458597, 0.052271962, 0.016835809]
[0.14584732, 0.052271962, 0.016823437]
[0.14583476, 0.052271962, 0.016810883]
[0.14582235, 0.052271962, 0.016798506]
[0.14580993, 0.052271962, 0.016786117]
[0.1457975, 0.052271962, 0.0167737]
[0.14578505, 0.052271843, 0.016761351]
[0.14577271, 0.052271962, 0.016748914]
[0.14576033, 0.052271962, 0.01673656]
[0.14574802, 0.052271962, 0.016724255]
[0.14573568, 0.052271962, 0.01671194]
[0.14572342, 0.052271962, 0.01669969]
[0.14571102, 0.052271962, 0.01668729]
[0.14569882, 0.052271962, 0.016675098]
[0.14568642, 0.052271843, 0.016662845]
[0.14567415, 0.052271843, 0.016650598]
[0.14566208, 0.052271962, 0.016638434]
[0.14564976, 0.052271962, 0.01662612]
[0.1456376, 0.052271962, 0.016613979]
[0.1456254, 0.052271962, 0.016601795]
[0.14561313, 0.052271962, 0.016589541]
[0.14560108, 0.052271962, 0.016577521]
[0.14558874, 0.052271843, 0.016565306]
[0.1455768, 0.052271962, 0.016553247]
[0.1455647, 0.052271962, 0.01654115]
[

[0.14338791, 0.052271962, 0.014367604]
[0.14337847, 0.052271962, 0.014358203]
[0.14336905, 0.052271962, 0.014348777]
[0.14335978, 0.052271962, 0.01433954]
[0.1433503, 0.052271962, 0.014330091]
[0.14334093, 0.052271962, 0.014320717]
[0.14333163, 0.052271962, 0.014311446]
[0.1433222, 0.052271843, 0.014302149]
[0.14331312, 0.05227208, 0.014292842]
[0.14330357, 0.052271843, 0.014283551]
[0.14329448, 0.05227208, 0.014274249]
[0.14328507, 0.052271962, 0.014264936]
[0.1432758, 0.052271962, 0.014255711]
[0.1432666, 0.052271962, 0.014246518]
[0.14325735, 0.052271962, 0.014237271]
[0.14324802, 0.052271962, 0.014227984]
[0.14323874, 0.052271843, 0.014218828]
[0.14322963, 0.052271962, 0.014209604]
[0.14322045, 0.052271962, 0.014200455]
[0.14321122, 0.052271962, 0.014191222]
[0.14320187, 0.052271843, 0.014182021]
[0.14319271, 0.052271843, 0.01417288]
[0.14318366, 0.052271962, 0.014163714]
[0.14317456, 0.052271962, 0.014154635]
[0.14316535, 0.052271962, 0.014145461]
[0.1431562, 0.052271962, 0.014136

[0.14147627, 0.052271843, 0.012460131]
[0.14146923, 0.052271962, 0.012452986]
[0.14146186, 0.052271962, 0.0124456445]
[0.14145459, 0.052271962, 0.012438384]
[0.14144741, 0.052271962, 0.012431219]
[0.14143997, 0.052271843, 0.012423923]
[0.14143287, 0.052271962, 0.012416735]
[0.14142567, 0.052271962, 0.012409563]
[0.1414185, 0.052271962, 0.012402394]
[0.14141111, 0.052271843, 0.012395146]
[0.14140405, 0.052271962, 0.01238798]
[0.14139688, 0.052271962, 0.012380816]
[0.14138976, 0.05227208, 0.012373628]
[0.1413823, 0.052271843, 0.012366429]
[0.14137512, 0.052271843, 0.012359271]
[0.14136817, 0.052271962, 0.012352196]
[0.14136082, 0.052271843, 0.012344989]
[0.14135379, 0.052271962, 0.01233785]
[0.1413466, 0.052271962, 0.012330702]
[0.14133956, 0.052271962, 0.01232366]
[0.14133237, 0.052271962, 0.012316489]
[0.14132518, 0.052271843, 0.012309425]
[0.14131826, 0.052271962, 0.012302402]
[0.14131121, 0.05227208, 0.01229529]
[0.14130391, 0.052271962, 0.012288114]
[0.14129683, 0.052271962, 0.01228

[0.13996051, 0.052271962, 0.010948695]
[0.13995475, 0.052271962, 0.010942933]
[0.13994889, 0.052271962, 0.0109371105]
[0.13994321, 0.052271962, 0.010931464]
[0.13993758, 0.05227208, 0.010925719]
[0.13993171, 0.052271962, 0.01092]
[0.13992602, 0.052271962, 0.010914332]
[0.13992028, 0.052271962, 0.010908605]
[0.13991438, 0.052271843, 0.010902841]
[0.13990885, 0.052271962, 0.010897215]
[0.139903, 0.052271843, 0.010891504]
[0.13989739, 0.052271962, 0.010885795]
[0.13989174, 0.052271962, 0.010880161]
[0.1398859, 0.052271843, 0.010874467]
[0.13988024, 0.052271843, 0.010868799]
[0.13987458, 0.052271962, 0.010863057]
[0.13986896, 0.052271962, 0.0108574545]
[0.13986324, 0.052271962, 0.010851767]
[0.13985753, 0.052271843, 0.010846179]
[0.13985196, 0.052271962, 0.010840506]
[0.13984625, 0.052271962, 0.010834835]
[0.13984051, 0.052271843, 0.010829214]
[0.139835, 0.052271962, 0.010823633]
[0.13982934, 0.052271962, 0.010817978]
[0.13982359, 0.052271843, 0.01081235]
[0.13981794, 0.052271843, 0.010806

[0.1387983, 0.052271962, 0.009790955]
[0.1387935, 0.052271843, 0.009786296]
[0.13878876, 0.052271843, 0.0097815795]
[0.13878438, 0.05227208, 0.00977697]
[0.13877952, 0.052271962, 0.009772267]
[0.13877477, 0.052271962, 0.009767519]
[0.13876998, 0.052271843, 0.00976289]
[0.13876541, 0.052271962, 0.009758226]
[0.13876086, 0.05227208, 0.0097535625]
[0.13875605, 0.052271962, 0.009748889]
[0.13875142, 0.052271962, 0.0097443]
[0.13874677, 0.052271962, 0.009739663]
[0.13874203, 0.052271962, 0.009734946]
[0.13873729, 0.052271843, 0.0097303605]
[0.13873275, 0.052271962, 0.009725704]
[0.13872802, 0.052271843, 0.00972112]
[0.13872355, 0.05227208, 0.009716442]
[0.13871863, 0.052271843, 0.009711778]
[0.13871416, 0.052271962, 0.00970722]
[0.13870949, 0.052271962, 0.009702558]
[0.13870493, 0.052271962, 0.0096979905]
[0.13870025, 0.052271962, 0.009693366]
[0.13869554, 0.052271843, 0.009688777]
[0.13869086, 0.052271843, 0.009684143]
[0.13868637, 0.052271962, 0.009679557]
[0.1386818, 0.052271962, 0.00967

KeyboardInterrupt: 

In [65]:
step = complete_history[1]
state = step.state
value = step.value
pol = step.policy

p = alphabot.predict(state[np.newaxis])
p[0][0], softmax(p[0][0]), pol, value, p[1]

(array([-0.3981203 ,  0.6302162 ,  1.8937004 ,  0.49141195], dtype=float32),
 array([[0.06202191, 0.17343868, 0.6135786 , 0.15096076]], dtype=float32),
 array([0.05882353, 0.17647059, 0.61764706, 0.14705882]),
 -1,
 array([[-0.9343863]], dtype=float32))

In [89]:
bot, best = 0, 0
for i in range(10):
    if i < 5:
        winner = play_eval()
    else:
        winner = play_eval(True)
        
    if winner == 0:
        bot += 1
    else:
        best += 1

bot, best

AttributeError: 'NoneType' object has no attribute 'recv'

In [27]:
bot, best = 0, 0
for i in range(100):
    winner, _ = simulate_game(10, 0.8, None, None, None, alphabot)
    if winner == 0:
        bot += 1
    else:
        best += 1

bot, best

(48, 52)

In [43]:
logging.getLogger().setLevel(logging.INFO)

In [27]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
complete_history[506].state[..., 2], complete_history[503].policy

(array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 array([0.33333333, 0.33333333, 0.        , 0.33333333]))

In [53]:
np.array([s.state for s in complete_history[506:1000:4]])[:, :, ::-1][0, ..., 2]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])