In [1]:
import tensorflow as tf
import pickle
import keras
from keras.models import Model, load_model, clone_model
from keras.utils import multi_gpu_model
from keras.layers import *
from keras.optimizers import Adam, SGD
from keras.regularizers import l2
from keras.losses import binary_crossentropy
import keras.backend as K
import numpy as np
import matplotlib.pyplot as plt
import sys
import random
sys.path.append('src')  # Fix for jupyter
import src.emulator as emulator
import src.emulator_utils as emulator_utils
import src.emulator_vis as emulator_vis
%matplotlib inline
from IPython.display import clear_output
import time
import multiprocessing
from multiprocessing import Event, Queue, Pipe
from multiprocessing import Process as Thread
import os
import logging
from mcts import *
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

  return f(*args, **kwds)
  return f(*args, **kwds)
Using TensorFlow backend.


# Logging

In [2]:
logging.basicConfig(filename='logging.log', level=logging.INFO, format='%(asctime)s %(message)s', filemode='w')

# Model Params

In [3]:
INPUT_SIZE = (16, 16, 5) # Map size fixed to 16x16 (2 to 3 players)
N_ACTIONS = 4
gpus = 1

# Define the Layers Blocks

In [5]:
filters = 64

# Convolutional Block
def conv_block(in_layer, name, filters=filters, kernel_size=(3,3), bn=True, relu=True):
    l = Conv2D(filters, kernel_size, use_bias = False, 
               padding='same', name = name, kernel_regularizer=l2(1e-4))(in_layer)
    if bn:
        l = BatchNormalization(axis=3, name = name + '_bn')(l)
    if relu:
        #l = Activation('relu', name = name + '_relu')(l)
        l = LeakyReLU(name = name + '_lkrelu')(l)

    return l

# Residual Block
def residual_conv(in_layer, idx, filters=filters, kernel_size=(3,3), bn=True, relu=True):
    name = 'res_' + str(idx)
    # Full conv block of pre-defined shape
    l = conv_block(in_layer, name + '_conv1', filters, kernel_size=(3,3), bn=True, relu=True)
    # Second block with skip connection
    l = Conv2D(filters, kernel_size, use_bias = False, padding='same', 
               name = name + '_conv2', kernel_regularizer=l2(1e-4))(l)
    if bn:
        l = BatchNormalization(axis=3, name = name + '_conv2_bn')(l)
    
    l = Concatenate()([in_layer, l]) # Skip conn.
    #l = Add()([in_layer, l]) # Skip conn.
    
    if relu:
        #l = Activation('relu', name = name + '_relu')(l)
        l = LeakyReLU(name = name + '_lkrelu')(l)
        
    return l

def value_head(in_layer):
    l = conv_block(in_layer, 'value_head', filters=1, kernel_size=(1,1))
    l = Flatten(name = 'value_flatten')(l)
    l = Dense(64, kernel_regularizer=l2(1e-4), name = 'value_dense')(l)
    #l = Activation('relu', name = 'value_relu')(l)
    l = LeakyReLU(name = 'value_lkrelu')(l)
    
    l = BatchNormalization(axis=1, name = 'value_bn')(l)

    l = Dense(1, use_bias = False, name = 'value', kernel_regularizer=l2(1e-4),
              activation='tanh')(l) # Value output
    return l

def policy_head(in_layer):
    l = conv_block(in_layer, 'policy_head', filters=2, kernel_size=(1,1))
    l = Flatten(name = 'policy_flatten')(l)
    #l = Dense(128, kernel_regularizer=l2(1e-4), name = 'policy_dense')(l)
    #l = LeakyReLU(name = 'policy_lkrelu')(l)

    l = Dense(N_ACTIONS, name = 'policy', use_bias = False, kernel_regularizer=l2(1e-4),
              activation='linear')(l) # Policy output
    return l

# Model

In [6]:
def create_model():
    def declare_model():
        n_residual = 4

        input_layer = Input(INPUT_SIZE)
        l = conv_block(input_layer, 'conv')
        for i in range(n_residual):
            l = residual_conv(l, idx=i + 1)

        policy = policy_head(l)
        
        value = value_head(l)

        alphabot = Model(input_layer, [policy, value])
        return alphabot
    
    if gpus > 1:
        with tf.device('/cpu:0'):
            alphabot = declare_model()
        alphabot_multi = multi_gpu_model(alphabot, gpus=gpus)
        return alphabot_multi, alphabot
    
    alphabot = declare_model()
    return alphabot, alphabot

In [7]:
def manage_predictions():
    t = 0
    
    while not winner_buffer.full():
        indices, states, net = [], [], []
        if processable_buffer.qsize() < min(num_threads, 2): # Wait until a bunch of requests are queued
            continue

        for i in range(processable_buffer.qsize()):
            index, state, net = processable_buffer.get()
            indices.append(index)
            states.append(state)
            
        states = np.array(states, dtype=np.float32)
        if net == 'alphabot':
            predictions = alphabot.predict(states)
        elif net == 'alphabot_best':
            predictions = alphabot_best.predict(states)

        for i, pred in enumerate(tuple(zip(predictions[0], predictions[1]))):
            pipes[indices[i]].send(dict(zip(alphabot.output_names, pred)))
        
        if time.time() - t > 30: # Every 30 secs
            t = time.time()
            logging.info('Finished evaluation %d games' % winner_buffer.qsize())
        
def simulate_games():
    logging.debug('Starting Threads for parallel Games')
    
    parallel_sim(evaluation=False) # Parallel Games
    
    while not history_buffer.full():
        indices, states = [], []
        if processable_buffer.qsize() < 2: # Wait until a bunch of requests are queued
            continue

        for i in range(processable_buffer.qsize()):
            index, state, _ = processable_buffer.get()
            indices.append(index)
            states.append(state)
            
        states = np.array(states, dtype=np.float32)
        predictions = alphabot.predict(states)
        for i, pred in enumerate(tuple(zip(predictions[0], predictions[1]))):
            pipes[indices[i]].send(dict(zip(alphabot.output_names, pred)))

    logging.info('Finished Simulating %s games', n_games)

In [8]:
def play_eval(reverted=False, pipe=None, process_id=None):
    global alphabot_best
    global alphabot
    
    game = emulator.Game(2)
    mapp = game.reset()
  
    tree_player0 = MCTS()
    tree_player0.alpha = MCTS_eval_alpha
  
    tree_player1 = MCTS()
    tree_player1.alpha = MCTS_eval_alpha

    old_mapp = None
    turn = 0
    s = map_to_state(mapp, old_mapp, None, 0)
    old_mapp = copy.deepcopy(mapp)
  
    states = []
    policies = []
    #reverted = np.random.random() > 0.5
    if reverted:
        player1 = 'alphabot'
        player0 = 'alphabot_best'
    else:
        player0 = 'alphabot'
        player1 = 'alphabot_best'
    
    while True:
        states.append(np.array(s))
        if turn == 0:
            policy = do_search(MCTS_eval_steps, s, mapp, game, tree_player0, pipe=pipe, process_id=process_id, ask_predict=ask_predict, alphabot=player0, allow_move=allow_move)
        else:
            policy = do_search(MCTS_eval_steps, s, mapp, game, tree_player1, pipe=pipe, process_id=process_id, ask_predict=ask_predict, alphabot=player1, allow_move=allow_move)
            
        if not use_eval_choice:
            choosen = np.argmax(policy)
        else :
            choosen = np.random.choice(4, p=policy)

        policies.append(np.array(policy))
        mapp = game.step(mapp, s, choosen, turn)

        turn = 1 - turn
        if turn == 0:  # We update the state
            s = map_to_state(mapp, old_mapp, s, 0)  # TODO: Map to state
        else:
            s[..., -1] = 1

        if turn == 0:
            old_mapp = np.array(mapp)
        
        logging.debug('Turn of %d Policy was %s Took action %s' % (1 - turn, np.round(policy, 2), game.dir_name[choosen])) 
        printable_state = map_to_state(mapp, old_mapp, s, 0) 
        printable_mapp = copy.copy(mapp) 
        printable_mapp[np.where(printable_state[..., 1] == 1)] = 2 
        printable_mapp[np.where(printable_state[..., 3] == 1)] = 3 
        logging.debug('\n' + str(printable_mapp).replace('-1', '--')) 
        
        if game.game_ended():
            if not reverted:
                return int(turn)
            else:
                return  int(not turn)

In [9]:
def train_model():
        picked_data = random.sample(complete_history, k=min(BATCH_SIZE, len(complete_history)))
        picked_data = apply_simmetries(picked_data)
        
        state = []
        policy = []
        value = []
        for step in picked_data:
            policy.append(step.policy)
            state.append(step.state)
            value.append(step.value)
            
        y = [np.zeros((len(state), 4)), np.zeros((len(state), 1))]
        y[0] = policy
        y[1] = value
        
        logging.debug('The label is %s', y)
        losses = alphabot.train_on_batch(np.array(state, dtype=np.float32), y)
        return losses

In [10]:
def training_cycle():
    global alphabot
    global alphabot_best
    global total_improv
    
    logging.info('Starting Training Cycle')
    simulate_games()
    
    # history_buffer contains the games, we store them inside complete history    
    for g in range(history_buffer.qsize()):
        complete_history.append(history_buffer.get())
    stop_simulation() # We can now stop the simulation (will free the memory)
    
    logging.info('Starting Model Training')
    losses = [0, 0, 0] # For debug purpose
    sum_loss = 0
    cc = 1
    for i in range(t_steps + 1):
        if i % 100 == 0:
            logging.info('Training Interaction: %s losses: %s %s', i, 
                         round(sum_loss / cc, 2), np.round(losses, 2))

        losses = train_model()
        sum_loss += losses[0]
        logging.debug('Losses: %s', losses)
        
        cc += 1
        if i % eval_steps == 0 and i > 0:
            cc = 1 # Reset loss counter
            sum_loss = 0
            wins = {'candidate' : 0, 'best' : 0}
            n_c = {0 : 'candidate', 1 : 'best'}
            
            logging.info('Starting self-play evaluation')    
            parallel_sim(evaluation=True) # Start Parallel Games
            manage_predictions()
            for i in range(winner_buffer.qsize()):
                w = winner_buffer.get()
                wins[n_c[w]] += 1 # add a win to the winner
            stop_simulation()
            
            win_ratio = round(wins['candidate'] / eval_games, 2)
            if win_ratio >= win_percent:
                logging.info('Great! Our candidate won %s percent of games', round(win_ratio * 100))
                total_improv += 1
                logging.info('Our bot got better %s times', total_improv)
                alphabot.save('alphabot_best.pickle')
                replace_best()    
            else:
                logging.info('Damn! Our candidate only won %s percent of games', round(win_ratio * 100, 2))         
                logging.info('Cloning to best')
                reload_best()
            
    if len(complete_history) >= k * n_games: #* 2:
        logging.info('Removing oldest games')
        del complete_history[:n_games] # Delete n oldest games from history

In [11]:
def load_best(best_model):
    global alphabot
    global alphabot_best
    alphabot_best = load_model(best_model, 
                               custom_objects={'softmax_cross_entropy_with_logits' : softmax_cross_entropy_with_logits})
    alphabot = load_model(best_model, 
                               custom_objects={'softmax_cross_entropy_with_logits' : softmax_cross_entropy_with_logits})

In [12]:
def reload_best():
    global alphabot
    global alphabot_best
    
    alphabot.set_weights(alphabot_best.get_weights())

In [13]:
def replace_best():
    global alphabot
    global alphabot_best
    
    alphabot_best.set_weights(alphabot.get_weights())

In [14]:
def train(cycles):
    global alphabot_best
    global alphabot
    
    #replace_best()
    
    complete_history = []
    for i in range(cycles):
        logging.info('Training cycle: %s', i)
        training_cycle()

In [15]:
def ask_predict(idi, x, net=None):
    # Adds to queue id and data from process
    processable_buffer.put((idi, x, net))

def sim(process_id, pipe, evaluation=False):
    np.random.seed()
    random.seed()
    
    if evaluation:
        while True:
            reverted = np.random.random() >= 0.5
            winner = play_eval(reverted, pipe, process_id)
            
            try:
                winner_buffer.put_nowait(winner)
            except:
                break
    
    else:
        while True:
            train_steps = simulate_game(MCTS_steps, MCTS_alpha, pipe, ask_predict, process_id)    
        
            try:
                for step in train_steps:
                    history_buffer.put_nowait(step)
            except:
                break
                    
def stop_simulation():
    global workers
    global history_buffer
    global processable_buffer
    global winner_buffer
    global pipes
    global child_pipes
    
    if 'workers' in globals() and len(workers) != 0:
        for worker in workers:
            worker.terminate()
    workers = []
    
    for pipe in pipes:
        pipe.close()

    for pipe in child_pipes:
        pipe.close()
    
    #for _ in range(history_buffer.qsize()):
    #    try:
    #        history_buffer.get_nowait()
    #    except:
    #        break
            
    #for _ in range(processable_buffer.qsize()):
    #    try:
    #        processable_buffer.get_nowait()
    #    except:
    #        break
        
    history_buffer.close()
    processable_buffer.close()
    winner_buffer.close()
    
    # Then we empty the queues
    del history_buffer
    del processable_buffer
    del pipes
    del child_pipes
    del winner_buffer

def parallel_sim(evaluation=False):
    global workers
    global history_buffer
    global processable_buffer
    global winner_buffer
    global pipes
    global child_pipes
    
    if 'workers' in globals() and len(workers) != 0:
        stop_simulation()
    
    history_buffer = Queue(n_games) # This numbers can be tweaked
    winner_buffer = Queue(eval_games)
    processable_buffer = Queue(num_threads)
    pipes = []
    child_pipes = []
    
    workers = []
    for i in range(num_threads):
        parent_pipe, child_pipe = Pipe() # Pipe to communicate with childs
        pipes.append(parent_pipe)
        child_pipes.append(child_pipe)
        
        worker = Thread(target=sim, args=[i, child_pipe, evaluation])
        worker.daemon = False
        worker.start()
        workers.append(worker)

In [16]:
import tensorflow as tf

def softmax_cross_entropy_with_logits(y_true, y_pred):

    p = y_pred
    pi = y_true

    #zero = tf.zeros(shape = tf.shape(pi), dtype=tf.float32)
    #where = tf.equal(pi, zero)

    #negatives = tf.fill(tf.shape(pi), -100.0) 
    #p = tf.where(where, negatives, p)

    loss = tf.maximum(0., tf.nn.softmax_cross_entropy_with_logits(labels = pi, logits = p) - 1)

    return loss

In [56]:
alphabot, _ = create_model()
alphabot.compile(optimizer=SGD(1e-3, momentum=0.9),
                          loss={'value' : 'mse', 'policy' : softmax_cross_entropy_with_logits},
                          loss_weights={'value' : 1., 'policy' : 1.})
alphabot.summary(line_length=112)

________________________________________________________________________________________________________________
Layer (type)                        Output Shape             Param #       Connected to                         
input_2 (InputLayer)                (None, 16, 16, 5)        0                                                  
________________________________________________________________________________________________________________
conv (Conv2D)                       (None, 16, 16, 64)       2880          input_2[0][0]                        
________________________________________________________________________________________________________________
conv_bn (BatchNormalization)        (None, 16, 16, 64)       256           conv[0][0]                           
________________________________________________________________________________________________________________
conv_lkrelu (LeakyReLU)             (None, 16, 16, 64)       0             conv_bn[0][0]        

In [64]:
alphabot.save('alphabot_best.pickle')
alphabot_best = load_model('alphabot_best.pickle', 
                           custom_objects={'softmax_cross_entropy_with_logits' : softmax_cross_entropy_with_logits})

In [15]:
load_best('alphabot_best.pickle')

ValueError: Unable to create group (no write intent on file)

In [65]:
# History of games for training
complete_history = []

# Game Params
n_players = 2
n_games = 10_000 # Simulate N games before each training
k = 10 # Games to be stored n_games * K

# Eval options
allow_move = False
use_eval_choice = False

# Simulation Params
num_threads = 6

MCTS_steps = 30
MCTS_eval_steps = 25
MCTS_alpha = 0.8
MCTS_eval_alpha = 0.8

# Training Params
t_steps = 3000 # Steps of training
eval_steps = 1000 # How many steps before evaluation
eval_games = 100 # How many games to play to evaluate how's best model
win_percent = 0.55 # Ratio of game won to become best model
BATCH_SIZE = 256
total_improv = 0

In [66]:
complete_history = []
cycles = 1000

train(cycles)

KeyboardInterrupt: 

In [44]:
K.set_value(alphabot.optimizer.lr, 1e-3)

In [92]:
for i in range(10_000):
    #picked_data = random.sample(complete_history, k=min(BATCH_SIZE, len(complete_history)))
    picked_data = complete_history[26:27]
    
    state = []
    policy = []
    value = []
    for step in picked_data:
        policy.append(step.policy)
        state.append(step.state)
        value.append(step.value)

    y = [np.zeros((len(state), 4)), np.zeros((len(state), 1))]
    y[0] = policy
    y[1] = value
    
    logging.debug('The label is %s', y)
    losses = alphabot.train_on_batch(np.array(state, dtype=np.float32), y)
    print(losses)

[3.215687, 1.153866, 1.9886234]
[3.1616054, 1.0961393, 1.9922682]
[3.085728, 1.0182159, 1.9943142]
[3.0276783, 0.9595599, 1.9949206]
[2.9945414, 0.9271091, 1.9942347]
[2.9822223, 0.91663516, 1.9923894]
[2.9790895, 0.9164022, 1.9894896]
[2.9758108, 0.91696846, 1.9856446]
[2.967691, 0.91354114, 1.9809523]
[2.9543338, 0.9056436, 1.9754927]
[2.937847, 0.8953041, 1.9693457]
[2.9211876, 0.88541293, 1.9625776]
[2.9067554, 0.8783034, 1.955255]
[2.8958123, 0.87519014, 1.9474254]
[2.8877192, 0.8753841, 1.9391384]
[2.8799603, 0.8763156, 1.9304483]
[2.8699787, 0.87539154, 1.9213907]
[2.8566792, 0.871474, 1.9120089]
[2.8404942, 0.86497015, 1.902328]
[2.8228474, 0.8572745, 1.892377]
[2.805739, 0.8503532, 1.8821903]
[2.7904654, 0.84547484, 1.8717953]
[2.777458, 0.8430606, 1.861202]
[2.766418, 0.8427756, 1.8504474]
[2.7565696, 0.84384346, 1.8395313]
[2.747082, 0.84540343, 1.8284839]
[2.7373033, 0.84678805, 1.8173207]
[2.7268264, 0.84758985, 1.8060424]
[2.7155576, 0.8476863, 1.7946773]
[2.703611, 0.847

[1.2365007, 0.83704495, 0.32626647]
[1.2344781, 0.83704215, 0.3242467]
[1.2324734, 0.8370395, 0.3222446]
[1.2304862, 0.8370368, 0.3202601]
[1.2285179, 0.8370342, 0.31829426]
[1.2265688, 0.8370315, 0.316348]
[1.2246361, 0.83702886, 0.31441772]
[1.2227219, 0.8370263, 0.31250608]
[1.2208239, 0.83702374, 0.31061062]
[1.2189432, 0.83702135, 0.30873227]
[1.2170796, 0.8370187, 0.30687124]
[1.2152345, 0.8370162, 0.3050284]
[1.2134056, 0.8370137, 0.30320203]
[1.2115912, 0.83701134, 0.3013899]
[1.2097948, 0.8370089, 0.2995958]
[1.2080132, 0.8370065, 0.2978168]
[1.2062478, 0.83700407, 0.29605374]
[1.2044997, 0.8370017, 0.29430795]
[1.2027663, 0.8369993, 0.2925768]
[1.2010471, 0.836997, 0.29085988]
[1.199345, 0.8369946, 0.28916]
[1.1976566, 0.83699226, 0.28747392]
[1.1959842, 0.8369899, 0.28580394]
[1.1943253, 0.8369877, 0.28414708]
[1.1926823, 0.83698535, 0.28250623]
[1.1910515, 0.83698297, 0.28087792]
[1.1894364, 0.8369808, 0.27926508]
[1.1878353, 0.83697855, 0.27766612]
[1.1862495, 0.8369763, 0

[1.0162197, 0.83667296, 0.10634641]
[1.0158802, 0.8366722, 0.10600777]
[1.0155435, 0.8366715, 0.10567161]
[1.0152075, 0.83667064, 0.105336435]
[1.0148743, 0.8366697, 0.1050042]
[1.014542, 0.83666897, 0.1046724]
[1.014212, 0.83666825, 0.10434326]
[1.013884, 0.8366673, 0.10401609]
[1.0135572, 0.8366666, 0.103690006]
[1.0132324, 0.8366658, 0.10336589]
[1.0129092, 0.836665, 0.10304344]
[1.0125874, 0.8366642, 0.1027224]
[1.0122674, 0.83666337, 0.10240308]
[1.0119494, 0.83666265, 0.102085866]
[1.0116333, 0.8366618, 0.10177044]
[1.0113183, 0.8366611, 0.10145617]
[1.0110046, 0.83666015, 0.10114346]
[1.0106928, 0.83665955, 0.10083229]
[1.0103824, 0.8366588, 0.100522615]
[1.0100744, 0.836658, 0.10021531]
[1.0097674, 0.8366572, 0.09990911]
[1.0094624, 0.8366564, 0.09960489]
[1.0091583, 0.83665574, 0.0993015]
[1.0088568, 0.83665496, 0.09900076]
[1.0085558, 0.8366543, 0.098700434]
[1.0082569, 0.8366534, 0.09840243]
[1.0079588, 0.83665264, 0.09810499]
[1.0076624, 0.8366519, 0.097809345]
[1.0073687, 

[0.9662973, 0.8365284, 0.05656344]
[0.9661815, 0.836528, 0.056448054]
[0.9660668, 0.8365276, 0.056333657]
[0.96595204, 0.8365271, 0.05621941]
[0.96583736, 0.83652675, 0.05610533]
[0.96572405, 0.8365264, 0.055992328]
[0.96561056, 0.83652604, 0.055879157]
[0.9654976, 0.8365257, 0.055766582]
[0.96538484, 0.8365252, 0.05565423]
[0.9652727, 0.83652484, 0.0555425]
[0.96516085, 0.83652437, 0.055431075]
[0.96504945, 0.836524, 0.055320043]
[0.9649389, 0.83652365, 0.055209767]
[0.96482754, 0.8365232, 0.055098876]
[0.9647178, 0.8365228, 0.054989435]
[0.96460766, 0.83652234, 0.054879796]
[0.96449834, 0.8365221, 0.054770716]
[0.9643892, 0.8365216, 0.05466202]
[0.96428025, 0.8365212, 0.054553404]
[0.96417165, 0.8365208, 0.054445285]
[0.964064, 0.83652043, 0.054338027]
[0.9639562, 0.8365201, 0.05423051]
[0.9638486, 0.8365196, 0.054123405]
[0.96374196, 0.83651936, 0.054016937]
[0.9636353, 0.8365188, 0.053910818]
[0.96352947, 0.8365185, 0.053805247]
[0.9634235, 0.83651805, 0.053699754]
[0.9633181, 0.83

[0.9477442, 0.83645535, 0.03808204]
[0.9476864, 0.83645505, 0.038024463]
[0.94762874, 0.83645487, 0.037967045]
[0.9475713, 0.8364545, 0.037909877]
[0.94751406, 0.8364543, 0.037852917]
[0.9474569, 0.83645403, 0.037795976]
[0.9473998, 0.8364537, 0.03773924]
[0.9473431, 0.83645356, 0.037682615]
[0.94728637, 0.83645326, 0.03762615]
[0.9472296, 0.83645296, 0.037569728]
[0.94717306, 0.83645284, 0.03751328]
[0.9471169, 0.8364526, 0.03745747]
[0.9470611, 0.83645236, 0.03740189]
[0.9470052, 0.83645207, 0.037346277]
[0.94694906, 0.8364518, 0.037290476]
[0.9468936, 0.83645153, 0.03723518]
[0.9468376, 0.83645123, 0.03717958]
[0.94678235, 0.836451, 0.037124548]
[0.946727, 0.8364508, 0.037069466]
[0.946672, 0.8364505, 0.03701475]
[0.9466172, 0.83645034, 0.03696007]
[0.94656235, 0.8364501, 0.03690538]
[0.9465077, 0.83644986, 0.03685106]
[0.946453, 0.8364495, 0.036796637]
[0.9463988, 0.8364493, 0.036742646]
[0.94634444, 0.836449, 0.036688533]
[0.9462904, 0.8364488, 0.036634732]
[0.9462367, 0.83644867,

[0.93707025, 0.83640444, 0.027459439]
[0.9370378, 0.8364042, 0.02742727]
[0.9370057, 0.836404, 0.027395256]
[0.93697333, 0.83640385, 0.027363144]
[0.9369412, 0.8364037, 0.02733115]
[0.93690926, 0.8364036, 0.02729927]
[0.9368771, 0.83640337, 0.027267313]
[0.9368453, 0.83640325, 0.02723561]
[0.93681324, 0.836403, 0.027203787]
[0.9367812, 0.8364029, 0.027172044]
[0.93674976, 0.8364028, 0.027140591]
[0.9367179, 0.83640254, 0.02710908]
[0.93668634, 0.83640236, 0.027077587]
[0.9366547, 0.83640224, 0.027046211]
[0.93662316, 0.83640206, 0.027014852]
[0.936592, 0.8364019, 0.026983825]
[0.9365606, 0.8364018, 0.026952462]
[0.9365295, 0.83640164, 0.02692151]
[0.9364979, 0.8364014, 0.026890224]
[0.93646675, 0.8364012, 0.02685923]
[0.9364356, 0.836401, 0.026828254]
[0.9364045, 0.836401, 0.026797177]
[0.93637383, 0.83640075, 0.026766704]
[0.93634266, 0.8364006, 0.026735704]
[0.93631184, 0.8364004, 0.026705168]
[0.9362812, 0.8364003, 0.026674554]
[0.9362504, 0.8364001, 0.026643936]
[0.9362197, 0.8364,

KeyboardInterrupt: 

In [93]:
step = complete_history[26]
state = step.state
value = step.value
pol = step.policy

p = alphabot.predict(state[np.newaxis])
p[0][0], softmax(p[0][0]), pol, value, p[1]

(array([6.8011874e-01, 1.3403143e-01, 1.8561162e-01, 2.3815435e-04],
       dtype=float32),
 array([[0.3709528 , 0.21486019, 0.22623353, 0.18795344]], dtype=float32),
 array([0.68518519, 0.12962963, 0.18518519, 0.        ]),
 1,
 array([[-0.01875848]], dtype=float32))

In [89]:
bot, best = 0, 0
for i in range(10):
    if i < 5:
        winner = play_eval()
    else:
        winner = play_eval(True)
        
    if winner == 0:
        bot += 1
    else:
        best += 1

bot, best

AttributeError: 'NoneType' object has no attribute 'recv'

In [27]:
bot, best = 0, 0
for i in range(100):
    winner, _ = simulate_game(10, 0.8, None, None, None, alphabot)
    if winner == 0:
        bot += 1
    else:
        best += 1

bot, best

(48, 52)

In [92]:
logging.getLogger().setLevel(logging.INFO)

In [67]:
%load_ext autoreload
%autoreload 2

In [40]:
complete_history[506].state[..., 2], complete_history[503].policy

(array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 array([0.33333333, 0.33333333, 0.        , 0.33333333]))

In [53]:
np.array([s.state for s in complete_history[506:1000:4]])[:, :, ::-1][0, ..., 2]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [60]:
def policy_rot90(policy, k = 1):
    k = k % 4
    policy = np.array(policy)
    for i in range(k):
        policy = policy[..., [1, 2, 3, 0]]
    
    return policy

def policy_flip(policy, vert=False):
    policy = np.array(policy)
    if vert:
        return policy[..., [0, 3, 2, 1]]
    
    return policy[..., [2, 1, 0, 3]]

def state_flip(state, vert=False):
    state = np.array(state)
    
    if vert:
        return state[:, ::-1]
    return state[:, :, ::-1]

def apply_simmetries(train_steps):
    # 90;180:270 degrees rotations
    # 0 right, 1 down, 2 left, 3 up
    # Flips
    
    t_s = []
    t_p = []
    value = []
    for step in train_steps:
        t_s.append(step.state)
        t_p.append(step.policy)
        value.append(step.value)
    
    i = np.random.randint(1, 5)
    j = np.random.randint(0, 3)
    
    state = np.rot90(t_s, k=i, axes=(1, 2))
    policy = policy_rot90(t_p, k=i)
    
    if j == 0: # Horizontal flip
        state = state_flip(state, vert=False)
        policy = policy_flip(policy, vert=False)
    elif j == 1: # Vertical flip
        state = state_flip(state, vert=True)
        policy = policy_flip(policy, vert=True)

    steps = [TrainStep(s, v, p) for s, p, v in zip(state, policy, value)]

    return steps