In [1]:
import tensorflow as tf
import pickle
import keras
from keras.models import Model, load_model, clone_model
from keras.utils import multi_gpu_model
from keras.layers import *
from keras.optimizers import Adam, SGD
from keras.regularizers import l2
from keras.losses import binary_crossentropy
import keras.backend as K
import numpy as np
import matplotlib.pyplot as plt
import sys
import random
sys.path.append('src')  # Fix for jupyter
import src.emulator as emulator
import src.emulator_utils as emulator_utils
import src.emulator_vis as emulator_vis
%matplotlib inline
from IPython.display import clear_output
import time
import multiprocessing
from multiprocessing import Event, Queue, Pipe
from multiprocessing import Process as Thread
import os
import logging
from mcts import *
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

  return f(*args, **kwds)
  return f(*args, **kwds)
Using TensorFlow backend.


# Logging

In [2]:
logging.basicConfig(filename='logging.log', level=logging.INFO, format='%(asctime)s %(message)s', filemode='w')

# Model Params

In [3]:
#INPUT_SIZE = (16, 16, 5) # Map size fixed to 16x16 (2 to 3 players)
INPUT_SIZE = (9, 9, 5) # Map size fixed to 16x16 (2 to 3 players)
N_ACTIONS = 4
gpus = 1

# Define the Layers Blocks

In [4]:
filters = 128

# Convolutional Block
def conv_block(in_layer, name, filters=filters, kernel_size=(3,3), bn=True, relu=True):
    l = Conv2D(filters, kernel_size, use_bias = False, 
               padding='same', name = name, kernel_regularizer=l2(1e-4))(in_layer)
    if bn:
        l = BatchNormalization(axis=3, name = name + '_bn')(l)
    if relu:
        #l = Activation('relu', name = name + '_relu')(l)
        l = LeakyReLU(name = name + '_lkrelu')(l)

    return l

# Residual Block
def residual_conv(in_layer, idx, filters=filters, kernel_size=(3,3), bn=True, relu=True):
    name = 'res_' + str(idx)
    # Full conv block of pre-defined shape
    l = conv_block(in_layer, name + '_conv1', filters, kernel_size=(3,3), bn=True, relu=True)
    # Second block with skip connection
    l = Conv2D(filters, kernel_size, use_bias = False, padding='same', 
               name = name + '_conv2', kernel_regularizer=l2(1e-4))(l)
    if bn:
        l = BatchNormalization(axis=3, name = name + '_conv2_bn')(l)
    
    l = Concatenate()([in_layer, l]) # Skip conn.
    #l = Add()([in_layer, l]) # Skip conn.
    
    if relu:
        #l = Activation('relu', name = name + '_relu')(l)
        l = LeakyReLU(name = name + '_lkrelu')(l)
        
    return l

def value_head(in_layer):
    l = conv_block(in_layer, 'value_head', filters=1, kernel_size=(1,1))
    #l = conv_block(in_layer, 'value_head', filters=32, kernel_size=(1,1))
    l = Flatten(name = 'value_flatten')(l)
    
    l = Dense(128, kernel_regularizer=l2(1e-4), name = 'value_dense')(l)
    #l = Activation('relu', name = 'value_relu')(l)
    l = LeakyReLU(name = 'value_lkrelu')(l)
    
    l = BatchNormalization(axis=1, name = 'value_bn')(l)

    l = Dense(1, use_bias = False, name = 'value', kernel_regularizer=l2(1e-4),
              activation='tanh')(l) # Value output
    return l

def policy_head(in_layer):
    l = conv_block(in_layer, 'policy_head', filters=2, kernel_size=(1,1))
    #l = conv_block(in_layer, 'policy_head', filters=64, kernel_size=(1,1))
    l = Flatten(name = 'policy_flatten')(l)
    
    l = Dense(N_ACTIONS, name = 'policy', use_bias = False, kernel_regularizer=l2(1e-4),
              activation='softmax')(l) # Policy output
    return l

# Model

In [5]:
def create_model():
    def declare_model():
        n_residual = 4

        input_layer = Input(INPUT_SIZE)
        l = conv_block(input_layer, 'conv')
        for i in range(n_residual):
            l = residual_conv(l, idx=i + 1)

        policy = policy_head(l)
        
        value = value_head(l)

        alphabot = Model(input_layer, [policy, value])
        return alphabot
    
    if gpus > 1:
        with tf.device('/cpu:0'):
            alphabot = declare_model()
        alphabot_multi = multi_gpu_model(alphabot, gpus=gpus)
        return alphabot_multi, alphabot
    
    alphabot = declare_model()
    return alphabot, alphabot

In [7]:
def policy_rot90(policy, k = 1):
    k = k % 4
    policy = np.array(policy)
    for i in range(k):
        policy = policy[..., [1, 2, 3, 0]]
    
    return policy

def policy_flip(policy, vert=False):
    policy = np.array(policy)
    if vert:
        return policy[..., [0, 3, 2, 1]]
    
    return policy[..., [2, 1, 0, 3]]

def state_flip(state, vert=False):
    state = np.array(state)
    
    if vert:
        return state[:, ::-1]
    return state[:, :, ::-1]

def apply_simmetries(data):
    # 90;180:270 degrees rotations
    # 0 right, 1 down, 2 left, 3 up
    # Flips
    
    train_steps = copy.copy(data)
    
    t_s = []
    t_p = []
    value = []
    for step in train_steps:
        t_s.append(step.state)
        t_p.append(step.policy)
        value.append(step.value)
    
    i = np.random.randint(1, 5)
    j = np.random.randint(0, 3)
    
    state = np.rot90(t_s, k=i, axes=(1, 2))
    policy = policy_rot90(t_p, k=i)
    
    if j == 0: # Horizontal flip
        state = state_flip(state, vert=False)
        policy = policy_flip(policy, vert=False)
    elif j == 1: # Vertical flip
        state = state_flip(state, vert=True)
        policy = policy_flip(policy, vert=True)

    steps = [TrainStep(s, v, p) for s, p, v in zip(state, policy, value)]

    return steps

In [8]:
def manage_predictions():
    t = 0
    
    while not winner_buffer.full():
        indices1, states1 = [], []
        indices2, states2 = [], []
        
        if processable_buffer.qsize() < min(num_threads, 2): # Wait until a bunch of requests are queued
            continue

        net0, net1 = [], []
        for i in range(processable_buffer.qsize()):
            index, state, net = processable_buffer.get()
            if net == False:
                if state[..., -1].all() == 0:
                    net = 'alphabot'
                else:
                    net = 'alphabot_best'
            else:
                if state[..., -1].all() == 0:
                    net = 'alphabot_best'
                else:
                    net = 'alphabot'
                
            if net == 'alphabot':
                indices1.append(index)
                states1.append(state)
            elif net == 'alphabot_best':
                indices2.append(index)
                states2.append(state)
        
        predictions1, predictions2 = [], []
        if len(states1) > 0:
            states1 = np.array(states1, dtype=np.float32)
            predictions1 = alphabot.predict(states1)
        if len(states2) > 0:
            states2 = np.array(states2, dtype=np.float32)
            predictions2 = alphabot_best.predict(states2)

        if len(predictions1) > 0:
            for i, pred in enumerate(tuple(zip(predictions1[0], predictions1[1]))):
                pipes[indices1[i]].send(dict(zip(alphabot.output_names, pred)))
        
        if len(predictions2) > 0:
            for i, pred in enumerate(tuple(zip(predictions2[0], predictions2[1]))):
                pipes[indices2[i]].send(dict(zip(alphabot.output_names, pred)))
        
        if time.time() - t > 30: # Every 30 secs
            t = time.time()
            logging.info('Finished evaluation %d games' % winner_buffer.qsize())
        
def simulate_games():
    logging.debug('Starting Threads for parallel Games')
    
    parallel_sim(evaluation=False) # Parallel Games
    
    while not history_buffer.full():
        indices, states = [], []
        if processable_buffer.qsize() < min(num_threads, 2): # Wait until a bunch of requests are queued
            continue

        for i in range(processable_buffer.qsize()):
            index, state, _ = processable_buffer.get()
            indices.append(index)
            states.append(state)
            
        states = np.array(states, dtype=np.float32)
        predictions = alphabot.predict(states)
        for i, pred in enumerate(tuple(zip(predictions[0], predictions[1]))):
            pipes[indices[i]].send(dict(zip(alphabot.output_names, pred)))

    logging.info('Finished Simulating %s games', n_games)

In [9]:
def play_eval(reverted=False, pipe=None, process_id=None):
    global alphabot_best
    global alphabot
    
    game = emulator.Game(2)
    mapp = game.reset()
  
    tree_player0 = MCTS()
    tree_player0.alpha = MCTS_eval_alpha
  
    tree_player1 = MCTS()
    tree_player1.alpha = MCTS_eval_alpha

    old_mapp = None
    head = None
    turn = 0
    s = map_to_state(mapp, old_mapp, None, 0)
    old_mapp = copy.deepcopy(mapp)
  
    states = []
    policies = []
    
    while True:
        states.append(np.array(s))
        if turn == 0:
            policy = do_search(MCTS_eval_steps, s, mapp, game, tree_player0, pipe=pipe, process_id=process_id, ask_predict=ask_predict, alphabot=reverted, allow_move=allow_move)
        else:
            policy = do_search(MCTS_eval_steps2, s, mapp, game, tree_player1, pipe=pipe, process_id=process_id, ask_predict=ask_predict, alphabot=reverted, allow_move=allow_move)

        if not use_eval_choice:
            choosen = np.argmax(policy)
        else :
            choosen = np.random.choice(4, p=policy)

        policies.append(np.array(policy))
        mapp, tmp_head = game.step(mapp, s, choosen, turn, mcts=True)

        turn = 1 - turn
        if turn == 0:  # We update the state
            s = map_to_state(mapp, old_mapp, s, 0, head)  # TODO: Map to state
        else:
            head = tmp_head
            s[..., -1] = 1

        if turn == 0:
            old_mapp = np.array(mapp)
        
        if game.game_ended():
            logging.debug('GAME ENDED, %s won, %s <- reverted' % (turn, reverted))
            if not reverted:
                return turn
            else:
                return  1 - turn

In [10]:
def train_model():        
        picked_data = random.sample(complete_history, k=min(BATCH_SIZE, len(complete_history))) 
        picked_data = apply_simmetries(picked_data)
        
        state = []
        policy = []
        value = []
        for step in picked_data:
            policy.append(step.policy)
            state.append(step.state)
            value.append(step.value)
            
        y = [np.zeros((len(state), 4)), np.zeros((len(state), 1))]
        y[0] = policy
        y[1] = value
        
        losses = alphabot.train_on_batch(np.array(state, dtype=np.float32), y)
        return losses

In [11]:
def training_cycle():
    global alphabot
    global alphabot_best
    global total_improv
    
    logging.info('Starting Training Cycle')
    simulate_games()
    
    # history_buffer contains the games, we store them inside complete history    
    for g in range(history_buffer.qsize()):
        complete_history.append(history_buffer.get())
    stop_simulation() # We can now stop the simulation (will free the memory)
    
    logging.info('Starting Model Training')
    losses = [0, 0, 0] # For debug purpose
    sum_loss = 0
    cc = 1
    for i in range(t_steps + 1):
        if i % 25 == 0:
            logging.info('Training Interaction: %s losses: %s %s', i, 
                         round(sum_loss / cc, 2), np.round(losses, 2))
        
        losses = train_model()
        sum_loss += losses[0]
        logging.debug('Losses: %s', losses)
        
        cc += 1
        if i % eval_steps == 0 and i > 0:
            cc = 1 # Reset loss counter
            sum_loss = 0
            wins = {'candidate' : 0, 'best' : 0}
            n_c = {0 : 'candidate', 1 : 'best'}
            
            logging.info('Starting self-play evaluation')    
            parallel_sim(evaluation=True) # Start Parallel Games
            manage_predictions()
            for i in range(winner_buffer.qsize()):
                w = winner_buffer.get()
                wins[n_c[w]] += 1 # add a win to the winner
            stop_simulation()
            
            win_ratio = round(wins['candidate'] / eval_games, 2)
            if win_ratio >= win_percent:
                logging.info('Great! Our candidate won %s percent of games', round(win_ratio * 100))
                total_improv += 1
                logging.info('Our bot got better %s times', total_improv)
                alphabot.save('alphabot_best.pickle')
                replace_best()    
            else:
                logging.info('Damn! Our candidate only won %s percent of games', round(win_ratio * 100, 2))         
                logging.info('Cloning to best')
                reload_best()
            
    if len(complete_history) >= k * n_games: #* 2:
        logging.info('Removing oldest games')
        del complete_history[:n_games] # Delete n oldest games from history

In [219]:
def load_best(best_model):
    global alphabot
    global alphabot_best
    alphabot_best = load_model(best_model, 
                               custom_objects={'softmax_cross_entropy_with_logits' : softmax_cross_entropy_with_logits,
                                          'categorical_weighted' : categorical_weighted})
    alphabot = load_model(best_model, 
                               custom_objects={'softmax_cross_entropy_with_logits' : softmax_cross_entropy_with_logits,
                                          'categorical_weighted' : categorical_weighted})

In [13]:
def reload_best():
    global alphabot
    global alphabot_best
    
    alphabot.set_weights(alphabot_best.get_weights())

In [14]:
def replace_best():
    global alphabot
    global alphabot_best
    
    alphabot_best.set_weights(alphabot.get_weights())

In [15]:
def train(cycles):
    global alphabot_best
    global alphabot
    
    #replace_best()
    
    complete_history = []
    for i in range(cycles):
        logging.info('Training cycle: %s', i)
        training_cycle()

In [16]:
def ask_predict(idi, x, net=None):
    # Adds to queue id and data from process
    processable_buffer.put((idi, x, net))

def sim(process_id, pipe, evaluation=False):
    np.random.seed()
    random.seed()
    
    if evaluation:
        while True:
            reverted = np.random.random() >= 0.5
            winner = play_eval(reverted, pipe, process_id)
            
            try:
                winner_buffer.put_nowait(winner)
            except:
                break
    
    else:
        while True:
            train_steps = simulate_game(MCTS_steps, MCTS_alpha, pipe, ask_predict, process_id)    
        
            try:
                for step in train_steps:
                    history_buffer.put_nowait(step)
            except:
                break
                    
def stop_simulation():
    global workers
    global history_buffer
    global processable_buffer
    global winner_buffer
    global pipes
    global child_pipes
    
    if 'workers' in globals() and len(workers) != 0:
        for worker in workers:
            worker.terminate()
    workers = []
    
    for pipe in pipes:
        pipe.close()

    for pipe in child_pipes:
        pipe.close()
    
    #for _ in range(history_buffer.qsize()):
    #    try:
    #        history_buffer.get_nowait()
    #    except:
    #        break
            
    #for _ in range(processable_buffer.qsize()):
    #    try:
    #        processable_buffer.get_nowait()
    #    except:
    #        break
        
    history_buffer.close()
    processable_buffer.close()
    winner_buffer.close()
    
    # Then we empty the queues
    del history_buffer
    del processable_buffer
    del pipes
    del child_pipes
    del winner_buffer

def parallel_sim(evaluation=False):
    global workers
    global history_buffer
    global processable_buffer
    global winner_buffer
    global pipes
    global child_pipes
    
    if 'workers' in globals() and len(workers) != 0:
        stop_simulation()
    
    history_buffer = Queue(n_games) # This numbers can be tweaked
    winner_buffer = Queue(eval_games)
    processable_buffer = Queue(num_threads)
    pipes = []
    child_pipes = []
    
    workers = []
    for i in range(num_threads):
        parent_pipe, child_pipe = Pipe() # Pipe to communicate with childs
        pipes.append(parent_pipe)
        child_pipes.append(child_pipe)
        
        worker = Thread(target=sim, args=[i, child_pipe, evaluation])
        worker.daemon = False
        worker.start()
        workers.append(worker)

In [17]:
import tensorflow as tf

def softmax_cross_entropy_with_logits(y_true, y_pred):

    p = y_pred
    pi = y_true

    zero = tf.zeros(shape = tf.shape(pi), dtype=tf.float32)
    where = tf.equal(pi, zero)

    negatives = tf.fill(tf.shape(pi), -100.0) 
    p = tf.where(where, negatives, p)

    loss = tf.maximum(0., tf.nn.softmax_cross_entropy_with_logits(labels = pi, logits = p) - 1)

    return loss

def categorical_weighted(y_true, y_pred):
    return tf.maximum(0., keras.losses.categorical_crossentropy(y_true, y_pred) - 1)

In [18]:
alphabot, _ = create_model()
alphabot.compile(optimizer=Adam(1e-3),#SGD(1e-3, momentum=0.9),
                          loss={'value' : 'mse', 'policy': categorical_weighted},
                          loss_weights={'value' : 0.5, 'policy' : 0.5})
alphabot.summary(line_length=112)

________________________________________________________________________________________________________________
Layer (type)                        Output Shape             Param #       Connected to                         
input_1 (InputLayer)                (None, 9, 9, 5)          0                                                  
________________________________________________________________________________________________________________
conv (Conv2D)                       (None, 9, 9, 128)        5760          input_1[0][0]                        
________________________________________________________________________________________________________________
conv_bn (BatchNormalization)        (None, 9, 9, 128)        512           conv[0][0]                           
________________________________________________________________________________________________________________
conv_lkrelu (LeakyReLU)             (None, 9, 9, 128)        0             conv_bn[0][0]        

In [19]:
#alphabot.save('alphabot_best.pickle')
alphabot_best = load_model('alphabot_best.pickle', 
                           custom_objects={'categorical_weighted' : categorical_weighted})

In [224]:
load_best('alphabot_best.pickle')

In [20]:
# History of games for training
#complete_history = []

# Game Params
n_players = 2
n_games = 1 #5_000 #10_000 # Simulate N games before each training
k = 10 # Games to be stored n_games * K

# Eval options
allow_move = False
use_eval_choice = False

# Simulation Params
num_threads = 6

MCTS_steps = 35
MCTS_eval_steps = 25
MCTS_eval_steps2 = MCTS_eval_steps
MCTS_alpha = 1.
MCTS_eval_alpha = 1.

# Training Params
t_steps = 900 #2000 # Steps of training
eval_steps = 1 #300 #500 # How many steps before evaluation
eval_games = 300 #150 # How many games to play to evaluate how's best model
win_percent = 0.55 # Ratio of game won to become best model
BATCH_SIZE = 256
total_improv = 0

In [30]:
#complete_history = []
cycles = 1000

train(cycles)

Process Process-2306:
Process Process-2310:
Process Process-2309:
Process Process-2308:
Process Process-2305:
Process Process-2307:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/adryw/miniconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/adryw/miniconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/adryw/miniconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/adryw/miniconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "<ipython-input-16-f2d8177e37a8>", line 12, in sim
    winner = play_eval(reverted, pipe, process_id)
  File "/home/adryw/miniconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.ru

KeyboardInterrupt: 

  File "src/mcts.py", line 160, in do_search
    tree.search(s, mapp, game, pipe, ask_predict, process_id, alphabot=alphabot, allow_move=allow_move)
  File "src/mcts.py", line 84, in search
    v = self.search(sp, new_map, game, pipe, ask_predict, process_id, allow_move, alphabot, head_pos)
  [Previous line repeated 3 more times]
  File "/home/adryw/miniconda3/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
  File "src/mcts.py", line 84, in search
    v = self.search(sp, new_map, game, pipe, ask_predict, process_id, allow_move, alphabot, head_pos)
  File "src/mcts.py", line 32, in search
    raw_prediction = pipe.recv()
  File "src/mcts.py", line 84, in search
    v = self.search(sp, new_map, game, pipe, ask_predict, process_id, allow_move, alphabot, head_pos)
KeyboardInterrupt
  File "/home/adryw/miniconda3/lib/python3.6/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/home/adryw/miniconda3/lib

In [27]:
K.set_value(alphabot.optimizer.lr, 1e-3)

In [162]:
for i in range(10_000):
    picked_data = random.sample(complete_history, k=min(BATCH_SIZE, len(complete_history)))
    #picked_data = complete_history[0:50]
    picked_data = apply_simmetries(picked_data)
    
    state = []
    policy = []
    value = []
    for step in picked_data:
        policy.append(step.policy)
        state.append(step.state)
        value.append(step.value)

    y = [np.zeros((len(state), 4)), np.zeros((len(state), 1))]
    y[0] = policy
    y[1] = value
    
    losses = alphabot.train_on_batch(np.array(state, dtype=np.float32), y)
    print(losses)

[1.1583364, 1.446481, 0.6207268]
[1.331359, 1.4504187, 0.9629259]
[1.209579, 1.473572, 0.6963061]
[1.164076, 1.4366484, 0.6423168]
[1.1588454, 1.4081893, 0.6604059]
[1.2281443, 1.4697251, 0.7375589]
[1.0960766, 1.4094826, 0.5337554]
[1.3415568, 1.4282053, 1.0060816]
[1.3080968, 1.4753394, 0.89211404]
[1.1649989, 1.448045, 0.63329995]
[1.326799, 1.4644306, 0.94060147]
[1.4240673, 1.3936629, 1.2059927]
[1.1683818, 1.4462812, 0.6420899]
[1.365388, 1.4447434, 1.0377253]
[1.396484, 1.4784896, 1.0662558]
[1.2087141, 1.4179596, 0.7513251]
[1.1451406, 1.4722918, 0.56992173]
[1.1463888, 1.4786279, 0.5661542]
[1.082129, 1.4391775, 0.47715548]
[1.2509357, 1.4245348, 0.8294823]
[1.2369635, 1.4686078, 0.75753635]
[1.1617712, 1.4352183, 0.6406163]
[1.2207134, 1.504924, 0.68887216]
[1.2449014, 1.4160061, 0.82624316]
[1.2152234, 1.4736707, 0.7093023]
[1.2662878, 1.4693744, 0.8158107]
[1.2466354, 1.4522547, 0.79370886]
[1.1386575, 1.4999915, 0.53010005]
[1.2118541, 1.4471065, 0.7294631]
[1.1510406, 1.4

[1.1077349, 1.426611, 0.5488308]
[1.128782, 1.4809525, 0.53658205]
[1.0053349, 1.4577618, 0.3128789]
[1.1363585, 1.4828447, 0.54984343]
[1.0767553, 1.4569311, 0.45654386]
[1.1495633, 1.4893489, 0.5697256]
[1.0929075, 1.4614887, 0.48424965]
[1.0910349, 1.5019503, 0.44001088]
[1.0323409, 1.4724289, 0.35210967]
[1.0044079, 1.4322128, 0.33642077]
[0.98479426, 1.4211771, 0.30818796]
[0.88544625, 1.4318893, 0.09873828]
[1.0326995, 1.499598, 0.32549775]
[1.0039341, 1.4403822, 0.3271491]
[0.9628965, 1.4283788, 0.25705153]
[1.1579598, 1.4651093, 0.61042774]
[0.90291744, 1.4109294, 0.15450667]
[0.9473989, 1.472036, 0.18235049]
[0.9774379, 1.4263264, 0.28813177]
[0.9569534, 1.4122491, 0.26124322]
[1.0729852, 1.4608078, 0.44475755]
[0.93858117, 1.4660792, 0.17069337]
[1.0351478, 1.4290572, 0.40086967]
[1.0629655, 1.4681504, 0.41743517]
[1.0566732, 1.4706595, 0.4023657]
[0.98024964, 1.4077466, 0.3124576]
[1.0525099, 1.4757541, 0.38899714]
[0.9478945, 1.4150343, 0.240513]
[0.96371955, 1.4672663, 0.2

[0.93271863, 1.4344535, 0.2008587]
[0.8318081, 1.3798126, 0.053772457]
[0.9447155, 1.4218234, 0.23767027]
[0.8865703, 1.3670175, 0.17627962]
[0.84903854, 1.3684036, 0.099922866]
[0.9449357, 1.4274201, 0.23279248]
[0.9018521, 1.428623, 0.14551418]
[0.9490514, 1.46156, 0.20706443]
[0.83064735, 1.3281784, 0.10372459]
[0.8590277, 1.4169012, 0.07184754]
[0.96876293, 1.4382551, 0.2700479]
[0.88658136, 1.4193282, 0.12469597]
[0.93059134, 1.3886297, 0.2434997]
[0.8895987, 1.39657, 0.1536583]
[0.8688957, 1.4132872, 0.09561723]
[0.9276795, 1.4328805, 0.1936716]
[0.8600406, 1.3950293, 0.09632321]
[0.91161925, 1.4615555, 0.13303375]
[0.9077297, 1.3908076, 0.19608387]
[1.039326, 1.4996051, 0.35056028]
[0.8990393, 1.49244, 0.07723526]
[0.95898026, 1.4760132, 0.21362877]
[0.83877516, 1.4120436, 0.037274383]
[0.9226861, 1.424116, 0.19311292]
[0.9712648, 1.462036, 0.2524418]
[0.9161403, 1.4718732, 0.13244843]
[0.8643099, 1.4342909, 0.06646303]
[0.92989296, 1.4397804, 0.1922318]
[0.84677505, 1.4054078, 

KeyboardInterrupt: 

In [497]:
step = complete_history[125]
state = step.state
value = step.value
pol = step.policy
policy, v = alphabot.predict(state[np.newaxis])
policy = policy[0]

mapp = np.full((6, 6), -1)
mapp[state[..., 0] == 1] = 1
mapp[state[..., 2] == 1] = 1

valid_actions = emulator.Game(2).valid_actions(mapp, state, state[..., -1].all() == 1)
if len(valid_actions) < 4:
    missing_idx = [v for v in [0, 1, 2, 3] if v not in valid_actions]
    policy[missing_idx] = 0
             
if sum(policy) > 0:
    policy = policy / sum(policy)


policy, pol, value, v

(array([0.33652508, 0.        , 0.33197975, 0.3314952 ], dtype=float32),
 array([0.57352941, 0.        , 0.22058824, 0.20588235]),
 1,
 array([[0.00925609]], dtype=float32))

In [210]:
for i in range(1):
    states = simulate_game(10, 0.8, alphabot=alphabot, eval_g=True, return_state=True)

maps = []
for state in states:
    mapp = state[..., 0]
    mapp += state[..., 2] * 2
    mapp[np.where(state[..., 1] == 1)] = 3
    mapp[np.where(state[..., 3] == 1)] = 4
    mapp = np.expand_dims(mapp, axis=-1)
    mapp = np.tile(mapp, [1, 1, 3])
        
    idx, cols, c = np.where(mapp == 1)
    mapp[idx, cols, :] = 0
    mapp[idx, cols, 0] = 128
    
    idx, cols, c = np.where(mapp == 2)
    mapp[idx, cols, :] = 0
    mapp[idx, cols, 1] = 128
    
    idx, cols, c = np.where(mapp == 3)
    mapp[idx, cols, :] = 0
    mapp[idx, cols, 0] = 255
    
    idx, cols, c = np.where(mapp == 4)
    mapp[idx, cols, :] = 0
    mapp[idx, cols, 1] = 255
    
    maps.append(mapp)
    
maps = np.array(maps)

write_gif(maps, './test.gif', fps=5)

In [500]:
logging.getLogger().setLevel(logging.INFO)

In [22]:
%load_ext autoreload
%autoreload 2

In [320]:
complete_history[506].state[..., 2], complete_history[503].policy

(array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 array([0.        , 0.14814815, 0.33333333, 0.51851852]))

In [53]:
np.array([s.state for s in complete_history[506:1000:4]])[:, :, ::-1][0, ..., 2]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])