In [1]:
import tensorflow as tf
import pickle
import keras
from keras.models import Model, load_model, clone_model
from keras.layers import *
from keras.optimizers import Adam, SGD
from keras.regularizers import l2
from keras.losses import categorical_crossentropy
import keras.backend as K
import numpy as np
import matplotlib.pyplot as plt
import sys
import random
sys.path.append('src')  # Fix for jupyter
import src.emulator as emulator
import src.emulator_utils as emulator_utils
import src.emulator_vis as emulator_vis
%matplotlib inline
from IPython.display import clear_output
import time
import multiprocessing
from multiprocessing import Event, Queue, Pipe
from multiprocessing import Process as Thread
import os
import logging
from mcts import *
from custom_layers import *
from simmetries import *
import scipy.integrate as integrate
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

  return f(*args, **kwds)
  return f(*args, **kwds)
Using TensorFlow backend.


# Logging

In [2]:
logging.basicConfig(filename='logging.log', level=logging.INFO, format='%(asctime)s %(message)s', filemode='w')

# Model Params

In [23]:
#INPUT_SIZE = (None, None, 5) # Map size fixed to 16x16 (2 to 3 players)
INPUT_SIZE = (16, 16, 5) # Map size fixed to 16x16 (2 to 3 players)
N_ACTIONS = 4
gpus = 1

def manage_predictions():
    t = time.time()
    
    while not winner_buffer.full():
        indices1, states1 = [], []
        indices2, states2 = [], []
        
        if processable_buffer.qsize() < min(num_threads, 2): # Wait until a bunch of requests are queued
            continue

        net0, net1 = [], []
        for i in range(processable_buffer.qsize()):
            index, state, net = processable_buffer.get()
            if net == False:
                if state[..., -1].all() == 0:
                    net = 'alphabot'
                else:
                    net = 'alphabot_best'
            else:
                if state[..., -1].all() == 0:
                    net = 'alphabot_best'
                else:
                    net = 'alphabot'
                
            if net == 'alphabot':
                indices1.append(index)
                states1.append(state)
            elif net == 'alphabot_best':
                indices2.append(index)
                states2.append(state)
        
        predictions1, predictions2 = [], []
        if len(states1) > 0:
            states1 = np.array(states1, dtype=np.float32)
            predictions1 = alphabot.predict(states1)
        if len(states2) > 0:
            states2 = np.array(states2, dtype=np.float32)
            predictions2 = alphabot_best.predict(states2)

        if len(predictions1) > 0:
            for i, pred in enumerate(tuple(zip(predictions1[0], predictions1[1]))):
                pipes[indices1[i]].send(dict(zip(alphabot.output_names, pred)))
        
        if len(predictions2) > 0:
            for i, pred in enumerate(tuple(zip(predictions2[0], predictions2[1]))):
                pipes[indices2[i]].send(dict(zip(alphabot.output_names, pred)))
        
        if time.time() - t > 120:
            t = time.time()
            logging.info('Finished evaluation %d games' % winner_buffer.qsize())
        
def simulate_games():
    logging.debug('Starting Threads for parallel Games')
    
    parallel_sim(evaluation=False) # Parallel Games
    
    while not history_buffer.full():
        indices, states = [], []
        if processable_buffer.qsize() < min(num_threads, 2): # Wait until a bunch of requests are queued
            continue

        for i in range(processable_buffer.qsize()):
            index, state, _ = processable_buffer.get()
            indices.append(index)
            states.append(state)
            
        states = np.array(states, dtype=np.float32)
        predictions = alphabot.predict(states)
        for i, pred in enumerate(tuple(zip(predictions[0], predictions[1]))):
            pipes[indices[i]].send(dict(zip(alphabot.output_names, pred)))

    logging.info('Finished Simulating %s games', n_games)

def play_eval(MAP_SIZE, reverted=False, pipe=None, process_id=None):
    global alphabot_best
    global alphabot
    
    if isinstance(MCTS_eval_steps, list):
        steps_long = MCTS_eval_steps[1]
        steps = MCTS_eval_steps[0]
    else:
        steps_long = MCTS_eval_steps   
    
    game = emulator.Game(2, MAP_SIZE)
    mapp = game.reset()
  
    old_mapp = None
    head = None
    turn = 0
    s = map_to_state(mapp, old_mapp, None, 0, MAP_SIZE)
    old_mapp = copy.deepcopy(mapp)
    counter_turn = 0
  
    while True:
        tree_player0 = MCTS()
        tree_player0.alpha = MCTS_eval_alpha
  
        tree_player1 = MCTS()
        tree_player1.alpha = MCTS_eval_alpha
        
        if counter_turn > MAP_SIZE * 2:
            steps = steps_long

        if turn == 0:
            policy = do_search(steps, s, mapp, game, tree_player0, MAP_SIZE, 
                               pipe=pipe, process_id=process_id, ask_predict=ask_predict, 
                               alphabot=reverted, allow_move=allow_move)
        else:
            policy = do_search(steps, s, mapp, game, tree_player1, MAP_SIZE,
                               pipe=pipe, process_id=process_id, ask_predict=ask_predict,
                               alphabot=reverted, allow_move=allow_move)

        choosen = np.argmax(policy)
        mapp, tmp_head = game.step(mapp, s, choosen, turn, mcts=True)
        counter_turn += 1/2

        turn = 1 - turn
        if turn == 0:  # We update the state
            s = map_to_state(mapp, old_mapp, s, 0, MAP_SIZE, head)  # TODO: Map to state
        else:
            head = tmp_head
            s[..., -1] = 1

        if turn == 0:
            old_mapp = np.array(mapp)
        
        if game.game_ended():
            logging.debug('GAME ENDED, %s won, %s <- reverted' % (turn, reverted))
            if not reverted:
                return turn
            else:
                return  1 - turn

def train_model():        
        picked_data = random.sample(complete_history, k=min(BATCH_SIZE, len(complete_history))) 
        
        state = []
        policy = []
        value = []
        for step in picked_data:
            policy.append(step.policy)
            state.append(step.state)
            value.append(step.value)
            
        y = [np.zeros((len(state), 4)), np.zeros((len(state), 1))]
        y[0] = policy
        y[1] = value
        
        losses = alphabot.train_on_batch(np.array(state, dtype=np.float32), y)
        return losses

def training_cycle():
    global alphabot
    global alphabot_best
    global total_improv
    
    simulate_games()
    
    # history_buffer contains the games, we store them inside complete history    
    tmp_buffer = {} # We store on different buffers based on image size
    for g in range(history_buffer.qsize()):
        sample = history_buffer.get()
        if not sample.map_size in tmp_buffer.keys():
            tmp_buffer[sample.map_size] = []
        tmp_buffer[sample.map_size].append(sample)
        
    for MAP_SIZE, samples in tmp_buffer.items():
        tmp_buffer = apply_simmetries(samples, MAP_SIZE)
        complete_history.extend(tmp_buffer)
        
    stop_simulation() # We can now stop the simulation (will free the memory)
    
    logging.info('Starting Model Training')
    losses = [0, 0, 0] # For debug purpose
    sum_loss = 0
    cc = 1
    for i in range(t_steps + 1):
        if i % 250 == 0 and i > 0:
            logging.info('Training Interaction: %s losses: %s %s', i, 
                         round(sum_loss / cc, 2), np.round(losses, 2))
        
        losses = train_model()
        sum_loss += losses[0]
        logging.debug('Losses: %s', losses)
        
        cc += 1
        if i % eval_steps == 0 and i > 0:
            cc = 1 # Reset loss counter
            sum_loss = 0
            wins = {'candidate' : 0, 'best' : 0}
            n_c = {0 : 'candidate', 1 : 'best'}
            
            logging.info('Starting self-play evaluation')    
            parallel_sim(evaluation=True) # Start Parallel Games
            manage_predictions()
            for i in range(winner_buffer.qsize()):
                w = winner_buffer.get()
                wins[n_c[w]] += 1 # add a win to the winner
            stop_simulation()
            
            win_ratio = round(wins['candidate'] / eval_games, 2)
            if win_ratio >= win_percent:
                logging.info('Great! Our candidate won %s percent of games', round(win_ratio * 100))
                total_improv += 1
                logging.info('Our bot got better %s times', total_improv)
                alphabot.save('alphabot_best.pickle')
                replace_best()    
            else:
                logging.info('Damn! Our candidate only won %s percent of games', round(win_ratio * 100, 2))         
                logging.info('Cloning to best')
                reload_best()
            
    if len(complete_history) >= k * n_games * 7:
        logging.info('Removing oldest games')
        del complete_history[:n_games * 7] # Delete n oldest games from history

def load_best(best_model):
    global alphabot
    global alphabot_best
    alphabot_best = load_model(best_model, 
                               custom_objects={'softmax_cross_entropy_with_logits' : softmax_cross_entropy_with_logits,
                                          'categorical_weighted' : categorical_weighted})
    alphabot = load_model(best_model, 
                               custom_objects={'softmax_cross_entropy_with_logits' : softmax_cross_entropy_with_logits,
                                          'categorical_weighted' : categorical_weighted})

def reload_best():
    global alphabot
    global alphabot_best
    
    alphabot.set_weights(alphabot_best.get_weights())

def replace_best():
    global alphabot
    global alphabot_best
    
    alphabot_best.set_weights(alphabot.get_weights())

def train(cycles):
    global alphabot_best
    global alphabot
        
    for i in range(cycles):
        logging.info('Training cycle: %s', i)
        training_cycle()

def ask_predict(idi, x, net=None):
    # Adds to queue id and data from process
    processable_buffer.put((idi, x, net))

def sim(process_id, pipe, evaluation=False):
    np.random.seed()
    random.seed()
    
    if evaluation:
        while True:
            reverted = np.random.random() >= 0.5
            MAP_SIZE = random_mapsize()
            winner = play_eval(MAP_SIZE, reverted, pipe, process_id)
            
            try:
                winner_buffer.put_nowait(winner)
            except:
                break
    
    else:
        while True:
            MAP_SIZE = random_mapsize()
            train_steps = simulate_game(MCTS_steps, MCTS_alpha, MAP_SIZE, pipe, ask_predict, process_id)            
            
            try:
                for step in train_steps:
                    history_buffer.put_nowait(step)
            except:
                break
                                
def stop_simulation():
    global workers
    global history_buffer
    global processable_buffer
    global winner_buffer
    global pipes
    global child_pipes
    
    if 'workers' in globals() and len(workers) != 0:
        for worker in workers:
            worker.terminate()
    workers = []
    
    for pipe in pipes:
        pipe.close()

    for pipe in child_pipes:
        pipe.close()
        
    history_buffer.close()
    processable_buffer.close()
    winner_buffer.close()
    
    # Then we empty the queues
    del history_buffer
    del processable_buffer
    del pipes
    del child_pipes
    del winner_buffer

def parallel_sim(evaluation=False):
    global workers
    global history_buffer
    global processable_buffer
    global winner_buffer
    global pipes
    global child_pipes
    
    if 'workers' in globals() and len(workers) != 0:
        stop_simulation()
    
    history_buffer = Queue(n_games) # This numbers can be tweaked
    winner_buffer = Queue(eval_games)
    processable_buffer = Queue(num_threads)
    pipes = []
    child_pipes = []
    
    workers = []
    for i in range(num_threads):
        parent_pipe, child_pipe = Pipe() # Pipe to communicate with childs
        pipes.append(parent_pipe)
        child_pipes.append(child_pipe)
        
        worker = Thread(target=sim, args=[i, child_pipe, evaluation])
        worker.daemon = False
        worker.start()
        workers.append(worker)

def random_mapsize(mu=10.5, sigma=np.sqrt(10), maxsize=16):
    normal = lambda x : (1 / np.sqrt(2*np.pi * np.square(sigma)) \
                         * np.power(np.e, -(np.square((x - mu)) \
                                            / (2 * np.square(sigma)))))
    
    values = [normal(i) for i in range(5, 17)]
    a = np.array(values) / np.sum(values)
    sample = np.random.choice(range(5, 17), p=a, size=1)[0]
    
    return sample

In [14]:
alphabot = declare_model(n_channels=75, n_residual=3)
alphabot.compile(optimizer=SGD(1e-4, momentum=0.9),
                          loss={'value' : 'mse', 'policy': categorical_crossentropy},
                          loss_weights={'value' : 0.25, 'policy' : 1})
alphabot.summary(line_length=112)

________________________________________________________________________________________________________________
Layer (type)                        Output Shape             Param #       Connected to                         
input_1 (InputLayer)                (None, 16, 16, 5)        0                                                  
________________________________________________________________________________________________________________
conv (Conv2D)                       (None, 16, 16, 75)       3375          input_1[0][0]                        
________________________________________________________________________________________________________________
conv_bn (BatchNormalization)        (None, 16, 16, 75)       300           conv[0][0]                           
________________________________________________________________________________________________________________
conv_relu (Activation)              (None, 16, 16, 75)       0             conv_bn[0][0]        

In [15]:
#alphabot.save('alphabot_best.pickle')
alphabot_best = load_model('alphabot_best.pickle', custom_objects={'tf': tf, 'ZeroConv': ZeroConv})

In [16]:
reload_best()

In [224]:
load_best('alphabot_best.pickle')

In [18]:
# History of games for training
complete_history = []

# Game Params
n_players = 2
n_games = 15_000 # Simulate N games before each training
k = 4 # Games to be stored n_games * K

# Eval options
allow_move = False
use_eval_choice = False

# Simulation Params
num_threads = 6

#MCTS_steps = [50, 130]
#MCTS_eval_steps = [30, 60]
MCTS_steps = [45, 100]
MCTS_eval_steps = [40, 60]
#MCTS_steps = [35, 90]
#MCTS_eval_steps = [35, 55]
#MCTS_steps = [25, 65]
#MCTS_eval_steps = [30, 50]

MCTS_alpha = 1.1
MCTS_eval_alpha = 1.1

# Training Params
t_steps = 3000 # Steps of training
eval_steps = 1000 # How many steps before evaluation
eval_games = 100 # How many games to play to evaluate who's best model
win_percent = 0.53 # Ratio of game won to become best model
BATCH_SIZE = 256
total_improv = 0

In [29]:
complete_history = []
cycles = 1000

train(cycles)

KeyboardInterrupt: 

In [None]:
cycles = 100
K.set_value(alphabot.optimizer.lr, 1e-2)
train(cycles)

In [None]:
cycles = 200
K.set_value(alphabot.optimizer.lr, 1e-3)
train(cycles)

In [None]:
cycles = 200
K.set_value(alphabot.optimizer.lr, 1e-4)
train(cycles)

In [None]:
K.set_value(alphabot.optimizer.lr, 1e-3)

In [31]:
logging.getLogger().setLevel(logging.INFO)

In [21]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
alphabot_best = load_model('selected/dynsizebkp.pickle', 
                      custom_objects={'tf': tf, 'ZeroConv' : ZeroConv})

In [18]:
alphabot = load_model('alphabot_best.pickle',
                      custom_objects={'tf': tf,
                                      'ZeroConv' : ZeroConv})

In [19]:
# History of games for training
complete_history = []

# Game Params
n_players = 2
n_games = 1 #8_000 #10_000 # Simulate N games before each training
k = 5 # Games to be stored n_games * K

# Eval options
allow_move = False
use_eval_choice = False

# Simulation Params
num_threads = 6

MCTS_steps = 70
MCTS_eval_steps = [40, 70]
MCTS_eval_steps2 = MCTS_eval_steps
MCTS_alpha = 1.
MCTS_eval_alpha = 1.

# Training Params
t_steps = 2000 # Steps of training
eval_steps = 1 #500 # How many steps before evaluation
eval_games = 200 # How many games to play to evaluate who's best model
win_percent = 0.55 # Ratio of game won to become best model
BATCH_SIZE = 512
total_improv = 0

cycles = 1000

train(cycles)

ValueError: Cannot feed value of shape (1, 1, 75, 1) for Tensor 'Placeholder_345:0', which has shape '(3, 3, 75, 75)'

# Play Human

In [27]:
from IPython.display import display
from ipywidgets import Button, Layout, HBox

up = Button(description="^", layout=Layout(margin='10px 0px 0px 60px', width='100px', height='75px'))
left = Button(description="<", layout=Layout(margin='0px 10px 0px 0px', width='100px', height='75px'))
right = Button(description=">", layout=Layout(margin='0px 0px 0px 10px', width='100px', height='75px'))
down = Button(description="˅", layout=Layout(margin='0px 0px 0px 60px', width='100px', height='75px'))

def display_buttons():
    display(up)
    display(HBox([left, right]))
    display(down)

def on_button_clicked(b):
    actions = {'>' : 0, '˅' : 1, '<' : 2, '^' : 3}
    human_action = actions[b.description]
    display_buttons()
    s, game_end = step(human_action)
    if game_end:
        print('Game Ended')
        reset()
        
    clear_output(True)
    
up.on_click(on_button_clicked)
down.on_click(on_button_clicked)
left.on_click(on_button_clicked)
right.on_click(on_button_clicked)

display_buttons()

Button(description='^', layout=Layout(height='75px', margin='10px 0px 0px 60px', width='100px'), style=ButtonS…

HBox(children=(Button(description='<', layout=Layout(height='75px', margin='0px 10px 0px 0px', width='100px'),…

Button(description='˅', layout=Layout(height='75px', margin='0px 0px 0px 60px', width='100px'), style=ButtonSt…

[[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0]
 [1 0 1 1 1 1 0 0 1 0 0 0 0 1 1 1]
 [1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 1 1 1 1 1 1 1 1 1 1 1 3 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 1 1 1 1 1 2 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]]
[('right', 1.0), ('down', 0.0), ('left', 0.0), ('up', 0.0)] [0.9731855] 155


In [26]:
reset()

In [25]:
m = 16

def reset():
    global s, mapp, old_mapp, game, mcts_tree, count_turn, time_to_move
    
    game = emulator.Game(2, MAP_SIZE=m)
    old_mapp = None
    mapp = game.reset()
    mcts_tree = MCTS()
    mcts_tree.alpha = 0.8
    s = map_to_state(mapp, old_mapp, None, 0, m)
    old_mapp = copy.copy(mapp)

    running = True
    count_turn = 1
    time_to_move = 2  # Time in seconds to pick a move
    #print(s[..., -1] + np.ones_like(s[..., -2:-1]) - s[..., -2])

def step(human_action):
    global s, mapp, old_mapp, game, mcts_tree, count_turn, time_to_move
    
    policy, steps_done, value = time_search(time_to_move, s, mapp, game, mcts_tree, alphabot,
                                            INPUT_SIZE=m)
    action = np.argmax(policy)
    count_turn += 1

    mapp, tmp_head = game.step(mapp, s, action, 0, mcts=True)  # Process player turn
    mapp, _ = game.step(mapp, s, human_action, 1, mcts=True)  # Process enemy turn
    s = map_to_state(mapp, old_mapp, s, 0, m, tmp_head)
    old_mapp = np.array(mapp)
    
    print(s[..., 0] + s[..., 2] + s[..., 1] + s[..., 3] * 2 \
          + np.ones_like(s[..., -2]) - s[..., -2])
    print([d for d in zip(game.dir_name.values(), np.around(policy, 2))], value, steps_done)
    
    return s, game.game_ended()

# Test fix conv layer

In [29]:
inp = alphabot.input

out = Conv2D(75, kernel_size=(4,4), strides=(1, 1), 
             kernel_initializer='ones', padding='same', use_bias=False)(inp)
out = BatchNormalization()(out)
conv1 = Activation('relu')(out)
#### From second

out = Conv2D(75, kernel_size=(4,4), strides=(1, 1), 
             kernel_initializer='ones', padding='same', use_bias=False)(conv1)
out = BatchNormalization()(out)
out = Activation('relu')(out)

out = ZeroConv()([conv1, out])

out = Conv2D(75, kernel_size=(4,4), strides=(1, 1), 
             kernel_initializer='ones', padding='same', use_bias=False)(conv1)
out = BatchNormalization()(out)
out = Activation('relu')(out)

out = ZeroConv()([conv1, out])

out = Conv2D(75, kernel_size=(4,4), strides=(1, 1), 
             kernel_initializer='ones', padding='same', use_bias=False)(conv1)
out = BatchNormalization()(out)
out = Activation('relu')(out)

out = ZeroConv()([conv1, out])

mini_model = Model(inp, out)

ones = np.zeros((9, 9, 5)) # Is a 9x9 -> on a 16x16
ones[:3, :3] = 1
ones[2, 2] = 0
mapp = np.zeros((16, 16, 5))
mapp[:ones.shape[0], :ones.shape[1]] = ones

after_conv = mini_model.predict(mapp[np.newaxis])[0, ..., 0]

plt.imshow(after_conv), after_conv[2, 2]

ValueError: Dimensions must be equal, but are 75 and 5 for 'zero_conv_52/mul' (op: 'Mul') with input shapes: [?,16,16,75], [?,16,16,5].

In [18]:
In [1]: %load_ext autoreload

In [2]: %autoreload 2

# Train Check

In [242]:
K.set_value(alphabot.optimizer.lr, 1e-2)
for i in range(10_000):
    picked_data = random.sample(complete_history, k=min(BATCH_SIZE, len(complete_history)))
    #picked_data = complete_history[0:50]
    
    state = []
    policy = []
    value = []
    for step in picked_data:
        policy.append(step.policy)
        state.append(step.state)
        value.append(step.value)

    y = [np.zeros((len(state), 4)), np.zeros((len(state), 1))]
    y[0] = policy
    y[1] = value
    
    losses = alphabot.train_on_batch(np.array(state, dtype=np.float32), y)
    print(losses)

[1.7725269, 1.3938543, 1.0267297]
[1.7904012, 1.4002347, 1.0727074]
[1.786456, 1.3974284, 1.0681561]
[1.7759346, 1.3955733, 1.0334966]
[1.7989341, 1.388261, 1.1547515]
[1.7792519, 1.387355, 1.079653]
[1.7678536, 1.3951814, 1.0027612]
[1.778127, 1.3886386, 1.0700347]
[1.7846489, 1.3923812, 1.0811597]
[1.7772214, 1.3864143, 1.0753275]
[1.773211, 1.3852948, 1.0637723]
[1.7750975, 1.3914261, 1.0468024]
[1.7937036, 1.3854845, 1.1450013]
[1.7704127, 1.3955926, 1.011414]
[1.7637405, 1.3866225, 1.020614]
[1.767346, 1.3938321, 1.0062056]
[1.7942452, 1.3967673, 1.1020703]
[1.7791798, 1.3935301, 1.054767]
[1.7636298, 1.3943883, 0.98914135]
[1.7662634, 1.3915446, 1.0110588]
[1.7569499, 1.3891838, 0.9832578]
[1.7656646, 1.3936056, 1.000438]
[1.7641922, 1.3933655, 0.99551696]
[1.7543228, 1.3889644, 0.97365457]
[1.761158, 1.3885818, 1.002536]
[1.764947, 1.3911164, 1.0075645]
[1.7618557, 1.38915, 1.0030742]
[1.7625796, 1.3902191, 1.0017064]
[1.7528853, 1.3881037, 0.97140193]
[1.7608098, 1.3896224, 0.9

[1.7315748, 1.3867378, 0.89517516]
[1.7285777, 1.382275, 0.9010537]
[1.7339944, 1.3890682, 0.89556277]
[1.7172102, 1.3866556, 0.83809245]
[1.7117151, 1.3840388, 0.82659453]
[1.7272698, 1.3881212, 0.8724991]
[1.7277074, 1.3840177, 0.8906796]
[1.727348, 1.3851744, 0.88463104]
[1.7018734, 1.3845347, 0.78530806]
[1.7193623, 1.3860453, 0.84923464]
[1.7132047, 1.3855042, 0.82678455]
[1.7432407, 1.3876896, 0.9382026]
[1.7267184, 1.3838177, 0.88761795]
[1.7285677, 1.3872237, 0.88140684]
[1.7340068, 1.3849967, 0.91208696]
[1.7308737, 1.3848231, 0.90026534]
[1.7319034, 1.3884945, 0.8897135]
[1.7223256, 1.387814, 0.8541395]
[1.7134112, 1.3854355, 0.82801217]
[1.7268507, 1.3871934, 0.87475455]
[1.7066108, 1.3851004, 0.8021827]
[1.7152191, 1.3871859, 0.8282912]
[1.724009, 1.3875551, 0.8619884]
[1.710766, 1.3839949, 0.823274]
[1.7141908, 1.3818411, 0.8456049]
[1.7231104, 1.3827217, 0.8777768]
[1.724673, 1.3853133, 0.87367755]
[1.7472934, 1.3846614, 0.9667819]
[1.7257141, 1.3863485, 0.87373275]
[1.72

[1.7006714, 1.3837204, 0.7874161]
[1.7199203, 1.3869075, 0.85167867]
[1.6752592, 1.384351, 0.6832764]
[1.6986374, 1.3806394, 0.7916495]
[1.696151, 1.3866227, 0.7577858]
[1.7030736, 1.3856689, 0.7893065]
[1.6885335, 1.383364, 0.7403809]
[1.6858808, 1.3789624, 0.7473911]
[1.6894382, 1.381589, 0.7511285]
[1.6929202, 1.3791658, 0.77476394]
[1.6855782, 1.3832663, 0.7290088]
[1.7066889, 1.3870928, 0.79816115]
[1.7016041, 1.3944845, 0.74827117]
[1.6917176, 1.3794973, 0.7686881]
[1.6861491, 1.382174, 0.73572326]
[1.681063, 1.379161, 0.72744566]
[1.6884592, 1.3892214, 0.7168037]
[1.7028852, 1.3864964, 0.78542316]
[1.700686, 1.3887731, 0.7675353]
[1.684516, 1.3846366, 0.71941733]
[1.6911515, 1.384937, 0.74477226]
[1.6881603, 1.3857439, 0.72959465]
[1.71309, 1.3900101, 0.8122644]
[1.6864887, 1.3816707, 0.7392317]
[1.697224, 1.381245, 0.78389096]
[1.7211286, 1.3897429, 0.84553367]
[1.6958171, 1.3838384, 0.7679212]
[1.6896296, 1.3834212, 0.7448553]
[1.6756016, 1.3810602, 0.6982032]
[1.6822121, 1.38

KeyboardInterrupt: 

In [53]:
step = complete_history[10000]
state = step.state
value = step.value
pol = step.policy
map_size = step.map_size
policy, v = alphabot.predict(state[np.newaxis])
policy = policy[0]

mapp = np.full((state.shape[0], state.shape[1]), -1)
mapp[state[..., 0] == 1] = 1
mapp[state[..., 2] == 1] = 1

valid_actions = emulator.Game(2, map_size).valid_actions(mapp, state, state[..., -1].all() == 1)
if len(valid_actions) < 4:
    missing_idx = [v for v in [0, 1, 2, 3] if v not in valid_actions]
    policy[missing_idx] = 0
             
if sum(policy) > 0:
    policy = policy / sum(policy)


policy, pol, value, v

(array([0.32682526, 0.08554007, 0.        , 0.5876347 ], dtype=float32),
 array([1.09939634e-08, 9.99895142e-01, 0.00000000e+00, 1.04846605e-04]),
 -1,
 array([[-0.16418928]], dtype=float32))

In [63]:
# Average value on set (should be ~ 0)
preds = []
for i in range(len(complete_history) // 8192 - 1):
    print(i)
    preds.extend(alphabot.predict(np.array([step.state for step in complete_history[i*8192: (i+1)*8192]]))[1])

np.average(np.array(preds)), del preds

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28


0.078885645

In [49]:
# Count positive and negative exp. (should be ~ equal)

pos, neg = 0, 0
for experience in complete_history:
    if experience.value < 0:
        neg += 1
    else:
        pos += 1

pos, neg

(125426, 126574)

In [11]:
for i in range(1):
    states = simulate_game(10, 0.8, 9, alphabot=alphabot, eval_g=True, return_state=True)

maps = []
for state in states:
    mapp = state[..., 0]
    mapp += state[..., 2] * 2
    mapp[np.where(state[..., 1] == 1)] = 3
    mapp[np.where(state[..., 3] == 1)] = 4
    mapp = np.expand_dims(mapp, axis=-1)
    mapp = np.tile(mapp, [1, 1, 3])
        
    idx, cols, c = np.where(mapp == 1)
    mapp[idx, cols, :] = 0
    mapp[idx, cols, 0] = 128
    
    idx, cols, c = np.where(mapp == 2)
    mapp[idx, cols, :] = 0
    mapp[idx, cols, 1] = 128
    
    idx, cols, c = np.where(mapp == 3)
    mapp[idx, cols, :] = 0
    mapp[idx, cols, 0] = 255
    
    idx, cols, c = np.where(mapp == 4)
    mapp[idx, cols, :] = 0
    mapp[idx, cols, 1] = 255
    
    mapp = cv2.resize(mapp.astype(np.uint8), (480, 480), interpolation=cv2.INTER_AREA)
    
    maps.append(mapp)
    
maps = np.array(maps)

#write_gif(maps, './test.gif', fps=5)

NameError: name 'alphabot' is not defined

In [320]:
complete_history[506].state[..., 2], complete_history[503].policy

(array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 array([0.        , 0.14814815, 0.33333333, 0.51851852]))

In [53]:
np.array([s.state for s in complete_history[506:1000:4]])[:, :, ::-1][0, ..., 2]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])