In [1]:
import tensorflow as tf
import pickle
import keras
from keras.models import Model, load_model, clone_model
from keras.utils import multi_gpu_model
from keras.layers import *
from keras.optimizers import Adam, SGD
from keras.regularizers import l2
from keras.losses import binary_crossentropy
import keras.backend as K
import numpy as np
import matplotlib.pyplot as plt
import sys
import random
sys.path.append('src')  # Fix for jupyter
import src.emulator as emulator
import src.emulator_utils as emulator_utils
import src.emulator_vis as emulator_vis
%matplotlib inline
from IPython.display import clear_output
import time
import multiprocessing
from multiprocessing import Event, Queue, Pipe
from multiprocessing import Process as Thread
import os
import logging
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

Using TensorFlow backend.


# Logging

In [2]:
logging.basicConfig(filename='logging.log', level=logging.DEBUG, format='%(asctime)s %(message)s', filemode='w')

# Model Params

In [3]:
INPUT_SIZE = (16, 16, 5) # Map size fixed to 16x16 (2 to 3 players)
N_ACTIONS = 4
gpus = 1

# Define the Layers Blocks

In [4]:
filters = 48

# Convolutional Block
def conv_block(in_layer, name, filters=filters, kernel_size=(3,3), bn=True, relu=True):
    l = Conv2D(filters, kernel_size, padding='same', name = name, kernel_regularizer=l2(1e-4),
              kernel_initializer='truncated_normal')(in_layer)
    if bn:
        l = BatchNormalization(axis=3, name = name + '_bn')(l)
    if relu:
        l = Activation('relu', name = name + '_relu')(l)
    
    l = Dropout(0.3)(l)
    return l

# Residual Block
def residual_conv(in_layer, idx, filters=filters, kernel_size=(3,3), bn=True, relu=True):
    name = 'res_' + str(idx)
    # Full conv block of pre-defined shape
    l = conv_block(in_layer, name + '_conv1', filters, kernel_size=(3,3), bn=True, relu=True)
    # Second block with skip connection
    l = Conv2D(filters, kernel_size, padding='same', name = name + '_conv2', kernel_regularizer=l2(1e-4),
              kernel_initializer='truncated_normal')(l)
    if bn:
        l = BatchNormalization(axis=3, name = name + '_conv2_bn')(l)
    l = Concatenate()([in_layer, l]) # Skip conn.
    if relu:
        l = Activation('relu', name = name + '_relu')(l)
        
    l = Dropout(0.3)(l)
    return l

def value_head(in_layer):
    l = conv_block(in_layer, 'value_head', filters=1, kernel_size=(1,1))
    l = Flatten(name = 'value_flatten')(l)
    l = Dense(64, name = 'value_dense',
             kernel_initializer='truncated_normal')(l)
    l = Activation('relu', name = 'value_relu')(l)
    l = Dropout(0.3)(l)
    l = BatchNormalization(axis=1, name = 'value_bn')(l)

    l = Dense(1, name = 'value', activation='tanh')(l) # Value output
    return l

def policy_head(in_layer):
    l = conv_block(in_layer, 'policy_head', filters=2, kernel_size=(1,1))
    l = Flatten(name = 'policy_flatten')(l)
    l = Dense(64, name='policy_dense', kernel_initializer='truncated_normal')(l)
    l = Dropout(0.3)(l)
    l = BatchNormalization(axis=1, name = 'policy_bn')(l)
    
    l = Dense(N_ACTIONS, name = 'policy', activation=None,
              kernel_initializer='truncated_normal')(l) # Policy output
    return l

# Model

In [5]:
def create_model():
    def declare_model():
        n_residual = 1#5

        input_layer = Input(INPUT_SIZE)
        l = conv_block(input_layer, 'conv')
        for i in range(n_residual):
            l = residual_conv(l, idx=i + 1)

        policy = policy_head(l)
        
        value = value_head(l)

        alphabot = Model(input_layer, [policy, value])
        return alphabot
    
    if gpus > 1:
        with tf.device('/cpu:0'):
            alphabot = declare_model()
        alphabot_multi = multi_gpu_model(alphabot, gpus=gpus)
        return alphabot_multi, alphabot
    
    alphabot = declare_model()
    return alphabot, alphabot

In [6]:
def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis]
    return e_x / div

In [7]:
def pre_train():
    #global complete_history
    #complete_history = []
    
    logging.info('Starting pretraining')
    for i in range(pretrain_games):
        if i % n_games == 0 and i > 0:
            logging.info('Simulated %s pretrain-games', n_games)
            
        games_buffer = [GameRecorder() for player in range(n_players)] # Create a place to store games
        game = emulator.Game(n_players) # TODO: Wrap the following lines in a function
        gmap = game.map # Access map manually on first step
        gmap_old = None # First frame has no older map
        p_alive = game.players_alive # Players alive
        n_alive = game.count_alive()

        def take_action_brain(game):
            actions = []
            for idx, s in enumerate(game.players_alive):
                player_idx = idx
                p_x, p_y = game.history[player_idx][-1]
                empty = -1
                size = INPUT_SIZE[0]
                liberties = np.array([0, 0, 0, 0]) # Right, Down, Left, Up
                liberties[0] = int(game.map[p_x % size, (p_y + 1) % size] == empty)
                liberties[1] = int(game.map[(p_x + 1) % size, p_y % size] == empty)
                liberties[2] = int(game.map[p_x % size, (p_y - 1) % size] == empty)
                liberties[3] = int(game.map[(p_x - 1) % size, p_y % size] == empty)
                if liberties.any() == 0:
                    actions.append(np.random.randint(0, len(liberties)))
                else:
                    while True:
                        x = np.random.randint(0, len(liberties))
                        if np.random.random() < 0.35: # A small chance of getting the action without even trying
                            actions.append(x)
                            break
                        if liberties[x] == 1:
                            actions.append(x)
                            break
            return actions
                
        while True:
            state = map_to_state(gmap, gmap_old, p_alive) # State for each player alive
            chosen_action = take_action_brain(game)
            
            gmap_old = copy.copy(gmap)
            gmap, p_alive_new, n_alive, reward, game_end = game.step(chosen_action)
        
            idx_alive = 0
            for alive in p_alive: # Players which were alive at the start of the step
                if alive == 0: # Player is dead, skip it
                    continue
                
                games_buffer[idx_alive].store(state[idx_alive], reward[idx_alive], chosen_action[idx_alive])
                idx_alive += 1
            p_alive = copy.copy(p_alive_new)
            
            if game_end:
                logging.debug('Game ended, rewards %s', reward)
                break
        
        for g in games_buffer:
            if len(g.actions_taken) == 0:
                logging.debug('WHAT A 0 STEPS GAME')
                continue

            complete_history.append(g)
    
    sum_losses = 0
    for i in range(pretrain_steps):
        losses = train_model()
        sum_losses += losses[0]
        if i % 100 == 0 and i > 0:
            logging.info('Pretrain step %s losses: %s', i, sum_losses / (i+1))

In [8]:
def simulate_games():
    logging.debug('Starting Threads for parallel Games')
    
    parallel_sim() # Parallel Games
    while not history_buffer.full():
        indices, states = [], []
        if processable_buffer.qsize() < num_threads * 2: # Wait until a bunch of requests are queued
            continue

        for i in range(processable_buffer.qsize()):
            index, state = processable_buffer.get()
            indices.append(index)
            states.append(state)
            
        states = np.array(states, dtype=np.float64)
        #print(states[:, :, :, 0])
        predictions = alphabot.predict(states)
        #print(predictions)
        for i, pred in enumerate(tuple(zip(predictions[0], predictions[1]))):
            pipes[indices[i]].send(dict(zip(alphabot.output_names, pred)))
        # We have to predict until buffer is full
    logging.info('Finished Simulating %s games', n_games)

In [9]:
def play_eval(log_game=False):
    game = emulator.Game(n_players) # TODO: Wrap the following lines in a function
    gmap = game.map # Access map manually on first step

    gmap_old = None # First frame has no older map
    p_alive = game.players_alive # Players alive
    n_alive = game.count_alive()
    
    #maps = [] # Initialise buffer for log
    #maps.append(copy.copy(gmap))
    
    while True:
        assert n_alive == 2, 'Multi player eval is not implemented yet'
        state = map_to_state(gmap, gmap_old, p_alive) # State for each player alive
        
        # The predictions from the candidate and the best bot
        p0 = alphabot.predict(state[0][np.newaxis])
        p1 = alphabot_best.predict(state[1][np.newaxis])
        
        # Split in value and policy
        candidate_policy = p0[0]
        candidate_value = p0[1]
        best_policy = p1[0]
        best_value = p1[1]
        
        logging.debug('Candidate Policy: %s Candidate Value: %s', candidate_policy, candidate_value)
        logging.debug('Best Bot Policy: %s Best Bot Value: %s', best_policy, best_value)
        
        policy = [candidate_policy[0], best_policy[0]]
        policy = softmax(np.array(policy)) # We softmax the policy logits
        chosen_action = np.argmax(policy, axis=-1)
        
        gmap_old = copy.copy(gmap)
        gmap, p_alive, n_alive, reward, game_end = game.step(chosen_action)
        #maps.append(copy.copy(gmap))
        
        if game_end:
            if sum(reward == -1) == 2:
                return play_eval()
            winner = np.where(np.array(p_alive) == 1)[0][0]
            if log_game:
                return maps

            return winner

In [10]:
def train_model():
        # Get a BATCH_SIZE of games
        picked_data = random.sample(complete_history, k=min(BATCH_SIZE, len(complete_history)))
        # Get a State from each game selected
        x = np.empty((len(picked_data), 16, 16, 4), dtype=np.float64)
        actions_taken = []
        rewards = []
        for j, game in enumerate(picked_data):
            index = np.random.randint(0, len(game.actions_taken)) # Get game length and generate index
            action = game.actions_taken[index]
            state = game.states[index]
            #state, action = simmetries(state, action) # Apply random simmetry
            x[j] = np.array(state, dtype=np.float64)
            actions_taken.append(action)    
            rewards.append(np.array(game.rewards[-1], dtype=np.float64))
        
        #rewards = np.array(rewards)
        actions_taken = np.array(actions_taken)
        y = alphabot.predict(x)
        #y = [np.zeros((x.shape[0], 4)), np.zeros((x.shape[0], 1))]
        logging.debug('The predict is %s', y)
        
        for idx, action in enumerate(actions_taken):
            if rewards[idx] == -1: # Loss
                #y[0][idx, :] = 1 # Every other actions is good
                y[0][idx, action] = 0 # Selected is bad
            else: # Win
                y[0][idx, action] = 1 # Every action is bad except this one
            y[1][idx, 0] = rewards[idx] # Policy is easier to manage
        logging.debug('The label is %s', y)
        losses = alphabot.train_on_batch(x, y)
        return losses

In [11]:
def training_cycle():
    global alphabot
    global alphabot_best
    global total_improv
    
    # Simulate n_games (exception made by first interaction)
    logging.info('Starting Training Cycle')
    while len(complete_history) < k * n_games:
        simulate_games()
        # history_buffer contains the games, we store them inside complete history    
        for g in range(history_buffer.qsize()):
            complete_history.append(history_buffer.get())
        stop_simulation() # We can now stop the simulation (will free the memory)
    logging.debug('Complete history should be full, it contains %s elements', len(complete_history))
    # Now we are ready for the training process
    logging.info('Starting Model Training')
    losses = [None, None, None] # For debug purpose
    sum_loss = 0
    cc = 1
    for i in range(t_steps + 1):
        if i % 100 == 0:
            logging.info('Training Interaction: %s losses: %s', i, 
                         round(sum_loss / cc, 2)) # Works?

        losses = train_model()
        sum_loss += losses[0]
        logging.debug('Losses: %s', losses)
        
        improved = False
        evalued_step = False
        cc += 1
        if i % eval_steps == 0 and i > 0:
            evalued_step = True
            cc = 1 # Reset loss counter
            sum_loss = 0
            wins = {'candidate' : 0, 'best' : 0}
            n_c = {0 : 'candidate', 1 : 'best'}
            
            logging.info('Starting self-play evaluation')    
            for j in range(eval_games):
                # 0 is Candidate, 1 is the (soon to be old) best
                wins[n_c[play_eval()]] += 1 # add a win to the winner
                if j % 100 == 0:
                    logging.info('Win state Candidate: %s Best: %s', wins['candidate'], wins['best'])
            win_ratio = wins['candidate'] / eval_games
            if win_ratio > win_percent:
                logging.info('Great! Our candidate won %s percent of games', round(win_ratio * 100, 2))
                total_improv += 1
                logging.info('Our bot got better %s times', total_improv)
                improved = True
                with open(r"alphabot_best.pickle", "wb") as output_file:
                    pickle.dump(alphabot, output_file)
                with open(r"alphabot_best.pickle", "rb") as input_file:
                    alphabot_best = pickle.load(input_file)
            else:
                logging.info('Damn! Our candidate only won %s percent of games', round(win_ratio * 100, 2))            
        if not improved and evalued_step:
            logging.info('Not improved, cloning to best')
            with open(r"alphabot_best.pickle", "rb") as input_file:
                alphabot = pickle.load(input_file)
            
    del complete_history[:n_games] # Delete n oldest games from history

In [12]:
def load_best(best_model):
    global alphabot
    global alphabot_best
    alphabot_best = load_model(best_model)
    alphabot.set_weights(alphabot_best.get_weights())

In [13]:
def train(cycles, best_model = None):
    global alphabot_best
    global alphabot
    
    if best_model != None:
        with open(r"alphabot_best.pickle", "rb") as input_file:
            alphabot_best = pickle.load(input_file)
        with open(r"alphabot_best.pickle", "rb") as input_file:
            alphabot = pickle.load(input_file)
    else:
        with open(r"alphabot_best.pickle", "rb") as input_file:
            alphabot_best = pickle.load(input_file)

    pre_train()
    with open(r"alphabot_best.pickle", "wb") as output_file:
        pickle.dump(alphabot, output_file)
    
    with open(r"alphabot_best.pickle", "rb") as input_file:
        alphabot_best = pickle.load(input_file)
    
    complete_history = []
    for i in range(cycles):
        training_cycle()

In [14]:
class GameRecorder():
    def __init__(self):
        self.states = []
        self.rewards = []
        self.actions_taken = []
        
    def store(self, state, reward, action_taken):
        self.states.append(state)
        self.rewards.append(reward)
        self.actions_taken.append(action_taken)

In [15]:
def ask_predict(idi, x):
    # Adds to queue id and data from process
    [processable_buffer.put((idi, xi)) for xi in x]

def sim(process_id, pipe):
    np.random.seed()
    random.seed()
    
    while True:
        games_buffer = [GameRecorder() for player in range(n_players)] # Create a place to store games
        
        # Simulate the game, if a prediction is needed use ask_predict
        game = emulator.Game(n_players) # TODO: Wrap the following lines in a function
        gmap = game.map # Access map manually on first step
        gmap_old = None # First frame has no older map
        p_alive = game.players_alive # Players alive
        n_alive = game.count_alive()

        while True:
            state = map_to_state(gmap, gmap_old, p_alive) # State for each player alive
            ask_predict(process_id, state)
            policy, value = [], []
            for i in range(n_alive):
                raw_prediction = pipe.recv() # Receive actions from main
                policy.append(raw_prediction['policy'])
                value.append(raw_prediction['value'])
            
            policy = softmax(np.array(policy)) # We softmax the policy logits
            #chosen_action = [np.random.choice(N_ACTIONS, p=act) for act in policy]
            chosen_action = np.argmax(policy, axis=-1)
            logging.debug('Choosen Actions %s Raw Actions %s', chosen_action, policy)
            
            gmap_old = copy.copy(gmap)
            gmap, p_alive_new, n_alive, reward, game_end = game.step(chosen_action)
        
            idx_alive = 0
            for alive in p_alive: # Players which were alive at the start of the step
                if alive == 0: # Player is dead, skip it
                    continue
                
                games_buffer[idx_alive].store(state[idx_alive], reward[idx_alive], chosen_action[idx_alive])
                idx_alive += 1
            p_alive = copy.copy(p_alive_new)
            
            if game_end:
                logging.debug('Game ended, rewards %s', reward)
                break
        try:
            for g in games_buffer:
                # I didn't find a bug yet that makes some games be of 0 steps, gonna skip them for now
                if len(g.actions_taken) == 0:
                    logging.debug('WHAT A 0 STEPS GAME')
                    continue
                    
                history_buffer.put_nowait(g)
        except:
            break
                    
def stop_simulation():
    global workers
    global history_buffer
    global processable_buffer
    global pipes
    global child_pipes
    
    if 'workers' in globals() and len(workers) != 0:
        for worker in workers:
            worker.terminate()
    workers = []
    
    for pipe in pipes:
        pipe.close()
    
    for pipe in child_pipes:
        pipe.close()
    
    
    #for _ in range(history_buffer.qsize()):
    #    try:
    #        history_buffer.get_nowait()
    #    except:
    #        break
            
    #for _ in range(processable_buffer.qsize()):
    #    try:
    #        processable_buffer.get_nowait()
    #    except:
    #        break
        
    
        
    history_buffer.close()
    processable_buffer.close()
    
    # Then we empty the queues
    del history_buffer
    del processable_buffer
    del pipes
    del child_pipes

def parallel_sim():
    global workers
    global history_buffer
    global processable_buffer
    global pipes
    global child_pipes
    
    if 'workers' in globals() and len(workers) != 0:
        stop_simulation()
    
    history_buffer = Queue(n_games) # This numbers can be tweaked
    processable_buffer = Queue(num_threads * n_players)
    pipes = []
    child_pipes = []
    
    workers = []
    for i in range(num_threads):
        parent_pipe, child_pipe = Pipe() # Pipe to communicate with childs
        pipes.append(parent_pipe)
        child_pipes.append(child_pipe)
        
        worker = Thread(target=sim, args=[i, child_pipe])
        worker.daemon = False
        worker.start()
        workers.append(worker)

In [16]:
def simmetries(state, action):
    # There are these simmetries:
    # +90;+180:-90;-180 degrees rotations
    # Flips
    
    def rotate_state(state, rot):
        N = INPUT_SIZE[0] - 1
        if rot == 0: # Rotation of 0 is simple
            return state
        
        new_state = np.empty(INPUT_SIZE, dtype=np.int)
        it = np.nditer(state, flags=['multi_index'])
        
        if rot == 3: # Rot of 90
            while not it.finished:
                x, y, c = it.multi_index
                new_state[x, y, c] = state[y, N - x, c]
                it.iternext()
                
        elif rot == 2: # Rot of 180
            while not it.finished:
                x, y, c = it.multi_index
                new_state[x, y, c] = state[N - x, N - y, c]
                it.iternext()
        elif rot == 1: # Rot of 270
            while not it.finished:
                x, y, c = it.multi_index
                new_state[x, y, c] = state[N - y, x, c]
                it.iternext()
        return new_state
    
    # First we apply a random rotation simmetry
    simmetry = random.sample([0, 1, 2, 3], 1)[0]
    #print(simmetry * 90)
    #print(action, simmetry)
    action = (action + simmetry) % N_ACTIONS
    state = rotate_state(state, simmetry)
    return state, action

In [17]:
alphabot, _ = create_model()
alphabot.compile(optimizer=Adam(1e-4), 
                          loss={'value' : 'mse', 'policy' : 'categorical_crossentropy'},
                          loss_weights={'value' : 0.1, 'policy' : 2.0})
alphabot.summary(line_length=112)

with open(r"alphabot_best.pickle", "wb") as output_file:
    pickle.dump(alphabot, output_file)
    
with open(r"alphabot_best.pickle", "rb") as input_file:
    alphabot_best = pickle.load(input_file)

________________________________________________________________________________________________________________
Layer (type)                        Output Shape             Param #       Connected to                         
input_1 (InputLayer)                (None, 16, 16, 5)        0                                                  
________________________________________________________________________________________________________________
conv (Conv2D)                       (None, 16, 16, 48)       2208          input_1[0][0]                        
________________________________________________________________________________________________________________
conv_bn (BatchNormalization)        (None, 16, 16, 48)       192           conv[0][0]                           
________________________________________________________________________________________________________________
conv_relu (Activation)              (None, 16, 16, 48)       0             conv_bn[0][0]        

In [18]:
# History of games for training
complete_history = []

# Game Params
n_players = 2
n_games = 15_000 # Simulate N games before each training
k = 6 # Games to be stored n_games * K

# Simulation Params
num_threads = 30

# Training Params
t_steps = 2000 # Steps of training
eval_steps = 1000 # How many steps before evaluation
eval_games = 500 # How many games to play to evaluate how's best model
win_percent = 0.55 # Ratio of game won to become best model
BATCH_SIZE = 384
total_improv = 0
pretrain_steps = 0 #750
pretrain_games = 0 #30_000

In [19]:
complete_history = []
cycles = 1000
assert False
train(cycles)

AssertionError: 

In [20]:
class MCTS():
    
    def __init__(self):
        self.tree = []
        self.P = {}
        self.Q = {}
        self.N = {}
        self.alpha = 0.8
    
    def search(self, s, mapp, game, nnet):
        logging.debug('Starting search')
        s_k = s.tobytes()
                
        if game.game_ended():
            logging.debug('Game Ended during search')
            game.reward = 0
            return -1
        
        if s_k not in self.tree:
            logging.debug('New state encountered')
            self.tree.append(s_k)
            policy, value = nnet.predict(s[np.newaxis])
            self.P[s_k], v = policy[0], value[0]
            self.Q[s_k] = np.zeros((4))
            self.N[s_k] = np.zeros((4))
            return -v
        
        max_u, best_a = -float('inf'), -1
        logging.debug('Evaluating UCB')
        for a in range(4): # The actions
            u = self.Q[s_k][a] + self.alpha * self.P[s_k][a] * np.sqrt(np.sum(self.N[s_k]) / (1 + self.N[s_k][a]))
            if u > max_u:
                max_u = u
                best_a = a
            logging.debug('Action %d has a value of %f' % (a, u))
        a = best_a
        
        turn = get_turn(mapp)
        #print('before is turn of', turn)
        logging.debug('\n ' + str(mapp))
        new_map = copy.deepcopy(mapp)
        new_map = game.step(new_map, s, a, turn)
        turn = get_turn(new_map)
        #print('now is turn of', turn)
        #if (mapp == new_map).all():
        #    print(mapp, '\n', new_map)
        #    assert False
        
        logging.debug(game.reward)
        logging.debug('New map after move, now is turn of %d' %turn)
        logging.debug('\n' + str(new_map))
        
        if turn == 0: # We update the state
            logging.debug('Player 0 turn, updating the map')
            sp = map_to_state(new_map, mapp, s, 0) # TODO: Map to state
        else:
            logging.debug('Player 1 turn, not updating the map')
            # But we have to change the point of view of it!
            sp = copy.copy(s)
            sp[..., -1] = 1
        
        v = self.search(sp, new_map, game, nnet)
        
        self.Q[s_k][a] = (self.N[s_k][a] * self.Q[s_k][a] + v) / (self.N[s_k][a] + 1)
        self.N[s_k][a] += 1
        
        return -v

In [21]:
game = emulator.Game(2)
mapp = game.reset()
s = map_to_state(mapp, None, None, 0)
tree = MCTS()
tree.alpha = 1

t = time.time()
for i in range(10):
    tree.search(s, mapp, game, alphabot)

NameError: name 'map_to_state' is not defined

In [22]:
def get_turn(x):
    idx, count = np.unique(x, return_counts=True)
    idx, count = idx[1:], count[1:]
    if count[0] == count[1]:
        return 0
    
    return idx[np.argmin(count)]

In [None]:
np.set_printoptions(precision=4)
np.set_printoptions(suppress=False)

In [None]:
x = tree.Q[s.tobytes()]
x / sum(x)

In [None]:
x = tree.N[s.tobytes()]
x / sum(x)

In [None]:
for x, y in tree.N.items():
    print(y)

In [None]:
sys.setrecursionlimit(80)

In [None]:
a = np.full((10, 10), fill_value=-1)
a[0, 0] = 1
np.unique(a, return_counts=True)

In [23]:
def map_to_state(gmap, gmap_old, state, turn):
    if type(gmap_old) != np.ndarray:
        gmap_old = np.full_like(gmap, -1)
    
    states = np.empty(INPUT_SIZE, dtype=np.int)
    
    states = process_map(gmap, gmap_old, state, turn)
    return states

def process_map(gmap, gmap_old, state, idx):
    pov_0 = np.zeros((*INPUT_SIZE[:2], 1), dtype=np.int)
    pov_0_last = np.zeros((*INPUT_SIZE[:2], 1), dtype=np.int)
    pov_1 = np.zeros((*INPUT_SIZE[:2], 1), dtype=np.int)
    pov_1_last = np.zeros((*INPUT_SIZE[:2], 1), dtype=np.int)
    
    pov_0[np.where(gmap == 0)] = 1 # Set to 1 where player 0 is
    pov_0_last[np.where(gmap_old == 0)] = 1
    
    pov_1[np.where(gmap == 1)] = 1 # Set to  1 where player 1 is
    pov_1_last[np.where(gmap_old == 1)] = 1
    
    pov_0_last = pov_0 - pov_0_last
    pov_1_last = pov_1 - pov_1_last
    
    if sum(sum(pov_0_last)) == 0:
        #print('No player 0 head')
        pov_0_last = np.expand_dims(state[..., 1], axis=-1)
    
    if sum(sum(pov_1_last)) == 0:
        #print('No player 1 head')
        pov_1_last = np.expand_dims(state[..., 3], axis=-1)
    
    turn_m = np.full((*INPUT_SIZE[:2], 1), dtype=np.int, fill_value=idx)
    
    return np.concatenate([pov_0, pov_0_last, pov_1, pov_1_last, turn_m], axis=2)

In [44]:
def do_search(n, s, mapp, game, alphabot):
    for i in range(n):
        tree.search(s, mapp, game, alphabot)
    x = tree.N[s.tobytes()]
    x / (sum(x) + 1e-7)
    return x

game = emulator.Game(2)
mapp = game.reset()
tree = MCTS()

old_mapp = None
turn = 0
s = map_to_state(mapp, old_mapp, None, 0)
old_mapp = copy.deepcopy(mapp)
print(str(old_mapp).replace('-1', '--'))
i = 0
while not game.game_ended():        
    #print('HEAD 0 \n' + str(s[..., 1]).replace('0', '--'))
    #print('-------------------')
    #print(str(old_mapp).replace('-1', '--'))
    #print('-------------------')
    choosen = np.argmax(do_search(30, s, mapp, game, alphabot))
    #choosen = 0
    mapp = game.step(mapp, s, choosen, turn)
    
    turn = get_turn(mapp)
    if turn == 0: # We update the state
        logging.debug('Player 0 turn, updating the STATE')
        #print('turn player 0')
        s = map_to_state(mapp, old_mapp, s, 0) # TODO: Map to state
    else:
        logging.debug('Player 1 turn, not updating the STATE')
        #print('turn player 1') 
        # But we have to change the point of view of it!
        s[..., -1] = 1
    
    if turn == 0:
        old_mapp = copy.deepcopy(mapp)
    
    if turn == 0:
        print(str(mapp).replace('-1', '--'))
        print('-------------------')
    
    i += 1
    if game.game_ended():
        print('Game ended %d steps', i)
        
    clear_output(True)

Game ended %d steps 132


In [None]:
%load_ext autoreload
%autoreload 2