In [1]:
import tensorflow as tf
import keras
from keras.models import Model, load_model
from keras.utils import multi_gpu_model
from keras.layers import *
from keras.optimizers import Adam
import keras.backend as K
import numpy as np
import matplotlib.pyplot as plt
import sys
import random
sys.path.append('src') # Fix for jupyter
import src.emulator as emulator
import src.emulator_utils as emulator_utils
import src.emulator_vis as emulator_vis
%matplotlib inline
from IPython.display import clear_output
import time
import multiprocessing
from multiprocessing import Event, Queue, Pipe
from multiprocessing import Process as Thread
import os
import logging
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="3"

Using TensorFlow backend.


# Logging

In [2]:
logging.basicConfig(filename='logging.log', level=logging.INFO, format='%(asctime)s %(message)s', filemode='w')

# Model Params

In [3]:
INPUT_SIZE = (16, 16, 4) # Map size fixed to 16x16 (2 to 3 players)
N_ACTIONS = 4
gpus = 1

# Define the Layers Blocks

In [4]:
# Convolutional Block
def conv_block(in_layer, name, filters=128, kernel_size=(3,3), bn=True, relu=True):
    l = Conv2D(filters, kernel_size, padding='same', name = name, kernel_initializer='glorot_normal')(in_layer)
    if bn:
        l = BatchNormalization(name = name + '_bn')(l)
    if relu:
        l = Activation('relu', name = name + '_relu')(l)
    return l

# Residual Block
def residual_conv(in_layer, idx, filters=128, kernel_size=(3,3), bn=True, relu=True):
    name = 'res_' + str(idx)
    # Full conv block of pre-defined shape
    l = conv_block(in_layer, name + '_conv1', filters, kernel_size=(3,3), bn=True, relu=True)
    # Second block with skip connection
    l = Conv2D(filters, kernel_size, padding='same', name = name + '_conv2', kernel_initializer='glorot_normal')(l)
    if bn:
        l = BatchNormalization(name = name + '_conv2_bn')(l)
    l = Concatenate()([in_layer, l]) # Skip conn.
    if relu:
        l = Activation('relu', name = name + '_relu')(l)
    return l

def value_head(in_layer):
    l = conv_block(in_layer, 'value_head', filters=1, kernel_size=(1,1))
    l = Flatten(name = 'value_flatten')(l)
    l = Dense(128, name = 'value_dense', kernel_initializer='glorot_normal')(l)
    l = Activation('relu', name = 'value_relu')(l)
    l = Dense(1, name = 'value', activation='tanh')(l) # Value output
    return l

def policy_head(in_layer):
    l = conv_block(in_layer, 'policy_head', filters=2, kernel_size=(1,1))
    l = Flatten(name = 'policy_flatten')(l)
    l = Dense(N_ACTIONS, name = 'policy')(l) # Policy output
    return l

# Model

In [5]:
def create_model():
    def declare_model():
        n_residual = 16

        input_layer = Input(INPUT_SIZE)
        l = conv_block(input_layer, 'conv')
        for i in range(n_residual):
            l = residual_conv(l, idx=i + 1)

        policy = policy_head(l)
        value = value_head(l)

        alphabot = Model(input_layer, [policy, value])
        return alphabot
    
    if gpus > 1:
        alphabot = declare_model()
        alphabot_multi = multi_gpu_model(alphabot)
        return alphabot_multi, alphabot
    
    alphabot = declare_model()
    return alphabot, _

In [6]:
def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div

In [7]:
alphabot, _ = create_model()
alphabot.compile(optimizer=Adam(), loss='mse')
alphabot.summary(line_length=112)

________________________________________________________________________________________________________________
Layer (type)                        Output Shape             Param #       Connected to                         
input_1 (InputLayer)                (None, 16, 16, 4)        0                                                  
________________________________________________________________________________________________________________
conv (Conv2D)                       (None, 16, 16, 128)      4736          input_1[0][0]                        
________________________________________________________________________________________________________________
conv_bn (BatchNormalization)        (None, 16, 16, 128)      512           conv[0][0]                           
________________________________________________________________________________________________________________
conv_relu (Activation)              (None, 16, 16, 128)      0             conv_bn[0][0]        

In [119]:
# History of games for training
complete_history = []

# Game Params
n_players = 2
n_games = 100 #10_000 # Simulate N games before each training
k = 1 # Games to be stored n_games * K

# Simulation Params
num_threads = 1 #48

# Training Params
t_steps = 1_000 * 10 # Steps of training
eval_steps = 2_000 # How many steps before evaluation
eval_games = 300 # How many games to play to evaluate how's best model
win_percent = 0.55 # Ratio of game won to become best model
BATCH_SIZE = 1024

In [9]:
def simulate_games():
    logging.debug('Starting Threads for parallel Games')
    
    parallel_sim() # Parallel Games
    while not history_buffer.full():
        indices, states = [], []
        if processable_buffer.qsize() < num_threads * 2: # Wait until a bunch of requests are queued
            continue

        for i in range(processable_buffer.qsize()):
            index, state = processable_buffer.get()
            indices.append(index)
            states.append(state)
            
        predictions = alphabot.predict(np.array(states, dtype=np.float64))
        for i, pred in enumerate(tuple(zip(predictions[0], predictions[1]))):
            pipes[indices[i]].send(dict(zip(alphabot.output_names, pred)))
        # We have to predict until buffer is full
    logging.info('Finished Simulating %s games', n_games)

In [10]:
def play_eval():
    game = emulator.Game(n_players) # TODO: Wrap the following lines in a function
    gmap = game.map # Access map manually on first step
    gmap_old = None # First frame has no older map
    p_alive = game.players_alive # Players alive
    n_alive = game.count_alive()
        
    while True:
        assert n_alive > 2, 'Multi player eval is not implemented yet'
        state = map_to_state(gmap, gmap_old, p_alive) # State for each player alive
        
        # The predictions from the candidate and the best bot
        p = alphabot.predict(state)
        candidate_pred, best_pred = tuple(zip(p[0], p[1]))
        
        # Split in value and policy
        candidate_policy = candidate_pred[0]
        candidate_value = candidate_pred[1]
        best_policy = best_pred[0]
        best_value = best_pred[1]
        
        logging.debug('Candidate Policy: %s Candidate Value: %s', candidate_policy, candidate_value)
        logging.debug('Best Bot Policy: %s Best Bot Value: %s', best_policy, best_value)
        
        policy = [candidate_policy, best_policy]
        policy = softmax(np.array(policy)) # We softmax the policy logits
        chosen_action = np.argmax(policy, axis=-1)

        gmap, p_alive, n_alive, reward, game_end = game.step(chosen_action)

        if game_end:
            winner = p_alive[np.where(p_alive == 1)]
            return winner

In [76]:
def training_cycle(best_model = None):
    if best_model != None:
        alphabot_best = load_model(best_model)
    else:
        alphabot_best = copy.copy(alphabot)
        
    # Simulate n_games (exception made by first interaction)
    while len(complete_history) < k * n_games:
        simulate_games()
        # history_buffer contains the games, we store them inside complete history    
        for g in range(history_buffer.qsize()):
            complete_history.append(history_buffer.get())
        stop_simulation() # We can now stop the simulation (will free the memory)
    logging.debug('Complete history should be full, it contains %s elements', len(complete_history))
    # Now we are ready for the training process
    logging.info('Starting Training')
    for i in range(t_steps):
        # Get a BATCH_SIZE of games
        picked_data = random.sample(complete_history, k=min(BATCH_SIZE, len(complete_history)))
        # Get a State from each game selected
        x = np.array(random.sample([game.states for game in picked_data], k=1)[0], dtype=np.float64)
        actions_taken = [game.actions_taken for game in picked_data]
        rewards = [np.array(game.rewards[-1], dtype=np.float64) for game in picked_data]
        y = alphabot.predict(x)
        y[0][:, actions_taken] = rewards
        y[1][:, 0] = rewards
        logging.debug('Policy training: %s Value Training: %s', prediction[0], prediction[1])
        losses = alphabot.train_on_batch(x, prediction)
        logging.debug('Losses: %s', losses)
        
        if i % eval_steps == 0:
            wins = {'candidate' : 0, 'best' : 0}
            for j in range(eval_games):
                logging.info('Starting self-play evaluation')
                # 0 is Candidate, 1 is the (soon to be old) best
                wins[list(wins.keys())[play_eval()]] += 1 # add a win to the winner            
                logging.info('Eval game Ended, win state is: %s', wins)
            win_ratio = wins['candidate'] / eval_games
            if win_ratio > win_percent:
                logging.info('Great! Our candidate won %s of games', round(win_ratio * 100, 2))
                alphabot_best = copy.copy(alphabot)
            else:
                logging.info('Damn! Our candidate only won %s of games', round(win_ratio * 100, 2))
    del complete_history[:n_games] # Delete n oldest games from history

In [127]:
for g in complete_history:
    if len(g.rewards) == 0:
        print('WTF', g.states, g.actions_taken)

WTF [] []
WTF [] []


In [132]:
t = time.time()
cycles = 1
complete_history = []

for i in range(cycles):
    training_cycle()
    
time.time() - t

11
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
36
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
16
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
2
[0.0, -1.0]
[0.0, 1.0]
12
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
1
[1.0]
[]
18
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0]
2
[0

IndexError: list index out of range

In [12]:
def map_to_state(gmap, gmap_old, p_alive):
    if type(gmap_old) != np.ndarray:
        gmap_old = np.full_like(gmap, -1)
    
    n_alive = sum(p_alive == 1)
    states = np.empty((n_alive, *INPUT_SIZE), dtype=np.int)
    
    idx_alive = 0
    for idx, alive in enumerate(p_alive):
        if alive == 0: # Skip dead player
            continue
            
        # Player is alive, we collect its state
        states[idx_alive] = process_map(idx, gmap, gmap_old)
        idx_alive += 1
        
    return states

def process_map(idx, gmap, gmap_old):
    pov_me = np.zeros((*INPUT_SIZE[:2], 1), dtype=np.int)
    pov_me_last = np.zeros((*INPUT_SIZE[:2], 1), dtype=np.int)
    pov_not_me = np.zeros((*INPUT_SIZE[:2], 1), dtype=np.int)
    pov_not_me_last = np.zeros((*INPUT_SIZE[:2], 1), dtype=np.int)
    
    pov_me[np.where(gmap == idx)] = 1 # Set to 1 where bot is
    pov_me_last[np.where(gmap_old == idx)] = 1
    
    pov_not_me[np.where(~np.isin(gmap, [idx, -1]))] = 1 # Set to 1 where bot is not
    pov_not_me_last[np.where(~np.isin(gmap_old, [idx, -1]))] = 1
    
    return np.concatenate([pov_me, pov_me_last, pov_not_me, pov_not_me_last], axis=2)

In [13]:
class GameRecorder():
    def __init__(self):
        self.states = []
        self.rewards = []
        self.actions_taken = []
        
    def store(self, state, reward, action_taken):
        self.states.append(state)
        self.rewards.append(reward)
        self.actions_taken.append(action_taken)

In [131]:
def ask_predict(id, x):
    # Adds to queue id and data from process
    [processable_buffer.put((id, xi)) for xi in x]

def sim(process_id, pipe):    
    while True:
        games_buffer = [GameRecorder() for player in range(n_players)] # Create a place to store games
        
        # Simulate the game, if a prediction is needed use ask_predict
        game = emulator.Game(n_players) # TODO: Wrap the following lines in a function
        gmap = game.map # Access map manually on first step
        gmap_old = None # First frame has no older map
        p_alive = game.players_alive # Players alive
        n_alive = game.count_alive()
        
        k = 0
        while True:
            state = map_to_state(gmap, gmap_old, p_alive) # State for each player alive
            ask_predict(process_id, state)
            policy, value = [], []
            for i in range(n_alive):
                raw_prediction = pipe.recv() # Receive actions from main
                policy.append(raw_prediction['policy'])
                value.append(raw_prediction['value'])
            
            policy = softmax(np.array(policy)) # We softmax the policy logits
            #chosen_action = [np.random.choice(N_ACTIONS, p=act) for act in policy]
            chosen_action = np.argmax(policy, axis=-1)
            logging.debug('Choosen Actions %s Raw Actions %s', chosen_action, policy)
            
            gmap, p_alive_new, n_alive, reward, game_end = game.step(chosen_action)
        
            idx_alive = 0
            for alive in p_alive: # Players which were alive at the start of the step
                if alive == 0: # Player is dead, skip it
                    continue
                
                games_buffer[idx_alive].store(state[idx_alive], reward[idx_alive], chosen_action[idx_alive])
                idx_alive += 1
            p_alive = copy.copy(p_alive_new)
            
            k += 1
            if game_end:
                print(k)
                logging.debug('Game ended, rewards %s', reward)
                break
        try:
            for g in games_buffer:
                print(g.rewards)
                history_buffer.put_nowait(g)
        except:
            break
                    
def stop_simulation():
    global workers
    global history_buffer
    global processable_buffer
    global pipes
    global child_pipes
    
    if 'workers' in globals() and len(workers) != 0:
        for worker in workers:
            worker.terminate()
    workers = []
    
    for pipe in pipes:
        pipe.close()
    
    for pipe in child_pipes:
        pipe.close()
    
    # Then we empty the queues
    del history_buffer
    del processable_buffer
    del pipes
    del child_pipes

def parallel_sim():
    global workers
    global history_buffer
    global processable_buffer
    global pipes
    global child_pipes
    
    if 'workers' in globals() and len(workers) != 0:
        stop_simulation()
    
    history_buffer = Queue(n_games) # This numbers can be tweaked
    processable_buffer = Queue(num_threads * n_players)
    pipes = []
    child_pipes = []
    
    workers = []
    for i in range(num_threads):
        parent_pipe, child_pipe = Pipe() # Pipe to communicate with childs
        pipes.append(parent_pipe)
        child_pipes.append(child_pipe)
        
        worker = Thread(target=sim, args=[i, child_pipe])
        worker.daemon = False
        worker.start()
        workers.append(worker)