In [1]:
import tensorflow as tf
import keras
from keras.models import Model, load_model, clone_model
from keras.utils import multi_gpu_model
from keras.layers import *
from keras.optimizers import Adam, SGD
from keras.regularizers import l2
from keras.losses import binary_crossentropy
import keras.backend as K
import numpy as np
import matplotlib.pyplot as plt
import sys
import random
sys.path.append('src')  # Fix for jupyter
import src.emulator as emulator
import src.emulator_utils as emulator_utils
import src.emulator_vis as emulator_vis
%matplotlib inline
from IPython.display import clear_output
import time
import multiprocessing
from multiprocessing import Event, Queue, Pipe
from multiprocessing import Process as Thread
import os
import logging
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

  return f(*args, **kwds)
  return f(*args, **kwds)
Using TensorFlow backend.


# Logging

In [2]:
logging.basicConfig(filename='logging.log', level=logging.INFO, format='%(asctime)s %(message)s', filemode='w')

# Model Params

In [3]:
INPUT_SIZE = (16, 16, 4) # Map size fixed to 16x16 (2 to 3 players)
N_ACTIONS = 4
gpus = 1

# Define the Layers Blocks

In [4]:
filters = 96

# Convolutional Block
def conv_block(in_layer, name, filters=filters, kernel_size=(3,3), bn=True, relu=True):
    l = Conv2D(filters, kernel_size, padding='same', name = name, kernel_regularizer=l2(5e-5),
               kernel_initializer='glorot_normal')(in_layer)
    if bn:
        l = BatchNormalization(name = name + '_bn')(l)
    if relu:
        l = Activation('relu', name = name + '_relu')(l)
    return l

# Residual Block
def residual_conv(in_layer, idx, filters=filters, kernel_size=(3,3), bn=True, relu=True):
    name = 'res_' + str(idx)
    # Full conv block of pre-defined shape
    l = conv_block(in_layer, name + '_conv1', filters, kernel_size=(3,3), bn=True, relu=True)
    # Second block with skip connection
    l = Conv2D(filters, kernel_size, padding='same', name = name + '_conv2', kernel_regularizer=l2(5e-5),
               kernel_initializer='glorot_normal')(l)
    if bn:
        l = BatchNormalization(name = name + '_conv2_bn')(l)
    l = Concatenate()([in_layer, l]) # Skip conn.
    if relu:
        l = Activation('relu', name = name + '_relu')(l)
    return l

def value_head(in_layer):
    l = conv_block(in_layer, 'value_head', filters=1, kernel_size=(1,1))
    l = Flatten(name = 'value_flatten')(l)
    l = Dense(64, name = 'value_dense', kernel_regularizer=l2(5e-5),
              kernel_initializer='glorot_normal')(l)
    l = Activation('relu', name = 'value_relu')(l)
    l = Dense(1, name = 'value', activation='tanh')(l) # Value output
    return l

def policy_head(in_layer):
    l = conv_block(in_layer, 'policy_head', filters=2, kernel_size=(1,1))
    l = Flatten(name = 'policy_flatten')(l)
    l = Dense(N_ACTIONS, name = 'policy')(l) # Policy output
    return l

# Model

In [5]:
def create_model():
    def declare_model():
        n_residual = 12

        input_layer = Input(INPUT_SIZE)
        l = conv_block(input_layer, 'conv')
        for i in range(n_residual):
            l = residual_conv(l, idx=i + 1)

        policy = policy_head(l)
        value = value_head(l)

        alphabot = Model(input_layer, [policy, value])
        return alphabot
    
    if gpus > 1:
        with tf.device('/cpu:0'):
            alphabot = declare_model()
        alphabot_multi = multi_gpu_model(alphabot, gpus=gpus)
        return alphabot_multi, alphabot
    
    alphabot = declare_model()
    return alphabot, alphabot

In [6]:
def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis]
    return e_x / div

In [7]:
alphabot_training, alphabot = create_model()
alphabot_training.compile(optimizer=SGD(1e-3), 
                          loss={'value' : 'mse', 'policy' : 'mse'},
                          loss_weights={'value' : 0.6, 'policy' : 1.0})
alphabot.summary(line_length=112)

________________________________________________________________________________________________________________
Layer (type)                        Output Shape             Param #       Connected to                         
input_1 (InputLayer)                (None, 16, 16, 4)        0                                                  
________________________________________________________________________________________________________________
conv (Conv2D)                       (None, 16, 16, 96)       3552          input_1[0][0]                        
________________________________________________________________________________________________________________
conv_bn (BatchNormalization)        (None, 16, 16, 96)       384           conv[0][0]                           
________________________________________________________________________________________________________________
conv_relu (Activation)              (None, 16, 16, 96)       0             conv_bn[0][0]        

In [8]:
# History of games for training
complete_history = []

# Game Params
n_players = 2
n_games = 12_500 #10_000 # Simulate N games before each training
k = 7 # Games to be stored n_games * K

# Simulation Params
num_threads = 16

# Training Params
t_steps = 1000 * 2 # Steps of training
eval_steps = 500 # How many steps before evaluation
eval_games = 350 # How many games to play to evaluate how's best model
win_percent = 0.55 # Ratio of game won to become best model
BATCH_SIZE = 200
alphabot_best = None
total_improv = 0

In [9]:
def simulate_games():
    logging.debug('Starting Threads for parallel Games')
    
    parallel_sim() # Parallel Games
    while not history_buffer.full():
        indices, states = [], []
        if processable_buffer.qsize() < (num_threads-1) * 2: # Wait until a bunch of requests are queued
            continue

        for i in range(processable_buffer.qsize()):
            index, state = processable_buffer.get()
            indices.append(index)
            states.append(state)
            
        predictions = alphabot.predict(np.array(states, dtype=np.float64))
        for i, pred in enumerate(tuple(zip(predictions[0], predictions[1]))):
            pipes[indices[i]].send(dict(zip(alphabot.output_names, pred)))
        # We have to predict until buffer is full
    logging.info('Finished Simulating %s games', n_games)

In [10]:
def play_eval(log_game=False):
    game = emulator.Game(n_players) # TODO: Wrap the following lines in a function
    gmap = game.map # Access map manually on first step

    gmap_old = None # First frame has no older map
    p_alive = game.players_alive # Players alive
    n_alive = game.count_alive()
    
    maps = [] # Initialise buffer for log
    maps.append(copy.copy(gmap))
    
    while True:
        assert n_alive == 2, 'Multi player eval is not implemented yet'
        state = map_to_state(gmap, gmap_old, p_alive) # State for each player alive
        
        # The predictions from the candidate and the best bot
        p0 = alphabot.predict(state[0][np.newaxis])
        p1 = alphabot_best.predict(state[1][np.newaxis])
        
        # Split in value and policy
        candidate_policy = p0[0]
        candidate_value = p0[1]
        best_policy = p1[0]
        best_value = p1[1]
        
        logging.debug('Candidate Policy: %s Candidate Value: %s', candidate_policy, candidate_value)
        logging.debug('Best Bot Policy: %s Best Bot Value: %s', best_policy, best_value)
        
        policy = [candidate_policy[0], best_policy[0]]
        policy = softmax(np.array(policy)) # We softmax the policy logits
        chosen_action = np.argmax(policy, axis=-1)
        
        gmap_old = copy.copy(gmap)
        gmap, p_alive, n_alive, reward, game_end = game.step(chosen_action)
        maps.append(copy.copy(gmap))
        
        if game_end:
            winner = np.where(np.array(p_alive) == 1)[0][0]
            if log_game:
                return maps

            return winner

In [11]:
def training_cycle(): 
    global alphabot
    global alphabot_best
    global total_improv
    
    # Simulate n_games (exception made by first interaction)
    logging.info('Starting Training Cycle')
    while len(complete_history) < k * n_games:
        simulate_games()
        # history_buffer contains the games, we store them inside complete history    
        for g in range(history_buffer.qsize()):
            complete_history.append(history_buffer.get())
        stop_simulation() # We can now stop the simulation (will free the memory)
    logging.debug('Complete history should be full, it contains %s elements', len(complete_history))
    # Now we are ready for the training process
    logging.info('Starting Model Training')
    losses = [None, None, None] # For debug purpose
    sum_loss = 0
    
    for i in range(t_steps + 1):
        if i % 100 == 0:
            logging.info('Training Interaction: %s losses: %s', i, round(sum_loss / (i + 1), 2))
        
        # Get a BATCH_SIZE of games
        t = time.time()
        picked_data = random.sample(complete_history, k=min(BATCH_SIZE, len(complete_history)))
        # Get a State from each game selected
        x = np.empty((len(picked_data), 16, 16, 4), dtype=np.float64)
        actions_taken = []
        rewards = []
        for j, game in enumerate(picked_data):
            index = np.random.randint(0, len(game.actions_taken)) # Get game length and generate index
            x[j] = np.array(game.states[index], dtype=np.float64)
            actions_taken.append(game.actions_taken[index])    
            rewards.append(np.array(game.rewards[-1], dtype=np.float64))
            
        #rewards = np.array(rewards)
        actions_taken = np.array(actions_taken)
        logging.debug('BEFORE PREDICT: %s', time.time() - t)
        t = time.time()
        y = alphabot_training.predict(x)
        #y = [np.zeros((x.shape[0], 4)), np.zeros((x.shape[0], 1))]
        logging.debug('PREDICT: %s', time.time() - t)
        for idx, action in enumerate(actions_taken):
            y[0][idx, actions_taken[idx]] = rewards[idx]
            y[1][idx, 0] = rewards[idx]
            
        logging.debug('Policy training: %s Value Training: %s', y[0], y[1])
        t = time.time()
        losses = alphabot_training.train_on_batch(x, y)
        logging.debug('TRAINING: %s', time.time() - t)
        sum_loss += losses[0]
        logging.debug('Losses: %s', losses)
        
        improved = False
        evalued_step = False
        if i % eval_steps == 0 and i > 0:
            evalued_step = True
            wins = {'candidate' : 0, 'best' : 0}
            n_c = {0 : 'candidate', 1 : 'best'}
            
            logging.info('Starting self-play evaluation')    
            for j in range(eval_games):
                # 0 is Candidate, 1 is the (soon to be old) best
                wins[n_c[play_eval()]] += 1 # add a win to the winner
                if j % 100 == 0:
                    logging.info('Win state Candidate: %s Best: %s', wins['candidate'], wins['best'])
            win_ratio = wins['candidate'] / eval_games
            if win_ratio > win_percent:
                logging.info('Great! Our candidate won %s percent of games', round(win_ratio * 100, 2))
                total_improv += 1
                logging.info('Our bot got better %s times', total_improv)
                improved = True
                alphabot_best = clone_model(alphabot)
                alphabot_best.set_weights(alphabot.get_weights())
            else:
                logging.info('Damn! Our candidate only won %s percent of games', round(win_ratio * 100, 2))
        if not improved and evalued_step:
            logging.info('Not improved, cloning to best')
            alphabot = clone_model(alphabot_best)
            alphabot.set_weights(alphabot_best.get_weights())
    del complete_history[:n_games] # Delete n oldest games from history

In [12]:
def load_best(best_model):
    global alphabot
    global alphabot_best
    alphabot_best = load_model(best_model)
    alphabot = clone_model(alphabot_best)

In [18]:
def train(cycles, best_model = None):
    global alphabot_best
    global alphabot
    
    if best_model != None:
        alphabot_best = load_model(best_model)
        alphabot = clone_model(alphabot_best)
    else:
        alphabot_best = clone_model(alphabot)  
        alphabot_best.set_weights(alphabot.get_weights())

    for i in range(cycles):
        training_cycle()

In [150]:
load_best('/data/rw/cp_bot/best_first_run.h5')



In [19]:
#complete_history = []
cycles = 1000

train(cycles)

KeyboardInterrupt: 

In [14]:
def map_to_state(gmap, gmap_old, p_alive):
    if type(gmap_old) != np.ndarray:
        gmap_old = np.full_like(gmap, -1)
    
    n_alive = sum(p_alive == 1)
    states = np.empty((n_alive, *INPUT_SIZE), dtype=np.int)
    
    idx_alive = 0
    for idx, alive in enumerate(p_alive):
        if alive == 0: # Skip dead player
            continue
            
        # Player is alive, we collect its state
        states[idx_alive] = process_map(idx, gmap, gmap_old)
        idx_alive += 1
        
    return states

def process_map(idx, gmap, gmap_old):
    pov_me = np.zeros((*INPUT_SIZE[:2], 1), dtype=np.int)
    pov_me_last = np.zeros((*INPUT_SIZE[:2], 1), dtype=np.int)
    pov_not_me = np.zeros((*INPUT_SIZE[:2], 1), dtype=np.int)
    pov_not_me_last = np.zeros((*INPUT_SIZE[:2], 1), dtype=np.int)
    
    pov_me[np.where(gmap == idx)] = 1 # Set to 1 where bot is
    pov_me_last[np.where(gmap_old == idx)] = 1
    
    pov_not_me[np.where(~np.isin(gmap, [idx, -1]))] = 1 # Set to 1 where bot is not
    pov_not_me_last[np.where(~np.isin(gmap_old, [idx, -1]))] = 1
    
    return np.concatenate([pov_me, pov_me_last, pov_not_me, pov_not_me_last], axis=2)

In [15]:
class GameRecorder():
    def __init__(self):
        self.states = []
        self.rewards = []
        self.actions_taken = []
        
    def store(self, state, reward, action_taken):
        self.states.append(state)
        self.rewards.append(reward)
        self.actions_taken.append(action_taken)

In [16]:
def ask_predict(id, x):
    # Adds to queue id and data from process
    [processable_buffer.put((id, xi)) for xi in x]

def sim(process_id, pipe):    
    while True:
        games_buffer = [GameRecorder() for player in range(n_players)] # Create a place to store games
        
        # Simulate the game, if a prediction is needed use ask_predict
        game = emulator.Game(n_players) # TODO: Wrap the following lines in a function
        gmap = game.map # Access map manually on first step
        gmap_old = None # First frame has no older map
        p_alive = game.players_alive # Players alive
        n_alive = game.count_alive()

        while True:
            state = map_to_state(gmap, gmap_old, p_alive) # State for each player alive
            ask_predict(process_id, state)
            policy, value = [], []
            for i in range(n_alive):
                raw_prediction = pipe.recv() # Receive actions from main
                policy.append(raw_prediction['policy'])
                value.append(raw_prediction['value'])
            
            policy = softmax(np.array(policy)) # We softmax the policy logits
            #chosen_action = [np.random.choice(N_ACTIONS, p=act) for act in policy]
            chosen_action = np.argmax(policy, axis=-1)
            logging.debug('Choosen Actions %s Raw Actions %s', chosen_action, policy)
            
            gmap_old = copy.copy(gmap)
            gmap, p_alive_new, n_alive, reward, game_end = game.step(chosen_action)
        
            idx_alive = 0
            for alive in p_alive: # Players which were alive at the start of the step
                if alive == 0: # Player is dead, skip it
                    continue
                
                games_buffer[idx_alive].store(state[idx_alive], reward[idx_alive], chosen_action[idx_alive])
                idx_alive += 1
            p_alive = copy.copy(p_alive_new)
            
            if game_end:
                logging.debug('Game ended, rewards %s', reward)
                break
        try:
            for g in games_buffer:
                # I didn't find a bug yet that makes some games be of 0 steps, gonna skip them for now
                if len(g.actions_taken) == 0:
                    logging.debug('WHAT A 0 STEPS GAME')
                    continue
                    
                history_buffer.put_nowait(g)
        except:
            break
                    
def stop_simulation():
    global workers
    global history_buffer
    global processable_buffer
    global pipes
    global child_pipes
    
    if 'workers' in globals() and len(workers) != 0:
        for worker in workers:
            worker.terminate()
    workers = []
    
    for pipe in pipes:
        pipe.close()
    
    for pipe in child_pipes:
        pipe.close()
        
    history_buffer.close()
    processable_buffer.close()
    
    # Then we empty the queues
    del history_buffer
    del processable_buffer
    del pipes
    del child_pipes

def parallel_sim():
    global workers
    global history_buffer
    global processable_buffer
    global pipes
    global child_pipes
    
    if 'workers' in globals() and len(workers) != 0:
        stop_simulation()
    
    history_buffer = Queue(n_games) # This numbers can be tweaked
    processable_buffer = Queue(num_threads * n_players)
    pipes = []
    child_pipes = []
    
    workers = []
    for i in range(num_threads):
        parent_pipe, child_pipe = Pipe() # Pipe to communicate with childs
        pipes.append(parent_pipe)
        child_pipes.append(child_pipe)
        
        worker = Thread(target=sim, args=[i, child_pipe])
        worker.daemon = False
        worker.start()
        workers.append(worker)