In [1]:
import tensorflow as tf
import pickle
import keras
from keras.models import Model, load_model, clone_model
from keras.utils import multi_gpu_model
from keras.layers import *
from keras.optimizers import Adam, SGD
from keras.regularizers import l2
from keras.losses import binary_crossentropy
import keras.backend as K
import numpy as np
import matplotlib.pyplot as plt
import sys
import random
sys.path.append('src')  # Fix for jupyter
import src.emulator as emulator
import src.emulator_utils as emulator_utils
import src.emulator_vis as emulator_vis
%matplotlib inline
from IPython.display import clear_output
import time
import multiprocessing
from multiprocessing import Event, Queue, Pipe
from multiprocessing import Process as Thread
import os
import logging
from mcts import *
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

  return f(*args, **kwds)
  return f(*args, **kwds)
Using TensorFlow backend.


# Logging

In [2]:
logging.basicConfig(filename='logging.log', level=logging.INFO, format='%(asctime)s %(message)s', filemode='w')

# Model Params

In [3]:
INPUT_SIZE = (16, 16, 5) # Map size fixed to 16x16 (2 to 3 players)
N_ACTIONS = 4
gpus = 1

# Define the Layers Blocks

In [4]:
filters = 64

# Convolutional Block
def conv_block(in_layer, name, filters=filters, kernel_size=(3,3), bn=True, relu=True):
    l = Conv2D(filters, kernel_size, use_bias = False, 
               padding='same', name = name, kernel_regularizer=l2(1e-4))(in_layer)
    if bn:
        l = BatchNormalization(axis=3, name = name + '_bn')(l)
    if relu:
        #l = Activation('relu', name = name + '_relu')(l)
        l = LeakyReLU(name = name + '_lkrelu')(l)

    return l

# Residual Block
def residual_conv(in_layer, idx, filters=filters, kernel_size=(3,3), bn=True, relu=True):
    name = 'res_' + str(idx)
    # Full conv block of pre-defined shape
    l = conv_block(in_layer, name + '_conv1', filters, kernel_size=(3,3), bn=True, relu=True)
    # Second block with skip connection
    l = Conv2D(filters, kernel_size, use_bias = False, padding='same', 
               name = name + '_conv2', kernel_regularizer=l2(1e-4))(l)
    if bn:
        l = BatchNormalization(axis=3, name = name + '_conv2_bn')(l)
    
    l = Concatenate()([in_layer, l]) # Skip conn.
    #l = Add()([in_layer, l]) # Skip conn.
    
    if relu:
        #l = Activation('relu', name = name + '_relu')(l)
        l = LeakyReLU(name = name + '_lkrelu')(l)
        
    return l

def value_head(in_layer):
    l = conv_block(in_layer, 'value_head', filters=1, kernel_size=(1,1))
    l = Flatten(name = 'value_flatten')(l)
    l = Dense(64, name = 'value_dense')(l)
    #l = Activation('relu', name = 'value_relu')(l)
    l = LeakyReLU(name = 'value_lkrelu')(l)
    
    l = BatchNormalization(axis=1, name = 'value_bn')(l)

    l = Dense(1, use_bias = False, name = 'value', activation='tanh')(l) # Value output
    return l

def policy_head(in_layer):
    l = conv_block(in_layer, 'policy_head', filters=2, kernel_size=(1,1))
    l = Flatten(name = 'policy_flatten')(l)
    l = Dense(N_ACTIONS, name = 'policy', use_bias = False, activation='linear')(l) # Policy output
    return l

# Model

In [5]:
def create_model():
    def declare_model():
        n_residual = 6

        input_layer = Input(INPUT_SIZE)
        l = conv_block(input_layer, 'conv')
        for i in range(n_residual):
            l = residual_conv(l, idx=i + 1)

        policy = policy_head(l)
        
        value = value_head(l)

        alphabot = Model(input_layer, [policy, value])
        return alphabot
    
    if gpus > 1:
        with tf.device('/cpu:0'):
            alphabot = declare_model()
        alphabot_multi = multi_gpu_model(alphabot, gpus=gpus)
        return alphabot_multi, alphabot
    
    alphabot = declare_model()
    return alphabot, alphabot

In [6]:
def simulate_games():
    logging.debug('Starting Threads for parallel Games')
    
    parallel_sim() # Parallel Games
    while not history_buffer.full():
        indices, states = [], []
        if processable_buffer.qsize() < 2: # Wait until a bunch of requests are queued
            continue

        for i in range(processable_buffer.qsize()):
            index, state = processable_buffer.get()
            indices.append(index)
            states.append(state)
            
        states = np.array(states, dtype=np.float32)
        predictions = alphabot.predict(states)
        for i, pred in enumerate(tuple(zip(predictions[0], predictions[1]))):
            pipes[indices[i]].send(dict(zip(alphabot.output_names, pred)))

    logging.info('Finished Simulating %s games', n_games)

In [7]:
def play_eval(reverted=False):
    global alphabot_best
    global alphabot
    
    game = emulator.Game(2)
    mapp = game.reset()
  
    tree_player0 = MCTS()
    tree_player0.alpha = MCTS_eval_alpha
  
    tree_player1 = MCTS()
    tree_player1.alpha = MCTS_eval_alpha

    old_mapp = None
    turn = 0
    s = map_to_state(mapp, old_mapp, None, 0)
    old_mapp = copy.deepcopy(mapp)
  
    states = []
    policies = []
    #reverted = np.random.random() > 0.5
    if reverted:
        player1 = alphabot
        player0 = alphabot_best
    else:
        player0 = alphabot
        player1 = alphabot_best
    
    while True:
        states.append(np.array(s))
        if turn == 0:
            policy = do_search(MCTS_eval_steps, s, mapp, game, tree_player0, alphabot=player0, allow_move=allow_move)
        else:
            policy = do_search(MCTS_eval_steps, s, mapp, game, tree_player1, alphabot=player1, allow_move=allow_move)
            
        if not use_eval_choice:
            choosen = np.argmax(policy)
        else :
            choosen = np.random.choice(4, p=policy)

        policies.append(np.array(policy))
        mapp = game.step(mapp, s, choosen, turn)

        turn = 1 - turn
        if turn == 0:  # We update the state
            s = map_to_state(mapp, old_mapp, s, 0)  # TODO: Map to state
        else:
            s[..., -1] = 1

        if turn == 0:
            old_mapp = np.array(mapp)

        if game.game_ended():
            if not reverted:
                return int(turn)
            else:
                return  int(not turn)

In [8]:
def train_model():
        picked_data = random.sample(complete_history, k=min(BATCH_SIZE, len(complete_history)))
        
        state = []
        policy = []
        value = []
        for step in picked_data:
            policy.append(step.policy)
            state.append(step.state)
            value.append(step.value)
            
        y = [np.zeros((len(state), 4)), np.zeros((len(state), 1))]
        y[0] = policy
        y[1] = value
        
        logging.debug('The label is %s', y)
        losses = alphabot.train_on_batch(np.array(state, dtype=np.float32), y)
        return losses

In [9]:
def training_cycle():
    global alphabot
    global alphabot_best
    global total_improv
    
    # Simulate n_games (exception made by first interaction)
    logging.info('Starting Training Cycle')
    #while len(complete_history) < k * n_games:
    simulate_games()
    # history_buffer contains the games, we store them inside complete history    
    for g in range(history_buffer.qsize()):
        complete_history.append(history_buffer.get())
    stop_simulation() # We can now stop the simulation (will free the memory)
    logging.debug('Complete history should be full, it contains %s elements', len(complete_history))
    # Now we are ready for the training process
    logging.info('Starting Model Training')
    losses = [None, None, None] # For debug purpose
    sum_loss = 0
    cc = 1
    for i in range(t_steps + 1):
        if i % 100 == 0:
            logging.info('Training Interaction: %s losses: %s', i, 
                         round(sum_loss / cc, 2)) # Works?

        losses = train_model()
        sum_loss += losses[0]
        logging.debug('Losses: %s', losses)
        
        improved = False
        evalued_step = False
        cc += 1
        if i % eval_steps == 0 and i > 0:
            evalued_step = True
            cc = 1 # Reset loss counter
            sum_loss = 0
            wins = {'candidate' : 0, 'best' : 0}
            n_c = {0 : 'candidate', 1 : 'best'}
            
            logging.info('Starting self-play evaluation')    
            for j in range(eval_games):
                if j >= eval_games // 2:
                    reverted = True
                else:
                    reverted = False
                    
                wins[n_c[play_eval(reverted)]] += 1 # add a win to the winner
                if j % 10 == 0:
                    logging.info('Win state Candidate: %s Best: %s', wins['candidate'], wins['best'])
            win_ratio = round(wins['candidate'] / eval_games, 2)
            if win_ratio >= win_percent:
                logging.info('Great! Our candidate won %s percent of games', win_ratio * 100)
                total_improv += 1
                logging.info('Our bot got better %s times', total_improv)
                improved = True
                alphabot.save('alphabot_best.pickle')
                alphabot_best = load_model('alphabot_best.pickle', 
                           custom_objects={'softmax_cross_entropy_with_logits' : softmax_cross_entropy_with_logits})
            else:
                logging.info('Damn! Our candidate only won %s percent of games', round(win_ratio * 100, 2))         
        if not improved and evalued_step:
            logging.info('Not improved, cloning to best')
            alphabot = load_model('alphabot_best.pickle', 
                           custom_objects={'softmax_cross_entropy_with_logits' : softmax_cross_entropy_with_logits})
        
        if improved:
            logging.info('Already improved, simulating more games')
            break
            
    if len(complete_history) == k * n_games:
        logging.debug('Removing oldest games')
        del complete_history[:n_games] # Delete n oldest games from history

In [10]:
def load_best(best_model):
    global alphabot
    global alphabot_best
    alphabot_best = load_model(best_model, 
                               custom_objects={'softmax_cross_entropy_with_logits' : softmax_cross_entropy_with_logits})
    alphabot.set_weights(alphabot_best.get_weights())

In [11]:
def train(cycles, best_model = None):
    global alphabot_best
    global alphabot
    
    #with open(r"alphabot_best.pickle", "wb") as output_file:
    #    pickle.dump(alphabot, output_file)
    alphabot.save('alphabot_best.pickle')
    
    #with open(r"alphabot_best.pickle", "rb") as input_file:
    #    alphabot_best = pickle.load(input_file)
    alphabot_best = load_model('alphabot_best.pickle', 
                           custom_objects={'softmax_cross_entropy_with_logits' : softmax_cross_entropy_with_logits})
    
    complete_history = []
    for i in range(cycles):
        training_cycle()

In [12]:
def ask_predict(idi, x):
    # Adds to queue id and data from process
    processable_buffer.put((idi, x))

def sim(process_id, pipe):
    np.random.seed()
    random.seed()
    
    while True:
        train_steps = simulate_game(MCTS_steps, MCTS_alpha, pipe, ask_predict, process_id)    
        
        try:
            for step in train_steps:
                history_buffer.put_nowait(step)
        except:
            break
                    
def stop_simulation():
    global workers
    global history_buffer
    global processable_buffer
    global pipes
    global child_pipes
    
    if 'workers' in globals() and len(workers) != 0:
        for worker in workers:
            worker.terminate()
    workers = []
    
    for pipe in pipes:
        pipe.close()

    for pipe in child_pipes:
        pipe.close()
    
    #for _ in range(history_buffer.qsize()):
    #    try:
    #        history_buffer.get_nowait()
    #    except:
    #        break
            
    #for _ in range(processable_buffer.qsize()):
    #    try:
    #        processable_buffer.get_nowait()
    #    except:
    #        break
        
    history_buffer.close()
    processable_buffer.close()
    
    # Then we empty the queues
    del history_buffer
    del processable_buffer
    del pipes
    del child_pipes

def parallel_sim():
    global workers
    global history_buffer
    global processable_buffer
    global pipes
    global child_pipes
    
    if 'workers' in globals() and len(workers) != 0:
        stop_simulation()
    
    history_buffer = Queue(n_games) # This numbers can be tweaked
    processable_buffer = Queue(num_threads)
    pipes = []
    child_pipes = []
    
    workers = []
    for i in range(num_threads):
        parent_pipe, child_pipe = Pipe() # Pipe to communicate with childs
        pipes.append(parent_pipe)
        child_pipes.append(child_pipe)
        
        worker = Thread(target=sim, args=[i, child_pipe])
        worker.daemon = False
        worker.start()
        workers.append(worker)

In [13]:
import tensorflow as tf

def softmax_cross_entropy_with_logits(y_true, y_pred):

    p = y_pred
    pi = y_true

    zero = tf.zeros(shape = tf.shape(pi), dtype=tf.float32)
    where = tf.equal(pi, zero)

    negatives = tf.fill(tf.shape(pi), -100.0) 
    p = tf.where(where, negatives, p)

    loss = tf.nn.softmax_cross_entropy_with_logits(labels = pi, logits = p)

    return loss

In [15]:
alphabot, _ = create_model()
alphabot.compile(optimizer=SGD(1e-3, momentum=0.9),
                          loss={'value' : 'mse', 'policy' : softmax_cross_entropy_with_logits},
                          loss_weights={'value' : 0.5, 'policy' : 0.5})
alphabot.summary(line_length=112)

________________________________________________________________________________________________________________
Layer (type)                        Output Shape             Param #       Connected to                         
input_2 (InputLayer)                (None, 16, 16, 5)        0                                                  
________________________________________________________________________________________________________________
conv (Conv2D)                       (None, 16, 16, 64)       2880          input_2[0][0]                        
________________________________________________________________________________________________________________
conv_bn (BatchNormalization)        (None, 16, 16, 64)       256           conv[0][0]                           
________________________________________________________________________________________________________________
conv_lkrelu (LeakyReLU)             (None, 16, 16, 64)       0             conv_bn[0][0]        

In [26]:
alphabot.save('alphabot_best.pickle')
alphabot_best = load_model('alphabot_best.pickle', 
                           custom_objects={'softmax_cross_entropy_with_logits' : softmax_cross_entropy_with_logits})

In [18]:
load_best('alphabot_best.pickle')

In [20]:
# History of games for training
complete_history = []

# Game Params
n_players = 2
n_games = 5_000 # Simulate N games before each training
k = 8 # Games to be stored n_games * K

# Eval options
allow_move = False
use_eval_choice = True

# Simulation Params
num_threads = 6

MCTS_steps = 40
MCTS_eval_steps = 20
MCTS_alpha = 0.6
MCTS_eval_alpha = 0.4

# Training Params
t_steps = 2000 # Steps of training
eval_steps = 1000 # How many steps before evaluation
eval_games = 100 # How many games to play to evaluate how's best model
win_percent = 0.55 # Ratio of game won to become best model
BATCH_SIZE = 512
total_improv = 0

In [None]:
#complete_history = []
cycles = 1000

train(cycles)

In [20]:
K.set_value(alphabot.optimizer.lr, 1e-3)

In [24]:
bot, best = 0, 0
for i in range(10):
    if i < 5:
        winner = play_eval()
    else:
        winner = play_eval(True)
        
    if winner == 0:
        bot += 1
    else:
        best += 1

bot, best

(6, 4)

In [26]:
bot, best = 0, 0
for i in range(100):
    winner, _ = simulate_game(10, 0.8, None, None, None, alphabot)
    if winner == 0:
        bot += 1
    else:
        best += 1

bot, best

(31, 69)

In [27]:
bot, best = 0, 0
for i in range(100):
    winner, _ = simulate_game(10, 0.8, None, None, None, alphabot)
    if winner == 0:
        bot += 1
    else:
        best += 1

bot, best

(48, 52)

In [29]:
logging.getLogger().setLevel(logging.INFO)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
def simmetries(state, action):
    # There are these simmetries:
    # +90;+180:-90;-180 degrees rotations
    # Flips
    
    def rotate_state(state, rot):
        N = INPUT_SIZE[0] - 1
        if rot == 0: # Rotation of 0 is simple
            return state
        
        new_state = np.empty(INPUT_SIZE, dtype=np.int)
        it = np.nditer(state, flags=['multi_index'])
        
        if rot == 3: # Rot of 90
            while not it.finished:
                x, y, c = it.multi_index
                new_state[x, y, c] = state[y, N - x, c]
                it.iternext()
                
        elif rot == 2: # Rot of 180
            while not it.finished:
                x, y, c = it.multi_index
                new_state[x, y, c] = state[N - x, N - y, c]
                it.iternext()
        elif rot == 1: # Rot of 270
            while not it.finished:
                x, y, c = it.multi_index
                new_state[x, y, c] = state[N - y, x, c]
                it.iternext()
        return new_state
    
    # First we apply a random rotation simmetry
    simmetry = random.sample([0, 1, 2, 3], 1)[0]
    #print(simmetry * 90)
    #print(action, simmetry)
    action = (action + simmetry) % N_ACTIONS
    state = rotate_state(state, simmetry)
    return state, action