In [None]:
import numpy as np
import tensorflow as tf #OLD VERSION ##########UPDATE THIS##########
from othello import Board
from copy import deepcopy
import matplotlib.pyplot as plt
import time
import os.path

np.set_printoptions(threshold=np.nan)

def reset_graph():
    if("sess" in globals() and sess):
        sess.close()
    tf.reset_default_graph()
    
def variable_summaries(var):
    with tf.name_scope('summaries'):
        mean = tf.reduce_mean(var)
        tf.summary.scalar('mean', mean)
    with tf.name_scope('stddev'):
        stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
    tf.summary.scalar('stddev', stddev)
    tf.summary.scalar('max', tf.reduce_max(var))
    tf.summary.scalar('min', tf.reduce_min(var))
    tf.summary.histogram('histogram', var)

def buildGraph(inputDim, dataType, hLayersDim, learning_rate = 0.01, name="player"):
    with tf.variable_scope(name):
        x = tf.placeholder(dataType, [None, inputDim])
        actions = tf.placeholder(tf.int32, [None])
        rewards = tf.placeholder(dataType, [None])

        layers = []
        biases = []
        preActivate = []

        with tf.variable_scope("Weights"):
            layers.append(tf.get_variable("W0", [inputDim, hLayersDim[0]], dataType, initializer=tf.contrib.layers.xavier_initializer()))
            variable_summaries(layers[-1])
            layers.append(tf.get_variable("W1", [hLayersDim[0], hLayersDim[1]], dataType, initializer=tf.contrib.layers.xavier_initializer()))
            variable_summaries(layers[-1])
            layers.append(tf.get_variable("W2", [hLayersDim[1], hLayersDim[2]], dataType, initializer=tf.contrib.layers.xavier_initializer()))
            variable_summaries(layers[-1])
            layers.append(tf.get_variable("W3", [hLayersDim[2], inputDim], dataType, initializer=tf.contrib.layers.xavier_initializer()))
            variable_summaries(layers[-1])

        with tf.variable_scope("Biases"):
            biases.append(tf.get_variable("b0", [hLayersDim[0]], dataType, initializer=tf.contrib.layers.xavier_initializer()))
            variable_summaries(biases[-1])
            biases.append(tf.get_variable("b1", [hLayersDim[1]], dataType, initializer=tf.contrib.layers.xavier_initializer()))
            variable_summaries(biases[-1])
            biases.append(tf.get_variable("b2", [hLayersDim[2]], dataType, initializer=tf.contrib.layers.xavier_initializer()))
            variable_summaries(biases[-1])
            biases.append(tf.get_variable("b3", [inputDim], dataType, initializer=tf.contrib.layers.xavier_initializer()))
            variable_summaries(biases[-1])

        hiddenStates = []
        
        with tf.variable_scope("Activation"):
            hiddenStates.append(tf.nn.relu(tf.matmul(x, layers[0]) + biases[0])) #Olhar o relu6, pode ser melhor
            tf.summary.histogram('hiddenStates', hiddenStates[-1])
            hiddenStates.append(tf.nn.relu(tf.matmul(hiddenStates[-1], layers[1]) + biases[1])) #Olhar o relu6, pode ser melhor
            tf.summary.histogram('hiddenStates', hiddenStates[-1])
            hiddenStates.append(tf.nn.relu(tf.matmul(hiddenStates[-1], layers[2]) + biases[2])) #Olhar o relu6, pode ser melhor
            tf.summary.histogram('hiddenStates', hiddenStates[-1])
            hiddenStates.append(tf.matmul(hiddenStates[-1], layers[3]) + biases[3])
            tf.summary.histogram('hiddenStates', hiddenStates[-1])

        output_raw = hiddenStates[-1]
        output = tf.nn.softmax(output_raw)

        with tf.variable_scope("cross_entropy"):
            neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=output_raw, labels=actions)
            loss = tf.reduce_mean(neg_log_prob * rewards)  # reward guided loss
            tf.summary.scalar('cross_entropy', loss)

        train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)
        
        return {
            "name": name,
            "x": x,
            "actions": actions,
            "rewards": rewards,
            "negLogProb": neg_log_prob,
            "loss": loss,
            "layers": layers,
            "biases": biases,
            "hiddenStates": hiddenStates,
            "output": output,
            "outputRaw": output_raw,
            "train_step": train_step
        }
    
def predict(agent, state):
    return sess.run(agent["output"], feed_dict={agent["x"]: state})

def makeMove(agent, board, name):
    if(name == "BLACK"):
        player = 1
        tempBoard = board.board
    else:
        player = -1
        tempBoard = board.inverted_board()
       
    moves = board.possible_moves(player) 
    if(len(moves) == 0):
        return (-1, player)
    probs = predict(agent, tempBoard.reshape([1,-1])).squeeze()
    
    new_probs = np.zeros(boardDim)
    for x, y, _ in moves:
        new_probs[x*8 + y] = probs[x*8 + y]
    
    if(np.sum(new_probs) == 0):
        selected_move = np.random.choice(len(moves), 1, p = [1/len(moves)]*len(moves))[0]
        selected_move = moves[selected_move]
        selected_move = selected_move[0]*8 + selected_move[1]
    else:
        new_probs = new_probs/np.sum(new_probs)
        selected_move = np.random.choice(boardDim, 1, p=new_probs)[0]
    
    board.move(selected_move//8, selected_move%8, player)
    return (selected_move, player)
        
def maybePrint(shouldI, s):
    if(shouldI):
        print(s)

def discount_and_norm_rewards(winLoseReward, actions, gamma):
    # discount episode rewards
    discounted_ep_rs = np.zeros(np.array(actions).shape)    
    discounted_ep_rs[-1] = winLoseReward
    running_add = 0
    for t in reversed(range(0, len(actions))):
        running_add = running_add * gamma + discounted_ep_rs[t]
        discounted_ep_rs[t] = running_add

    # normalize episode rewards
    discounted_ep_rs -= np.mean(discounted_ep_rs)
    discounted_ep_rs /= np.std(discounted_ep_rs)
    return discounted_ep_rs
    
def playGame(board, gamma, v = False):
    board.reset()
    
    boardHistoryP1 = []
    boardHistoryP2 = []
    moveSequenceP1 = []
    moveSequenceP2 = []
    
    maybePrint(v, b)
    while(not b.finished()):
        boardBeforeMoveP1 = deepcopy(board.board)
        P1Move = makeMove(p, b, "BLACK")[0]
        if(P1Move != -1):
            boardHistoryP1.append(boardBeforeMoveP1)
            moveSequenceP1.append(P1Move)
        
        boardBeforeMoveP2 = deepcopy(board.inverted_board())
        P2Move = makeMove(p, b, "WHITE")[0]
        if(P2Move != -1):
            boardHistoryP2.append(boardBeforeMoveP2)
            moveSequenceP2.append(P2Move)
        maybePrint(v, "====================")
        maybePrint(v, (moveSequenceP1[-1]//8, moveSequenceP1[-1]%8))
        maybePrint(v, b)
        maybePrint(v, "====================")
        maybePrint(v, (moveSequenceP2[-1]//8, moveSequenceP2[-1]%8))
        maybePrint(v, b)
    maybePrint(v, b.score())
    
    r = winLoseReward(board)
    rewardSequenceP1 = list(discount_and_norm_rewards(r if r != 0 else -1, moveSequenceP1, gamma))
    rewardSequenceP2 = list(discount_and_norm_rewards(-r if r != 0 else -1, moveSequenceP2, gamma))

    boardHistory = boardHistoryP1 + boardHistoryP2
    moveSequence = moveSequenceP1 + moveSequenceP2
    rewardSequence = rewardSequenceP1 + rewardSequenceP2
    
    return (boardHistory, moveSequence, rewardSequence)
    
def winLoseReward(board):
    if(np.sum(board.board) > 0):
        return 1
    elif(np.sum(board.board) < 0):
        return -1
    else:
        return 0

    
def sampleBatch(mem, size):
    indexes = []
    while(len(indexes) < size):
        i = np.random.randint(0,len(mem["states"]))
        if i not in indexes:
            indexes.append(i)
            
    boards = [mem["states"][i]for i in indexes]
    actions = [mem["actions"][i]for i in indexes]
    rewards = [mem["rewards"][i]for i in indexes]
    return np.array(boards).reshape((size, boardDim)), actions, np.array(rewards)

def randomPlay(board, name):
    if(name == "BLACK"):
        player = 1
    else:
        player = -1
       
    moves = board.possible_moves(player) 
    
    if(len(moves) == 0):
        return -1
    
    selected_move = np.random.choice(len(moves), 1, p = [1/len(moves)]*len(moves))[0]
    board.move(moves[selected_move][0], moves[selected_move][1], player)        
    
def testPlayer(board, v = False):
    board.reset()
    
    maybePrint(v, board)
    while(not b.finished()):
        P1Move = makeMove(p, b, "BLACK")[0]
        if(P1Move != -1):
            maybePrint(v, board)
        if (randomPlay(b, "WHITE") != -1):
            maybePrint(v, board)
    return b.score()

reset_graph()

batchSize = 128
epochs = 50
gamma = 0.95
boardDim = 8*8
hLayersDim = [128, 256, 128]
gpuNum = 1

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
b = Board()
p = buildGraph(boardDim, tf.float32, hLayersDim)
replayMemory = {
    "states": [],
    "actions": [],
    "rewards": []
}
losses = []
testGames = []
summaryCounter = 0

#with tf.device('/gpu:'+str(gpuNum)):
with tf.Session(config=config) as sess: 
    sess.run(tf.global_variables_initializer())

    merged = tf.summary.merge_all()
    data_writer = tf.summary.FileWriter('./data')

    for j in range(epochs):
        for i in range(batchSize):
            boards, moves, rewards = playGame(b, gamma)

            replayMemory["states"].extend(boards)
            replayMemory["actions"].extend(moves)
            replayMemory["rewards"].extend(rewards)

        boards, actions, rewards = sampleBatch(replayMemory, batchSize)
        summary, outputRaw, negLogProb, loss, _ = sess.run([merged, p["outputRaw"], p["negLogProb"], p["loss"], p["train_step"]], feed_dict={p["x"]:boards, p["actions"]: actions, p["rewards"]: rewards})
        data_writer.add_summary(summary, summaryCounter)
        summaryCounter += 1
        if(j % 100 == 0):
            saver = tf.train.Saver()
            save_path = saver.save(sess, "./pesos/pesos-"+str(time.time())+".ckpt")
            print("Model saved in file: %s" % save_path)

        losses.append(loss)
        print("Loss: "+str(loss))

        black_wins = 0
        for _ in range(100):
            score = testPlayer(b)
            if(score[0] >= score[1]):
                black_wins += 1
        testGames.append(black_wins)

    plt.plot(losses)
    plt.show()

    plt.plot(testGames)
    plt.show()
    

Model saved in file: ./pesos/pesos-1504819574.9374185.ckpt
Loss: 0.145857
Loss: 0.575047
Loss: 0.189925
Loss: 0.361393
Loss: 0.203815
Loss: 0.196249
Loss: -0.967964
Loss: -0.519173
Loss: -3.25292
Loss: -1.44447
Loss: -7.44761
Loss: -8.6772
Loss: -19.7533
Loss: -27.7486
Loss: -15.3925
Loss: -47.7877
Loss: -44.2792
Loss: -80.4518
Loss: -193.538
Loss: -208.01


In [None]:
print(np.average(testGames))
print(np.average(losses))