In [1]:
import numpy as np
import tensorflow as tf #OLD VERSION ##########UPDATE THIS##########
from othello import Board
from copy import deepcopy
import matplotlib.pyplot as plt
import time

np.set_printoptions(threshold=np.nan)

def reset_graph():
    if("sess" in globals() and sess):
        sess.close()
    tf.reset_default_graph()

def buildGraph(inputDim, dataType, hLayersDim, learning_rate = 0.01, name="player"):
    with tf.variable_scope(name):
        x = tf.placeholder(dataType, [None, inputDim])
        actions = tf.placeholder(tf.int32, [None])
        rewards = tf.placeholder(dataType, [None])

        layers = []
        biases = []

        layers.append(tf.get_variable("W0", [inputDim, hLayersDim[0]], dataType, initializer=tf.contrib.layers.xavier_initializer()))
        layers.append(tf.get_variable("W1", [hLayersDim[0], hLayersDim[1]], dataType, initializer=tf.contrib.layers.xavier_initializer()))
        layers.append(tf.get_variable("W2", [hLayersDim[1], hLayersDim[2]], dataType, initializer=tf.contrib.layers.xavier_initializer()))
        layers.append(tf.get_variable("W3", [hLayersDim[2], inputDim], dataType, initializer=tf.contrib.layers.xavier_initializer()))

        biases.append(tf.get_variable("b0", [hLayersDim[0]], dataType, initializer=tf.contrib.layers.xavier_initializer()))
        biases.append(tf.get_variable("b1", [hLayersDim[1]], dataType, initializer=tf.contrib.layers.xavier_initializer()))
        biases.append(tf.get_variable("b2", [hLayersDim[2]], dataType, initializer=tf.contrib.layers.xavier_initializer()))
        biases.append(tf.get_variable("b3", [inputDim], dataType, initializer=tf.contrib.layers.xavier_initializer()))

        hiddenStates = []

        hiddenStates.append(tf.nn.tanh(tf.matmul(x, layers[0]) + biases[0])) #Olhar o relu6, pode ser melhor
        hiddenStates.append(tf.nn.tanh(tf.matmul(hiddenStates[-1], layers[1]) + biases[1])) #Olhar o relu6, pode ser melhor
        hiddenStates.append(tf.nn.tanh(tf.matmul(hiddenStates[-1], layers[2]) + biases[2])) #Olhar o relu6, pode ser melhor
        hiddenStates.append(tf.matmul(hiddenStates[-1], layers[3]) + biases[3]) #Olhar o relu6, pode ser melhor

        output_raw = hiddenStates[-1] #sparse_softmax_cross_entropy_with_logits
        output = tf.nn.softmax(output_raw)

        #loss = tf.losses.softmax_cross_entropy(actions, output_raw, rewards)
        neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=output_raw, labels=actions)
        loss = tf.reduce_mean(neg_log_prob * rewards)  # reward guided loss

        train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)

        return {
            "name": name,
            "x": x,
            "actions": actions,
            "rewards": rewards,
            "negLogProb": neg_log_prob,
            "loss": loss,
            "layers": layers,
            "biases": biases,
            "hiddenStates": hiddenStates,
            "output": output,
            "outputRaw": output_raw,
            "train_step": train_step
        }
    
def predict(agent, state):
    return sess.run(agent["output"], feed_dict={agent["x"]: state})

def makeMove(agent, board, name):
    if(name == "BLACK"):
        player = 1
        tempBoard = board.board
    else:
        player = -1
        tempBoard = board.inverted_board()
       
    moves = board.possible_moves(player) 
    if(len(moves) == 0):
        return (-1, player)
    probs = predict(agent, tempBoard.reshape([1,-1])).squeeze()
    
    new_probs = np.zeros(boardDim)
    for x, y, _ in moves:
        new_probs[x*8 + y] = probs[x*8 + y]
    
    if(np.sum(new_probs) == 0):
        selected_move = np.random.choice(len(moves), 1, p = [1/len(moves)]*len(moves))[0]
        selected_move = moves[selected_move]
        selected_move = selected_move[0]*8 + selected_move[1]
    else:
        new_probs = new_probs/np.sum(new_probs)
        selected_move = np.random.choice(boardDim, 1, p=new_probs)[0]
    
    board.move(selected_move//8, selected_move%8, player)
    return (selected_move, player)
        
def maybePrint(shouldI, s):
    if(shouldI):
        print(s)

def discount_and_norm_rewards(winLoseReward, actions, gamma):
    # discount episode rewards
    discounted_ep_rs = np.zeros(np.array(actions).shape)    
    discounted_ep_rs[-1] = winLoseReward
    running_add = 0
    for t in reversed(range(0, len(actions))):
        running_add = running_add * gamma + discounted_ep_rs[t]
        discounted_ep_rs[t] = running_add

    # normalize episode rewards
    discounted_ep_rs -= np.mean(discounted_ep_rs)
    discounted_ep_rs /= np.std(discounted_ep_rs)
    return discounted_ep_rs
    
def playGame(board, gamma, v = False):
    board.reset()
    
    boardHistoryP1 = []
    boardHistoryP2 = []
    moveSequenceP1 = []
    moveSequenceP2 = []
    
    maybePrint(v, b)
    while(not b.finished()):
        boardBeforeMoveP1 = deepcopy(board.board)
        P1Move = makeMove(p, b, "BLACK")[0]
        if(P1Move != -1):
            boardHistoryP1.append(boardBeforeMoveP1)
            moveSequenceP1.append(P1Move)
        
        boardBeforeMoveP2 = deepcopy(board.inverted_board())
        P2Move = makeMove(p, b, "WHITE")[0]
        if(P2Move != -1):
            boardHistoryP2.append(boardBeforeMoveP2)
            moveSequenceP2.append(P2Move)
        maybePrint(v, "====================")
        maybePrint(v, (moveSequenceP1[-1]//8, moveSequenceP1[-1]%8))
        maybePrint(v, b)
        maybePrint(v, "====================")
        maybePrint(v, (moveSequenceP2[-1]//8, moveSequenceP2[-1]%8))
        maybePrint(v, b)
    maybePrint(v, b.score())
    
    r = winLoseReward(board)
    rewardSequenceP1 = list(discount_and_norm_rewards(r, moveSequenceP1, gamma))
    rewardSequenceP2 = list(discount_and_norm_rewards(-r, moveSequenceP2, gamma))

    boardHistory = boardHistoryP1 + boardHistoryP2
    moveSequence = moveSequenceP1 + moveSequenceP2
    rewardSequence = rewardSequenceP1 + rewardSequenceP2
    
    return (boardHistory, moveSequence, rewardSequence)
    
def winLoseReward(board):
    if(np.sum(board.board) > 0):
        return 1
    elif(np.sum(board.board) < 0):
        return -1
    else:
        return -1 #tie is punished
    
def idx2onehot(idx):
    onehots = np.zeros((len(idx), boardDim))
    count = 0
    for i in idx:
        onehots[count][i] = 1
        count+=1
    return onehots
    
def sampleBatch(mem, size):
    indexes = []
    while(len(indexes) < size):
        i = np.random.randint(0,len(mem["states"]))
        if i not in indexes:
            indexes.append(i)
            
    boards = [mem["states"][i]for i in indexes]
    actions = [mem["actions"][i]for i in indexes]
    rewards = [mem["rewards"][i]for i in indexes]
    return np.array(boards).reshape((size, boardDim)), actions, np.array(rewards)

def randomPlay(board, name):
    if(name == "BLACK"):
        player = 1
    else:
        player = -1
       
    moves = board.possible_moves(player) 
    
    if(len(moves) == 0):
        return -1
    
    selected_move = np.random.choice(len(moves), 1, p = [1/len(moves)]*len(moves))[0]
    board.move(moves[selected_move][0], moves[selected_move][1], player)        
    
def testPlayer(board, v = False):
    board.reset()
    
    maybePrint(v, board)
    while(not b.finished()):
        P1Move = makeMove(p, b, "BLACK")[0]
        if(P1Move != -1):
            maybePrint(v, board)
        if (randomPlay(b, "WHITE") != -1):
            maybePrint(v, board)
    return b.score()

reset_graph()

batchSize = 128
epochs = 100000
gamma = 0.95
boardDim = 8*8
hLayersDim = [128, 256, 128]
gpuNum = 1

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
b = Board()
p = buildGraph(boardDim, tf.float32, hLayersDim)
replayMemory = {
    "states": [],
    "actions": [],
    "rewards": []
}
losses = []
testGames = []

#with tf.device('/gpu:'+str(gpuNum)):
with tf.Session(config=config) as sess: 
    sess.run(tf.global_variables_initializer())

    for j in range(epochs):
        for i in range(batchSize):
            boards, moves, rewards = playGame(b, gamma)

            replayMemory["states"].extend(boards)
            replayMemory["actions"].extend(moves)
            replayMemory["rewards"].extend(rewards)

        boards, actions, rewards = sampleBatch(replayMemory, batchSize)
        outputRaw, negLogProb, loss, _ = sess.run([p["outputRaw"], p["negLogProb"], p["loss"], p["train_step"]], feed_dict={p["x"]:boards, p["actions"]: actions, p["rewards"]: rewards})
#            if(j % 250 == 0):
#                saver = tf.train.Saver()
#                save_path = saver.save(sess, "/pesos/pesos-"+str(time.time)+".ckpt")
#                print("Model saved in file: %s" % save_path)

        losses.append(loss)
        print("Loss: "+str(loss))

        black_wins = 0
        for _ in range(100):
            score = testPlayer(b)
            if(score[0] >= score[1]):
                black_wins += 1
        testGames.append(black_wins)

    plt.plot(losses)
    plt.show()

    plt.plot(testGames)
    plt.show()
    

InvalidArgumentError: Cannot assign a device for operation 'save/SaveV2': Could not satisfy explicit device specification '/device:GPU:1' because no supported kernel for GPU devices is available.
	 [[Node: save/SaveV2 = SaveV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/device:GPU:1"](save/Const, save/SaveV2/tensor_names, save/SaveV2/shape_and_slices, player/W0, player/W1, player/W2, player/W3, player/b0, player/b1, player/b2, player/b3, player/beta1_power, player/beta2_power, player/player/W0/Adam, player/player/W0/Adam_1, player/player/W1/Adam, player/player/W1/Adam_1, player/player/W2/Adam, player/player/W2/Adam_1, player/player/W3/Adam, player/player/W3/Adam_1, player/player/b0/Adam, player/player/b0/Adam_1, player/player/b1/Adam, player/player/b1/Adam_1, player/player/b2/Adam, player/player/b2/Adam_1, player/player/b3/Adam, player/player/b3/Adam_1)]]

Caused by op 'save/SaveV2', defined at:
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\ipykernel\kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\tornado\ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\ipykernel\kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\ipykernel\ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\IPython\core\interactiveshell.py", line 2683, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\IPython\core\interactiveshell.py", line 2787, in run_ast_nodes
    if self.run_code(code, result):
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\IPython\core\interactiveshell.py", line 2847, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-1-53bb50f25cbc>", line 243, in <module>
    saver = tf.train.Saver()
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\training\saver.py", line 1140, in __init__
    self.build()
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\training\saver.py", line 1172, in build
    filename=self._filename)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\training\saver.py", line 686, in build
    save_tensor = self._AddSaveOps(filename_tensor, saveables)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\training\saver.py", line 276, in _AddSaveOps
    save = self.save_op(filename_tensor, saveables)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\training\saver.py", line 219, in save_op
    tensors)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\ops\gen_io_ops.py", line 768, in save_v2
    tensors=tensors, name=name)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\framework\ops.py", line 2630, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "c:\users\rapha\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\framework\ops.py", line 1204, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): Cannot assign a device for operation 'save/SaveV2': Could not satisfy explicit device specification '/device:GPU:1' because no supported kernel for GPU devices is available.
	 [[Node: save/SaveV2 = SaveV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/device:GPU:1"](save/Const, save/SaveV2/tensor_names, save/SaveV2/shape_and_slices, player/W0, player/W1, player/W2, player/W3, player/b0, player/b1, player/b2, player/b3, player/beta1_power, player/beta2_power, player/player/W0/Adam, player/player/W0/Adam_1, player/player/W1/Adam, player/player/W1/Adam_1, player/player/W2/Adam, player/player/W2/Adam_1, player/player/W3/Adam, player/player/W3/Adam_1, player/player/b0/Adam, player/player/b0/Adam_1, player/player/b1/Adam, player/player/b1/Adam_1, player/player/b2/Adam, player/player/b2/Adam_1, player/player/b3/Adam, player/player/b3/Adam_1)]]


In [None]:
print(np.average(testGames))
print(np.average(losses))