In [1]:
import src.Omok as omok
import src.MCTS as mcts
import numpy as np
import numpy.random as rnd
import os

In [2]:
L = 3
goal = 3
N = L*L
c_puct = 1.0
tau = 1.
epsilon = 0.25
l2_c = 1e-4

Game = omok.Omok(L,goal)
MCTS = mcts.MCTS(L,goal,c_puct,tau,epsilon)
Game.Initialize()
MCTS.Initialize()

In [3]:
from collections import deque

replay_memory_size = 10000
replay_memory = deque([], maxlen=replay_memory_size)

def sample_memories(batch_size):
    
#    global replay_memory
    
    indices = rnd.permutation(len(replay_memory))[:batch_size]
    cols = [[], [], [], [], []] #z, v, pi, p, state
    for idx in indices:
        memory = replay_memory[idx]
        for col, value in zip(cols, memory):
            col.append(value)
    cols = [np.array(col) for col in cols]
    return cols[0].reshape(-1,1), cols[1], cols[2], cols[3], cols[4]

In [4]:
def Input_Converter(nowBoard):
    
    data = []
    for i in nowBoard:
        conv_i = -1
        if (i==0):
            conv_i = 0
        elif (i==1):
            conv_i = 2
        else:
            conv_i = 1
        data.append([i,conv_i])

    return data

#print(Input_Converter(Game.Board))

def Result_Converter(reward,policy):
    data = []
    data.extend(policy)
    data.extend(reward)
    
    return data

def Calculate_Loss(z,v,pi,p):
        
    loss = (z-v)**2
    tmp = 0
    for e_pi,e_p in zip(pi,p):
        tmp = tmp + v*e_pi*e_p
    loss = loss + tmp
    
    return loss

#print (Input_Converter(Game.Board))

In [5]:
import tensorflow as tf

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)
    
print (int(L/2), int(L/4), int(L/8))
z = 0
v = 0
pi = []
p = []

1 0 0


In [6]:
reset_graph()

input_height = L
input_width = L
input_channels = 2

conv_n_maps = [8,8,8]
conv_kernel_sizes = [3,2,1]
conv_strides = [1,1,1]
conv_paddings = ["SAME"] * 3
conv_activation = [tf.nn.relu] * 3

n_hidden_inputs = L * L * conv_n_maps[-1]
n_hidden = 256
hidden_activation = tf.nn.relu
n_outputs = N+1
#initializer = tf.contrib.layers.variance_scaling_initializer()
initializer = tf.contrib.layers.xavier_initializer()

learning_rate = 0.01

def Network(X_state, scope):
    prev_layer = X_state
    conv_layers = []
    with tf.variable_scope(scope) as scope:
        for n_maps, kernel_size, strides, padding, activation in zip(conv_n_maps, conv_kernel_sizes, conv_strides, conv_paddings, conv_activation):
            prev_layer = tf.layers.conv2d(prev_layer, 
                                          filters=n_maps, 
                                          kernel_size=kernel_size, 
                                          strides=strides, 
                                          padding=padding, 
                                          activation=activation,
                                          kernel_initializer=initializer)
            conv_layers.append(prev_layer)
        last_conv_layer_flat = tf.reshape(prev_layer, shape=[-1, n_hidden_inputs])
        hidden = tf.layers.dense(last_conv_layer_flat, n_hidden, activation=hidden_activation, kernel_initializer=initializer)
        outputs = tf.layers.dense(hidden, n_outputs)
    trainable_vars = {var.name[len(scope.name):]: var for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)}
    return outputs, trainable_vars

X = tf.placeholder(tf.float32, shape=[None, input_height*input_width, input_channels])
X_state = tf.reshape(X, shape=[-1, input_height, input_width, input_channels])


actor_values, actor_vars = Network(X_state, scope="Networks/actor")    # acts
#critic_values, critic_vars = Network(X_state, scope="Networks/critic")# learns

#copy_ops = [actor_var.assign(critic_vars[var_name])
#            for var_name, actor_var in actor_vars.items()]
#copy_critic_to_actor = tf.group(*copy_ops)

In [7]:
with tf.variable_scope("train"):

    Y = tf.placeholder(tf.float32, shape=[None, 1])
    term_loss = Calculate_Loss(z,v,pi,p)
    
    vars_in_network = tf.trainable_variables()
    l2_loss = tf.add_n([tf.nn.l2_loss(var) for var in vars_in_network]) * l2_c
    loss_total = term_loss + l2_loss
    global_step = tf.Variable(0, trainable=False, name='global_step')
    optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
    training_op = optimizer.minimize(loss_total, global_step = global_step)

In [8]:
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [9]:
n_steps = 3000  # total number of training steps
training_start = 0  # start training after 1,000 game iterations
training_interval = 3  # run a training step every 3 game iterations
save_steps = 50  # save the model every 50 training steps
copy_steps = 3  # copy the critic to the actor every 25 training steps
discount_rate = 0.95
batch_size = 10
find_next_iter = 20
iteration = -1  # game iterations
checkpoint_path = "./checkpoint/my_omok.ckpt"
done = True # env needs to be reset
replay_memory = deque([], maxlen=replay_memory_size)

with tf.Session() as sess:
    if os.path.isfile(checkpoint_path):
        saver.restore(sess, checkpoint_path)
        print("yes")
    else:
        init.run()
        print("no")
    
    while True:
        step = global_step.eval()
        
        if iteration >= n_steps:
            break
        iteration += 1
        if (iteration % 50 == 0):
            print("\rIteration {}".format(iteration))

        if done: # game over, start again
            
            Game.Initialize()
            MCTS.Initialize()
            
            next_values = actor_values.eval(feed_dict={X: [Input_Converter(Game.Board)]})
            leaf_v = next_values[0][-1]
            leaf_p = next_values[0][:-1]
           
            MCTS.MCTS_EXPAND(Game.Board,leaf_p,leaf_v)
            reward = 0
            tmp_memory = []
           
        
        while(1):
            for i in range(find_next_iter):
                A_t = MCTS.MCTS_SELECT(c_puct,Game.Board)
                #print (A_t)
                if (A_t):
                    next_values = actor_values.eval(feed_dict={X: [Input_Converter(A_t)]})
                    leaf_v = next_values[0][-1]
                    leaf_p = next_values[0][:-1]
                    backup_v = MCTS.MCTS_EXPAND(A_t,leaf_p,leaf_v)
                    MCTS.MCTS_BACKUP(backup_v,A_t)
                    
                    #print (next_values)
                    #input()
            
                else:
                    state = Game.Board

                    next_values = actor_values.eval(feed_dict={X: [Input_Converter(state)]})
                    leaf_v = next_values[0][-1]
                    leaf_p = next_values[0][:-1]

                    backup_v = MCTS.MCTS_EXPAND(Game.Board,leaf_p,leaf_v)
                    MCTS.MCTS_BACKUP(backup_v,state)


            for_memory = actor_values.eval(feed_dict={X:[Input_Converter(Game.Board)]})
            memory_v = for_memory[-1]
            memory_p = for_memory[:-1]
            
            tmp_memory.append([memory_v,MCTS.Policy,memory_p,Input_Converter(Game.Board)])
            
            #print (MCTS.Nt[MCTS.Tree.index(Game.Board)])
            #Game.PrintBoard()
            # Actor plays
            plyer, move_x, move_y= MCTS.MCTS_PLAY(tau,Game.Board)
            Game.Playing(plyer,move_x,move_y)

            
            #print (move_x,move_y, iteration)
            if(iteration % 1000 == 0):
                print (Game.PrintBoard())
            
            if (move_y == -1):
                print (MCTS.Policy)
                print (MCTS.Proport)
                print (len(MCTS.Policy), len(MCTS.Proport))
                print (MCTS.Nt)
                Game.PrintBoard()
                print(MCTS.Tree)
                print(MCTS.Tree_Net)
                print(MCTS.Nt)
                
                input()
                
                
            # For showing            
            if (Game.BFF_Bingo(plyer)):
                #print(plyer,"win")
                #reward = 1
                ur_win = -1
                #print (plyer,"win")
                break
            
            if (not Game.Board.count(0)):
                ur_win = 0
                #print ("Draw")
                break
        
        player1 = Game.Board.count(1)
        player2 = Game.Board.count(2)
        num_move = player1 + player2
        
        for i in range(num_move):
            ur_win = ur_win * -1
            tmp_memory[i].insert(0,ur_win)
            replay_memory.append(tmp_memory[i])
            
        if iteration < training_start or iteration % training_interval != 0:
            continue
        
        # Critic learns
        
        if (iteration % 5 == 0):
            z,v,pi,p,state = sample_memories(batch_size)
            #print (z)

            training_op.run(feed_dict={X:state, Y: z})

        # Regularly copy critic to actor
        #if iteration % 10 == 0:
            #print(MCTS.Policy)
            #copy_critic_to_actor.run()
            #Game.PrintBoard()

        # And save regularly
        #if iteration % save_steps == 0:
            #saver.save(sess, checkpoint_path)

no
Iteration 0
- - - 
- - O 
- - - 
None
- - - 
- - O 
- X - 
None
O - - 
- - O 
- X - 
None
O - - 
X - O 
- X - 
None
O - O 
X - O 
- X - 
None
O - O 
X - O 
- X X 
None
O - O 
X O O 
- X X 
None
O - O 
X O O 
X X X 
None
Iteration 50
Iteration 100
Iteration 150
Iteration 200
Iteration 250
Iteration 300
Iteration 350
Iteration 400
Iteration 450
Iteration 500
Iteration 550
Iteration 600
Iteration 650
Iteration 700
Iteration 750
Iteration 800
Iteration 850
Iteration 900
Iteration 950
Iteration 1000
O - - 
- - - 
- - - 
None
O - - 
- - - 
- X - 
None
O - - 
O - - 
- X - 
None
O - - 
O - - 
X X - 
None
O - O 
O - - 
X X - 
None
O - O 
O X - 
X X - 
None
O - O 
O X O 
X X - 
None
O - O 
O X O 
X X X 
None
Iteration 1050
Iteration 1100
Iteration 1150
Iteration 1200
Iteration 1250
Iteration 1300
Iteration 1350
Iteration 1400
Iteration 1450
Iteration 1500
Iteration 1550
Iteration 1600
Iteration 1650
Iteration 1700
Iteration 1750
Iteration 1800
Iteration 1850
Iteration 1900
Iteration 1950
Iter

In [10]:
print (Input_Converter(A_t))

[]


In [11]:
goal = 3
L = 3
start = 1
match = 0
for i in range(0,goal):
    print (start,i,start+(i+(L*i)),start+(L*(i+1)),start+i+1)
    if (Game.Board[start+(i+(L*i))] == 0):
        match += 1
        #if (start+(L*(i+1)) > L*(L-1)) or 
        if ((start+i+1)%L == 0) :
            print ("break")
            break


1 0 1 4 2
1 1 5 7 3
1 2 9 10 4


IndexError: list index out of range

In [None]:
Game.Initialize()

reward = -1
leaf_p = [float("{0:.2f}".format(np.random.rand())) for i in range(N)]
leaf_v = float("{0:.2f}".format(np.random.rand()))
print (Game.Board)
MCTS.MCTS_EXPAND(Game.Board,leaf_p,leaf_v)
#print(MCTS.Tree.index(Game.Board))

for j in range(50):
    for i in range(10):
        A_t = MCTS.MCTS_SELECT(c_puct,Game.Board)
        #print (A_t)
        if (A_t):
            leaf_p = [float("{0:.2f}".format(np.random.rand())) for i in range(N)]
            leaf_v = float("{0:.2f}".format(np.random.rand()))
            backup_v = MCTS.MCTS_EXPAND(A_t,leaf_p,leaf_v)
            MCTS.MCTS_BACKUP(backup_v,A_t)
        else:
            break
    """
    print ("Tree")
    print (MCTS.Tree)
    print ("Tree_Net")
    print (MCTS.Tree_Net)
    print ("Nt")
    print (MCTS.Nt)
    print ("Qt")
    print (MCTS.Qt)
    print ("Wt")
    print (MCTS.Wt)
    print ("Pt")    
    print (MCTS.Pt)
    print ("Trajectory")
    print (MCTS.Trajectory)
    print ("Trajectory_A")
    print (MCTS.Trajectory_A)
    """
    plyer, x, y= MCTS.MCTS_PLAY(tau,Game.Board)
    Game.Playing(plyer,x,y)
    if (Game.BFF_Bingo(plyer)):
        print(plyer,"win")
        reward = 1
        break
    print(Game.PrintBoard())
    #input()