In [5]:
import numpy as np
import tensorflow as tf

from game import play_step
from utils import generate_board
from players import trained_player
from game_display import print_board

from IPython.display import clear_output

In [6]:
learning_rate = 0.001
gamma = 0.99
num_epochs = 1000
batch_size = 32
eps = .2

actor_model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(90,)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(81, activation='softmax')
])

critic_model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(90,)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [7]:
actor_optimizer = tf.keras.optimizers.legacy.Adam(learning_rate)
critic_optimizer = tf.keras.optimizers.legacy.Adam(learning_rate)

def actor_loss(action_probabilities, advantages):
    action_log_prob = -tf.math.log(action_probabilities)
    actor_loss = tf.reduce_sum(action_log_prob * advantages)
    return actor_loss

def critic_loss(values, returns):
    critic_loss = tf.reduce_mean(tf.square(returns - values))
    return critic_loss

In [10]:
all_board, global_board = generate_board()
i, j = None, None
reward = 1
other_reward = 1
while reward == 1 and other_reward == 1:
    new_state = trained_player(all_board, global_board, i, j, actor_model, eps)
    all_board, global_board, reward, i, j = play_step(all_board, global_board, new_state, player = 0)
    if reward != 1: break
    all_board, global_board, other_reward, i, j = play_step(all_board, global_board, (i, j), player = 1)
    clear_output()
    print_board(all_board, global_board)
    input()

clear_output()
print_board(all_board, global_board)


 X |   | X ||   |   | O ||   OOOOO   
---+---+---||---+---+---||  O     O  
   |   | O || X |   |   ||  O     O  
---+---+---||---+---+---||  O     O  
   |   | X ||   | X | X ||   OOOOO   
   OOOOO   ||   | O | X ||   |   | O 
  O     O  ||---+---+---||---+---+---
  O     O  || X | O |   ||   | X |   
  O     O  ||---+---+---||---+---+---
   OOOOO   || X |   |   || X | X |   
   OOOOO   ||   OOOOO   ||   OOOOO   
  O     O  ||  O     O  ||  O     O  
  O     O  ||  O     O  ||  O     O  
  O     O  ||  O     O  ||  O     O  
   OOOOO   ||   OOOOO   ||   OOOOO   


In [9]:
print_board(all_board, global_board)

   X   X   ||   X   X   ||   OOOOO   
    X X    ||    X X    ||  O     O  
     X     ||     X     ||  O     O  
    X X    ||    X X    ||  O     O  
   X   X   ||   X   X   ||   OOOOO   
   OOOOO   ||   X   X   || O |   | O 
  O     O  ||    X X    ||---+---+---
  O     O  ||     X     ||   | X |   
  O     O  ||    X X    ||---+---+---
   OOOOO   ||   X   X   ||   | X | O 
   OOOOO   || X | X | O ||   X   X   
  O     O  ||---+---+---||    X X    
  O     O  || O | X |   ||     X     
  O     O  ||---+---+---||    X X    
   OOOOO   || X | O | O ||   X   X   
