In [132]:
import gym
from gym import spaces
import numpy as np
import random
import time
from collections import deque
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
import math
from functools import partial

In [279]:
COLS = 7
ROWS = 6
EPISODES = 10000
GAMES_PER_BATCH = 6
UPDATE_EVRY = 600
ESILON_DECAY = 0.99
epsilon = 1

In [134]:
class Connect4Env(gym.Env):
    def __init__(self):
        self.action_space = spaces.Discrete(COLS)
        self.observation_space = spaces.Box(low=-1, high=1, shape=(ROWS,COLS), dtype=np.int32)
    def step(self, col):
        if self.done:
            print('Game is over must. Reset enviorment!!!')
            return
        if self.board[0][col] != 0:
            self.done = True
            self.reward = -2
            return [self.board, self.reward, self.done]

        self.board = self.board * -1
        
        row = None
        for r in range(ROWS-1,-1,-1):
            if self.board[r][col] == 0:
                self.board[r][col] = 1
                row = r 
                break
        #Horizontal
        connected = 0
        for c in range(col+1, COLS):
            if self.board[row][c] != 1:
                break
            connected += 1
        for c in range(col-1, -1, -1):
            if self.board[row][c] != 1:
                break
            connected += 1
        if connected >= 3:
            self.done = True
            self.reward = 1
            return [self.board, self.reward, self.done]
        
        #Vertical
        connected = 0
        for r in range(row+1, ROWS):
            if self.board[r][col] != 1:
                    break
            connected += 1
        for r in range(row-1, -1, -1):
            if self.board[r][col] != 1:
                break
            connected += 1
        if connected >= 3:
            self.done = True
            self.reward = 1
            return [self.board, self.reward, self.done] 
        
        #Positive Diagonal
        connected = 0
        for x in range(1, min(ROWS,COLS)):
            if row + x > ROWS-1 or col + x > COLS-1:
                break
            if self.board[row+x][col+x] != 1:
                break
            connected += 1
        for x in range(1, min(ROWS,COLS)):
            if row - x < 0 or col - x < 0:
                break
            if self.board[row-x][col-x] != 1:
                break
            connected += 1
        if connected >= 3:
            self.done = True
            self.reward = 1
            return [self.board, self.reward, self.done]
        
        #Negative Digaonal
        connected = 0
        for x in range(1, min(ROWS,COLS)):
            if row + x > ROWS-1 or col - x < 0:
                break
            if self.board[row+x][col-x] != 1:
                break
            connected += 1
        for x in range(1, min(ROWS,COLS)):
            if row - x < 0 or col + x > COLS-1:
                break
            if self.board[row-x][col+x] != 1:
                break
            connected += 1
        if connected >= 3:
            self.done = True
            self.reward = 1
            return [self.board, self.reward, self.done]
        
        #Full board
        self.done = True
        for i in range(COLS):
            if self.board[0][i] == 0:
                self.done = False
                break
        return [self.board, self.reward, self.done]
        
        
    def reset(self):
        self.done = False
        self.reward = 0
        self.turn = 1
        self.board = np.zeros((ROWS,COLS))
        return self.board

In [135]:
class ActorCriticNetwork(tf.keras.Model):
    def __init__(self):
        super(ActorCriticNetwork, self).__init__()
        self.i = tf.keras.layers.Input(shape=(ROWS,COLS,1))
        mask = np.asarray([c != c for c in range(COLS)], dtype=bool)
        
        self.shared_layers = tf.keras.Sequential([
            tf.keras.layers.Conv2D(512,(3,3),padding ='same'),
            tf.keras.layers.Normalization(axis=None),
            tf.keras.layers.Activation(partial(tf.nn.leaky_relu, alpha=0.01)),

            tf.keras.layers.Conv2D(512,(3,3),padding ='same'),
            tf.keras.layers.Normalization(axis=None),
            tf.keras.layers.Activation(partial(tf.nn.leaky_relu, alpha=0.01)),

            tf.keras.layers.Conv2D(512,(3,3)),
            tf.keras.layers.Normalization(axis=None),
            tf.keras.layers.Activation(partial(tf.nn.leaky_relu, alpha=0.01)),

            tf.keras.layers.Conv2D(512,(3,3)),
            tf.keras.layers.Normalization(axis=None),
            tf.keras.layers.Activation(partial(tf.nn.leaky_relu, alpha=0.01)),
            
            tf.keras.layers.Flatten(),

            tf.keras.layers.Dense(3072, activation=partial(tf.nn.leaky_relu, alpha=0.01)),
            tf.keras.layers.Dense(1024, activation=partial(tf.nn.leaky_relu, alpha=0.01))
        ])

        self.value_head = tf.keras.Sequential([
            tf.keras.layers.Dense(512, activation=partial(tf.nn.leaky_relu, alpha=0.01)),
            tf.keras.layers.Dense(1, activation='linear')
        ])

        self.policy_head = tf.keras.Sequential([
            tf.keras.layers.Dense(512, activation=partial(tf.nn.leaky_relu, alpha=0.01)),
            tf.keras.layers.Dense(COLS)
        ])
        
        self.masking = tf.keras.layers.Softmax()
        
        z = self.shared_layers(self.i)
        value = self.value_head(z)
        p = self.policy_head(z)
        policy = self.masking(p, mask = mask)
        self.model = tf.keras.models.Model(inputs=self.i, outputs=[policy,value])
        
    def value(self, state):
        z = self.shared_layers(state)
        value = self.value_head(z)
        return value
    
    def policy(self, state):
        mask = IllegalMoveMask(state)
        z = self.shared_layers(state)
        p = self.policy_head(z)
        policy = self.masking(p, mask = mask)
        return policy
    
    def un_masked_policy(self, state):
        z = self.shared_layers(state)
        p = self.policy_head(z)
        policy = self.masking(p)
        return policy
    
    def value_policy(self, state):
        mask = IllegalMoveMask(state)
        z = self.shared_layers(state)
        value = self.value_head(z)
        p = self.policy_head(z)
        policy = self.masking(p, mask = mask)
        return value, policy

In [136]:
def IllegalMoveMask(states):
    m = []
    for i in range(len(states)):
        m.append([(states[i][0][c])[0] == 0 for c in range(COLS)])
    m = np.asarray(m, dtype=bool)
    return m

In [137]:
#Make this use a training and testing model and store training data
def rollout(env, model_train, model_test):
    
    training_data = [[], [], [], [],  []]
    
    done = False
    board = env.reset()
    
    #half of the time the test model goes first
    if random.uniform(0, 1) > 0.5:
        probs = model_test.policy(board.reshape(-1, *board.shape, 1))
        action_dist = tfp.distributions.Categorical(probs=probs)
        opp_action = action_dist.sample()[0]
        board, opp_reward, done = env.step(opp_action)
        
    while not done:
        val, probs = model_train.value_policy(board.reshape(-1, *board.shape, 1))
        val = val.numpy()[0][0]
        
        #Training action
        print(probs)
        action_dist = tfp.distributions.Categorical(probs=probs)
        
        if random.uniform(0, 1) > epsilon:
            action = action_dist.sample()
            action = action.numpy()[0]
        else:
            action = random.choice([c for c in range(COLS) if board[0][c] == 0])
            
        action_prob = action_dist.prob(action)
        action_prob = action_prob.numpy()
        save_board = board
        board, reward, done = env.step(action)
        
        #if the game is not over opponent gets to move
        if not done:
            probs = model_test.policy(board.reshape(-1, *board.shape, 1))
            action_dist = tfp.distributions.Categorical(probs=probs)
            opp_action = action_dist.sample()[0]
            board, opp_reward, done = env.step(opp_action)
            
        #if opponent won we lost
        if opp_reward == 1:
            reward = -1 
        #if opponent made an illegal move we win
        if opp_reward == -2:
            reward = 1
            
            
        for i, item in enumerate((save_board, action, reward, val, action_prob)):
            training_data[i].append(item)
            
    training_data[3] = calculate_gaes(training_data[2],training_data[3])
    training_data.append(discounted_reward(training_data[2]))

    return training_data

In [138]:
def calculate_gaes(rewards, values, gamma=0.99, decay=0.97):
    next_values = np.concatenate([values[1:],[0]])
    deltas = [r+gamma*n_v - v for r,v,n_v in zip(rewards, values, next_values)]
    gaes = [deltas[-1]]
    for i in reversed(range(len(deltas)-1)):
        gaes.append(deltas[i] + gamma * decay * gaes[-1])
    return(gaes[::-1])

def discounted_reward(rewards, gamma=0.99):
    new_rewards = [float(rewards[-1])]
    for i in reversed(range(len(rewards)-1)):
        new_rewards.append(float(rewards[i]) + gamma * new_rewards[-1])
    return new_rewards[::-1]

In [162]:
#Trainer
def Train_PPO(model, boards, actions, old_probs, gaes, disc_rewards, lr=0.0000005, epsilon=0.2, delta = 0.001, max_iter = 80):
    
    boards = tf.convert_to_tensor(boards)
    gaes = tf.convert_to_tensor(gaes, dtype=tf.float32)
    actions = tf.convert_to_tensor(actions) 
    old_probs = tf.convert_to_tensor(old_probs, dtype=tf.float32)
    old_probs = tf.maximum(old_probs,1e-10) #prvent div by 0
    old_probs = tf.squeeze(old_probs,1)
    
    mask = IllegalMoveMask(tf.expand_dims(boards, -1))
    
    for i in range(max_iter):
        with tf.GradientTape(persistent = True) as tape:
            probs = model.un_masked_policy(tf.expand_dims(boards, -1))
            probs = tf.maximum(probs*mask,1e-10)
            dist = tfp.distributions.Categorical(probs=probs)
            new_probs = dist.prob(actions)
            new_probs = tf.maximum(new_probs,1e-10) #prvent div by 0
            policy_ratio = new_probs/old_probs
            ploicy_loss = -tf.math.minimum(policy_ratio*gaes,
                                           tf.clip_by_value(policy_ratio, 1-epsilon, 1+epsilon)*gaes)
            policy_loss = tf.math.reduce_mean(ploicy_loss)
            
            
            new_vals = model.value(tf.expand_dims(boards, -1))
            new_vals = tf.squeeze(new_vals,1)
            value_loss = tf.keras.losses.MSE(new_vals, disc_rewards)
            total_loss = policy_loss + value_loss
                
        gradients = tape.gradient(total_loss, tape.watched_variables())
        tf.keras.optimizers.Adam(lr).apply_gradients(zip(gradients, tape.watched_variables()))
        
        #Check if the kl divergence is too much before applying change
        r = tf.maximum(policy_ratio, 1e-10) #prvent log of 0
        kl_div = tf.math.reduce_mean((r-1)-tf.math.log(r))      
        print(kl_div)
        if kl_div >= delta:
            break

In [140]:
#Initialization
ac = ActorCriticNetwork()
ac_opp = ActorCriticNetwork()
ac_opp.set_weights(ac.get_weights())
env = Connect4Env()
ep_i = 0
epsilon = 1

In [281]:
#Training loop
for _ in range(EPISODES):
    training_data = [[],[],[],[],[],[]]
    for _ in range(GAMES_PER_BATCH):
        new_data = rollout(env, ac, ac_opp)
     
        for i in range(len(training_data)):
            training_data[i] = training_data[i] + new_data[i]
            
    epsilon = epsilon*ESILON_DECAY 
    
    #Shuffle the training data
    s = random.randint(0,100000)#Any big number
    training_data = [random.Random(s).sample(x,len(training_data[0])) for x in training_data]


    #Train PPO
    Train_PPO(ac, training_data[0], training_data[1], training_data[4], training_data[3], training_data[5])

    ep_i+=1
    if not ep_i % UPDATE_EVRY:     
        ac_opp.set_weights(ac.get_weights())
        ac.save_weights("Episode-{}-model-weights".format(ep_i))
        epsilon = 1

tf.Tensor(
[[1.0128384e-13 2.1800549e-01 2.6325852e-02 6.5245187e-01 1.0321674e-01
  7.7829991e-12 2.2539524e-17]], shape=(1, 7), dtype=float32)
tf.Tensor(
[[1.7581080e-14 1.9245337e-01 2.3465682e-02 7.0393693e-01 8.0143988e-02
  1.7612478e-12 2.4941847e-18]], shape=(1, 7), dtype=float32)
tf.Tensor(
[[1.4581213e-13 3.9339980e-01 4.9212296e-02 4.7504917e-01 8.2338721e-02
  1.1907458e-11 3.3682798e-17]], shape=(1, 7), dtype=float32)
tf.Tensor(
[[4.2411773e-12 4.1564646e-01 7.0482858e-02 4.3176973e-01 8.2100891e-02
  2.0803397e-10 2.4310998e-15]], shape=(1, 7), dtype=float32)
tf.Tensor(
[[7.0287508e-12 4.3981269e-01 8.1903346e-02 4.1716504e-01 6.1118897e-02
  3.2753225e-10 4.6908655e-15]], shape=(1, 7), dtype=float32)
tf.Tensor(
[[1.3422461e-09 6.8792230e-01 1.9615321e-01 0.0000000e+00 1.1592444e-01
  2.9177182e-08 3.7061612e-12]], shape=(1, 7), dtype=float32)
tf.Tensor(
[[8.9347525e-09 6.1524612e-01 2.1552716e-01 0.0000000e+00 1.6922657e-01
  1.4499597e-07 4.2309194e-11]], shape=(1, 7), 

KeyboardInterrupt: 

In [277]:
#Show game in terminal
done = False
board = env.reset()
t = 1
while not done:
    print(board*t)
    probs = ac.policy(board.reshape(-1, *board.shape, 1))
    print(probs)
    action = np.argmax(probs)
    board, reward, done = env.step(action)
    if done:
        print(board)
    t = t * -1

[[0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]]
tf.Tensor(
[[1.2814886e-13 4.0082127e-01 8.5881921e-03 5.8045435e-01 1.0136227e-02
  6.8818085e-12 2.3772584e-17]], shape=(1, 7), dtype=float32)
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]]
tf.Tensor(
[[2.1340839e-14 3.5317177e-01 6.7706197e-03 6.3308042e-01 6.9771996e-03
  1.4755902e-12 2.4656325e-18]], shape=(1, 7), dtype=float32)
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.]]
tf.Tensor(
[[6.7487011e-14 4.1808000e-01 9.5635783e-03 5.6674594e-01 5.6105088e-03
  4.1028973e-12 1.0488740e-17]], shape=(1, 7), dtype=float32)
[[ 0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  

In [75]:
#load saved weights
ep_i = 300
ac.load_weights("Episode-{}-model-weights".format(ep_i))
ac_opp.load_weights("Episode-{}-model-weights".format(ep_i))
epsilon = 1

In [251]:
#play against the model
done = False
board = env.reset()

In [265]:
probs = ac.policy(board.reshape(-1, *board.shape, 1))
print(probs)
action = np.argmax(probs)
board, reward, done = env.step(action)
print(board)

tf.Tensor(
[[4.9855975e-10 6.1719155e-01 8.1421666e-02 3.0138665e-01 0.0000000e+00
  1.6651690e-07 5.5534349e-12]], shape=(1, 7), dtype=float32)
[[ 0.  0.  0.  0.  1.  0.  0.]
 [ 0.  1.  0.  0.  1.  0.  0.]
 [ 0.  1.  0.  0. -1.  0.  0.]
 [ 0. -1.  0.  0.  1.  0.  0.]
 [-1. -1.  0.  0.  1.  0.  0.]
 [-1. -1. -1.  0.  1.  0.  0.]]


In [None]:
board, reward, done = env.step(2)
print(board)