In [None]:
import numpy as np
import tensorflow as tf
# keras model approach
from tensorflow.keras import Model,Sequential
from tensorflow.keras.layers import Conv2D,MaxPooling2D,Flatten,BatchNormalization,Dense, Input
from tensorflow.keras.activations import relu
from tqdm import tqdm
from collections import deque
import random
import gym

In [None]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)
tf.keras.backend.set_floatx('float32')

In [None]:
from kaggle_environments import evaluate, make, utils

env = make("connectx", debug=True)
env.render(mode="ipython", width=500, height=450)

In [None]:
env = make("connectx", debug=True)
print(dir(env))
#print(env.state[0]['observation']['board'])
'''env.state[0]['observation']['board'][0] = 1
env.state[0]['observation']['board'][1] = 1
env.state[0]['observation']['board'][2] = 1
env.state[0]['observation']['board'][3] = 1'''
print(env.done)
env.step([1,1])
env.step([1,1])
a = env.step([1,1])
print("A")
print(a)
print(env.state[0]['observation']['board'])
env.render(mode="ipython", width=500, height=450)

In [None]:
action_space = env.configuration['columns']
obs = env.reset()
# creating a non CNN model with directly feeding all the squares to the NN
# The input has *3 because we want each piece to have a different class,
# so basically 3 matrices will be formed 1 for empty squares, 1 matrice for 
# player 1's coin, 1 matrice for player 2's coin 
# all will be concatenated and then fed into the NN

nn_input = len(obs[0]['observation']['board'])*3
print(nn_input)
env.render(mode="ipython", width=500, height=450)

In [None]:
adam = tf.keras.optimizers.Adam(learning_rate = 0.001)

In [None]:
def model_keras():
    
    inputs = Input(shape=(nn_input,))
    x = Dense(1000,activation='relu',kernel_initializer="glorot_uniform")(inputs)
    x = BatchNormalization()(x)
    x = Dense(500,activation='relu',kernel_initializer="glorot_uniform")(inputs)
    x = BatchNormalization()(x)
    x = Dense(500,activation='relu',kernel_initializer="glorot_uniform")(x)
    x = BatchNormalization()(x)
    x = Dense(500,activation='relu',kernel_initializer="glorot_uniform")(x)
    x = BatchNormalization()(x)
    x = Dense(500,activation='relu',kernel_initializer="glorot_uniform")(x)
    x = BatchNormalization()(x)
    output = Dense(action_space,activation='linear',kernel_initializer="glorot_uniform")(x)
    model = Model(inputs=inputs, outputs=output, name="RL_Value_Function")
    
    print(model.summary())
    
    model.compile(optimizer=adam,loss='mean_squared_error',metrics=['mean_squared_error'])
    
    return model
# converts observation into NN input type
def encode(board):
    empty = []
    player_1 = []
    player_2 = []
    
    for i in board:
        if i == 0:
            empty.append(1)
        else:
            empty.append(0)
        if i == 1:
            player_1.append(1)
        else:
            player_1.append(0)
        if i == 2:
            player_2.append(1)
        else:
            player_2.append(0)
            
    output = np.concatenate((np.array(empty),np.array(player_1),np.array(player_2)),axis=0)
    return output
model_1 = model_keras()
model_2 = model_keras()

In [None]:
'''class ConnectX(gym.Env):
    def __init__(self, switch_prob=0.0):
        self.env = make('connectx', debug=False)
        self.pair = [None, 'random']
        self.trainer = self.env.train(self.pair)
        self.switch_prob = switch_prob

        # Define required gym fields (examples):
        config = self.env.configuration
        self.action_space = gym.spaces.Discrete(config.columns)
        self.observation_space = gym.spaces.Discrete(config.columns * config.rows)

    def switch_trainer(self):
        self.pair = self.pair[::-1]
        self.trainer = self.env.train(self.pair)

    def step(self, action):
        return self.trainer.step(action)
    
    def reset(self):
        if np.random.random() < self.switch_prob:
            self.switch_trainer()
        return self.trainer.reset()
    
    def render(self, **kwargs):
        return self.env.render(**kwargs)'''

In [None]:
def custom_loss(y_true,y_pred):
    return tf.keras.losses.mean_squared_error(y_true,y_pred)

In [None]:
# lets build some memory into the model to perform decorrelated batch updates
# this is TD learning

# so apparantly the NN has to be adjusted only according to the action taken by it
# for example if action 2 is taken then only the weights for action 2 should be changed
# for this reason we should only update the q_2 vector with regarding the chosen action
# rewards will also be added to that action only
# q_2 is what we thought the value of the state will be after doing action
# we also add the reward and make this the target for the NN

replay_batch = deque(maxlen = 3000)

warmup = 10 #will start training after these many episodes have passed

# to balance exploration
epsilon = {
"epsilon" : 1.0,
"epsilon_decay": 0.99999,
"epsilon_min":0.01,
}

In [None]:
import random
random.seed(2020)

#@tf.function
def batch_train(model_1,model_2,gamma,batch_size,epsilon):
    
    #decaying the exploration
    if epsilon['epsilon'] > epsilon['epsilon_min']:
         epsilon['epsilon'] =  epsilon['epsilon'] * epsilon['epsilon_decay']
   
    batch = random.sample(replay_batch,batch_size)
    
    batch_reward = []
    batch_action = []
    batch_done = []
    
    batch_current_state = np.zeros((batch_size, nn_input))
    batch_next_state = np.zeros((batch_size, nn_input))

    for i in range(batch_size):
        batch_reward.append(batch[i][2])
        batch_action.append(batch[i][1])
        batch_current_state[i] = batch[i][0]
        batch_next_state[i] = batch[i][3]
        batch_done.append(batch[i][4])
    #lets calculate the next state value as the current value will be calculated in 
    # in gradient tape
    
    next_q = model_2.predict(batch_next_state)
    
    max_q = []
    for i in next_q:
        max_q.append(max(i))
    max_q = np.array(max_q,dtype = 'float32')
    
    target = batch_reward + gamma*max_q # this is the Q learning Target
    
    
    with tf.GradientTape() as tape:
        # logits is the forward pass
        logits = model_1(batch_current_state, training=True)
        
        q_target = np.array(logits)
        
        
        # VERY IMPORTANT NOTE, IF THE EPISODE ENDS THE DONE VALUE BECOMES TRUE
        # IT IS VERY IMPORTANT THAT THE NN UPDATES TOWARDS THIS TRUE VALUE RATHER THAN
        # ITS OWN THINKING VALUE (r + gamma*max(action)) THAT WE USE FOR ALL
        # NON TERMINAL REWARDS 
        # THIS MAKES OR BREAKS THE NETWORK VERY VERY IMPORTANT
        for i in range(batch_size):
            q_target[i][batch_action[i]] = target[i]
            if batch_done[i]:
                q_target[i][batch_action[i]] = batch_reward[i]

        # calculating the loss
        loss_value = custom_loss(q_target,logits)
    
    #we retrieve the gradients
    grads = tape.gradient(loss_value, model_1.trainable_weights)
    
    #THIS IS ONE STEP OF GRAD DESCENT (Minimizes the loss)
    adam.apply_gradients(zip(grads, model_1.trainable_weights))

def policy(q_vals,turn):
    # lets implement a policy which decays
    if np.random.rand() <= epsilon['epsilon']:  
        return random.randrange(action_space)
    elif turn == True:
        action = np.argmax(q_vals[0])
        return action
    else:
        action = np.argmin(q_vals[0])
        return action
def update_target_network():
    model_2.set_weights(model_1.get_weights())
# this is the custom reward function
def get_reward(rew):
    reward = None
    if rew == 1:
        reward = 1
    elif rew == -1:
        reward = -1
    elif rew == None:
        reward = -5
    else:
        reward = 0
    return reward

In [None]:
env.reset()

x = env.step([0,1])
x = env.step([0,1])
x = env.step([0,1])
x = env.step([0,1])
x = env.step([0,1])
x = env.step([0,1])
x = env.step([0,1])
print(env.done)
print(x)
print(encode(x[0]['observation']['board']))
env.render(mode="ipython", width=300, height=300)

In [None]:
# THE TRAINING LOOP
global_steps = 0
# to have same networks in the starting
update_target_network()

for i in tqdm(range(1000)):
    observation = env.reset()[0]['observation']['board']
    observation = encode(observation)
    observation = np.expand_dims(observation, axis=0)
    done = False
    turn = True
    total_reward = 0
    while not done:        
        #storing the current state
        state_1 = observation
        # this is the current q values
        q_state = model_1(observation,training = False)
        
        action = policy(q_state,turn) # current action
        
        # ok so this environments works a bit different 
        # actions of player 1 have to be in the format ['player 1 action',0]
        # actions of player 2 havet to be in the format [0,'player 2']
        st = None
        if turn == True:
            st = env.step([int(action),0])
            turn = False
        else:
            st = env.step([0,int(action)])
            turn = True
        
        next_state = st[0]['observation']['board']
        next_state = encode(next_state)
        next_state = np.expand_dims(next_state, axis=0)
        reward = get_reward(st[0]['reward'])
        done = env.done
        
        # calculating the total reward
        total_reward = total_reward + reward

        state_2 = next_state
        state_reward = reward
        # only store for player 1
        if turn == True:
            replay_batch.append((state_1,action,state_reward,state_2,done))
        
        observation = next_state
        
        if i>warmup:
            batch_train(model_1,model_2,0.99,64,epsilon)
            global_steps = global_steps + 1
            
        if done:
            update_target_network()
            
    print(total_reward)

In [None]:
model_1.save("G:\Data Science\Reinforcement Learning\Connect X kaggle/model_1_1000ep_DQN_ONLY.h5")
model_2.save("G:\Data Science\Reinforcement Learning\Connect X kaggle/model_2_1000ep_DQN_ONLY.h5")

In [None]:
# JUST TESTING THE NETWORK

for i in tqdm(range(50)):
    observation = env.reset()[0]['observation']['board']
    observation = encode(observation)
    observation = np.expand_dims(observation, axis=0)
    done = False
    turn = True
    total_reward = 0
    while not done:        
        #storing the current state
        state_1 = observation
        # this is the current q values
        q_state = model_1(observation,training = False)
        action = policy(q_state,turn) # current action
        print("q state")
        print(q_state)
        # ok so this environments works a bit different 
        # actions of player 1 have to be in the format ['player 1 action',0]
        # actions of player 2 havet to be in the format [0,'player 2']
        st = None
        if turn == True:
            st = env.step([int(action),0])
            turn = False
        else:
            st = env.step([0,int(action)])
            turn = True
        
        next_state = st[0]['observation']['board']
        next_state = encode(next_state)
        next_state = np.expand_dims(next_state, axis=0)
        reward = get_reward(st[0]['reward'])

        done = env.done
        
        # calculating the total reward
        total_reward = total_reward + reward

        state_2 = next_state
        state_reward = reward
        # only store for player 1's move
        observation = next_state
    env.render(mode="ipython", width=300, height=300)
    print(total_reward)
    print("episode ended")

In [None]:
env.play([None, "negamax"], width=500, height=450)