In [1]:
import numpy as np
import tensorflow as tf
# keras model approach
from tensorflow.keras import Model,Sequential
from tensorflow.keras.layers import Conv2D,MaxPooling2D,Flatten,BatchNormalization,Dense, Input
from tensorflow.keras.activations import relu
from tqdm import tqdm
from collections import deque
import random
import gym

import time

In [2]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)
tf.keras.backend.set_floatx('float32')

In [3]:
# making the connectx environment
from kaggle_environments import evaluate, make, utils

env = make("connectx", debug=True)
env.render(mode="ipython", width=500, height=450)

Failed: football: No module named 'gfootball'


In [4]:
tempx = np.arange(4)
ran = [2,3]
print(tempx[ran])

[2 3]


### Defining Nodes the custom nodes


In [53]:
import time
import math
import random

def randomPolicy(state):
    #env = make("connectx", debug=True)
        while not state.isTerminal():
            try:
                action = random.choice(state.getPossibleActions())
            except IndexError:
                raise Exception("Non-terminal state has no possible actions: " + str(state))
            state = state.takeAction(action)
        return state.getReward()

def learning_policy(state,epsilon):
    while not state.isTerminal():
        if epsilon['epsilon'] > epsilon['epsilon_min']:
            epsilon['epsilon'] =  epsilon['epsilon'] * epsilon['epsilon_decay']
        # This is an epsilon greedy policy and so will play random moves sometimes
        if np.random.rand() <= epsilon['epsilon']:
            try:
                action = random.choice(state.getPossibleActions())
            except IndexError:
                raise Exception("Non-terminal state has no possible actions: " + str(state))
        # if non random move has to be played
        else:
            try:
                position = state.position
                # converting into NN readable format
                position = encode(position)
                position = np.expand_dims(position, axis=0)

                q_vals = model_1(position,training = False)
                q_vals = np.array(q_vals)
                # these are all the valid moves and we have to take the max from all the valid moves only
                # different policy for different players
                action = None
                
                if state.mark == 1:  # policy for the first player
                    valid = []
                    # we use this to remove the numbers that would not be valid
                    for i in range(state.columns):
                        if state.position[i] != 0:
                            q_vals[0][i] = float('-inf')
        
                    best_action = np.argmax(q_vals)
                    action = (best_action,0)
                    # print("QVALS P1")
                    # print(q_vals)
                    # print("ACTION P1")
                    # print(action)
                    
                    # DEBUGGING

                    # print("WE HAVE ENTERED THE LOOOP")
                    # for i in range(6):
                    #     for j in range(7):
                    #         if (i*7+j)>42:
                    #             print("INDEX")
                    #             print(i*7+j)
                    #         print(state.position[i*7+j], end = " ")
                    #     print()
                
                else: # policy for the second player
                    valid = []
                    for i in range(state.columns):
                        if state.position[i] != 0:
                            q_vals[0][i] = float("inf")

                    best_action = np.argmin(q_vals)
                    #print("BEST ACTION TYPE")
                    #print(type(best_action))
                    action = (0,best_action)
                    # print("QVALS P2")
                    # print(q_vals)
                    # print("ACTION P2")
                    # print(action)

                    # DEBUGGING

                    # print("WE HAVE ENTERED THE LOOOP")
                    # for i in range(6):
                    #     for j in range(7):
                    #         if (i*7+j)>42:
                    #             print("INDEX")
                    #             print(i*7+j)
                    #         print(state.position[i*7+j], end = " ")
                    #     print()
            except Exception as e:
                raise Exception("Non-terminal state has no possible actions: " )
        state = state.takeAction(action)
    # DEBUGGING LOOP
    # for i in range(6):
    #     for j in range(7):
    #         if (i*7+j)>42:
    #             print("INDEX")
    #             print(i*7+j)
    #         print(state.position[i*7+j], end = " ")
    #     print()    
    return state.getReward(),epsilon

class treeNode():
    def __init__(self, state, parent):
        self.state = state
        self.isTerminal = state.isTerminal()
        self.isFullyExpanded = self.isTerminal
        self.parent = parent
        self.numVisits = 0
        self.totalReward = 0
        self.children = {}


class mcts():
    def __init__(self, timeLimit=None, iterationLimit=None, explorationConstant=1 / math.sqrt(2),
                 rolloutPolicy=learning_policy,epsilon = None):
        if timeLimit != None:
            if iterationLimit != None:
                raise ValueError("Cannot have both a time limit and an iteration limit")
            # time taken for each MCTS search in milliseconds
            self.timeLimit = timeLimit
            self.limitType = 'time'
        else:
            if iterationLimit == None:
                raise ValueError("Must have either a time limit or an iteration limit")
            # number of iterations of the search
            if iterationLimit < 1:
                raise ValueError("Iteration limit must be greater than one")
            self.searchLimit = iterationLimit
            self.limitType = 'iterations'
        self.explorationConstant = explorationConstant
        self.rollout = rolloutPolicy

    def search(self, initialState):
        self.root = treeNode(initialState, None)

        if self.limitType == 'time':
            timeLimit = time.time() + self.timeLimit / 1000
            count = 0
            while time.time() < timeLimit:
                count = count + 1
                self.executeRound()
            print("rollout done ",count)
        else:
            for i in range(self.searchLimit):
                self.executeRound()
        bestChild = self.getBestChild(self.root, 0)
        return self.getAction(self.root, bestChild)

    def executeRound(self):
        node = self.selectNode(self.root)
        reward,self.epsilon = self.rollout(node.state,epsilon)
        self.backpropogate(node, reward)


    def selectNode(self, node):
        while not node.isTerminal:
            if node.isFullyExpanded:
                node = self.getBestChild(node, self.explorationConstant)
            else:
                return self.expand(node)
        return node

    def expand(self, node):
        actions = node.state.getPossibleActions()
        for action in actions:
            if action not in node.children:
                newNode = treeNode(node.state.takeAction(action), node)
                node.children[action] = newNode
                if len(actions) == len(node.children):
                    node.isFullyExpanded = True
                return newNode

        raise Exception("Should never reach here")

    def backpropogate(self, node, reward):
        while node is not None:
            node.numVisits += 1
            node.totalReward += reward
            node = node.parent

    def getBestChild(self, node, explorationValue):
        bestValue = float("-inf")
        bestNodes = []
        for child in node.children.values():
            nodeValue = node.state.getCurrentPlayer() * child.totalReward / child.numVisits + explorationValue * math.sqrt(
                2 * math.log(node.numVisits) / child.numVisits)
            if nodeValue > bestValue:
                bestValue = nodeValue
                bestNodes = [child]
            elif nodeValue == bestValue:
                bestNodes.append(child)
        return random.choice(bestNodes)

    def getAction(self, root, bestChild):
        for action, node in root.children.items():
            if node is bestChild:
                return action

In [40]:
# env.state has the current environment state

# env.state[0]["status"] has the game status
# INVALID Means that the game has ended due to invalid move
# ACTIVE means player 1's move
# INACTIVE means player 2's move
# DONE means that the game is over and there is a winner
env.reset()
env.step([3,1])


'''env.step([0,1])
env.step([0,1])
env.step([0,1])
env.step([0,1])
env.step([0,1])
env.step([0,1])'''
print(env.state)
win = is_win(env.state[0]['observation']['board'],env.configuration)
print(win)
if win:
    print("yes")
else:
    print("is win is bugged")


env.render(mode="ipython", width=500, height=450)

[{'action': 3, 'reward': 0, 'info': {}, 'observation': {'board': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], 'mark': 1}, 'status': 'INACTIVE'}, {'action': 1, 'reward': 0, 'info': {}, 'observation': {'mark': 2}, 'status': 'ACTIVE'}]
False
is win is bugged


In [41]:
action_space = env.configuration['columns']
obs = env.reset()
# creating a non CNN model with directly feeding all the squares to the NN
# The input has *3 because we want each piece to have a different class,
# so basically 3 matrices will be formed 1 for empty squares, 1 matrice for 
# player 1's coin, 1 matrice for player 2's coin 
# all will be concatenated and then fed into the NN

nn_input = len(obs[0]['observation']['board'])*3
print(nn_input)
env.render(mode="ipython", width=500, height=450)

126


In [42]:
# setting the NN model and making the model function
adam = tf.keras.optimizers.Adam(learning_rate = 0.001)

def model_keras():
    
    inputs = Input(shape=(nn_input,))
    x = Dense(1000,activation='relu',kernel_initializer="glorot_uniform")(inputs)
    x = BatchNormalization()(x)
    x = Dense(500,activation='relu',kernel_initializer="glorot_uniform")(inputs)
    x = BatchNormalization()(x)
    output = Dense(action_space,activation='linear',kernel_initializer="glorot_uniform")(x)
    model = Model(inputs=inputs, outputs=output, name="RL_Value_Function")
    
    print(model.summary())
    
    model.compile(optimizer=adam,loss='mean_squared_error',metrics=['mean_squared_error'])
    
    return model
# converts observation into NN input type
def encode(board):
    empty = []
    player_1 = []
    player_2 = []
    
    for i in board:
        if i == 0:
            empty.append(1)
        else:
            empty.append(0)
        if i == 1:
            player_1.append(1)
        else:
            player_1.append(0)
        if i == 2:
            player_2.append(1)
        else:
            player_2.append(0)
            
    output = np.concatenate((np.array(empty),np.array(player_1),np.array(player_2)),axis=0)
    return output
model_1 = model_keras()
model_2 = model_keras()
def custom_loss(y_true,y_pred):
    return tf.keras.losses.categorical_crossentropy(y_true,y_pred)

Model: "RL_Value_Function"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 126)]             0         
_________________________________________________________________
dense_7 (Dense)              (None, 500)               63500     
_________________________________________________________________
batch_normalization_5 (Batch (None, 500)               2000      
_________________________________________________________________
dense_8 (Dense)              (None, 7)                 3507      
Total params: 69,007
Trainable params: 68,007
Non-trainable params: 1,000
_________________________________________________________________
None
Model: "RL_Value_Function"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 126)]             0         
_________

In [71]:
import random
random.seed(2020)

#@tf.function
def batch_train(model_1,model_2,gamma,batch_size,epsilon):
    
    #decaying the exploration
    if epsilon['epsilon'] > epsilon['epsilon_min']:
         epsilon['epsilon'] =  epsilon['epsilon'] * epsilon['epsilon_decay']
   
    batch = random.sample(replay_batch,batch_size)
    
    batch_reward = []
    batch_action = []
    batch_done = []
    
    batch_current_state = np.zeros((batch_size, nn_input))
    batch_next_state = np.zeros((batch_size, nn_input))

    for i in range(batch_size):
        batch_reward.append(batch[i][2])
        batch_action.append(batch[i][1])
        batch_current_state[i] = batch[i][0]
        batch_next_state[i] = batch[i][3]
        batch_done.append(batch[i][4])
    #lets calculate the next state value as the current value will be calculated in 
    # in gradient tape
    
    next_q = model_2.predict(batch_next_state)
    
    max_q = []
    for i in next_q:
        max_q.append(max(i))
    max_q = np.array(max_q,dtype = 'float32')
    
    target = batch_reward + gamma*max_q # this is the Q learning Target
    
    
    with tf.GradientTape() as tape:
        # logits is the forward pass
        logits = model_1(batch_current_state, training=True)
        
        q_target = np.array(logits)
        print("Q TARGET")
        print(q_target.shape)
        print(q_target)
        
        # VERY IMPORTANT NOTE, IF THE EPISODE ENDS THE DONE VALUE BECOMES TRUE
        # IT IS VERY IMPORTANT THAT THE NN UPDATES TOWARDS THIS TRUE VALUE RATHER THAN
        # ITS OWN THINKING VALUE (r + gamma*max(action)) THAT WE USE FOR ALL
        # NON TERMINAL REWARDS 
        # THIS MAKES OR BREAKS THE NETWORK VERY VERY IMPORTANT
        for i in range(batch_size):
            q_target[i][batch_action[i]] = target[i]
            if batch_done[i]:
                q_target[i][batch_action[i]] = batch_reward[i]

        # calculating the loss
        loss_value = custom_loss(q_target,logits)
    
    #we retrieve the gradients
    grads = tape.gradient(loss_value, model_1.trainable_weights)
    
    #THIS IS ONE STEP OF GRAD DESCENT (Minimizes the loss)
    adam.apply_gradients(zip(grads, model_1.trainable_weights))

def policy(q_vals,turn):
    # lets implement a policy which decays
    if np.random.rand() <= epsilon['epsilon']:  
        return random.randrange(action_space)
    elif turn == True:
        action = np.argmax(q_vals[0])
        return action
    else:
        action = np.argmin(q_vals[0])
        return action
def update_target_network():
    model_2.set_weights(model_1.get_weights())
# this is the custom reward function
def get_reward(rew):
    reward = None
    if rew == 1:
        reward = 1
    elif rew == -1:
        reward = -1
    elif rew == None:
        reward = -5
    else:
        reward = 0
    return reward

    

In [44]:
def play(board, column, mark, config):
    """ Plays a move. Taken from the Kaggle environment. """
    columns = config.columns
    rows = config.rows
    row = max([r for r in range(rows) if board[column + (r * columns)] == EMPTY])
    board[column + (row * columns)] = mark

def is_win(board, config):
    has_played=True
    columns = config.columns
    rows = config.rows
    inarow = config.inarow - 1
    output = False
    for column in range(columns):
        for mark in [1,2]:
            try:
                row = (
                    min([r for r in range(rows) if board[column + (r * columns)] == mark])
                    if has_played
                    else max([r for r in range(rows) if board[column + (r * columns)] == EMPTY])
                )

                def count(offset_row, offset_column):
                    for i in range(1, inarow + 1):
                        r = row + offset_row * i
                        c = column + offset_column * i
                        if (
                            r < 0
                            or r >= rows
                            or c < 0
                            or c >= columns
                            or board[c + (r * columns)] != mark
                        ):
                            return i - 1
                    return inarow

                output = (
                    count(1, 0) >= inarow  # vertical.
                    or (count(0, 1) + count(0, -1)) >= inarow  # horizontal.
                    or (count(-1, -1) + count(1, 1)) >= inarow  # top left diagonal.
                    or (count(-1, 1) + count(1, -1)) >= inarow  # top right diagonal.
                )
                if output == True:
                    return output
            except:
                pass
    return False
def find_winner(board, config):
    has_played=True
    columns = config.columns
    rows = config.rows
    inarow = config.inarow - 1
    output = False
    for column in range(columns):
        for mark in [1,2]:
            try:
                row = (
                    min([r for r in range(rows) if board[column + (r * columns)] == mark])
                    if has_played
                    else max([r for r in range(rows) if board[column + (r * columns)] == EMPTY])
                )

                def count(offset_row, offset_column):
                    for i in range(1, inarow + 1):
                        r = row + offset_row * i
                        c = column + offset_column * i
                        if (
                            r < 0
                            or r >= rows
                            or c < 0
                            or c >= columns
                            or board[c + (r * columns)] != mark
                        ):
                            return i - 1
                    return inarow

                output = (
                    count(1, 0) >= inarow  # vertical.
                    or (count(0, 1) + count(0, -1)) >= inarow  # horizontal.
                    or (count(-1, -1) + count(1, 1)) >= inarow  # top left diagonal.
                    or (count(-1, 1) + count(1, -1)) >= inarow  # top right diagonal.
                )
                if output == True:
                    return mark
            except:
                pass
    return False

def opponent_mark(mark):
    """ The mark indicates which player is active - player 1 or player 2. """
    return 3 - mark    
def is_tie(board):
        """ Checks if a tie occured. """
        return not(any(mark == EMPTY for mark in board))
class State():
    def __init__(self,pos,playa,cols,mark,config):
        self.position = pos
        self.config = config
        #self.curr_env = env
        #self.move = move # this is the move which when played created the current state of the position
        self.columns = cols
        self.mark = mark
        self.player = playa # this identifies for which player to solve the game
        # 1 means solve for player 1
        # 2 means solve for player 2
        # the reward function changes for this
    
    def getCurrentPlayer(self):
        return self.mark
    
    def takeAction(self,action):
        
        temp_pos = self.position.copy()
        if action[0] == 0:
            play(temp_pos,action[1],self.mark,self.config)
        else:
            play(temp_pos,action[0],self.mark,self.config)
#        output = None
#        if self.mark == 1:
        output = State(temp_pos,self.player,self.columns,opponent_mark(self.mark),self.config)
        # for player 2
#         if self.mark == 2:
#             output = State(temp_pos,self.player,self.columns,opponent_mark(self.mark),self.config)
        return output

        
    def getPossibleActions(self):
        
        valid = []
        for i in range(self.columns):
            if self.position[i] == 0:
                if self.mark == 1:
                    valid.append((i,0))
                else:
                    valid.append((0,i))
        return valid
    
    def isTerminal(self):
        if is_win(self.position,self.config) or is_tie(self.position):
            return True
        else:
            return False
    def getReward(self):
#         '''
#         reward will change for when player 1 will play and when player 2 will play
#         The MCTS TREE For the first player will have reward 1 when player 1 wins and 0 when player 2 wins
        
#         The MCTS Tree for the second player will have reward 1 when player 2 wins and 0 when player 1 wins
        
#         #hence it should be mentioned in the start of the tree, for which player is the tree solving
    
        
#         '''
        reward = 0
        if is_win(self.position, self.config):
            if self.mark == 2 and self.player == 1:
                return 1
            elif self.mark == 1 and self.player == 1:
                return -1
            if self.mark == 1 and self.player == 2:
                return 1
            elif self.mark == 2 and self.player == 2:
                return -1
        elif is_tie(self.position):
            return 0.5
        else:
            return 0
'''class Action():
    def __init__(self, player, act):
        self.player = player
        self.x = act[0]
        self.y = act[1]
        
    def __str__(self):
        return str((self.x, self.y))

    def __repr__(self):
        return str(self)

    def __eq__(self, other):
        return self.__class__ == other.__class__ and self.x == other.x and self.y == other.y and self.player == other.player

    def __hash__(self):
        return hash((self.x, self.y, self.player))'''

'class Action():\n    def __init__(self, player, act):\n        self.player = player\n        self.x = act[0]\n        self.y = act[1]\n        \n    def __str__(self):\n        return str((self.x, self.y))\n\n    def __repr__(self):\n        return str(self)\n\n    def __eq__(self, other):\n        return self.__class__ == other.__class__ and self.x == other.x and self.y == other.y and self.player == other.player\n\n    def __hash__(self):\n        return hash((self.x, self.y, self.player))'

In [73]:
# lets build some memory into the model to perform decorrelated batch updates
# this is TD learning

# so apparantly the NN has to be adjusted only according to the action taken by it
# for example if action 2 is taken then only the weights for action 2 should be changed
# for this reason we should only update the q_2 vector with regarding the chosen action
# rewards will also be added to that action only
# q_2 is what we thought the value of the state will be after doing action
# we also add the reward and make this the target for the NN

replay_batch = deque(maxlen = 3000)

warmup = 10 #will start training after these many episodes have passed

# to balance exploration
epsilon = {
"epsilon" : 1.0,
"epsilon_decay": 0.99999,
"epsilon_min":0.01,
}
# for the second player
epsilon2 = {
"epsilon" : 1.0,
"epsilon_decay": 0.99999,
"epsilon_min":0.01,
}

In [74]:
print("length of replay batch",len(replay_batch))
print(replay_batch[33])

length of replay batch 0


IndexError: deque index out of range

### LOOP TO PLAY AGAINST SELF

In [76]:
EMPTY = 0
for i in tqdm(range(11)):
    env.reset()
    done = False
    while done == False:
        
        if env.state[0]['status'] == "ACTIVE":#first players move
            # adding the current state of the board into the memory
            state_1 = env.state[0]['observation']['board']
            print("STATE")
            print(state_1)
            state_1_encoded = encode(state_1)
            state_1_encoded = np.expand_dims(state_1_encoded, axis=0)
            # this is the current q values
            q_state = model_1(state_1_encoded,training = False)
            print("Q STATE",q_state)
            # feeding the relavant information about the state
            initialState = State(env.state[0]['observation']['board'],1,env.configuration['columns'],1,env.configuration)
            # creating a new tree
            tree = mcts(timeLimit=1000,epsilon = epsilon)
            # searching the tree and getting the action
            action = tree.search(initialState=initialState)
            print(action)
            env.step(action)
            
            # adding to memory deque
            # also I add only player 1s moves as the better the algorithm
            # gets at identifying best moves for player 1, it will
            # automatically learn to identify whats not good for player 1
            # this is the next state of the board
            next_state = env.state[0]['observation']['board']
            next_state = encode(next_state)
            next_state = np.expand_dims(next_state, axis=0)
            reward = get_reward(env.state[0]['reward'])
            done = env.done
            
            replay_batch.append((state_1_encoded,action[0],reward,next_state,done)) # taking only the first players action
            print("replay batch reached")
        else: # second players move
            initialState = State(env.state[0]['observation']['board'],2,env.configuration['columns'],2,env.configuration)
            tree = mcts(timeLimit=1000,epsilon = epsilon2)
            action = tree.search(initialState=initialState)
            print(action)
            env.step(action)
        if env.state[0]['status'] == "INVALID" or env.state[0]['status'] == "DONE":
            done = True
            print("REWARD ",env.state[0]['reward'])
        if i>warmup:
            batch_train(model_1,model_2,0.99,32,epsilon)
        # update the target network after 5 episodes
        if done and i%5 == 0:
            update_target_network()
    env.render(mode="ipython", width=300, height=300)











  0%|          | 0/11 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[ASTATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Q STATE tf.Tensor(
[[ 0.00887358  0.07665836 -0.09803591  0.16125256  0.01515654  0.13212682
  -0.29089096]], shape=(1, 7), dtype=float32)
rollout done  72
(2, 0)
replay batch reached
rollout done  71
(0, 4)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0]
Q STATE tf.Tensor(
[[ 0.01495143  0.11691239 -0.14166883 -0.11526838  0.05130466  0.08030687
  -0.09339388]], shape=(1, 7), dtype=float32)
rollout done  70
(5, 0)
replay batch reached
rollout done  78
(0, 2)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 2, 1, 0]
Q STATE tf.Tensor(
[[ 0.01001045  0.3842517  -0.3415078  -0.17029986 -0.04458886  0.21498534
  -0.15135044]











  9%|▉         | 1/11 [00:21<03:32, 21.23s/it][A[A[A[A[A[A[A[A[A[ASTATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Q STATE tf.Tensor(
[[ 0.00887358  0.07665836 -0.09803591  0.16125256  0.01515654  0.13212682
  -0.29089096]], shape=(1, 7), dtype=float32)
rollout done  49
(0, 0)
replay batch reached
rollout done  56
(0, 1)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0]
Q STATE tf.Tensor(
[[ 0.0199259   0.22219583 -0.04424613  0.16446596  0.0267263   0.02854325
  -0.3989215 ]], shape=(1, 7), dtype=float32)
rollout done  50
(2, 0)
replay batch reached
rollout done  51
(0, 3)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 0, 0, 0]
Q STATE tf.Tensor(
[[-0.08282551  0.2869066   0.02694859  0.05317976  0.21949834  0.05580776
  -0.2











 18%|█▊        | 2/11 [00:32<02:43, 18.21s/it][A[A[A[A[A[A[A[A[A[ASTATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Q STATE tf.Tensor(
[[ 0.00887358  0.07665836 -0.09803591  0.16125256  0.01515654  0.13212682
  -0.29089096]], shape=(1, 7), dtype=float32)
rollout done  55
(5, 0)
replay batch reached
rollout done  60
(0, 6)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2]
Q STATE tf.Tensor(
[[ 0.03234153  0.1906206  -0.20147601  0.20518164 -0.02839212  0.1223596
  -0.26436174]], shape=(1, 7), dtype=float32)
rollout done  62
(3, 0)
replay batch reached
rollout done  63
(0, 3)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 2]
Q STATE tf.Tensor(
[[-0.07989972  0.16907299 -0.33973902  0.29681787  0.00812722  0.12144408
  -0.15











 27%|██▋       | 3/11 [00:39<01:58, 14.87s/it][A[A[A[A[A[A[A[A[A[ASTATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Q STATE tf.Tensor(
[[ 0.00887358  0.07665836 -0.09803591  0.16125256  0.01515654  0.13212682
  -0.29089096]], shape=(1, 7), dtype=float32)
rollout done  63
(3, 0)
replay batch reached
rollout done  76
(0, 1)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0]
Q STATE tf.Tensor(
[[-0.00907733  0.11872948  0.06741091  0.2804301   0.10668278  0.19014633
  -0.29684094]], shape=(1, 7), dtype=float32)
rollout done  83
(3, 0)
replay batch reached
rollout done  52
(0, 3)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0]
Q STATE tf.Tensor(
[[-0.11090369  0.15447557 -0.09269878  0.36644414  0.09697799  0.08626826
  -0.3











 36%|███▋      | 4/11 [00:53<01:42, 14.67s/it][A[A[A[A[A[A[A[A[A[ASTATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Q STATE tf.Tensor(
[[ 0.00887358  0.07665836 -0.09803591  0.16125256  0.01515654  0.13212682
  -0.29089096]], shape=(1, 7), dtype=float32)
rollout done  56
(0, 0)
replay batch reached
rollout done  59
(0, 6)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2]
Q STATE tf.Tensor(
[[-0.03928614  0.09052043 -0.11948735  0.23155318 -0.02713599 -0.03502783
  -0.39898175]], shape=(1, 7), dtype=float32)
rollout done  70
(6, 0)
replay batch reached
rollout done  71
(0, 6)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2]
Q STATE tf.Tensor(
[[ 0.01603876 -0.01897942 -0.40053195  0.19693516  0.00590134 -0.17604053
  -0.3











 45%|████▌     | 5/11 [01:10<01:32, 15.42s/it][A[A[A[A[A[A[A[A[A[ASTATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Q STATE tf.Tensor(
[[ 0.00887358  0.07665836 -0.09803591  0.16125256  0.01515654  0.13212682
  -0.29089096]], shape=(1, 7), dtype=float32)
rollout done  56
(3, 0)
replay batch reached
rollout done  58
(0, 3)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
Q STATE tf.Tensor(
[[-0.17115456  0.13014089 -0.12472722  0.2871839   0.07524746  0.12415605
  -0.22655255]], shape=(1, 7), dtype=float32)
rollout done  56
(2, 0)
replay batch reached
rollout done  68
(0, 6)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 1, 0, 0, 2]
Q STATE tf.Tensor(
[[-0.17655699  0.18464684 -0.26858497  0.21101925  0.0695736   0.00633394
  -0.1











 55%|█████▍    | 6/11 [01:22<01:12, 14.44s/it][A[A[A[A[A[A[A[A[A[ASTATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Q STATE tf.Tensor(
[[ 0.00887358  0.07665836 -0.09803591  0.16125256  0.01515654  0.13212682
  -0.29089096]], shape=(1, 7), dtype=float32)
rollout done  55
(3, 0)
replay batch reached
rollout done  56
(0, 3)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
Q STATE tf.Tensor(
[[-0.17115456  0.13014089 -0.12472722  0.2871839   0.07524746  0.12415605
  -0.22655255]], shape=(1, 7), dtype=float32)
rollout done  55
(6, 0)
replay batch reached
rollout done  46
(0, 3)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1]
Q STATE tf.Tensor(
[[-0.17935759  0.05840684 -0.18446161  0.28779963 -0.08929516  0.05305733
  -0.1











 64%|██████▎   | 7/11 [01:32<00:51, 12.84s/it][A[A[A[A[A[A[A[A[A[ASTATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Q STATE tf.Tensor(
[[ 0.00887358  0.07665836 -0.09803591  0.16125256  0.01515654  0.13212682
  -0.29089096]], shape=(1, 7), dtype=float32)
rollout done  54
(2, 0)
replay batch reached
rollout done  58
(0, 2)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
Q STATE tf.Tensor(
[[-0.13578945  0.27505338 -0.33443183 -0.03344307 -0.02907015  0.1115097
  -0.36448538]], shape=(1, 7), dtype=float32)
rollout done  60
(1, 0)
replay batch reached
rollout done  66
(0, 4)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 1, 0, 2, 0, 0]
Q STATE tf.Tensor(
[[ 0.05052113  0.3575942  -0.26161364 -0.10242753 -0.01102938  0.23953465
  -0.27











 73%|███████▎  | 8/11 [01:52<00:45, 15.07s/it][A[A[A[A[A[A[A[A[A[ASTATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Q STATE tf.Tensor(
[[ 0.00887358  0.07665836 -0.09803591  0.16125256  0.01515654  0.13212682
  -0.29089096]], shape=(1, 7), dtype=float32)
rollout done  53
(3, 0)
replay batch reached
rollout done  55
(0, 1)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0]
Q STATE tf.Tensor(
[[-0.00907733  0.11872948  0.06741091  0.2804301   0.10668278  0.19014633
  -0.29684094]], shape=(1, 7), dtype=float32)
rollout done  66
(3, 0)
replay batch reached
rollout done  46
(0, 3)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0]
Q STATE tf.Tensor(
[[-0.11090369  0.15447557 -0.09269878  0.36644414  0.09697799  0.08626826
  -0.3











 82%|████████▏ | 9/11 [02:07<00:30, 15.10s/it][A[A[A[A[A[A[A[A[A[ASTATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Q STATE tf.Tensor(
[[ 0.00887358  0.07665836 -0.09803591  0.16125256  0.01515654  0.13212682
  -0.29089096]], shape=(1, 7), dtype=float32)
rollout done  65
(6, 0)
replay batch reached
rollout done  61
(0, 0)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1]
Q STATE tf.Tensor(
[[ 0.03097456  0.02802237  0.06374481  0.15973349 -0.08202696  0.09041633
  -0.22093096]], shape=(1, 7), dtype=float32)
rollout done  60
(0, 0)
replay batch reached
rollout done  62
(0, 6)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 1]
Q STATE tf.Tensor(
[[ 0.19104294  0.07628708 -0.08641186  0.27556467 -0.18962395  0.1580914
  -0.26











 91%|█████████ | 10/11 [02:31<00:17, 17.83s/it][A[A[A[A[A[A[A[A[A[ASTATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Q STATE tf.Tensor(
[[ 0.00887358  0.07665836 -0.09803591  0.16125256  0.01515654  0.13212682
  -0.29089096]], shape=(1, 7), dtype=float32)
rollout done  68
(6, 0)
replay batch reached
rollout done  58
(0, 3)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1]
Q STATE tf.Tensor(
[[-0.05822991 -0.13245799 -0.00479392  0.21817552  0.04287502  0.10965221
  -0.31738186]], shape=(1, 7), dtype=float32)
rollout done  59
(3, 0)
replay batch reached
rollout done  49
(0, 3)
STATE
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1]
Q STATE tf.Tensor(
[[-0.18195823 -0.07209609 -0.12500852  0.33752826  0.0782667  -0.08135983
  -0.











100%|██████████| 11/11 [02:40<00:00, 14.62s/it]


In [None]:
env.state[0]['observation']

## To manually play against the agent

In [10]:
# play against the agent

EMPTY = 0
for i in tqdm(range(1)):
    env.reset()
    done = False
    while done == False:
        
        if env.state[0]['status'] == "ACTIVE":#first players move
            initialState = State(env.state[0]['observation']['board'],1,env.configuration['columns'],1,env.configuration)
            tree = mcts(timeLimit=3000)
            action = tree.search(initialState=initialState)
            print(action)
            env.step(action)
            env.render(mode="ipython", width=300, height=300)
        else:
            print("Make your move : ")
            move = input()
            env.step([0,int(move)])
            env.render(mode="ipython", width=300, height=300)
                
        if env.state[0]['status'] == "INVALID" or env.state[0]['status'] == "DONE":
            done = True
            print("REWARD ",env.state[0]['reward'])
    env.render(mode="ipython", width=300, height=300)



  0%|          | 0/1 [00:00<?, ?it/s][A[Arollout done  19584
(3, 0)


Make your move : 
  0%|          | 0/1 [00:03<?, ?it/s]


ValueError: invalid literal for int() with base 10: ''

In [None]:
## NEGAMAX ALGORITHM FROM KAGGLE
from random import choice

def play_negamax(board, column, mark, config):
    columns = config.columns
    rows = config.rows
    row = max([r for r in range(rows) if board[column + (r * columns)] == EMPTY])
    board[column + (row * columns)] = mark


def is_win_negamax(board, column, mark, config, has_played=True):
    columns = config.columns
    rows = config.rows
    inarow = config.inarow - 1
    row = (
        min([r for r in range(rows) if board[column + (r * columns)] == mark])
        if has_played
        else max([r for r in range(rows) if board[column + (r * columns)] == EMPTY])
    )

def negamax_agent(obs, config):
    columns = config.columns
    rows = config.rows
    size = rows * columns

    # Due to compute/time constraints the tree depth must be limited.
    max_depth = 4

    def negamax(board, mark, depth):
        moves = sum(1 if cell != EMPTY else 0 for cell in board)

        # Tie Game
        if moves == size:
            return (0, None)

        # Can win next.
        for column in range(columns):
            if board[column] == EMPTY and is_win_negamax(board, column, mark, config, False):
                return ((size + 1 - moves) / 2, column)

        # Recursively check all columns.
        best_score = -size
        best_column = None
        for column in range(columns):
            if board[column] == EMPTY:
                # Max depth reached. Score based on cell proximity for a clustering effect.
                if depth <= 0:
                    row = max(
                        [
                            r
                            for r in range(rows)
                            if board[column + (r * columns)] == EMPTY
                        ]
                    )
                    score = (size + 1 - moves) / 2
                    if column > 0 and board[row * columns + column - 1] == mark:
                        score += 1
                    if (
                        column < columns - 1
                        and board[row * columns + column + 1] == mark
                    ):
                        score += 1
                    if row > 0 and board[(row - 1) * columns + column] == mark:
                        score += 1
                    if row < rows - 2 and board[(row + 1) * columns + column] == mark:
                        score += 1
                else:
                    next_board = board[:]
                    play_negamax(next_board, column, mark, config)
                    (score, _) = negamax(next_board,
                                         1 if mark == 2 else 2, depth - 1)
                    score = score * -1
                if score > best_score or (score == best_score and choice([True, False])):
                    best_score = score
                    best_column = column

        return (best_score, best_column)

    _, column = negamax(obs.board[:], obs.mark, max_depth)
    if column == None:
        column = choice([c for c in range(columns) if obs.board[c] == EMPTY])
    return column

In [26]:
# play against NEGAMAX

EMPTY = 0
for i in tqdm(range(1)):
    env.reset()
    done = False
    while done == False:
        if env.state[0]['status'] == "ACTIVE":#first players move
            initialState = State(env.state[0]['observation']['board'],1,env.configuration['columns'],1,env.configuration)
            tree = mcts(timeLimit=100)
            action = tree.search(initialState=initialState)
            print(action)
            env.step(action)
        else:
            action = negamax_agent(env.state[0]['observation'],env.configuration)
            env.step([0,action])
            
        if env.state[0]['status'] == "INVALID" or env.state[0]['status'] == "DONE":
            done = True
            print("REWARD ",env.state[0]['reward'])
    env.render(mode="ipython", width=300, height=300)

  0%|          | 0/1 [00:00<?, ?it/s]


NameError: name 'State' is not defined

In [None]:
env.play([None,"negamax"])

## TO PLAY AGAINS MY ALGO JUST RUN THE CELL BELOW

In [36]:
from kaggle_environments import evaluate, make, utils
import numpy as np
import tensorflow as tf
# keras model approach
from tensorflow.keras import Model,Sequential
from tensorflow.keras.layers import Conv2D,MaxPooling2D,Flatten,BatchNormalization,Dense, Input
from tensorflow.keras.activations import relu
from tqdm import tqdm
from collections import deque
import random
import gym

import time
EMPTY  = 0
def final_function_MCTS(obs,config):
    import time
    import math
    import random

    def randomPolicy(state):
        #env = make("connectx", debug=True)
        while not state.isTerminal():
            try:
                action = random.choice(state.getPossibleActions())
            except IndexError:
                raise Exception("Non-terminal state has no possible actions: " + str(state))
            state = state.takeAction(action)
        return state.getReward()


    class treeNode():
        def __init__(self, state, parent):
            self.state = state
            self.isTerminal = state.isTerminal()
            self.isFullyExpanded = self.isTerminal
            self.parent = parent
            self.numVisits = 0
            self.totalReward = 0
            self.children = {}


    class mcts():
        def __init__(self, timeLimit=None, iterationLimit=None, explorationConstant=1 / math.sqrt(2),
                     rolloutPolicy=randomPolicy):
            if timeLimit != None:
                if iterationLimit != None:
                    raise ValueError("Cannot have both a time limit and an iteration limit")
                # time taken for each MCTS search in milliseconds
                self.timeLimit = timeLimit
                self.limitType = 'time'
            else:
                if iterationLimit == None:
                    raise ValueError("Must have either a time limit or an iteration limit")
                # number of iterations of the search
                if iterationLimit < 1:
                    raise ValueError("Iteration limit must be greater than one")
                self.searchLimit = iterationLimit
                self.limitType = 'iterations'
            self.explorationConstant = explorationConstant
            self.rollout = rolloutPolicy

        def search(self, initialState):
            self.root = treeNode(initialState, None)

            if self.limitType == 'time':
                timeLimit = time.time() + self.timeLimit / 1000

                while time.time() < timeLimit:

                    self.executeRound()
  
            else:
                for i in range(self.searchLimit):
                    self.executeRound()

            bestChild = self.getBestChild(self.root, 0)
            return self.getAction(self.root, bestChild)

        def executeRound(self):
            node = self.selectNode(self.root)
            reward = self.rollout(node.state)
            self.backpropogate(node, reward)

        def selectNode(self, node):
            while not node.isTerminal:
                if node.isFullyExpanded:
                    node = self.getBestChild(node, self.explorationConstant)
                else:
                    return self.expand(node)
            return node

        def expand(self, node):
            actions = node.state.getPossibleActions()
            for action in actions:
                if action not in node.children:
                    newNode = treeNode(node.state.takeAction(action), node)
                    node.children[action] = newNode
                    if len(actions) == len(node.children):
                        node.isFullyExpanded = True
                    return newNode

            raise Exception("Should never reach here")

        def backpropogate(self, node, reward):
            while node is not None:
                node.numVisits += 1
                node.totalReward += reward
                node = node.parent

        def getBestChild(self, node, explorationValue):
            bestValue = float("-inf")
            bestNodes = []
            for child in node.children.values():
                nodeValue = node.state.getCurrentPlayer() * child.totalReward / child.numVisits + explorationValue * math.sqrt(
                    2 * math.log(node.numVisits) / child.numVisits)
                if nodeValue > bestValue:
                    bestValue = nodeValue
                    bestNodes = [child]
                elif nodeValue == bestValue:
                    bestNodes.append(child)
            return random.choice(bestNodes)

        def getAction(self, root, bestChild):
            for action, node in root.children.items():
                if node is bestChild:
                    return action
    def play(board, column, mark, config):

        columns = config.columns
        rows = config.rows
        row = max([r for r in range(rows) if board[column + (r * columns)] == EMPTY])
        board[column + (row * columns)] = mark

    def is_win(board, config):
        has_played=True
        columns = config.columns
        rows = config.rows
        inarow = config.inarow - 1
        output = False
        for column in range(columns):
            for mark in [1,2]:
                try:
                    row = (
                        min([r for r in range(rows) if board[column + (r * columns)] == mark])
                        if has_played
                        else max([r for r in range(rows) if board[column + (r * columns)] == EMPTY])
                    )

                    def count(offset_row, offset_column):
                        for i in range(1, inarow + 1):
                            r = row + offset_row * i
                            c = column + offset_column * i
                            if (
                                r < 0
                                or r >= rows
                                or c < 0
                                or c >= columns
                                or board[c + (r * columns)] != mark
                            ):
                                return i - 1
                        return inarow

                    output = (
                        count(1, 0) >= inarow  # vertical.
                        or (count(0, 1) + count(0, -1)) >= inarow  # horizontal.
                        or (count(-1, -1) + count(1, 1)) >= inarow  # top left diagonal.
                        or (count(-1, 1) + count(1, -1)) >= inarow  # top right diagonal.
                    )
                    if output == True:
                        return output
                except:
                    pass
        return False
    def find_winner(board, config):
        has_played=True
        columns = config.columns
        rows = config.rows
        inarow = config.inarow - 1
        output = False
        for column in range(columns):
            for mark in [1,2]:
                try:
                    row = (
                        min([r for r in range(rows) if board[column + (r * columns)] == mark])
                        if has_played
                        else max([r for r in range(rows) if board[column + (r * columns)] == EMPTY])
                    )

                    def count(offset_row, offset_column):
                        for i in range(1, inarow + 1):
                            r = row + offset_row * i
                            c = column + offset_column * i
                            if (
                                r < 0
                                or r >= rows
                                or c < 0
                                or c >= columns
                                or board[c + (r * columns)] != mark
                            ):
                                return i - 1
                        return inarow

                    output = (
                        count(1, 0) >= inarow  # vertical.
                        or (count(0, 1) + count(0, -1)) >= inarow  # horizontal.
                        or (count(-1, -1) + count(1, 1)) >= inarow  # top left diagonal.
                        or (count(-1, 1) + count(1, -1)) >= inarow  # top right diagonal.
                    )
                    if output == True:
                        return mark
                except:
                    pass
        return False

    def opponent_mark(mark):
        """ The mark indicates which player is active - player 1 or player 2. """
        return 3 - mark    
    def is_tie(board):
            """ Checks if a tie occured. """
            return not(any(mark == EMPTY for mark in board))
    class State():
        def __init__(self,pos,playa,cols,mark,config):
            self.position = pos
            self.config = config
            #self.curr_env = env
            #self.move = move # this is the move which when played created the current state of the position
            self.columns = cols
            self.mark = mark
            self.player = playa # this identifies for which player to solve the game
            # 1 means solve for player 1
            # 2 means solve for player 2
            # the reward function changes for this

        def getCurrentPlayer(self):
            return self.mark

        def takeAction(self,action):

            temp_pos = self.position.copy()
            if action[0] == 0:
                play(temp_pos,action[1],self.mark,self.config)
            else:
                play(temp_pos,action[0],self.mark,self.config)
    #        output = None
    #        if self.mark == 1:
            output = State(temp_pos,self.player,self.columns,opponent_mark(self.mark),self.config)
            # for player 2
    #         if self.mark == 2:
    #             output = State(temp_pos,self.player,self.columns,opponent_mark(self.mark),self.config)
            return output


        def getPossibleActions(self):

            valid = []
            for i in range(self.columns):
                if self.position[i] == 0:
                    if self.mark == 1:
                        valid.append((i,0))
                    else:
                        valid.append((0,i))
            return valid

        def isTerminal(self):
            if is_win(self.position,self.config) or is_tie(self.position):
                return True
            else:
                return False
        def getReward(self):
    #         '''
    #         reward will change for when player 1 will play and when player 2 will play
    #         The MCTS TREE For the first player will have reward 1 when player 1 wins and 0 when player 2 wins

    #         The MCTS Tree for the second player will have reward 1 when player 2 wins and 0 when player 1 wins

    #         #hence it should be mentioned in the start of the tree, for which player is the tree solving


    #         '''
            reward = 0
            if is_win(self.position, self.config):
                if self.mark == 2 and self.player == 1:
                    return 1
                elif self.mark == 1 and self.player == 1:
                    return -1
                if self.mark == 1 and self.player == 2:
                    return 1
                elif self.mark == 2 and self.player == 2:
                    return -1
            elif is_tie(self.position):
                return 0.5
            else:
                return 0
    player = obs.mark
    initialState = State(obs['board'],player,config['columns'],obs.mark,config)
    tree = mcts(timeLimit=(config.timeout - 1) * 1000)
    action = tree.search(initialState=initialState)
    return int(max(action))

In [22]:
from kaggle_environments import evaluate, make, utils

env = make("connectx", debug=True)
env.render(mode="ipython", width=500, height=450)

In [23]:
#evaluate("connectx", [final_function_MCTS, "negamax"], num_episodes=5)

In [24]:
#evaluate("connectx", [final_function_MCTS, "random"], num_episodes=5)

In [25]:
env.play([final_function_MCTS, None], width=500, height=450)

In [30]:
env.play([final_function_MCTS,None], width=500, height=450)