In [72]:
import numpy as np
import random
import torch
from torch import nn

In [212]:
## Game Engine

class Mancala:
    def __init__(self, disp = False, seed = 0):
        import random
        # set seed 
        random.seed(seed)
        
        # Initialize Mancala game board
        self.board = [[4]*6 for i in range(2)]
        
        # Choose which player goes first (probably change to random initialization)
        self.turn = random.randint(0, 1)    # 0 for TOP player
                                            # 1 for BOTTOM player
        
        # Game End indicator
        self.game_end = False
        
        # Should prints be displayed?
        self.disp = disp
        
    
    ## def output
    def __str__(self):
        return str(self.board[0]) + '\n' + str(self.board[1]) +  'End = ' +  str(self.game_end)
        
    def seed(self, seed=0):
        random.seed(seed)
        
    def valid_action(self, action, turn):
        return self.board[turn][abs(5*(1 - turn) - action%6)] != 0
    
    ## act on the game
    def action(self, action):
        # Action meanings:
        # 0-5:  distribute beans from bowl 0-5 WITHOUT placing one in the Score
        # 6-11: distribute beans from bowl 0-5 AND place one in the Score
        # 12: End game
        
        if action == 12:
            self.game_end = True
            return 0
        
        # interpret input
        bowl = action % 6      # take beans from bowl
        toScore = action // 6  # 0 -> skip Score, 1 -> do not skip Score
        
        # position
        x = bowl if self.turn else 5 - bowl
        y = self.turn
        
        # Init reward
        reward = 0
        
        # How many beans are in chosen bowl
        beans = self.board[y][x]
        
        # Who will be next?
        next_turn = 1 - self.turn
        
        # If chosen bowl is empty
        if beans == 0:
            if self.disp: print("Error! You chose an empty bowl! -10 Points Penalty")
            reward -= 2
            next_turn = 1 - next_turn
            
        # Take beans out of bowl
        self.board[y][x] = 0
        
        # Beans are distributed counterclock-wise
        direction = 2*self.turn - 1   # +1 -> right, -1 -> left
        
        # While there are beans left, distribute them counterclock-wise
        while beans != 0:
            
            # If beans < 0 then there has been some error
            if beans < 0:
                if self.disp: print("Error nr of beans negative!")
                
            # move to next bowl
            x += direction  
            
            # if end is reached check if toScore is true and continue on other side
            if x < 0 or x > 5:
                # if toScore is true, put one bean in score (only if it is the correct Score)
                if toScore and y == self.turn:
                    reward += 1
                    beans -= 1
                    
                    # if this was the last bean the same player will play again
                    if beans == 0:
                        next_turn = self.turn
                        break
                # change row and direction
                y = 1 - y
                direction = -direction
                x += direction
            
            # if this was the last bean, check opposite bowl
            if beans == 1 and self.board[y][x] == 0 and y == self.turn:
                reward += 1 + self.board[1 - y][x]
                self.board[1 - y][x] = 0
            else:
                self.board[y][x] += 1
            beans -= 1
            
        # set next player
        self.turn = next_turn
        
        return reward
       
    def end_check(self):
        if self.game_end or self.board[self.turn] == [0]*6:
            if self.disp: print(f'No more moves for {"TOP" if self.turn else "BOTTOM"}!')
            self.__str__()
            self.game_end = True
            return True
        else:
            return False
    
    def final_reward(self):
        if not self.game_end:
            if self.disp: print("The game is not over?!")
            return 0
        
        if self.disp: print(f'No more moves for {"TOP" if self.turn else "BOTTOM"} Player! \nGame End\n ==============================')
        
        return sum(self.board[1 - self.turn])
        
            
    def copy(self):
        cop = Mancala()
        cop.board = np.array(self.board)
        cop.game_end = bool(self.game_end)
        cop.turn = int(self.turn)
        return cop
        
    def reset(self, disp = False):
        # Initialize Mancala game board
        self.board = [[4]*6 for i in range(2)]
        
        # Choose which player goes first (probably change to random initialization)
        self.turn = random.randint(0, 1)    # 0 for TOP player
                                         # 1 for BOTTOM player
            
        # Game End indicator
        self.game_end = False
        
        # Should prints be displayed?
        self.disp = disp
        
    def flip(self):
        self.board = [self.board[1][::-1], self.board[0][::-1]]

In [213]:
## For Deep-Q-Learning we need a replay memory
# This memory is for the Top player!
class Memory:
    def __init__(self, maxlen = 1e5):
        import random
        self.size = 0
        self.memory = []
        print("Memory Initialized")
        self.maxlen = maxlen
        self.current_pos = 0
        
    
    def __getitem__(self, idx=-1):
        if idx < 0 or idx >= self.size:
            print(f'Index {idx} is too large for memory of length {self.size}. \nInstead return random entry')
            return self.memory[random.randint(0, self.size - 1)]
        else:
            return self.memory[idx]
    
    def draw(self):
        if self.size: # self.size should not be 0
            return self.__getitem__(random.randint(0, self.size - 1))
        else:
            print(f'Memory {turn} is not yet filled')
            return []
        
    def draw_batch(self, batch_size):
        batch_size = min(batch_size, self.size)
        return [self.draw() for i in range(batch_size)]
    
    def add(self, previousState, action, reward, state, game_end):
        if self.size < self.maxlen:
            # relevant quantities: state0, action, reward, state1, game_end
            self.memory.append([previousState, 
                                action, 
                                reward, 
                                state,
                                game_end])
            self.size += 1
            self.current_pos += 1
        else:
            self.current_pos = self.current_pos % self.maxlen
            self.memory.append([previousState, 
                                action, 
                                reward, 
                                state,
                                game_end])
            self.current_pos += 1
        
        
    def add_random_game(self):
        # Initialize game
        game = Mancala(False)  
        
        # Start game Loop
        while not game.game_end:
            
            # state t
            game0 = game.copy()
            
            # random action
            ac = random.randint(0, 11)
            
            while game.board[game.turn][abs(5*(1 - game.turn) - ac%6)] == 0:
                #print(game.turn, game.board)
                ac = random.randint(0, 11)
            reward = game.action(ac)
            
            # save state(t), action, state(t+1)
            self.add(game0.turn, game0, ac, reward, game.copy())  
            
    def __str__(self):
        return f'Size: {self.size} \nFirst Entry Player 0:\n {self.memory[0][0]} \n... \nLast Entry Player 1:\n {self.memory[1][self.size[1] - 1]}'

In [283]:
class Player():
    
    def __init__(self, pos, name):
        
        import numpy as np
        import random
        
        self.score = 0
        self.reward = 0
        self.name = name
        
        self.pos = pos
        self.position = ["TOP", "BOTTOM"][pos]
        
        self.previousState = [[4]*6, [4]*6]
        self.action = -1
        
        self.memory = Memory()
        
    def reset(self):
        self.score = 0
        self.reward = 0
        
        self.previousState = [[4]*6, [4]*6]
        self.action = -1
        
        
        
class RandomPlayer(Player):
    def __init__(self, pos=1, name="Random"):
        Player.__init__(self, pos, name)
        self.memory.maxlen = 2
    
    def think(self, game):
        action = random.randint(0, 11)
        while not game.valid_action(action, game.turn):
            action = random.randint(0, 11)
        self.action = action
        
        return action
        
class GreedyPlayer(Player):
    def __init__(self, pos=1, name="Greedy"):
        Player.__init__(self, pos, name)
        self.reward_list = np.zeros(12, dtype=np.int64)
        self.memory.maxlen = 2
    
    def think(self, game):
        reward_list = np.zeros(12, dtype=np.int64)
        #if 0 not in game.board[self.pos]:
        #    if 12 not in game.board[self.pos] and 13 not in game.board[self.pos]:
        #        return random.randint(0, 11)
                
        if True: #0
            for idx, beans in enumerate(game.board[self.pos][::(game.turn*2-1)]):
                # If there are no beans, skip
                if beans == 0:
                    continue
                    
                # if beans reach 'score', and action >= 6, put one in 'score'
                if beans + idx >= 5:
                    reward_list[idx + 6] += 1
                    if beans + idx > 17:
                        reward_list[idx + 6] += 1
                
                # if beans 13 it cannot land in an empty bowl
                if beans > 13:
                    continue
                # if beans=13 then action idx+6 will be good
                elif beans == 13:
                    reward_list[idx + 6] += 2 + game.board[1 - self.pos][abs(5*(1-game.turn)-idx)]
                # if beans=12 then action idx will be good
                elif beans == 12:
                    reward_list[idx] += 2 + game.board[1 - self.pos][abs(5*(1-game.turn)-idx)]
                # if beans do not cross 'score'
                elif beans + idx <= 5:
                    if game.board[self.pos][abs(5*(1-game.turn)-idx-beans)] == 0:
                        reward_list[idx] += 1 + game.board[1 - self.pos][abs(5*(1-game.turn)-idx-beans)]
                        reward_list[idx + 6] += 1 + game.board[1 - self.pos][abs(5*(1-game.turn)-idx-beans)]
                # if beans cross 'score' of both players
                elif beans + idx == 12:
                    if game.board[self.pos][abs(5*game.turn)] == 0:
                        reward_list[idx] += 2 + game.board[1 - self.pos][abs(5*game.turn)]
                elif beans + idx > 12:
                    if game.board[self.pos][abs(5*(1-game.turn)-(beans + idx - 12))] == 0:
                        reward_list[idx] += 2 + game.board[1 - self.pos][abs(5*(1-game.turn)-(beans + idx - 12))]
                    if game.board[self.pos][abs(5*(1-game.turn)-(beans + idx - 13))] == 0:
                        reward_list[idx + 6] += 2 + game.board[1 - self.pos][abs(5*(1-game.turn)-(beans + idx - 13))]
            
        else:
            for idx, beans in enumerate(game.board[self.pos]):
                if beans == 0:
                    continue
                
                if beans > idx:
                    reward_list[idx + 6] += 1
                    if beans > 12 + idx:
                        reward_list[idx + 6] += 1
                        
                if beans > 13:
                    continue
                elif beans == 13:
                    reward_list[idx + 6] = 2 + game.board[1 - self.pos][idx]
                elif beans == 12:
                    reward_list[idx] = 2 + game.board[1 - self.pos][idx]
                elif beans + idx <= 5:
                    if game.board[self.pos][idx - beans] == 0:
                        reward_list[idx] = 1 + game.board[1 - self.pos][idx - beans]
                        reward_list[idx + 6] = 1 + game.board[1 - self.pos][idx - beans]
                elif beans + idx == 12:
                    if game.board[self.pos][beans + (5-idx) - 12] == 0:
                        reward_list[idx] = 2 + game.board[1 - self.pos][beans - idx - 7]
                elif beans+ idx > 12:
                    if game.board[self.pos][beans + (5-idx) - 12] == 0:
                        reward_list[idx] = 2 + game.board[1 - self.pos][beans - idx - 7]
                    if game.board[self.pos][beans + (5-idx) - 13] == 0:
                        reward_list[idx + 6] = 2 + game.board[1 - self.pos][beans - idx - 8]
                        
        #print(reward_list)
        action = np.random.choice(np.where(reward_list == np.max(reward_list))[0])
        while not game.valid_action(action, game.turn):
            #print(reward_list, "invalid action", action)
            reward_list[action] -= 1000
            action = np.random.choice(np.where(reward_list == np.max(reward_list))[0])
        #print(action)
        return action
    

class HumanPlayer(Player):
    def __init__(self, pos=1, name='Human'):
        Player.__init__(self, pos, name)
        self.memory.maxlen = 2
        
    def think(self, game):
        #print(game)
        
        inp = int(input(f'It is {self.name}s turn. Whats your next move? (input 0-11, end: 12)'))
        if inp in range(13):
            action = inp
        else:
            print(f'{inp} is an invalid input! Valid inputs are numbers 0-11 for actions and 12 to end the game.')
            action = self.think(game)
            
        self.action = action
        
        return action
        
class DQN(nn.Module):

    def __init__(self, state_space_dim, action_space_dim):
        from torch import nn
        
        super().__init__()

        self.linear = nn.Sequential(
                nn.Linear(state_space_dim, 128),
                nn.ReLU(),
                nn.Linear(128, 128),
                nn.ReLU(),
                nn.Linear(128, action_space_dim)
                )

    def forward(self, x):
        return self.linear(x)
    
        
class DQNPlayer(Player):
    def __init__(self, 
                 gamma=0.97, 
                 learning_rate = 1e-2,
                 batch_size = 256,
                 target_net_update_steps = 1000,
                 replay_memory_capacity = 1e5, 
                 epsilon_start = 100,
                 epsilon_stop = 3e3,
                 epsilon_min = 0.05,
                 pos=0, 
                 name='AI'):
        import torch
        import random
        import numpy as np
        from torch import nn
        
        Player.__init__(self, pos, name)
        
        ### Set Memory length
        self.memory.maxlen = replay_memory_capacity
        
        ### Initialize the policy network
        self.net = DQN(state_space_dim=12, action_space_dim=12)
        
        ### Initialize the target network with the same weights of the policy network
        self.target_net = DQN(state_space_dim=12, action_space_dim=12)
        self.target_net_update_steps = target_net_update_steps
        
        ### Initialize the optimizer
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate) # The optimizer will update ONLY the parameters of the policy network
        
        ### Initialize the loss function (Huber loss)
        self.loss_fn = nn.SmoothL1Loss()
        
        self.idx = 0
        
        self.final = False
        
        torch.manual_seed(0)
        np.random.seed(0)
        random.seed(0)
        
        self.gamma = gamma
        self.memory.maxlen = replay_memory_capacity
        
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        
        self.epsilon_start = epsilon_start
        self.epsilon_stop = epsilon_stop
        self.epsilon_min = epsilon_min
        
    def epsilon(self):
        return max(self.epsilon_min, 
                   1 if self.idx < self.epsilon_start 
                   else 1 - (1 - self.epsilon_min) * (self.idx - self.epsilon_start)  / (self.epsilon_stop - self.epsilon_start))
        
    def update_step(self):
        
        if self.idx < self.epsilon_start:
            self.idx += 1
            pass
        
        #replay_mem = self.memory
        #policy_net = self.net
        #target_net = self.target_net
        #gamma = self.gamma
        #optimizer = self.optimizer
        #loss_fn = self.loss_fn
        #batch_size = self.batch_size
        
        # Sample the data from the replay memory
        batch = self.memory.draw_batch(self.batch_size)
        batch_size = len(batch)

        # Create tensors for each element of the batch
        states      = torch.tensor([s[0][0] + s[0][1] for s in batch], dtype=torch.float32)
        actions     = torch.tensor([s[1] for s in batch], dtype=torch.int64)
        rewards     = torch.tensor([s[2] for s in batch], dtype=torch.int64)
        next_states = torch.tensor([s[3][0] + s[3][1] if not s[4] else [0]*12 for s in batch], dtype=torch.float32)
        game_end    = torch.tensor([s[4] for s in batch], dtype=torch.bool)


        # Compute a mask of non-final states (all the elements where the next state is not None)
        non_final_next_states = torch.tensor([s[2] for s in batch if s[2] is not None], dtype=torch.int64) # the next state can be None if the game has ended
        non_final_mask = torch.tensor([s[2] is not None for s in batch], dtype=torch.bool)


        # Compute all the Q values (forward pass)
        self.net.train()
        q_values = self.net(states)
        # Select the proper Q value for the corresponding action taken Q(s_t, a)
        state_action_values = q_values.gather(1, actions.unsqueeze(1))

        # Compute the value function of the next states using the target network V(s_{t+1}) = max_a( Q_target(s_{t+1}, a)) )
        with torch.no_grad():
            self.target_net.eval()
            q_values_target = self.target_net(next_states)
        next_state_max_q_values = torch.zeros(batch_size)
        next_state_max_q_values[game_end] = q_values_target.max(dim=1)[0][game_end]

        # Compute the expected Q values
        expected_state_action_values = rewards + (next_state_max_q_values * self.gamma)
        expected_state_action_values = expected_state_action_values.unsqueeze(1) # Set the required tensor shape

        # Compute the Huber loss
        loss = self.loss_fn(state_action_values, expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        # Apply gradient clipping (clip all the gradients greater than 2 for training stability)
        nn.utils.clip_grad_norm_(self.net.parameters(), 2)
        self.optimizer.step()
        
        self.idx += 1
        
        # Update the target network every target_net_update_steps episodes
        if self.idx == 100 or self.idx % self.target_net_update_steps == 0:
            print('Updating target network...')
            self.target_net.load_state_dict(self.net.state_dict()) # This will copy the weights of the policy network to the target network
    
    def think(self, game):
        
        if not self.final and random.random() < self.epsilon():
            action = random.randint(0, 11)
            while not game.valid_action(action, game.turn):
                action = random.randint(0, 11)
            
        else:
            with torch.no_grad():
                self.net.eval()
                net_out = self.net(torch.tensor([game.board[0] + game.board[0]], dtype=torch.float32))
                action = int(torch.argmax(net_out))
                
            
            while not game.valid_action(action, game.turn):
                #print(reward_list, "invalid action", action)
                net_out[0][action] -= 1000
                action = int(torch.argmax(net_out))
        
        
        return int(action)
        
        
        

In [284]:
 class Arena():
    def __init__(self, game, aPlayer, bPlayer, disp = False):
        
        import random
        import numpy as np
        
        self.game = game
        
        if aPlayer.pos == bPlayer.pos:
            print("Both Players have the same position!")
            self.player = [aPlayer, bPlayer]
            self.game.game_end = True
        else:
            self.player = [aPlayer, bPlayer][::bPlayer.pos - aPlayer.pos]
        
        
        self.disp = disp
        
        if self.disp:
            print("Game initialized!")
            print(self.__str__())
            
    def __str__(self):
        out = f'Top Player:    {self.player[0].name}. Score: {self.player[0].score}\n'
        out += f'Bottom Player: {self.player[1].name}. Score: {self.player[1].score}\n'
        out += str(self.game)
        out += f'\nNext Player: {[self.player[0].name, self.player[1].name][self.game.turn]}'
        return out if self.disp else ''
        
    
    def reset(self):
        self.game.reset()
        for p in self.player:
            p.reset()
        
        
        
    def play(self):
        
        # Whose turn is it?
        player = self.player[self.game.turn]
        
        # Put what happened since the last move into Player's Memory
        if player.action != -1:
            player.memory.add(player.previousState, 
                              player.action, 
                              player.reward, 
                              list(self.game.board),
                              self.game.game_end)
        # Reset reward
        player.reward = 0
        
        # Save current board for future memory
        player.previousState = list(self.game.board)
        
        # Let player choose action
        player.action = player.think(self.game)
        
        # Display choice
        if self.disp: print(f'{player.name} chooses {player.action}')
            
        # Perform action and get reward from Game
        reward = self.game.action(player.action)
        
        # Display reward
        if self.disp: print(f'Reward: {reward}')
        
        # player gets positive reward
        player.reward += reward
        player.score += reward
        
        # opposite player gets negative reward
        self.player[1 - player.pos].reward -= reward
        
        # check if game is over: 
        if self.game.end_check():
            reward = self.game.final_reward()
            self.player[self.game.turn].reward -= reward
            self.player[1 - self.game.turn].reward += reward
            self.player[1 - self.game.turn].score += reward
            
            self.player[self.game.turn].memory.add(self.player[self.game.turn].previousState,
                                                   self.player[self.game.turn].action,
                                                   self.player[self.game.turn].reward,
                                                   None,
                                                   self.game.game_end)
            
            self.player[1 - self.game.turn].memory.add(self.player[1 - self.game.turn].previousState,
                                                   self.player[1 - self.game.turn].action,
                                                   self.player[1 - self.game.turn].reward,
                                                   None,
                                                   self.game.game_end)
            if self.disp: print('Game Over')
            
        else:
            if self.disp: print(self.__str__())
            self.play()
    

In [285]:
T = DQNPlayer(gamma=0.97, 
              learning_rate = 1e-2,
              batch_size = 128,
              target_net_update_steps = 1000,
              replay_memory_capacity = 1e6, 
              epsilon_start = 50,
              epsilon_stop = 8e4,
              epsilon_min = 0.05,
              pos = 0,
              name = 'AI')
B = GreedyPlayer(pos=1)
G = Mancala()
A = Arena(G, T, B, False)

num_iterations = int(2e5)

res = [[],[]]

for i in range(num_iterations):
    A.reset()
    A.play()
    res[0].append(T.score)
    res[1].append(B.score)
    if (i+1) % 1000 == 0:
        print(f'Game {i+1} / {int(num_iterations)}. Score: AI - {sum(res[0])/len(res[0])}, Greedy: {sum(res[1])/len(res[1])}. Epsilon: {T.epsilon()}')
        res = [[],[]]
    T.update_step()
    


Memory Initialized
Memory Initialized
Updating target network...
Updating target network...
Game 1000 / 200000. Score: AI - 14.683, Greedy: 33.317
Updating target network...
Game 2000 / 200000. Score: AI - 14.587, Greedy: 33.413
Updating target network...
Game 3000 / 200000. Score: AI - 14.436, Greedy: 33.564
Updating target network...
Game 4000 / 200000. Score: AI - 14.588, Greedy: 33.412
Updating target network...
Game 5000 / 200000. Score: AI - 14.626, Greedy: 33.374
Updating target network...
Game 6000 / 200000. Score: AI - 14.479, Greedy: 33.521
Updating target network...
Game 7000 / 200000. Score: AI - 14.894, Greedy: 33.106
Updating target network...
Game 8000 / 200000. Score: AI - 14.959, Greedy: 33.041
Updating target network...
Game 9000 / 200000. Score: AI - 14.933, Greedy: 33.067
Updating target network...
Game 10000 / 200000. Score: AI - 15.118, Greedy: 32.882
Updating target network...
Game 11000 / 200000. Score: AI - 15.016, Greedy: 32.984
Updating target network...
Game

KeyboardInterrupt: 

In [279]:
T.target_net(torch.tensor([[4]*12], dtype=torch.float32))

tensor([[-1.1140, -0.8573, -0.6934, -0.9955, -0.7917, -1.5093, -0.8375, -0.3664,
         -0.1282, -0.0140,  0.0112,  0.1426]], grad_fn=<AddmmBackward0>)

In [264]:
T.final = True
A.disp = False
res = [[],[]]

for i in range(500):
    A.reset()
    A.play()
    res[0].append(T.score)
    res[1].append(B.score)
    
print(f'AVERAGE SCORE: AI {sum(res[0])/len(res[0])}, Greedy {sum(res[1])/len(res[1])}')

AVERAGE SCORE: AI 21.858, Greedy 26.142


In [174]:
T.memory.memory

[[[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
  8,
  -5,
  [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
  False],
 [[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
  4,
  -1,
  [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
  False],
 [[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
  5,
  -1,
  [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
  False],
 [[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
  3,
  -1,
  [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
  False],
 [[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
  4,
  -1,
  [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
  False],
 [[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
  1,
  -5,
  [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
  False],
 [[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
  4,
  -1,
  [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
  False],
 [[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
  8,
  1,
  [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
  False],
 [[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
  9,
  -7,
  [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
  False],
 [[[0, 0, 0, 0, 0, 0

In [156]:
T = GreedyPlayer(pos=0)
B = RandomPlayer(pos=1)
G = Mancala()
A = Arena(G, T, B, False)

res = [[],[]]

for i in range(200):
    A.play()
    
    res[0].append(A.player[T.pos].score)
    res[1].append(A.player[B.pos].score)
    A.reset()
    
print(f'Average Score: Greedy: {sum(res[0])/len(res[0])}, Random: {sum(res[1])/len(res[1])}')

Memory Initialized
Memory Initialized
Average Score: Greedy: 33.95, Random: 14.05


In [7]:
T = GreedyPlayer(pos=0)
B = HumanPlayer(pos=1)

G = Mancala()
print(G.turn)

A = Arena(G, T, B, True)

A.play()

Memory Initialized
Memory Initialized
1
Game initialized!
Top Player:    Greedy Bastard. Score: 0
Bottom Player: Human. Score: 0
[4, 4, 4, 4, 4, 4]
[4, 4, 4, 4, 4, 4]End = False
Next Player: Human
It is Humans turn. Whats your next move? (input 0-11, end: 12)0
Human chooses 0
Human 0 0
Reward: 0
Top Player:    Greedy Bastard. Score: 0
Bottom Player: Human. Score: 0
[4, 4, 4, 4, 4, 4]
[0, 5, 5, 5, 5, 4]End = False
Next Player: Greedy Bastard
Greedy Bastard chooses 7
Greedy Bastard 7 0
Reward: 0
Top Player:    Greedy Bastard. Score: 0
Bottom Player: Human. Score: 0
[5, 5, 5, 5, 0, 4]
[0, 5, 5, 5, 5, 4]End = False
Next Player: Human
It is Humans turn. Whats your next move? (input 0-11, end: 12)1
Human chooses 1
Human 1 0
Reward: 0
Top Player:    Greedy Bastard. Score: 0
Bottom Player: Human. Score: 0
[5, 5, 5, 5, 0, 5]
[0, 0, 6, 6, 6, 5]End = False
Next Player: Greedy Bastard
Greedy Bastard chooses 8
Greedy Bastard 8 0
Reward: 1
Top Player:    Greedy Bastard. Score: 1
Bottom Player: Human

[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0    1    0 -999    0    0]
Greedy Bastard chooses 4
Greedy Bastard 4 -2405
Reward: -10
Top Player:    Greedy Bastard. Score: -2415
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0    1    0 -999    0    0]
Greedy Bastard chooses 5
Greedy Bastard 5 -2415
Reward: -10
Top Player:    Greedy Bastard. Score: -2425
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0    1    0 -999    0    0]
Greedy Bastard chooses 8
Greedy Bastard 8 -2425
Reward: -10
Top Player:    Greedy Bastard. Score: -2435
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0 -999    0    1    0    0]
Greedy Ba

Top Player:    Greedy Bastard. Score: -4745
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0 -999    0    1    0    0]
[   0    0    0    0    0    0    0 -999    0 -999    0    0]
Greedy Bastard chooses 2
Greedy Bastard 2 -4745
Reward: -10
Top Player:    Greedy Bastard. Score: -4755
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0    1    0 -999    0    0]
Greedy Bastard chooses 8
Greedy Bastard 8 -4755
Reward: -10
Top Player:    Greedy Bastard. Score: -4765
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0    1    0 -999    0    0]
Greedy Bastard chooses 4
Greedy Bastard 4 -4765
Reward: -10
Top Player:    Greedy Bastard. Scor

Greedy Bastard 10 -7075
Reward: -10
Top Player:    Greedy Bastard. Score: -7085
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0 -999    0    1    0    0]
[   0    0    0    0    0    0    0 -999    0 -999    0    0]
[    0     0     0 -1000     0     0     0  -999     0  -999     0     0]
Greedy Bastard chooses 8
Greedy Bastard 8 -7085
Reward: -10
Top Player:    Greedy Bastard. Score: -7095
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0 -999    0    1    0    0]
[   0    0    0    0    0    0    0 -999    0 -999    0    0]
[-1000     0     0     0     0     0     0  -999     0  -999     0     0]
[-1000     0     0 -1000     0     0     0  -999     0  -999     0     0]
Greedy Bastard chooses 2
Greedy Bastard 2 -7095
Reward: -10
Top Player:    G

[   0    0    0    0    0    0    0 -999    0 -999    0    0]
Greedy Bastard chooses 2
Greedy Bastard 2 -9405
Reward: -10
Top Player:    Greedy Bastard. Score: -9415
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0 -999    0    1    0    0]
[   0    0    0    0    0    0    0 -999    0 -999    0    0]
Greedy Bastard chooses 4
Greedy Bastard 4 -9415
Reward: -10
Top Player:    Greedy Bastard. Score: -9425
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0 -999    0    1    0    0]
Greedy Bastard chooses 4
Greedy Bastard 4 -9425
Reward: -10
Top Player:    Greedy Bastard. Score: -9435
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0

Reward: -10
Top Player:    Greedy Bastard. Score: -11735
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0 -999    0    1    0    0]
Greedy Bastard chooses 5
Greedy Bastard 5 -11735
Reward: -10
Top Player:    Greedy Bastard. Score: -11745
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0    1    0 -999    0    0]
[   0    0    0    0    0    0    0 -999    0 -999    0    0]
Greedy Bastard chooses 2
Greedy Bastard 2 -11745
Reward: -10
Top Player:    Greedy Bastard. Score: -11755
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0    1    0 -999    0    0]
Greedy Bastard chooses 5
Greedy Bastard 5 -11755
Reward: -10
Top Player:    Gr

[   0    0    0    0    0    0    0 -999    0 -999    0    0]
[    0 -1000     0     0     0     0     0  -999     0  -999     0     0]
Greedy Bastard chooses 4
Greedy Bastard 4 -14045
Reward: -10
Top Player:    Greedy Bastard. Score: -14055
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0    1    0 -999    0    0]
[   0    0    0    0    0    0    0 -999    0 -999    0    0]
Greedy Bastard chooses 5
Greedy Bastard 5 -14055
Reward: -10
Top Player:    Greedy Bastard. Score: -14065
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0 -999    0    1    0    0]
Greedy Bastard chooses 5
Greedy Bastard 5 -14065
Reward: -10
Top Player:    Greedy Bastard. Score: -14075
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Play

Greedy Bastard 10 -16345
Reward: -10
Top Player:    Greedy Bastard. Score: -16355
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0    1    0 -999    0    0]
Greedy Bastard chooses 2
Greedy Bastard 2 -16355
Reward: -10
Top Player:    Greedy Bastard. Score: -16365
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0    1    0 -999    0    0]
Greedy Bastard chooses 8
Greedy Bastard 8 -16365
Reward: -10
Top Player:    Greedy Bastard. Score: -16375
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0    1    0 -999    0    0]
Greedy Bastard chooses 10
Greedy Bastard 10 -16375
Reward: -10
Top Player:    Greedy Bastard. Score: -16385
Bottom 

Reward: -10
Top Player:    Greedy Bastard. Score: -18675
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0 -999    0    1    0    0]
[   0    0    0    0    0    0    0 -999    0 -999    0    0]
Greedy Bastard chooses 2
Greedy Bastard 2 -18675
Reward: -10
Top Player:    Greedy Bastard. Score: -18685
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0 -999    0    1    0    0]
Greedy Bastard chooses 10
Greedy Bastard 10 -18685
Reward: -10
Top Player:    Greedy Bastard. Score: -18695
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0    1    0 -999    0    0]
Greedy Bastard chooses 5
Greedy Bastard 5 -18695
Reward: -10
Top Player:    

[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0    1    0 -999    0    0]
[   0    0    0    0    0    0    0 -999    0 -999    0    0]
[    0     0     0     0     0     0 -1000  -999     0  -999     0     0]
Greedy Bastard chooses 4
Greedy Bastard 4 -21015
Reward: -10
Top Player:    Greedy Bastard. Score: -21025
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0 -999    0    1    0    0]
[   0    0    0    0    0    0    0 -999    0 -999    0    0]
Greedy Bastard chooses 10
Greedy Bastard 10 -21025
Reward: -10
Top Player:    Greedy Bastard. Score: -21035
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0    1    0 -999    0    0]
[   0    0    0    0    0    0    0 -999    0 -999    0    0]
Greedy Bastard chooses 4
Greedy Bastard 4 -2

Greedy Bastard 5 -23335
Reward: -10
Top Player:    Greedy Bastard. Score: -23345
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0 -999    0    1    0    0]
Greedy Bastard chooses 11
Greedy Bastard 11 -23345
Reward: -10
Top Player:    Greedy Bastard. Score: -23355
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0    1    0 -999    0    0]
[   0    0    0    0    0    0    0 -999    0 -999    0    0]
[    0     0     0     0     0     0 -1000  -999     0  -999     0     0]
Greedy Bastard chooses 8
Greedy Bastard 8 -23355
Reward: -10
Top Player:    Greedy Bastard. Score: -23365
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0 -999

Greedy Bastard 10 -25655
Reward: -10
Top Player:    Greedy Bastard. Score: -25665
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0 -999    0    1    0    0]
Greedy Bastard chooses 8
Greedy Bastard 8 -25665
Reward: -10
Top Player:    Greedy Bastard. Score: -25675
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0 -999    0    1    0    0]
[   0    0    0    0    0    0    0 -999    0 -999    0    0]
Greedy Bastard chooses 10
Greedy Bastard 10 -25675
Reward: -10
Top Player:    Greedy Bastard. Score: -25685
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0    1    0 -999    0    0]
[   0    0    0    0    0    0    0 -999    0 -999 

Greedy Bastard 11 -27985
Reward: -10
Top Player:    Greedy Bastard. Score: -27995
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0 -999    0    1    0    0]
Greedy Bastard chooses 10
Greedy Bastard 10 -27995
Reward: -10
Top Player:    Greedy Bastard. Score: -28005
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0 -999    0    1    0    0]
[   0    0    0    0    0    0    0 -999    0 -999    0    0]
Greedy Bastard chooses 4
Greedy Bastard 4 -28005
Reward: -10
Top Player:    Greedy Bastard. Score: -28015
Bottom Player: Human. Score: 10
[0, 0, 4, 0, 8, 1]
[0, 0, 5, 2, 2, 1]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 0 1 0 1 0 0]
[   0    0    0    0    0    0    0 -999    0    1    0    0]
[   0    0    0    0    0    0    0 -999    0 -999 

RecursionError: maximum recursion depth exceeded while calling a Python object

In [8]:
11%6

5

In [31]:
[0,0,0,0,0,0] == [0]*6

True

In [80]:
concatenate([[1,2,3],[4,5,6]])

NameError: name 'concatenate' is not defined

In [83]:
np.array([1,2,3,4])[[True,False,True,True]]

array([1, 3, 4])