In [61]:
import numpy as np
import random

In [62]:
## Game Engine

class Mancala:
    import random
    def __init__(self, disp = False, seed = 0):
        # set seed 
        random.seed(seed)
        
        # Initialize Mancala game board
        self.board = [[4]*6 for i in range(2)]
        
        # Choose which player goes first (probably change to random initialization)
        self.turn = random.randint(0, 1)    # 0 for TOP player
                                            # 1 for BOTTOM player
        
        # Game End indicator
        self.game_end = False
        
        # Should prints be displayed?
        self.disp = disp
        
    
    ## def output
    def __str__(self):
        return str(self.board[0]) + '\n' + str(self.board[1]) +  'End = ' +  str(self.game_end)
        
    def seed(self, seed=0):
        random.seed(seed)
        
    def valid_action(self, action, turn):
        return self.board[turn][action%6] != 0
    
    ## act on the game
    def action(self, action):
        # Action meanings:
        # 0-5:  distribute beans from bowl 0-5 WITHOUT placing one in the Score
        # 6-11: distribute beans from bowl 0-5 AND place one in the Score
        # 12: End game
        
        if action == 12:
            self.game_end = True
            return 0
        
        # interpret input
        bowl = action % 6      # take beans from bowl
        toScore = action // 6  # 0 -> skip Score, 1 -> do not skip Score
        
        # position
        x = abs(5*(1 - self.turn) - bowl)
        y = self.turn
        
        # Init reward
        reward = 0
        
        # How many beans are in chosen bowl
        beans = self.board[y][x]
        
        # Who will be next?
        next_turn = 1 - self.turn
        
        # If chosen bowl is empty
        if beans == 0:
            if self.disp: print("Error! You chose an empty bowl! -10 Points Penalty")
            reward -= 10
            next_turn = 1 - next_turn
            
        # Take beans out of bowl
        self.board[y][x] = 0
        
        # Beans are distributed counterclock-wise
        direction = 2*self.turn - 1   # +1 -> right, -1 -> left
        
        # While there are beans left, distribute them counterclock-wise
        while beans != 0:
            
            # If beans < 0 then there has been some error
            if beans < 0:
                if self.disp: print("Error nr of beans negative!")
                
            # move to next bowl
            x += direction  
            
            # if end is reached check if toScore is true and continue on other side
            if x < 0 or x > 5:
                # if toScore is true, put one bean in score (only if it is the correct Score)
                if toScore and y == self.turn:
                    reward += 1
                    beans -= 1
                    
                    # if this was the last bean the same player will play again
                    if beans == 0:
                        next_turn = self.turn
                        break
                # change row and direction
                y = 1 - y
                direction = -direction
                x += direction
            
            # if this was the last bean, check opposite bowl
            if beans == 1 and self.board[y][x] == 0 and y == self.turn:
                reward += 1 + self.board[1 - y][x]
                self.board[1 - y][x] = 0
            else:
                self.board[y][x] += 1
            beans -= 1
            
        # set next player
        self.turn = next_turn
        
        return reward
       
    def end_check(self):
        if self.game_end or self.board[self.turn] == [0]*6:
            print(f'No more moves for {"TOP" if self.turn else "BOTTOM"}!')
            self.__str__()
            self.game_end = True
            return True
        else:
            return False
    
    def final_reward(self):
        if not self.game_end:
            print("The game is not over?!")
            return 0
        
        if self.disp: print(f'No more moves for {"TOP" if self.turn else "BOTTOM"} Player! \nGame End\n ==============================')
        
        return sum(self.board[1 - self.turn])
        
            
    def copy(self):
        cop = Mancala()
        cop.board = np.array(self.board)
        cop.game_end = bool(self.game_end)
        cop.turn = int(self.turn)
        return cop
        
    def reset(self, disp = False):
        # Initialize Mancala game board
        self.board = np.ones((2,6), dtype=int) * 4
        
        # Choose which player goes first (probably change to random initialization)
        self.turn = random.randint(0, 1)    # 0 for TOP player
                                         # 1 for BOTTOM player
            
        # Game End indicator
        self.game_end = False
        
        # Should prints be displayed?
        self.disp = disp
        
    def flip(self):
        self.board = [self.board[1][::-1], self.board[0][::-1]]

In [63]:
## For Deep-Q-Learning we need a replay memory
# This memory is for the Top player!
class Memory:
    import random
    def __init__(self, maxlen = 1e5):
        self.size = 0
        self.memory = []
        print("Memory Initialized")
        self.maxlen = maxlen
        self.current_pos = 0
        
    
    def __getitem__(self, idx=-1):
        if idx < 0 or idx >= self.size:
            print(f'Index {idx} is too large for memory of length {self.size}. \nInstead return random entry')
            return self.memory[random.randint(0, self.size - 1)]
        else:
            return self.memory[idx]
    
    def draw(self):
        if self.size: # self.size should not be 0
            return self.__getitem__(random.randint(0, self.size - 1))
        else:
            print(f'Memory {turn} is not yet filled')
            return []
        
    def draw_batch(self, batch_size):
        batch_size = min(batch_size, self.size)
        return [self.draw() for i in range(batch_size)]
    
    def add(self, previousState, action, reward, state, game_end):
        if self.size < self.maxlen:
            # relevant quantities: state0, action, reward, state1, game_end
            self.memory.append([previousState, 
                                action, 
                                reward, 
                                state,
                                game_end])
            self.size += 1
            self.current_pos += 1
        else:
            self.current_pos = self.current_pos % self.maxlen
            self.memory.append([previousState, 
                                action, 
                                reward, 
                                state,
                                game_end])
            self.current_pos += 1
        
        
    def add_random_game(self):
        # Initialize game
        game = Mancala(False)  
        
        # Start game Loop
        while not game.game_end:
            
            # state t
            game0 = game.copy()
            
            # random action
            ac = random.randint(0, 11)
            
            while game.board[game.turn][abs(5*(1 - game.turn) - ac%6)] == 0:
                #print(game.turn, game.board)
                ac = random.randint(0, 11)
            reward = game.action(ac)
            
            # save state(t), action, state(t+1)
            self.add(game0.turn, game0, ac, reward, game.copy())  
            
    def __str__(self):
        return f'Size: {self.size} \nFirst Entry Player 0:\n {self.memory[0][0]} \n... \nLast Entry Player 1:\n {self.memory[1][self.size[1] - 1]}'

In [73]:
class Player():
    import numpy as np
    import random
    
    def __init__(self, pos, name):
        self.score = 0
        self.reward = 0
        self.name = name
        
        self.pos = pos
        self.position = ["TOP", "BOTTOM"][pos]
        
        self.reward = 0
        self.previousState = [[4]*6, [4]*6]
        self.action = -1
        
        self.memory = Memory()
        
    def reset(self):
        self.__init__(self.pos, self.name)
        
        
        
class RandomPlayer(Player):
    def __init__(self, pos=1, name="Random"):
        Player.__init__(self, pos, name)
        self.memory.maxlen = 2
    
    def think(self, game):
        action = random.randint(0, 11)
        while not game.valid_action(action, game.turn):
            action = random.randint(0, 11)
        
        return action
        
class GreedyPlayer(Player):
    import numpy as np
    def __init__(self, pos=1, name="Greedy Bastard"):
        Player.__init__(self, pos, name)
        self.reward_list = np.zeros(12, dtype=np.int64)
        self.memory.maxlen = 2
    
    def think(self, game):
        reward_list = np.zeros(12, dtype=np.int64)
        #if 0 not in game.board[self.pos]:
        #    if 12 not in game.board[self.pos] and 13 not in game.board[self.pos]:
        #        return random.randint(0, 11)
                
        if self.pos == 0:
            for idx, beans in enumerate(game.board[self.pos][::-1]):
                if beans == 0:
                    continue
                    
                if beans + idx >= 5:
                    reward_list[idx + 6] += 1
                    if beans + idx > 17:
                        reward_list[idx + 6] += 1
                
                if beans > 13:
                    continue
                elif beans == 13:
                    reward_list[idx + 6] = 2 + game.board[1 - self.pos][idx]
                elif beans == 12:
                    reward_list[idx] = 2 + game.board[1 - self.pos][idx]
                elif beans <= 5-idx:
                    if game.board[self.pos][5 - idx - beans] == 0:
                        reward_list[idx] = 1 + game.board[1 - self.pos][5 - idx - beans]
                        reward_list[idx + 6] = 1 + game.board[1 - self.pos][5 - idx - beans]
                        
                elif beans == 5 - idx + 1 + 6:
                    if game.board[self.pos][beans + idx - 12] == 0:
                        reward_list[idx] = 2 + game.board[1 - self.pos][beans + idx - 12]
                elif beans > 5 - idx + 1 + 6:
                    if game.board[self.pos][beans + idx - 12] == 0:
                        reward_list[idx] = 2 + game.board[1 - self.pos][beans + idx - 12]
                    if game.board[self.pos][beans + idx - 13] == 0:
                        reward_list[idx + 6] = 2 + game.board[1 - self.pos][beans + idx - 13]
            
        else:
            for idx, beans in enumerate(game.board[self.pos]):
                if beans == 0:
                    continue
                
                if beans > idx:
                    reward_list[idx + 6] += 1
                    if beans > 12 + idx:
                        reward_list[idx + 6] += 1
                        
                if beans > 13:
                    continue
                elif beans == 13:
                    reward_list[idx + 6] = 2 + board[1 - self.pos][idx]
                elif beans == 12:
                    reward_list[idx] = 2 + board[1 - self.pos][idx]
                elif beans <= idx:
                    if game.board[self.pos][idx - beans] == 0:
                        reward_list[idx] = 1 + game.board[1 - self.pos][idx - beans]
                        reward_list[idx + 6] = 1 + game.board[1 - self.pos][idx - beans]
                elif beans == idx + 1 + 6:
                    if game.board[self.pos][beans - idx - 7] == 0:
                        reward_list[idx] = 2 + game.board[1 - self.pos][beans - idx - 7]
                elif beans > idx + 1 + 6:
                    if game.board[self.pos][beans - idx - 7] == 0:
                        reward_list[idx] = 2 + game.board[1 - self.pos][beans - idx - 7]
                    if game.board[self.pos][beans - idx - 8] == 0:
                        reward_list[idx + 6] = 2 + game.board[1 - self.pos][beans - idx - 8]
                        
        print(reward_list)
        action = np.random.choice(np.where(reward_list == np.max(reward_list))[0])
        while not game.valid_action(action, game.turn):
            reward_list[action] -= 1000
            action = np.random.choice(np.where(reward_list == np.max(reward_list))[0])
        print(action)
        return action
    

class HumanPlayer(Player):
    def __init__(self, pos=1, name='Human'):
        Player.__init__(self, pos, name)
        self.memory.maxlen = 2
        
    def think(self, game):
        #print(game)
        
        inp = int(input(f'It is {self.name}s turn. Whats your next move? (input 0-11, end: 12)'))
        if inp in range(13):
            action = inp
        else:
            print(f'{inp} is an invalid input! Valid inputs are numbers 0-11 for actions and 12 to end the game.')
            action = self.action(game)
        
        return action
        

In [74]:
 class Arena():
    import random
    import numpy as np
    def __init__(self, game, topPlayer, bottomPlayer, disp = False):
        self.game = game
        self.player = [topPlayer, bottomPlayer]
        
        if topPlayer.pos == bottomPlayer.pos:
            print("Both Players have the same position!")
            self.game.game_end = True
        
        self.disp = disp
        
        if self.disp:
            print("Game initialized!")
            print(self.__str__())
            
    def __str__(self):
        out = f'Top Player:    {self.player[0].name}. Score: {self.player[0].score}\n'
        out += f'Bottom Player: {self.player[1].name}. Score: {self.player[1].score}\n'
        out += str(self.game)
        out += f'\nNext Player: {[self.player[0].name, self.player[1].name][self.game.turn]}'
        return out if self.disp else ''
        
    
    def reset(self):
        self.game.reset()
        self.topScore = 0
        self.bottomScore = 0
        
        
        
    def play(self):
        
        # Whose turn is it?
        player = self.player[self.game.turn]
        
        # Put what happened since the last move into Player's Memory
        if player.action != -1:
            player.memory.add(player.previousState, 
                              player.action, 
                              player.reward, 
                              list(self.game.board),
                              self.game.game_end)
        # Reset reward
        player.reward = 0
        
        # Save current board for future memory
        player.previousState = list(self.game.board)
        
        # Let player choose action
        player.action = player.think(self.game)
        
        # Display choice
        if self.disp: print(f'{player.name} chooses {player.action}')
            
        # Perform action and get reward from Game
        reward = self.game.action(player.action)
        
        # Display reward
        if self.disp: print(f'Reward: {reward}')
        
        # player gets positive reward
        player.reward += reward
        player.score += reward
        
        # opposite player gets negative reward
        self.player[1 - player.pos].reward -= reward
        
        # check if game is over: 
        if self.game.end_check():
            reward = self.game.final_reward()
            self.player[self.game.turn].reward -= reward
            self.player[1 - self.game.turn].reward += reward
            self.player[1 - self.game.turn].score += reward
            
            self.player[self.game.turn].memory.add(self.player[self.game.turn].previousState,
                                                   self.player[self.game.turn].action,
                                                   self.player[self.game.turn].reward,
                                                   None,
                                                   self.game.game_end)
            
            self.player[1 - self.game.turn].memory.add(self.player[1 - self.game.turn].previousState,
                                                   self.player[1 - self.game.turn].action,
                                                   self.player[1 - self.game.turn].reward,
                                                   None,
                                                   self.game.game_end)
            if self.disp: print('Game Over')
            
        else:
            if self.disp: print(self.__str__())
            self.play()
        
    
    

In [75]:
T = GreedyPlayer(pos=0)
B = HumanPlayer(pos=1)

G = Mancala()
print(G.turn)

A = Arena(G, T, B, True)

A.play()

Memory Initialized
Memory Initialized
1
Game initialized!
Top Player:    Greedy Bastard. Score: 0
Bottom Player: Human. Score: 0
[4, 4, 4, 4, 4, 4]
[4, 4, 4, 4, 4, 4]End = False
Next Player: Human
It is Humans turn. Whats your next move? (input 0-11, end: 12)5
Human chooses 5
Reward: 0
Top Player:    Greedy Bastard. Score: 0
Bottom Player: Human. Score: 0
[4, 4, 5, 5, 5, 5]
[4, 4, 4, 4, 4, 0]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 1 1 1 1 1 1]
7
Greedy Bastard chooses 7
Reward: 1
Top Player:    Greedy Bastard. Score: 1
Bottom Player: Human. Score: 0
[5, 5, 6, 6, 0, 5]
[4, 4, 4, 4, 4, 0]End = False
Next Player: Greedy Bastard
[0 0 0 0 0 0 1 0 1 1 1 1]
6
Greedy Bastard chooses 6
Reward: 0
Top Player:    Greedy Bastard. Score: 1
Bottom Player: Human. Score: 0
[6, 6, 7, 7, 1, 0]
[4, 4, 4, 4, 4, 0]End = False
Next Player: Human
It is Humans turn. Whats your next move? (input 0-11, end: 12)1
Human chooses 1
Reward: 1
Top Player:    Greedy Bastard. Score: 1
Bottom Player: Human. 

In [67]:
np.where(np.array([1,2,3,4,4,3,2,4]) == np.max([1,2,3,4,4,3,2,4]))[0]

array([3, 4, 7], dtype=int64)

In [68]:
random.randint(0, 1)

1

In [69]:
G = Mancala()
G.turn

1