In [1]:
import numpy as np
import random

In [2]:
## Game Engine

class Mancala:
    import random
    def __init__(self, disp = False):
        # Initialize Mancala game board
        self.board = [[4]*6 for i in range(2)]
        
        # Choose which player goes first (probably change to random initialization)
        self.turn = random.randint(0, 1)    # 0 for TOP player
                                            # 1 for BOTTOM player
            
            
        self.score = [0, 0]
        
        # Game End indicator
        self.game_end = False
        
        # Should prints be displayed?
        self.disp = disp
        
    
    ## def output
    def __str__(self):
        return str(self.board[0]) + '\n' + str(self.board[1]) +  'End = ' +  str(self.game_end)
        
    def seed(self, seed=0):
        random.seed(seed)
        
    def valid_action(self, action, turn):
        return self.board[turn][action%6] != 0
    
    ## act on the game
    def action(self, action):
        # Action meanings:
        # 0-5:  distribute beans from bowl 0-5 WITHOUT placing one in the Score
        # 6-11: distribute beans from bowl 0-5 AND place one in the Score
        
        # interpret input
        bowl = action % 6      # take beans from bowl
        toScore = action // 6  # 0 -> skip Score, 1 -> do not skip Score
        
        # position
        x = abs(5*(1 - self.turn) - bowl)
        y = self.turn
        
        # Init reward
        reward = 0
        
        # How many beans are in chosen bowl
        beans = self.board[y][x]
        
        # Who will be next?
        next_turn = 1 - self.turn
        
        # If chosen bowl is empty
        if beans == 0:
            if self.disp: print("Error! You chose an empty bowl! -10 Points Penalty")
            reward -= 10
            next_turn = 1 - next_turn
            
        # Take beans out of bowl
        self.board[y][x] = 0
        
        # Beans are distributed counterclock-wise
        direction = 2*self.turn - 1   # +1 -> right, -1 -> left
        
        # While there are beans left, distribute them counterclock-wise
        while beans != 0:
            
            # If beans < 0 then there has been some error
            if beans < 0:
                if self.disp: print("Error nr of beans negative!")
                
            # move to next bowl
            x += direction  
            
            # if end is reached check if toScore is true and continue on other side
            if x < 0 or x > 5:
                # if toScore is true, put one bean in score (only if it is the correct Score)
                if toScore and y == self.turn:
                    reward += 1
                    self.score[self.turn] += 1
                    beans -= 1
                    
                    # if this was the last bean the same player will play again
                    if beans == 0:
                        next_turn = self.turn
                        break
                # change row and direction
                y = 1 - y
                direction = -direction
                x += direction
            
            # if this was the last bean, check opposite bowl
            if beans == 1 and self.board[y][x] == 0 and y == self.turn:
                reward+= 1 + self.board[1 - y, x]
                self.score[self.turn] += 1 + self.board[1 - y, x]
                self.board[1 - y][x] = 0
            else:
                self.board[y][x] += 1
            beans -= 1
            
        # add reward to Score of current Player
        #self.score[self.turn] += reward
        
        # end_check
        #reward += self.end_check(next_turn)
        
        # check if this move was bad
        #if self.turn != next_turn:
        #    reward[self.] -= self.bad_move_check(next_turn)
        
        # set next player
        self.turn = next_turn
        
        return reward
       
    def end_check(self, next_turn):
        reward = [0, 0]
        if self.board[next_turn] == [0]*6:
            if self.disp: print(f'No more moves for {"TOP" if self.turn else "BOTTOM"} Player! \nGame End\n ==============================')
            if turn == self.turn:
                self.score[self.turn] -= sum(self.board[next_turn])
                reward[next_turn] += sum(self.board[next_turn])
            else:
                self.score[self.turn] += sum(self.board[next_turn])
                reward[self.turn] += sum(self.board[next_turn])
            #loose_penalty = +sum(self.board[self.turn])
            self.game_end = True
            return reward
        return reward
        
            
    def copy(self):
        cop = Mancala()
        cop.board = np.array(self.board)
        cop.score = list(self.score)
        cop.game_end = bool(self.game_end)
        cop.turn = int(self.turn)
        return cop
        
    def reset(self, disp = False):
        # Initialize Mancala game board
        self.board = np.ones((2,6), dtype=int) * 4
        
        # Initialize Scores to 0
        self.score = [0, 0]
        
        # Choose which player goes first (probably change to random initialization)
        self.turn = random.randint(0, 1)    # 0 for TOP player
                                         # 1 for BOTTOM player
            
        # Game End indicator
        self.game_end = False
        
        # Should prints be displayed?
        self.disp = disp

In [3]:
## For Deep-Q-Learning we need a replay memory
# This memory is for the Top player!
class Memory:
    import random
    def __init__(self, maxlen = 1e5):
        self.size = 0
        self.memory = []
        print("Memory Initialized")
        self.maxlen = maxlen
        self.current_pos = 0
        
    
    def __getitem__(self, idx=-1):
        if idx < 0 or idx >= self.size:
            print(f'Index {idx} is too large for memory of length {self.size}. \nInstead return random entry')
            return self.memory[random.randint(0, self.size - 1)]
        else:
            return self.memory[idx]
    
    def draw(self):
        if self.size: # self.size should not be 0
            return self.__getitem__(random.randint(0, self.size - 1))
        else:
            print(f'Memory {turn} is not yet filled')
            return []
        
    def draw_batch(self, batch_size):
        batch_size = min(batch_size, self.size)
        return [self.draw() for i in range(batch_size)]
    
    def add(self, game0, action, reward, game1, game_end):
        if self.size < self.maxlen:
            # relevant quantities: state0, action, reward, state1, game_end
            self.memory.append([game0.board.reshape(1,12), action, reward, game1.board.reshape(1,12), game1.game_end])
            self.size += 1
            self.current_pos += 1
        else:
            self.current_pos = self.current_pos % self.maxlen
            self.memory[self.current_pos[turn]] = [game0.board.reshape(1,12), action, reward, game1.board.reshape(1,12), game1.game_end]
            self.current_pos += 1
        
        
    def add_random_game(self):
        # Initialize game
        game = Mancala(False)  
        
        # Start game Loop
        while not game.game_end:
            
            # state t
            game0 = game.copy()
            
            # random action
            ac = random.randint(0, 11)
            
            while game.board[game.turn][abs(5*(1 - game.turn) - ac%6)] == 0:
                #print(game.turn, game.board)
                ac = random.randint(0, 11)
            reward = game.action(ac)
            
            # save state(t), action, state(t+1)
            self.add(game0.turn, game0, ac, reward, game.copy())  
            
    def __str__(self):
        return f'Size: {self.size} \nFirst Entry Player 0:\n {self.memory[0][0]} \n... \nLast Entry Player 1:\n {self.memory[1][self.size[1] - 1]}'

In [4]:
class Player():
    import numpy as np
    import random
    
    def __init__(self, pos, name):
        self.score = 0
        self.reward = 0
        self.name = name
        
        self.pos = pos
        self.position = ["TOP", "BOTTOM"][pos]
        
        
class RandomPlayer(Player):
    def __init__(self, pos=1, name="Random"):
        Player.__init__(self, pos, name)
    
    def action(self, game):
        action = random.randint(0, 11)
        while not game.valid_action(action, game.turn):
            action = random.randint(0, 11)
        
        return action
        
class GreedyPlayer(Player):
    import numpy as np
    def __init__(self, pos=1, name="Greedy Bastard"):
        Player.__init__(self, pos, name)
        self.reward_list = np.zeros(12, dtype=np.int64)
    
    def action(self, game):
        reward_list = np.zeros(12, dtype=np.int64)
        if 0 not in game.board[self.pos]:
            if 12 not in game.board[self.pos] and 13 not in game.board[self.pos]:
                return random.randint(0, 11)
                
        if self.pos == 0:
            for idx, beans in enumerate(game.board[self.pos]):
                if beans > 13 or beans == 0:
                    continue
                elif beans == 13:
                    reward_list[idx + 6] = 3 + game.board[1 - self.pos][idx]
                elif beans == 12:
                    reward_list[idx] = 2 + game.board[1 - self.pos][idx]
                elif beans <= 5-idx:
                    if game.board[self.pos][5 - idx - beans] == 0:
                        reward_list[idx] = 1 + game.board[1 - self.pos][5 - idx - beans]
                        reward_list[idx + 6] = 1 + game.board[1 - self.pos][5 - idx - beans]
                        
                elif beans == 5 - idx + 1 + 6:
                    if game.board[self.pos][beans + idx - 12] == 0:
                        reward_list[idx] = 2 + game.board[1 - self.pos][beans + idx - 12]
                elif beans > 5 - idx + 1 + 6:
                    if game.board[self.pos][beans + idx - 12] == 0:
                        reward_list[idx] = 2 + game.board[1 - self.pos][beans + idx - 12]
                    if game.board[self.pos][beans + idx - 13] == 0:
                        reward_list[idx + 6] = 3 + game.board[1 - self.pos][beans + idx - 13]
            
        else:
            for idx, beans in enumerate(game.board[self.pos]):
                if beans > 13 or beans == 0:
                    continue
                elif beans == 13:
                    reward_list[idx + 6] = 3 + board[1 - self.pos][idx]
                elif beans == 12:
                    reward_list[idx] = 2 + board[1 - self.pos][idx]
                elif beans <= idx:
                    if game.board[self.pos][idx - beans] == 0:
                        reward_list[idx] = 1 + game.board[1 - self.pos][idx - beans]
                        reward_list[idx + 6] = 1 + game.board[1 - self.pos][idx - beans]
                elif beans == idx + 1 + 6:
                    if game.board[self.pos][beans - idx - 7] == 0:
                        reward_list[idx] = 2 + game.board[1 - self.pos][beans - idx - 7]
                elif beans > idx + 1 + 6:
                    if game.board[self.pos][beans - idx - 7] == 0:
                        reward_list[idx] = 2 + game.board[1 - self.pos][beans - idx - 7]
                    if game.board[self.pos][beans - idx - 8] == 0:
                        reward_list[idx + 6] = 3 + game.board[1 - self.pos][beans - idx - 8]
                        
        print(reward_list)
        action = np.random.choice(np.where(reward_list == np.max(reward_list))[0])
        while not game.valid_action(action, game.turn):
            reward_list[action] -= 1000
            action = np.random.choice(np.where(reward_list == np.max(reward_list))[0])
        
        return action
    

class HumanPlayer(Player):
    def __init__(self, pos=1, name='Human'):
        Player.__init__(self, pos, name)
        
    def action(self, game):
        #print(game)
        
        inp = input(f'It is {self.name}s turn. Whats your next move? (input 0-11, end: 12)')
        if inp == '12':
            print(f'GAME ENDS')
            game.game_end = True
        elif inp in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11']:
            action = int(inp)
            while not game.valid_action(action, game.turn):
                print(f'The chosen bowl is empty. Please choose another action.')
                action = self.action(game)
        else:
            print(f'{inp} is an invalid input! Valid inputs are numbers 0-11 for actions and 12 to end the game.')
            action = self.action(game)
        
        return action
        

In [5]:
 class Arena():
    import random
    import numpy as np
    def __init__(self, Game, TopPlayer, BottomPlayer, TopMemory, disp = False):
        self.Game = Game
        self.TopPlayer = TopPlayer
        self.BottomPlayer = BottomPlayer
        
        self.TopScore = 0
        self.BottomScore = 0
        
        self.TopMemory = TopMemory
        
        self.disp = disp
        
        if self.disp:
            print("Game has started!")
            print(self.__str__())
            
    def __str__(self):
        out = f'Top Player:    {self.TopPlayer.name}. Score: {self.Game.score[0]}\n'
        out += f'Bottom Player: {self.BottomPlayer.name}. Score: {self.Game.score[1]}\n'
        out += str(self.Game)
        out += f'\nNext Player: {[self.TopPlayer.name, self.BottomPlayer.name][self.Game.turn]}'
        return out if self.disp else ''
        
    
    def reset(self):
        self.Game.reset()
        self.TopScore = 0
        self.BottomScore = 0
        
        
    def play(self):
            
        if self.Game.turn == 1:
            
            action = self.BottomPlayer.action(self.Game)
            
            if self.disp: print(f'{self.BottomPlayer.name} chooses {action}')
            
            reward = self.Game.action(action)
            
            if self.disp: print(f'Reward: {reward}')
            
            
        while not self.Game.game_end:

            reward = 0
            
            previous_state = self.Game.copy()

            action = self.TopPlayer.action(self.Game)
            
            if self.disp: print(f'{self.TopPlayer.name} chooses {action}')
                
            reward = self.Game.action(action)
                
            end_reward = self.Game.end_check(self.Game.turn)
            
            if self.Game.game_end:
                reward += end_reward[0] - end_reward[1]
            
            if self.disp: 
                print(f'Reward: {reward}')
                print(self.Game)
                
            
            if self.Game.turn == 1:

                actionBottom = self.BottomPlayer.action(self.Game)

                if self.disp: print(f'{self.BottomPlayer.name} chooses {actionBottom}')

                rewardBottom = self.Game.action(action)
                reward -= rewardBottom
                                    
                end_reward = self.Game.end_check(self.Game.turn)
                
                if self.Game.game_end:
                    reward += end_reward[0] - end_reward[1]
                    reward_Bottom += end_reward[1]
                
                if self.disp: 
                    print(f'Reward: {rewardBottom}')
                    print(self.Game)
            

            self.TopMemory.add(previous_state, action, reward, self.Game.copy(), self.Game.game_end)

In [6]:
T = GreedyPlayer(pos=0)
B = HumanPlayer(pos=1)

M = Memory()

G = Mancala()

A = Arena(G, T, B, M, True)

A.play()

Memory Initialized
Game has started!
Top Player:    Greedy Bastard. Score: 0
Bottom Player: Human. Score: 0
[4, 4, 4, 4, 4, 4]
[4, 4, 4, 4, 4, 4]End = False
Next Player: Greedy Bastard
Greedy Bastard chooses 3
Reward: 0
[5, 5, 0, 4, 4, 4]
[5, 5, 4, 4, 4, 4]End = False
It is Humans turn. Whats your next move? (input 0-11, end: 12)2
Human chooses 2
Reward: 0
[5, 5, 0, 4, 5, 5]
[5, 5, 4, 0, 5, 5]End = False
[0 0 0 0 0 0 0 0 0 0 0 0]
Greedy Bastard chooses 11
Reward: 1
[0, 5, 0, 4, 5, 5]
[6, 6, 5, 1, 5, 5]End = False


KeyboardInterrupt: Interrupted by user

In [None]:
np.where(np.array([1,2,3,4,4,3,2,4]) == np.max([1,2,3,4,4,3,2,4]))[0]