In [5]:
# state = {'board': [1, 0, 2, 1, 0, 1, 3, 1, 0, 2, 1, 0], 'current_player': 1, 'player_territory': (0, 6)}
# action = 6
# play(state, action, True)

# New Game Play

In [6]:
import copy
import random 
from json.encoder import INFINITY
import pandas as pd
import math

seed = 37

randint = random.randint
random.seed(seed)

In [7]:
def end_game(state):
    
    return sum(state['board']) == 0

In [8]:
def assign_reward(reward, new_reward):
    return [reward[0] + new_reward[0], reward[1] + new_reward[1], new_reward[2]]

In [9]:
def terminate_loop(state):
    state['board'] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    reward = (0,0,0)

    return (state, reward)

In [10]:
def print_game_play(state, reward, new_starting_position):
    print('new_starting_position: ', new_starting_position)
    print('state: ', state)
    print('reward: ', reward)
    print('--------------------')

In [11]:
def four_left(state, reward):
    state['board'] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

    if reward[2] == 0: 
        return  (4,0,0)
    
    if reward[2] == 1: 
        return  (0,4,1)

In [12]:
def get_valid_actions(arr):
    new_arr = []
    for i,a in enumerate(arr):
        if a != 0:
            new_arr.append(i)
    return new_arr

In [13]:
def generate_action(state,agent_1, agent_2):
   if state['current_player'] == 0:
      func = agent_1['func']
      arg = agent_1['arg']
      return func(state, arg)
    
   if state['current_player'] == 1:
      func = agent_2['func']
      arg = agent_2['arg']
      return func(state, arg)


In [14]:
def is_valid_actions(state):

    if state['current_player'] == 0:
          val = state['board'][0:state['player_territory'][1]]
          return sum(val)
    
    if state['current_player'] == 1:
          val = state['board'][state['player_territory'][1]:12]
          return sum(val)
    

In [15]:
def get_reward(stones_in_pit, board, position, stone, current_player, player_territory): 
    board[position] = 0
    if stone == stones_in_pit-1 and position < player_territory[1] and current_player == 1:
        return (0,4,1)

    if stone == stones_in_pit-1 and position >= player_territory[1] and current_player == 0:
        return  (4,0,0)

    if position < player_territory[1]:
        return  (4,0,0)
        # return (0,4,1)

    else:
        return (0,4,1)


In [16]:
def is_illegal_move(state, action):
    board = state['board']
    current_player = state['current_player']
    player_territory = state['player_territory']
    stones_in_pit = board[action]

    # is pit empty
    if stones_in_pit == 0:
        # print('pit is empty')
        return True
    
    # is pit not in player one's territory
    if current_player == 0 and action >= player_territory[1]:
        # print("pit is not in player 1's territory")
        return True
    
    # is pit not in player two's territory
    if current_player == 1 and action <  player_territory[1]:
        # print("pit is not in player 2's territory")
        return True
    
    return False


In [17]:
def session(state, starting_position,latest_winner ):
    new_state = copy.deepcopy(state)

    board = new_state['board']
    current_player = new_state['current_player']
    player_territory = new_state['player_territory']
    stones_in_pit = board[starting_position]
    board[starting_position] = 0
    reward = [0,0,latest_winner]


    for stone in range(stones_in_pit):
        future_position = (stone + starting_position + 1) % 12
        board[future_position] += 1

        if board[future_position] == 4:
             new_reward = get_reward(stones_in_pit, board, future_position, stone, current_player, player_territory)
             reward = assign_reward(reward, new_reward)

    return (new_state, reward, future_position)


In [18]:
def play(state, action, show=False):
    reward = [0,0,0]
    max_rez = 100
    rez = 0
    # print(is_illegal_move(state, action))
    if is_illegal_move(state, action):
        new_state = {
        'board' : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        'current_player': state['current_player'],
        'player_territory': (0,6)}
        r = new_state['current_player']
        reward = [r*400,((r+1)%2)*400,0]
        return (new_state, reward)

    state, new_reward, new_starting_position = session(state, action, reward[2])
    reward = assign_reward(reward, new_reward)
    if show:
        print_game_play(state, reward, new_starting_position)

    board = state['board']
    stones_in_pit = board[new_starting_position]
    
    while stones_in_pit > 1:
        state, new_reward, new_starting_position = session(state,new_starting_position,reward[2])
        reward = assign_reward(reward, new_reward)
        if show:
            print_game_play(state, reward, new_starting_position)

        board = state['board']
        stones_in_pit = board[new_starting_position]

        if rez > max_rez:
            state, reward = terminate_loop(state)
            return (state, reward)

        rez += 1

    
    state['current_player'] = +(not state['current_player'])

    if sum(state['board']) <= 4:
        new_reward = four_left(state, reward)
        reward = assign_reward(reward, new_reward)

    if not is_valid_actions(state):
        state['current_player'] = +(not state['current_player'])

    return (state, reward)

## Simulation

In [19]:
def simulate_game(state,  agent_1, agent_2,show=False):
    state = copy.deepcopy(state)
    reward = [0,0,0]
    path = []

    while True:
        action = generate_action(state, agent_1, agent_2)

        if is_illegal_move(state, action):
            continue
        
        state, new_reward = play(state, action)
        reward = assign_reward(reward, new_reward)
        if show:
            print_game_play(state, reward, action)

        if end_game(state):
            break
        path.append(action)
    
    return  (reward, path)

In [20]:
# state = {
# 'board' :[4,4,4,4,4,4,4,4,4,4,4,4],
#   'current_player': 0,
#   'player_territory': (0,6)}

# simulate_game(state)

In [21]:
def get_valid_actions_mct(state):
    board = state['board']
    territory = state['player_territory'][1]
    current_player = state['current_player']
    valid_actions = []
    for i,a in enumerate(board):
        if current_player == 0 and i < territory and a != 0:
                    valid_actions.append(i)
        if current_player == 1 and i >= territory and a != 0:
                    valid_actions.append(i)
    return valid_actions

## Agents

### Random Agent

In [22]:
# def random_agent(state, arg=None):
#     if state['current_player'] == 0:
#         arr = state['board'][0:state['player_territory'][1]]
#         # print('arr:', arr)
#         valid_actions = get_valid_actions(arr)
#         # print(valid_actions)
#         return valid_actions[randint(0,len(valid_actions)-1)]
    

#     if state['current_player'] == 1:
#         arr = state['board'][state['player_territory'][1]:12]
#         # print('arr:', arr)
#         valid_actions = get_valid_actions(arr)
#         # print(valid_actions)
#         return valid_actions[randint(0,len(valid_actions)-1)] + state['player_territory'][1]

def random_agent(state, arg=None):
        valid_actions = get_valid_actions_mct(state)
        return valid_actions[randint(0,len(valid_actions)-1)]

### Minimax Agent

In [23]:

def minimax_agent(state, arg):
    # print(state)
    max_dept = arg['max_dept']
    reward = [0,0,0]
    all_game_state = []
    all_game_state.append((state, reward))

    if state['current_player'] == 0:
        best_score = -INFINITY
        for action in range(state['player_territory'][1]):
            temp_state, reward =  copy.deepcopy(all_game_state[-1])
            # temp_state['path'].append(child)
            game_state, new_reward = play(temp_state, action)
            reward = assign_reward(reward, new_reward)
            all_game_state.append((game_state, reward))
            score = minimax(game_state, False, all_game_state, reward, max_dept, -INFINITY, INFINITY)
            all_game_state.pop()

            if score > best_score:
                best_score = score
                move = action

    else:
        best_score = INFINITY
        for action in range(state['player_territory'][1],12):
            temp_state, reward =  copy.deepcopy(all_game_state[-1])
            game_state, new_reward = play(temp_state, action)
            reward = assign_reward(reward, new_reward)
            all_game_state.append((game_state, reward))
            score = minimax(game_state, True,all_game_state,reward,max_dept,-INFINITY,INFINITY)
            all_game_state.pop()

            if score < best_score:
                best_score = score
                move = action
    # print('best_score',best_score)
    return move



def minimax(game_state, is_maximizing,all_game_state,reward,max_dept,alpha,beta ):
    max_dept -= 1
    # game_state['current_player'] = current_player

    if(max_dept <= 0 or  end_game(game_state)):
        return reward[0]-reward[1]

    if is_maximizing:
        best_score = -INFINITY

        for action in range(game_state['player_territory'][1]):
            temp_state, reward =  copy.deepcopy(all_game_state[-1])
            game_state, new_reward = play(temp_state, action)
            reward = assign_reward(reward, new_reward)
            all_game_state.append((game_state, reward))
            score = minimax(game_state, False,all_game_state,reward,max_dept,alpha,beta)
            all_game_state.pop()
            best_score = max(score, best_score)
            alpha = max(alpha,score)

            if beta <= alpha:
                break

        return best_score

    else:
        best_score = INFINITY

        for action in range(game_state['player_territory'][1],12):
            temp_state, reward =  copy.deepcopy(all_game_state[-1])
            # print(temp_state)
            game_state, new_reward = play(temp_state, action)
            reward = assign_reward(reward, new_reward)
            all_game_state.append((game_state, reward))
            score = minimax(game_state, True,all_game_state,reward,max_dept,alpha,beta)
            all_game_state.pop()
            best_score = min(score, best_score)
            beta = min(beta,score)

            if beta <= alpha:
                break

        return best_score


In [24]:
# state =  {'board': [0, 8, 8, 0, 0, 2, 7, 2, 7, 7, 1, 2], 'current_player': 0, 'player_territory': (0, 6)}

# arg= {
#       'max_dept': 2
#    }

# minimax_agent(state,arg)

In [84]:
class Node():
    def __init__(self,state, action=None,parent_node=None):
        self.parent_node = parent_node
        self.action = action
        self.legal_actions = []
        self.total_reward = 0
        self.visit_count = 0
        self.expanded = False
        self.children = None
        self.state = state

    def update_parent(self, reward, visit):
        self.parent_node.total_reward += reward
        self.parent_node.visit_count += visit

In [89]:
def expand(node):
    state = node.state
    actions = get_valid_actions_mct(state)
    node.action = actions
    node.expanded = True
    node.children = {}

    for action in actions:
        new_state, reward = play(state, action)
        node.children[action] = Node(new_state,action,node)
    return node

In [88]:
state = {
   # 'board' :[6, 6, 2, 7, 1, 6, 1, 6, 6, 6, 0, 1],
   'board' :[4,4,4,4,4,4,4,4,4,4,4,4],
   'current_player': 0,
   'player_territory': (0,6)
}
n = Node(state)
l = expand(n)
l.state

{'board': [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
 'current_player': 0,
 'player_territory': (0, 6)}

In [27]:
def resources_left( max_iterations, iterations):
    return max_iterations > iterations

In [28]:
def ucb(constant = 2, total_score = 0, number_of_parent_visits = 0, number_of_visits = 0):
    if number_of_visits == 0:
        return INFINITY
    avg_score = total_score / number_of_visits
    
    return (avg_score +(constant*math.sqrt(math.log(number_of_parent_visits)/number_of_visits)))

In [75]:
def select_action(node):
    actions = node['actions']
    best_score = -INFINITY
    for action in actions:
        total_score = node['children'][action]['total_reward']
        number_of_parent_visits = node['number_visits']
        number_of_visits = node['children'][action]['number_visits']
        # print(total_score, number_of_parent_visits, number_of_visits )
        score = ucb(2,total_score, number_of_parent_visits, number_of_visits )
        if score >= best_score:
            best_score = score
            best_action = action
    node['children'][best_action]['number_visits'] += 1
    return best_action

In [62]:
# function for the result of the simulation
def rollout(node, action):

    child = node['children'][action]
    state = child['state']
    player_1 = {
    'func': random_agent,
    'arg': {}
    }

    player_2 = {
    'func': random_agent,
    'arg': {}
    }

    if end_game(state):
            return 0.5
    reward, path = simulate_game(state, player_1, player_2)

    if reward[state['current_player']] > 24:
        results = 1
    if reward[state['current_player']] < 24:
        results = 0.5
    if reward[state['current_player']] == 24:
        results = 0


    child['total_reward'] += results
    return  reward[state['current_player']]



In [46]:
def best_child(node):
    actions = node['actions']
    best_value = -INFINITY
    for action in actions:
        number_visits = node['children'][action]['number_visits']
        total_reward = node['children'][action]['total_reward']
        value = total_reward/number_visits
        print("value", value)
        print("node",action, node['children'][action])
        if value >= best_value:
            best_value = value
            best_action = action
    return best_action

## Monte Carlo Tree Search (MCTS) Agent

In [44]:
def mcts_agent(state,arg):
    max_iterations = arg['max_iterations']
    root = {
    'state': state,
    'actions': [],
    'total_reward': 0,
    'number_visits': 0,
     'is_expandable': True,
    'children': None
}
    
    i = 0
    while resources_left(max_iterations, i):
        if root['is_expandable']:
            expansion(root)
        # print('root', root['state'])
        action = select_action(root)
        rollout(root, action)
        root['number_visits'] += 1
        i += 1
    # print(root)
    return best_child(root)


In [78]:
state = {
   # 'board' :[6, 6, 2, 7, 1, 6, 1, 6, 6, 6, 0, 1],
   'board' :[4,4,4,4,4,4,4,4,4,4,4,4],
   'current_player': 0,
   'player_territory': (0,6)
}

mcts_agent(state,{'max_iterations': 500})

value 0.5723684210526315
node 0 {'state': {'board': [2, 7, 1, 6, 1, 6, 6, 6, 0, 1, 6, 6], 'current_player': 1, 'player_territory': (0, 6)}, 'actions': [], 'total_reward': 43.5, 'number_visits': 76, 'reward': [0, 0, 0], 'is_expandable': True, 'children': None}
value 0.6601941747572816
node 1 {'state': {'board': [6, 2, 7, 1, 6, 1, 6, 6, 6, 0, 1, 6], 'current_player': 1, 'player_territory': (0, 6)}, 'actions': [], 'total_reward': 68.0, 'number_visits': 103, 'reward': [0, 0, 0], 'is_expandable': True, 'children': None}
value 0.6634615384615384
node 2 {'state': {'board': [6, 6, 2, 7, 1, 6, 1, 6, 6, 6, 0, 1], 'current_player': 1, 'player_territory': (0, 6)}, 'actions': [], 'total_reward': 69.0, 'number_visits': 104, 'reward': [0, 0, 0], 'is_expandable': True, 'children': None}
value 0.5616438356164384
node 3 {'state': {'board': [1, 6, 6, 2, 7, 1, 6, 1, 6, 6, 6, 0], 'current_player': 1, 'player_territory': (0, 6)}, 'actions': [], 'total_reward': 41.0, 'number_visits': 73, 'reward': [0, 0, 0],

2

In [None]:
state = {
   'board' :[4,4,4,4,4,4,4,4,4,4,4,4],
   'current_player': 0,
   'player_territory': (0,6)
}

number_of_games = 1
simulation_results = []
lol = []

player_1 = {
   'func': mcts_agent,
   'arg': {
    #   'max_dept': 10,
      'max_iterations': 100,
   }
}

player_2 = {
   'func': random_agent,
  'arg': {
    #   'max_dept': 10,
      'max_iterations': 100,
   }
}

for j in range(2,11):
# for j in range(1):
   state['player_territory'] = (0,j)
   print('player_territory',0,j)
   results = {
         'player_1_pits': 0,
         'player_2_pits': 0,
         'player_1_wins': 0,
         'player_2_wins': 0,
         'ties': 0,
      }
   for i in range(number_of_games):
      reward, path = simulate_game(state, player_1, player_2)
      lol.append([reward,path])
      if reward[0] > reward[1]:
         results['player_1_wins'] += 1 
      elif reward[0] < reward[1]:
         results['player_2_wins'] += 1 
      elif reward[0] == reward[1]:
         results['ties'] += 1 
   results['player_1_pits'] = j
   results['player_2_pits'] = 12-j
   results['player_1_wins'] = results['player_1_wins']/number_of_games * 100
   results['player_2_wins'] = results['player_2_wins']/number_of_games * 100
   results['ties'] = results['ties']/number_of_games * 100

   simulation_results.append(results)
   print('player_1_wins: ', results['player_1_wins'])
   print('player_2_wins: ',results['player_2_wins'])
   print('ties: ',results['ties'])

In [35]:
df = pd.DataFrame(simulation_results)
df.to_csv('mcts_agent_100_vs__mcts_agent_100__100')
df

Unnamed: 0,player_1_pits,player_2_pits,player_1_wins,player_2_wins,ties
0,2,10,0.0,100.0,0.0
1,3,9,0.0,100.0,0.0
2,4,8,0.0,100.0,0.0
3,5,7,0.0,100.0,0.0
4,6,6,0.0,100.0,0.0
5,7,5,0.0,100.0,0.0
6,8,4,0.0,0.0,100.0
7,9,3,100.0,0.0,0.0
8,10,2,100.0,0.0,0.0
