# New Game Play

In [2]:
import copy
import random
from json.encoder import INFINITY
import pandas as pd
import math

seed = 37

randint = random.randint
random.seed(seed)

In [3]:
def end_game(state):
    return sum(state['board']) == 0

In [4]:
def assign_reward(reward, new_reward):
    return [reward[0] + new_reward[0], reward[1] + new_reward[1], new_reward[2]]

In [5]:
def terminate_loop(state):
    state['board'] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    reward = (0,0,0)

    return (state, reward)

In [6]:
def print_game_play(state, reward, new_starting_position):
    print('new_starting_position: ', new_starting_position)
    print('state: ', state)
    print('reward: ', reward)
    print('--------------------')

In [7]:
def four_left(state, reward):
    state['board'] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

    if reward[2] == 0:
        return  (4,0,0)

    if reward[2] == 1:
        return  (0,4,1)

In [8]:
def get_valid_actions(arr):
    new_arr = []
    for i,a in enumerate(arr):
        if a != 0:
            new_arr.append(i)
    return new_arr

In [9]:
def generate_action(state,agent_1, agent_2):
   if state['current_player'] == 0:
      func = agent_1['func']
      arg = agent_1['arg']
      return func(state, arg)

   if state['current_player'] == 1:
      func = agent_2['func']
      arg = agent_2['arg']
      return func(state, arg)


In [10]:
def is_valid_actions(state):

    if state['current_player'] == 0:
          val = state['board'][0:state['player_territory'][1]]
          return sum(val)

    if state['current_player'] == 1:
          val = state['board'][state['player_territory'][1]:12]
          return sum(val)


In [11]:
def get_reward(stones_in_pit, board, position, stone, current_player, player_territory):
    board[position] = 0
    if stone == stones_in_pit-1 and position < player_territory[1] and current_player == 1:
        return (0,4,1)

    if stone == stones_in_pit-1 and position >= player_territory[1] and current_player == 0:
        return  (4,0,0)

    if position < player_territory[1]:
        return  (4,0,0)
        # return (0,4,1)

    else:
        return (0,4,1)


In [12]:
def is_illegal_move(state, action):
    board = state['board']
    current_player = state['current_player']
    player_territory = state['player_territory']
    stones_in_pit = board[action]

    # is pit empty
    if stones_in_pit == 0:
        # print('pit is empty')
        return True

    # is pit not in player one's territory
    if current_player == 0 and action >= player_territory[1]:
        # print("pit is not in player 1's territory")
        return True

    # is pit not in player two's territory
    if current_player == 1 and action <  player_territory[1]:
        # print("pit is not in player 2's territory")
        return True

    return False


In [13]:
def session(state, starting_position,latest_winner ):
    new_state = copy.deepcopy(state)

    board = new_state['board']
    current_player = new_state['current_player']
    player_territory = new_state['player_territory']
    stones_in_pit = board[starting_position]
    board[starting_position] = 0
    reward = [0,0,latest_winner]


    for stone in range(stones_in_pit):
        future_position = (stone + starting_position + 1) % 12
        board[future_position] += 1

        if board[future_position] == 4:
             new_reward = get_reward(stones_in_pit, board, future_position, stone, current_player, player_territory)
             reward = assign_reward(reward, new_reward)

    return (new_state, reward, future_position)


In [14]:
def play(state, action, show=False):
    reward = [0,0,0]
    max_rez = 100
    rez = 0
    # print(is_illegal_move(state, action))
    if is_illegal_move(state, action):
        new_state = {
        'board' : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        'current_player': state['current_player'],
        'player_territory': (0,6)}
        r = new_state['current_player']
        reward = [r*400,((r+1)%2)*400,0]
        return (new_state, reward)

    state, new_reward, new_starting_position = session(state, action, reward[2])
    reward = assign_reward(reward, new_reward)
    if show:
        print_game_play(state, reward, new_starting_position)

    board = state['board']
    stones_in_pit = board[new_starting_position]

    while stones_in_pit > 1:
        state, new_reward, new_starting_position = session(state,new_starting_position,reward[2])
        reward = assign_reward(reward, new_reward)
        if show:
            print_game_play(state, reward, new_starting_position)

        board = state['board']
        stones_in_pit = board[new_starting_position]

        if rez > max_rez:
            state, reward = terminate_loop(state)
            return (state, reward)

        rez += 1


    state['current_player'] = +(not state['current_player'])

    if sum(state['board']) <= 4:
        new_reward = four_left(state, reward)
        reward = assign_reward(reward, new_reward)

    if not is_valid_actions(state):
        state['current_player'] = +(not state['current_player'])

    return (state, reward)

## Simulation

In [15]:
def simulate_game(state,  agent_1, agent_2,show=False):
    state = copy.deepcopy(state)
    reward = [0,0,0]
    path = []

    while True:
        action = generate_action(state, agent_1, agent_2)

        if is_illegal_move(state, action):
            continue

        state, new_reward = play(state, action)
        reward = assign_reward(reward, new_reward)
        if show:
            print_game_play(state, reward, action)

        if end_game(state):
            break
        path.append(action)

    return  (reward, path)

In [16]:
def get_valid_actions_mct(state):
    board = state['board']
    territory = state['player_territory'][1]
    current_player = state['current_player']
    valid_actions = []
    for i,a in enumerate(board):
        if current_player == 0 and i < territory and a != 0:
                    valid_actions.append(i)
        if current_player == 1 and i >= territory and a != 0:
                    valid_actions.append(i)
    return valid_actions

## Agents

### Random Agent

In [17]:
def random_agent(state, arg=None):
        valid_actions = get_valid_actions_mct(state)
        return valid_actions[randint(0,len(valid_actions)-1)]

### Minimax Agent

In [18]:

def minimax_agent(state, arg):
    # print(state)
    max_dept = arg['max_dept']
    reward = [0,0,0]
    all_game_state = []
    all_game_state.append((state, reward))

    if state['current_player'] == 0:
        best_score = -INFINITY
        for action in range(state['player_territory'][1]):
            temp_state, reward =  copy.deepcopy(all_game_state[-1])
            # temp_state['path'].append(child)
            game_state, new_reward = play(temp_state, action)
            reward = assign_reward(reward, new_reward)
            all_game_state.append((game_state, reward))
            score = minimax(game_state, False, all_game_state, reward, max_dept, -INFINITY, INFINITY)
            all_game_state.pop()

            if score > best_score:
                best_score = score
                move = action

    else:
        best_score = INFINITY
        for action in range(state['player_territory'][1],12):
            temp_state, reward =  copy.deepcopy(all_game_state[-1])
            game_state, new_reward = play(temp_state, action)
            reward = assign_reward(reward, new_reward)
            all_game_state.append((game_state, reward))
            score = minimax(game_state, True,all_game_state,reward,max_dept,-INFINITY,INFINITY)
            all_game_state.pop()

            if score < best_score:
                best_score = score
                move = action
    # print('best_score',best_score)
    return move



def minimax(game_state, is_maximizing,all_game_state,reward,max_dept,alpha,beta ):
    max_dept -= 1
    # game_state['current_player'] = current_player

    if(max_dept <= 0 or  end_game(game_state)):
        return reward[0]-reward[1]

    if is_maximizing:
        best_score = -INFINITY

        for action in range(game_state['player_territory'][1]):
            temp_state, reward =  copy.deepcopy(all_game_state[-1])
            game_state, new_reward = play(temp_state, action)
            reward = assign_reward(reward, new_reward)
            all_game_state.append((game_state, reward))
            score = minimax(game_state, False,all_game_state,reward,max_dept,alpha,beta)
            all_game_state.pop()
            best_score = max(score, best_score)
            alpha = max(alpha,score)

            if beta <= alpha:
                break

        return best_score

    else:
        best_score = INFINITY

        for action in range(game_state['player_territory'][1],12):
            temp_state, reward =  copy.deepcopy(all_game_state[-1])
            # print(temp_state)
            game_state, new_reward = play(temp_state, action)
            reward = assign_reward(reward, new_reward)
            all_game_state.append((game_state, reward))
            score = minimax(game_state, True,all_game_state,reward,max_dept,alpha,beta)
            all_game_state.pop()
            best_score = min(score, best_score)
            beta = min(beta,score)

            if beta <= alpha:
                break

        return best_score


## Monter Carlo

In [19]:
class Node():
    def __init__(self,state,root_current_player, action=None,parent_node=None):
        self.parent_node = parent_node
        self.action = action
        self.legal_actions = []
        self.total_score = 0
        self.visit_count = 0
        self.expanded = False
        self.children = None
        self.state = state
        self.root_current_player = root_current_player

    def update_result(self, reward):
        self.total_score += reward
        self.visit_count += 1

    def printer(self):
        print('parent_node: ',self.parent_node)
        print('action: ',self.action)
        print('legal_actions: ',self.legal_actions)
        print('total_score: ',self.total_score)
        print('visit_count: ',self.visit_count)
        print('expanded: ',self.expanded)
        print('children: ',self.children)
        print('root_current_player: ',self.root_current_player)
        print('state: ',self.state)

In [20]:
def expand(node):
    state = node.state
    actions = get_valid_actions_mct(state)
    node.legal_actions = actions
    node.expanded = True
    root_current_player = node.root_current_player
    node.children = {}

    for action in actions:
        new_state, reward = play(state, action)
        node.children[action] = Node(new_state,root_current_player,action,node)
    return node

In [21]:
def resources_left( max_iterations, iterations):
    return max_iterations > iterations

In [22]:
def ucb(constant = 2, total_score = 0, number_of_parent_visits = 0, number_of_visits = 0):
    if number_of_visits == 0:
        return INFINITY
    avg_score = total_score / number_of_visits

    return (avg_score +(constant*math.sqrt(math.log(number_of_parent_visits)/number_of_visits)))

In [23]:
def select(node):
    actions = node.legal_actions

    if len(actions) == 0:
        return None

    best_score = -INFINITY
    for action in actions:
        child = node.children[action]
        total_score = child.total_score
        parent_visit_count = node.visit_count
        visit_count = child.visit_count
        score = ucb(2,total_score, parent_visit_count, visit_count )

        if score > best_score:
            best_score = score
            best_action = action

    return best_action

In [24]:
# function for the result of the simulation
def rollout(node):

    state = node.state

    player_1 = {
    'func': random_agent,
    'arg': {}
    }

    player_2 = {
    'func': random_agent,
    'arg': {}
    }

    if end_game(state):
            return 0
#     print('current_player',state['current_player'])
#     print('root_current_player',node.root_current_player)
    reward, path = simulate_game(state, player_1, player_2)
#     print(reward[node.root_current_player])

    if reward[node.root_current_player] > 24:
        results = 1
    if reward[node.root_current_player] < 24:
        results = 0
    if reward[node.root_current_player] == 24:
        results = -1

    return  results
    # return  reward[node.root_current_player]



In [25]:
def best_child(node):
    actions = node.legal_actions
    highest_visit = -INFINITY
    for action in actions:
        child = node.children[action]
        visit_count = child.visit_count
        if visit_count > highest_visit:
            highest_visit = visit_count
            best_action = action
    return best_action

In [26]:
def back_propagation(node, result):
    node.update_result(result)
    parent_node =node.parent_node

    if parent_node == None:
        return

    back_propagation(parent_node, result)

## Monte Carlo Tree Search (MCTS) Agent

In [27]:
def mcts_agent(state,arg):
    max_iterations = arg['max_iterations']
    root = Node(state,state['current_player'])
    expand(root)
    node = root
    i = 0

    while resources_left(max_iterations, i):
        i += 1
        while node.children:
            action = select(node)

            if action == None:
                break
            child = node.children[action]
            node = child
            # node.printer()

        if node.visit_count == 0:
            result = rollout(node)
            back_propagation(node, result)
            node = root
        else:
            expand(node)
            action = select(node)
            if action == None:
                break
            child = node.children[action]
            node = child
            result = rollout(node)
            back_propagation(node, result)
            node = root

    return best_child(root)


In [27]:
state = {
   # 'board' :[6, 6, 2, 7, 1, 6, 1, 6, 6, 6, 0, 1],
   'board' :[4,4,4,4,4,4,4,4,4,4,4,4],
   'current_player': 0,
   'player_territory': (0,6)
}

t = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0}
for i in range(10):
   action, node = mcts_agent(state,{'max_iterations': 1000})
   t[action] += 1


TypeError: cannot unpack non-iterable int object

In [None]:
t

In [None]:
ns = node.children[7]
for child in ns.children:
    print(ns.children[child].printer())

In [99]:
def format_state(state, action):
    board = state['board']
    data = []
    for i,pit in enumerate(board):
        num = str(i)
        data[f'pit {num}'] = pit
    data['current_player'] = state['current_player']
    data['player_territory'] = state['player_territory'][1]
    data['action'] = action
    return [data]

In [None]:
state = {
   'board' :[4,4,4,4,4,4,4,4,4,4,4,4],
   'current_player': 0,
   'player_territory': (0,6)
}

action = 0
format_state(state, action)

## Elo

In [101]:
def update_score(actual_result, expected_result):
    k = 20
    return k * (actual_result-expected_result)

def expected_score(elo_1,elo_2):
    diff = elo_2 - elo_1
    a = diff/400
    d = 1 + 10**a
    return 1/d

def calculate_elo(elo_0,elo_1,result):
    expected_result = expected_score(elo_0,elo_1)
    change = update_score(result, expected_result)

    new_elo_0 = elo_0 + change
    new_elo_1 = elo_1 - change

    return (new_elo_0, new_elo_1)

## Test

### Agents Details

In [139]:
random_agent_details = {
   'func': random_agent,
   'arg': {},
   'name': 'random_agent',
   'elo': 1200
}

mcts_agent_details_10 = {
   'func': mcts_agent,
   'arg': {
      'max_iterations': 10,
   },
   'name': 'mcts_agent_10',
   'elo': 1200
}

mcts_agent_details_100 = {
   'func': mcts_agent,
   'arg': {
      'max_iterations': 100,
   },
   'name': 'mcts_agent_100',
   'elo': 1200
}

mcts_agent_details_1000 = {
   'func': mcts_agent,
   'arg': {
      'max_iterations': 1000,
   },
   'name': 'mcts_agent_1000',
   'elo': 1200
}

minimax_agent_details_3 = {
   'func': minimax_agent,
   'arg': {
      'max_dept': 3,
   },
   'name': 'minimax_agent_dept_3',
   'elo': 1200
}

minimax_agent_details_6 = {
   'func': minimax_agent,
   'arg': {
      'max_dept': 6,
   },
   'name': 'minimax_agent_dept_6',
   'elo': 1200
}

minimax_agent_details_9 = {
   'func': minimax_agent,
   'arg': {
      'max_dept': 9,
   },
   'name': 'minimax_agent_dept_9',
   'elo': 1200
}

In [152]:
def match_up(origin_state,number_of_games, player_1, player_2 ):
   results = {
         'player_1_name': player_1['name'],
         'player_2_name': player_2['name'],
         'player_1_wins': 0,
         'player_2_wins': 0,
         'ties': 0,
         'player_1_elo': player_1['elo'],
         'player_2_elo': player_2['elo'],
      }
    
   for i in range(number_of_games):
      reward, path = simulate_game(origin_state, player_1, player_2)
      # paths.append([reward,path])
      
      if reward[0] > reward[1]:
         results['player_1_wins'] += 1
         elo_0, elo_1 = calculate_elo(player_1['elo'],player_2['elo'],1)
      elif reward[0] < reward[1]:
         results['player_2_wins'] += 1
         elo_0, elo_1 = calculate_elo(player_1['elo'],player_2['elo'],0)
      elif reward[0] == reward[1]:
         results['ties'] += 1
         elo_0, elo_1 = calculate_elo(player_1['elo'],player_2['elo'],0.5)
      
      player_1['elo'],player_2['elo'] = (elo_0, elo_1)

   results['player_1_elo'] = player_1['elo']
   results['player_2_elo'] = player_2['elo']
   results['player_1_wins'] = results['player_1_wins']/number_of_games * 100
   results['player_2_wins'] = results['player_2_wins']/number_of_games * 100
   results['ties'] = results['ties']/number_of_games * 100

   return results

In [None]:
state = {
   'board' :[4,4,4,4,4,4,4,4,4,4,4,4],
   'current_player': 0,
   'player_territory': (0,6)
}

number_of_games = 200
simulation_results = []
paths = []

agent_match_up =  [random_agent_details, minimax_agent_details_3, minimax_agent_details_6,minimax_agent_details_9, mcts_agent_details_10, mcts_agent_details_100,mcts_agent_details_1000]


for agent_1 in agent_match_up:
   for agent_2 in agent_match_up:
      if agent_1['name'] == agent_2['name']:
         continue
      
      results = match_up(state,number_of_games, agent_1, agent_2 )
      simulation_results.append(results)
      df = pd.DataFrame(simulation_results)
      df.to_csv('matches')

      print(agent_1['name'],'wins:',results['player_1_wins'])
      print(agent_2['name'],'wins:', results['player_2_wins'])
      print('ties: ',results['ties'])
      print(agent_1['name'],'elo: ',agent_1['elo'])
      print(agent_2['name'],'elo: ', agent_2['elo'])
      print('-----------------------------------------')

for agent in agent_match_up:
   print(agent['name'])
   print(agent['elo'])

random_agent wins: 0.0
minimax_agent_dept_3 wins: 99.0
ties:  1.0
random_agent elo:  813.690005735087
minimax_agent_dept_3 elo:  1598.9917338925795
-----------------------------------------
random_agent wins: 0.0
minimax_agent_dept_6 wins: 99.0
ties:  1.0
random_agent elo:  753.8850457240397
minimax_agent_dept_6 elo:  1489.12930468121
-----------------------------------------


In [127]:
df = pd.DataFrame(simulation_results)
df.to_csv('mcts_agent_100_vs__mcts_agent_100__100')
df

### Policy Network

In [89]:

    player_1, player_2, player_1_wins, player_2_wins, ties, elo_1, elo_2
elo 1000    1020
wins 9%     91%



(1481.0648043040405, 1018.9351956959596)