# Lab 3: Policy Search

## Task3.3: An agent using minmax

Code based on an example presented here: https://realpython.com/python-minimax-nim/#play-a-simplified-game-of-nim


In [1]:
from typing import Callable
from copy import deepcopy
from operator import xor
from itertools import accumulate
import random
from collections import namedtuple
import logging
from tqdm import tqdm
from evolution_strategy import *
from functools import cache


## The _Nim_ and _Nimply_ classes


In [2]:
from Nim import Nimply
from Nim import Nim


## Sample (and silly) startegies


In [3]:
# pure_random from the lecture
def pure_random(state: Nim) -> Nimply:
    row = random.choice([r for r, c in enumerate(state.rows) if c > 0])
    if state.k == None:
        num_objects = random.randint(1, state.rows[row])
    elif state.rows[row] < state.k:
        num_objects = random.randint(1, state.rows[row])
    else:
        num_objects = random.randint(1, state.k)
    return Nimply(row, num_objects)

## Optimal strategy

In [4]:
# optimal strategy using nim sum (a bit different implementation than the lecture)
def nim_sum(state: Nim) -> int:
    *_, result = accumulate(state.rows, xor)
    return result


def optimal_strategy(state: Nim) -> Nimply:
    # retrieve the possible moves
    possible_moves = [(r, o) for r, c in enumerate(state.rows)
                      for o in range(1, c + 1)]
    if state.k != None:
        possible_moves = [p for p in possible_moves if p[1] <= state.k]

    # check the values of nim_sum after all possible moves
    possible_moves_optimal = list()

    for move in possible_moves:
        temp_state = deepcopy(state)
        temp_state.nimming(Nimply(move[0], move[1]))
        if nim_sum(temp_state) == 0:
            possible_moves_optimal.append(move)

    if possible_moves_optimal == []:
        chosen_move = random.choice(possible_moves)
    else:
        chosen_move = random.choice(possible_moves_optimal)

    return Nimply(chosen_move[0], chosen_move[1])

## MinMax

First player will be min (-1, equivalent to max_turn = FALSE), second max(1, equivalent to max_turn = TRUE).

In [5]:
def possible_moves(state: Nim):
    # retrieve the possible moves
    possible_moves = [(r, o) for r, c in enumerate(state.rows)
                      for o in range(1, c + 1)]
    # possible moves if k was implemented
    if state.k != None:
        possible_moves = [p for p in possible_moves if p[1] <= state.k]

    return possible_moves

def possible_new_states(state: Nim):
    # returns a list of outcome of all possible moves
    new_states_lists = []
    for p in possible_moves(state):
        temp_state = deepcopy(state)
        temp_state.nimming(Nimply(p[0], p[1]))
        new_states_lists.append(temp_state)
    return new_states_lists


In [6]:
def ending_position(state: Nim):
    return (sum(r for r in state.rows) == 0)


In [7]:
@cache
def minmax(state: Nim, max_turn):
    if ending_position(state):
        # end of a game
        return -1 if max_turn else 1

    possible_states = possible_new_states(state)

    if max_turn:
        scores = [minmax(new_state, max_turn=False)
                  for new_state in possible_states]
        return max(scores)
    else:
        scores = [minmax(new_state, max_turn=True)
                  for new_state in possible_states]
        return min(scores)


In [8]:
def minmax_best_move(state: Nim):
    for move in possible_moves(state):
        temp_nim = deepcopy(state)
        temp_nim.nimming(Nimply(move[0], move[1]))
        score = minmax(temp_nim, max_turn=False)
        if score > 0:
            break
    return Nimply(move[0], move[1])
    

## Oversimplified match


In [9]:
logging.getLogger().setLevel(logging.DEBUG)

# strategy = (pure_random, minmax_best_move)
# strategy = (minmax_best_move, pure_random)
strategy = (optimal_strategy, minmax_best_move)
# strategy = (minmax_best_move, optimal_strategy)

nim = Nim(3)

In [10]:
logging.debug(f"status: Initial board  -> {nim}")
player = 0
while nim:
    ply = strategy[player](nim)
    nim.nimming(ply)
    logging.debug(f"status: After player {player} -> {nim}")
    player = 1 - player
winner = 1 - player
logging.info(f"status: Player {winner} won!")

DEBUG:root:status: Initial board  -> <1 3 5>
DEBUG:root:status: After player 0 -> <1 3 2>
DEBUG:root:status: After player 1 -> <1 3 0>
DEBUG:root:status: After player 0 -> <1 1 0>
DEBUG:root:status: After player 1 -> <1 0 0>
DEBUG:root:status: After player 0 -> <0 0 0>
INFO:root:status: Player 0 won!


## Evaluate

In [11]:
# games are ran NUM_MATCHES times to check the average result
def evaluate(NUM_MATCHES: int, NIM_SIZE: int, strategy0: Callable, strategy1: Callable, k=None) -> float:
    opponent = (strategy0, strategy1)
    won = 0

    for m in range(NUM_MATCHES):
        nim = Nim(NIM_SIZE, k)
        player = 0
        while nim:
            ply = opponent[player](nim)
            nim.nimming(ply)
            player = 1 - player
        if player == 0:
            won += 1
    return won / NUM_MATCHES


In [12]:
NUM_MATCHES = 100
NIM_SIZE = 3
k = None


In [13]:
# minmax against pure_random
# minmax starts
print("MinMax against pure_random")
print("MinMax starts, win rate:", evaluate(NUM_MATCHES, NIM_SIZE, pure_random, minmax_best_move, k))
# pure random starts
print("pure_random starts, win rate:", 1 - evaluate(NUM_MATCHES, NIM_SIZE, minmax_best_move, pure_random, k))

MinMax against pure_random
MinMax starts, win rate: 0.99
pure_random starts, win rate: 1.0


In [14]:
# minmax against optimal_strategy
# minmax starts
print("MinMax against optimal_strategy")
print("MinMax starts, win rate:", evaluate(NUM_MATCHES, NIM_SIZE, optimal_strategy, minmax_best_move, k))
# pure random starts
print("optimal_strategy starts, win rate:", 1 - evaluate(NUM_MATCHES, NIM_SIZE, minmax_best_move, optimal_strategy, k))

MinMax against optimal_strategy
MinMax starts, win rate: 0.0
optimal_strategy starts, win rate: 1.0


## Evolution vs MinMax

#### make_strategy without nim-sum

#### Initial population

In [15]:
POPULATION_SIZE = 10
NUM_GENERATIONS = 100
OFFSPRING_SIZE = 5

NUM_MATCHES = 100
NIM_SIZE = 3
EVALUATION_STRATEGY = optimal_strategy
k = None

In [16]:
population = list()
Individual = namedtuple("Individual", ["genome", "fitness"])

i = 0
while i < POPULATION_SIZE:
    # genome is a tuple of probabilities of using: longest_row, shortest_row, take_one,
    # gabriele_strategy, pure_random_strategy or krzysztof_strategy strategies
    genome = tuple([round(random.random(), 2) for _ in range(6)])

    # prevents from creating duplicates
    if check_duplicates(genome, population):
        i -= 1
    else:
        population.append(
            Individual(
                genome,
                fitness(
                    genome,
                    NUM_MATCHES,
                    NIM_SIZE,
                    EVALUATION_STRATEGY,
                    make_strategy(
                        {"longest_row": genome[0], "shortest_row": genome[1], "take_one": genome[2],
                         "gabriele_strategy": genome[3], "pure_random_strategy": genome[4], "krzysztof_strategy": genome[5]}),
                    k
                ),
            )
        )
    i += 1

population = sorted(population, key=lambda i: -i.fitness)[:POPULATION_SIZE]

print_order_of_params(1)

for p in population:
    print(p)

genome=(longest_row, shortest_row, take_one, gabriele_strategy, pure_random_strategy, krzysztof_strategy)
Individual(genome=(0.36, 0.35, 0.01, 0.06, 0.23, 0.27), fitness=0.18)
Individual(genome=(0.59, 0.89, 0.25, 0.33, 0.95, 0.55), fitness=0.11)
Individual(genome=(0.86, 0.19, 0.39, 0.91, 0.73, 0.75), fitness=0.08)
Individual(genome=(0.95, 0.6, 0.89, 0.5, 0.18, 0.38), fitness=0.08)
Individual(genome=(0.88, 0.34, 0.47, 0.61, 0.86, 0.1), fitness=0.06)
Individual(genome=(0.89, 0.78, 0.32, 0.76, 0.3, 0.52), fitness=0.06)
Individual(genome=(0.08, 0.73, 0.91, 0.51, 0.21, 0.14), fitness=0.05)
Individual(genome=(0.2, 0.27, 0.32, 0.34, 0.36, 0.13), fitness=0.03)
Individual(genome=(0.1, 0.94, 0.53, 0.49, 0.31, 0.13), fitness=0.03)
Individual(genome=(0.17, 0.84, 0.62, 0.15, 0.61, 0.13), fitness=0.01)


In [17]:
for g in tqdm(range(NUM_GENERATIONS)):
    offspring = list()
    i = 0
    while i < OFFSPRING_SIZE:

        p1 = tournament(population)
        p2 = tournament(population)

        # randomly choose form of crossover
        if random.random() < 0.4:
            o = average_cross_over(p1.genome, p2.genome)
        else:
            o = cross_over(p1.genome, p2.genome)

        # mutate
        if random.random() < 0.5:
            o = mutation(o)

        # prevents from creating duplicates
        if check_duplicates(o, population) or check_duplicates(o, offspring):
            i -= 1
        else:
            f = fitness(
                o,
                NUM_MATCHES,
                NIM_SIZE,
                EVALUATION_STRATEGY,
                make_strategy(
                    {"longest_row": o[0], "shortest_row": o[1], "take_one": o[2],
                     "gabriele_strategy": o[3], "pure_random_strategy": o[4], "krzysztof_strategy": o[5]}),
                k
            )
            offspring.append(Individual(o, f))
        i += 1

    population += offspring
    population = sorted(population, key=lambda i: -i.fitness)[:POPULATION_SIZE]


100%|██████████| 100/100 [00:10<00:00,  9.10it/s]


In [18]:
print_order_of_params(1)

for p in population:
    print(p)


genome=(longest_row, shortest_row, take_one, gabriele_strategy, pure_random_strategy, krzysztof_strategy)
Individual(genome=(0.2232, 0.0695, 0.002, 0.0138, 0.23, 0.9332), fitness=0.61)
Individual(genome=(0.2089, 0.0695, 0.0474, 0.0138, 0.23, 0.9332), fitness=0.59)
Individual(genome=(0.0488, 0.0107, 0.1045, 0.06, 0.23, 0.8884), fitness=0.59)
Individual(genome=(0.1658, 0.2458, 0.0809, 0.06, 0.23, 0.9235), fitness=0.58)
Individual(genome=(0.2089, 0.0695, 0.002, 0.0138, 0.23, 0.9332), fitness=0.58)
Individual(genome=(0.2429, 0.2914, 0.01, 0.06, 0.23, 0.8884), fitness=0.57)
Individual(genome=(0.2232, 0.0107, 0.002, 0.0138, 0.23, 0.9332), fitness=0.57)
Individual(genome=(0.2232, 0.0107, 0.002, 0.06, 0.23, 0.943), fitness=0.56)
Individual(genome=(0.2232, 0.0107, 0.002, 0.06, 0.23, 0.8884), fitness=0.56)
Individual(genome=(0.2232, 0.0107, 0.1045, 0.06, 0.23, 0.8884), fitness=0.55)


## MinMax with Alpha-Beta pruning

In [19]:
@cache
def minmax_alphabeta(state: Nim, max_turn, alpha=-1, beta=1):
    # minmax with alphabeta pruning
    # if (score := evaluate_alphabeta(state, max_turn)) is not None:
    #     return score
    if ending_position(state):
        # end of a game
        return -1 if max_turn else 1

    scores = []
    for new_state in possible_new_states(state):
        scores.append(
            score := minmax_alphabeta(new_state, not max_turn, alpha, beta)
        )
        if max_turn:
            alpha = max(alpha, score)
        else:
            beta = min(beta, score)
        if beta <= alpha:
            break
    return (max if max_turn else min)(scores)

# possible_new_states and ending_move are the same for both versions of minmax

def minmax_best_move_alphabeta(state: Nim):
    for move in possible_moves(state):
        temp_nim = deepcopy(state)
        temp_nim.nimming(Nimply(move[0], move[1]))
        score = minmax_alphabeta(temp_nim, max_turn=False)
        if score > 0:
            break
    return Nimply(move[0], move[1])


## Oversimplified match
#### Alpha-Beta pruning

In [20]:
logging.getLogger().setLevel(logging.DEBUG)

# strategy = (pure_random, minmax_best_move_alphabeta)
# strategy = (minmax_best_move_alphabeta, pure_random)
strategy = (optimal_strategy, minmax_best_move_alphabeta)
# strategy = (minmax_best_move_alphabeta, optimal_strategy)

nim = Nim(3)

In [21]:
logging.debug(f"status: Initial board  -> {nim}")
player = 0
while nim:
    ply = strategy[player](nim)
    nim.nimming(ply)
    logging.debug(f"status: After player {player} -> {nim}")
    player = 1 - player
winner = 1 - player
logging.info(f"status: Player {winner} won!")

DEBUG:root:status: Initial board  -> <1 3 5>
DEBUG:root:status: After player 0 -> <1 3 2>
DEBUG:root:status: After player 1 -> <1 3 0>
DEBUG:root:status: After player 0 -> <1 1 0>
DEBUG:root:status: After player 1 -> <1 0 0>
DEBUG:root:status: After player 0 -> <0 0 0>
INFO:root:status: Player 0 won!


## Evaluate
#### Alpha-Beta pruning

In [22]:
NUM_MATCHES = 100
NIM_SIZE = 3
k = None

In [23]:
# minmax against pure_random
# minmax starts
print("MinMax against pure_random")
print("MinMax starts, win rate:", evaluate(NUM_MATCHES, NIM_SIZE, pure_random, minmax_best_move, k))
# pure random starts
print("pure_random starts, win rate:", 1 - evaluate(NUM_MATCHES, NIM_SIZE, minmax_best_move, pure_random, k))

MinMax against pure_random
MinMax starts, win rate: 0.99
pure_random starts, win rate: 1.0


In [24]:
# minmax against optimal_strategy
# minmax starts
print("MinMax against optimal_strategy")
print("MinMax starts, win rate:", evaluate(NUM_MATCHES, NIM_SIZE, optimal_strategy, minmax_best_move, k))
# pure random starts
print("optimal_strategy starts, win rate:", 1 - evaluate(NUM_MATCHES, NIM_SIZE, minmax_best_move, optimal_strategy, k))

MinMax against optimal_strategy
MinMax starts, win rate: 0.0
optimal_strategy starts, win rate: 1.0


#### Evolution of Alpha-Beta pruned version of MinMax is not continued because time of execusion is exactly the same as in basic MinMax
The reason for that may be obviosly also wrong implementation