# Lab 3: Policy Search

## Task3.4: An agent using Reinforcement Learning

Code based on a Maze example presented during CI lecture


In [1]:
from typing import Callable
from copy import deepcopy
from operator import xor
from itertools import accumulate
import random
from collections import namedtuple
import logging
from tqdm import tqdm
from evolution_strategy import *
from functools import cache
import numpy as np


## The _Nim_ and _Nimply_ classes


In [2]:
from Nim import Nimply
from Nim import Nim

# update_nim() is same as nimming()
# adding get_state_and_reward(), give_reward() and is_game_over


def is_game_over(self):
    # check if robot in the final position
    return (sum(r for r in self._rows) == 0)


def get_reward(self):
    # returning state will be just saving the nim
    return self.give_reward()


def give_reward(self):
    # if at end give 0 reward
    # if not at end give -1 reward
    return -1 * int(not is_game_over(self))


Nim.is_game_over = is_game_over
Nim.get_reward = get_reward
Nim.give_reward = give_reward


## Sample (and silly) startegies


In [3]:
# pure_random from the lecture
def pure_random(state: Nim) -> Nimply:
    row = random.choice([r for r, c in enumerate(state.rows) if c > 0])
    if state.k == None:
        num_objects = random.randint(1, state.rows[row])
    elif state.rows[row] < state.k:
        num_objects = random.randint(1, state.rows[row])
    else:
        num_objects = random.randint(1, state.k)
    return Nimply(row, num_objects)


## Optimal strategy


In [4]:
# optimal strategy using nim sum (a bit different implementation than the lecture)
def nim_sum(state: Nim) -> int:
    *_, result = accumulate(state.rows, xor)
    return result


def optimal_strategy(state: Nim) -> Nimply:
    # retrieve the possible moves
    possible_moves = [(r, o) for r, c in enumerate(state.rows)
                      for o in range(1, c + 1)]
    if state.k != None:
        possible_moves = [p for p in possible_moves if p[1] <= state.k]

    # check the values of nim_sum after all possible moves
    possible_moves_optimal = list()

    for move in possible_moves:
        temp_state = deepcopy(state)
        temp_state.nimming(Nimply(move[0], move[1]))
        if nim_sum(temp_state) == 0:
            possible_moves_optimal.append(move)

    if possible_moves_optimal == []:
        chosen_move = random.choice(possible_moves)
    else:
        chosen_move = random.choice(possible_moves_optimal)

    return Nimply(chosen_move[0], chosen_move[1])


## Agent


In [5]:
def possible_moves(state: Nim):
    # retrieve the possible moves
    possible_moves = [(r, o) for r, c in enumerate(state.rows)
                      for o in range(1, c + 1)]
    # possible moves if k was implemented
    if state.k != None:
        possible_moves = [p for p in possible_moves if p[1] <= state.k]

    return possible_moves


def possible_new_states(state: Nim):
    # returns a list of outcome of all possible moves
    new_states_lists = []
    for p in possible_moves(state):
        temp_state = deepcopy(state)
        temp_state.nimming(Nimply(p[0], p[1]))
        new_states_lists.append(temp_state)
    return new_states_lists


In [6]:
def ending_position(state: Nim):
    return (sum(r for r in state.rows) == 0)


In [7]:
class Agent(object):
    def __init__(self, state: Nim, alpha=0.15, random_factor=0.2):  # 80% explore, 20% exploit
        self.state_history = []  # state, reward
        self.state = state
        self.alpha = alpha
        self.random_factor = random_factor
        # self.moves = possible_moves(state)
        self.G = {}
        self.init_reward(state)

    def init_reward(self, state: Nim):
        # retrieve the possible moves
        possible_moves = [(r, o) for r, c in enumerate(state.rows)
                          for o in range(1, c + 1)]
        # possible moves if k was implemented
        if state.k != None:
            possible_moves = [p for p in possible_moves if p[1] <= state.k]

        for m in possible_moves:
            self.G[m] = np.random.uniform(low=1.0, high=0.1)

    def choose_action(self, allowedMoves):
        maxG = -10e15
        next_move = None
        if np.random.random() < self.random_factor:
            # if random number below random factor, choose random action
            next_move = random.choice(allowedMoves)
        else:
            # if exploiting, gather all possible actions and choose one with the highest G (reward)
            for move in allowedMoves:
                if self.G[move] >= maxG:
                    next_move = move
                    maxG = self.G[move]

        return Nimply(next_move[0], next_move[1])

    def update_state_history(self, state, reward):
        self.state_history.append((deepcopy(state), reward))

    def learn(self):
        target = 0

        for prev, reward in reversed(self.state_history):
            self.G[prev] = self.G[prev] + self.alpha * (target - self.G[prev])
            target += reward

        self.state_history = []

        self.random_factor -= 10e-5  # decrease random factor each episode of play


## Oversimplified match


In [8]:
logging.getLogger().setLevel(logging.DEBUG)

eval_strategy = pure_random  # player 0

nim = Nim(3)
robot = Agent(nim, alpha=0.1, random_factor=0.4)

logging.debug(f"status: Initial board  -> {nim}")
player = 0
while nim:
    if player == 0:
        ply = eval_strategy(nim)
        nim.nimming(ply)
        logging.debug(f"status: After player {player} -> {nim}")
    else:
        # current state is just nim
        # choose an action (explore or exploit)
        action = robot.choose_action(possible_moves(nim))
        nim.nimming(action)  # update the maze according to the action
        reward = nim.get_reward()  # get the new state and reward
        # update the robot memory with state and reward
        robot.update_state_history((action[0], action[1]), reward)
        logging.debug(f"status: After player {player} -> {nim}")
    player = 1 - player
winner = 1 - player
logging.info(f"status: Player {winner} won!")


DEBUG:root:status: Initial board  -> <1 3 5>
DEBUG:root:status: After player 0 -> <1 0 5>
DEBUG:root:status: After player 1 -> <1 0 2>
DEBUG:root:status: After player 0 -> <0 0 2>
DEBUG:root:status: After player 1 -> <0 0 1>
DEBUG:root:status: After player 0 -> <0 0 0>
INFO:root:status: Player 0 won!


## Evaluate


In [9]:
# games are ran NUM_MATCHES times to check the average result
# who_starts = 0 - recurrent learning starts, who_starts = 1 - recurrent learning goes second
def evaluate(NUM_LEARNING_STEPS: int, NUM_MATCHES: int, NIM_SIZE: int, eval_strategy: Callable, who_starts: int, k=None) -> float:

    nim = Nim(NIM_SIZE, k)
    robot = Agent(nim, alpha=0.1, random_factor=0.2)

    for i in range(NUM_LEARNING_STEPS):
        won = 0

        for m in range(NUM_MATCHES):
            nim = Nim(NIM_SIZE, k)
            player = 0
            while nim:
                if player == who_starts:
                    # current state is just nim
                    # choose an action (explore or exploit)
                    action = robot.choose_action(possible_moves(nim))
                    # update the maze according to the action
                    nim.nimming(action)
                    reward = nim.get_reward()  # get the new state and reward
                    # update the robot memory with state and reward
                    robot.update_state_history((action[0], action[1]), reward)
                else:
                    ply = eval_strategy(nim)
                    nim.nimming(ply)
                player = 1 - player
            robot.learn()
            if player == who_starts:
                won += 1
        print("After", (i + 1)*NUM_MATCHES, "win rate:", won / NUM_MATCHES)


In [10]:
evaluate(30, 100, 3, pure_random, 0)


After 100 win rate: 0.5
After 200 win rate: 0.34
After 300 win rate: 0.45
After 400 win rate: 0.4
After 500 win rate: 0.43
After 600 win rate: 0.38
After 700 win rate: 0.41
After 800 win rate: 0.36
After 900 win rate: 0.39
After 1000 win rate: 0.32
After 1100 win rate: 0.34
After 1200 win rate: 0.32
After 1300 win rate: 0.33
After 1400 win rate: 0.39
After 1500 win rate: 0.32
After 1600 win rate: 0.28
After 1700 win rate: 0.39
After 1800 win rate: 0.46
After 1900 win rate: 0.41
After 2000 win rate: 0.32
After 2100 win rate: 0.28
After 2200 win rate: 0.25
After 2300 win rate: 0.29
After 2400 win rate: 0.3
After 2500 win rate: 0.23
After 2600 win rate: 0.37
After 2700 win rate: 0.37
After 2800 win rate: 0.35
After 2900 win rate: 0.31
After 3000 win rate: 0.3


My algorithm seems to learn how not to play Nim, maybe I messed up the evaluation, maybe the algorithm is wrong. 

If you see the mistake please let me know champ, otherwise just write in the review that the solution for RL is wrong.

At this point I'm giving up cause it's been too much Nim for the last two weeks and I don't know what's wrong and what's right anymore :(
