# LAB3 POLICY Search

## Task

Write agents able to play [*Nim*](https://en.wikipedia.org/wiki/Nim), with an arbitrary number of rows and an upper bound $k$ on the number of objects that can be removed in a turn (a.k.a., *subtraction game*).

The player **taking the last object wins**.

* Task3.1: An agent using fixed rules based on *nim-sum* (i.e., an *expert system*)
* Task3.2: An agent using evolved rules
* Task3.3: An agent using minmax
* Task3.4: An agent using reinforcement learning


In [1331]:
import logging
from itertools import permutations
from collections import namedtuple
import random

from copy import deepcopy,copy
from functools import reduce
import numpy as np

In [1332]:
Nimply = namedtuple("Nimply", "row, num_objects")

In [1333]:
class Nim:
    def __init__(self, num_rows: int, k: int = None) -> None:
        self.num_rows = num_rows
        self._rows = [i * 2 + 1 for i in range(num_rows)] # here we are putting the number of sticks in a single row 
        # like a list -> [1,3,5,7,....]
        self._k = k

    def __bool__(self):
        return sum(self._rows) > 0

    def __str__(self):
        return "<" + " ".join(str(_) for _ in self._rows) + ">"


    def play(self, ply: Nimply) -> None:
        row, num_objects = ply
        assert self._rows[row] >= num_objects
        assert self._k is None or num_objects <= self._k
        self._rows[row] -= num_objects

    def possible_plays (self) -> list:
        possiblePlays=[]
        th = 0
        if self._k != None:
            th = self._k
        else:
            th = max(self._rows)
            
        possiblePlays.append([Nimply(r,p+1) for r in range(self.num_rows) for p in range(self._rows[r]) if p+1 <= th or not self._rows ])
        return possiblePlays[0]
        

## Task 3.1 Expert System

In [1334]:
def pure_random(state: Nim) -> Nimply:
    row = random.choice([r for r, c in enumerate(state._rows) if c > 0]) 
    num_objects = random.randint(1, state._rows[row])# now we are selecting a random number of sticks from the selected row
    return Nimply(row, num_objects)

In [1335]:
Game = Nim(4)
def calculate_nim_sum(rows):
    return reduce(lambda x, y: x ^ y, rows)


def expertSystem (Game: Nim) -> Nimply:
    best_ply = list()
    for i in Game.possible_plays():
        tmp = deepcopy(Game)
        tmp.play(i)
        best_ply.append((i,calculate_nim_sum(tmp._rows)))
    best_ply = sorted(best_ply,key= lambda x :x[1],reverse=False)
    retply=random.choice([num[0] for num in best_ply if num[1] == 0]) if best_ply[0][1]==0 else random.choice(best_ply)[0]
    return retply


strategy=[expertSystem,pure_random]

player = 0
while Game:
    ply = strategy[player](Game)
    # print(Game)
    Game.play(ply)
    print(f"status: After player {player} -> {Game}")
    player = 1 - player
winner = 1 - player
print(f"status: Player {winner} won!")
# print(ply) 

status: After player 0 -> <1 3 5 4>
status: After player 1 -> <1 3 5 1>
status: After player 0 -> <1 3 3 1>
status: After player 1 -> <1 3 0 1>
status: After player 0 -> <1 0 0 1>
status: After player 1 -> <1 0 0 0>
status: After player 0 -> <0 0 0 0>
status: Player 0 won!


## Task 3.2 : Evolved Rules 
### Base-Nim Strategy 

In [1336]:
def decimal_to_base3(decimal_number):
    if decimal_number == 0:
        return '0'

    base3_digits = []
    while decimal_number > 0:
        decimal_number, remainder = divmod(decimal_number, 3)
        base3_digits.append(str(remainder))

    base3_number = ''.join(base3_digits[::-1])
    return base3_number

def convert_to_base_nim(rows):
    base_nim_sizes = [int(decimal_to_base3(num)) for num in rows]
    xor_sum = 0
    for number in base_nim_sizes:
        xor_sum ^= number
    return xor_sum


In [1337]:

def Base_Nim(Game: Nim) -> Nimply:
    best_ply = list()
   
    for i in Game.possible_plays():
        tmp = deepcopy(Game)
        tmp.play(i)
        best_ply.append((i,convert_to_base_nim(tmp._rows)))
    best_ply = sorted(best_ply,key= lambda x :x[1],reverse=False)
    retply=random.choice([num[0] for num in best_ply if num[1] != 0]) if best_ply[0][1]!=0 else random.choice(best_ply)[0]
    return retply

In [1338]:
Game = Nim(4)
    
strategy=[pure_random,Base_Nim]

player = 0
while Game:
    ply = strategy[player](Game)
    Game.play(ply)
    print(f"status: After player {player} -> {Game}")
    player = 1 - player
winner = 1 - player
print(f"status: Player {winner} won!")

status: After player 0 -> <0 3 5 7>
status: After player 1 -> <0 3 5 4>
status: After player 0 -> <0 3 5 1>
status: After player 1 -> <0 0 5 1>
status: After player 0 -> <0 0 5 0>
status: After player 1 -> <0 0 2 0>
status: After player 0 -> <0 0 0 0>
status: Player 0 won!


## Task 3.3: minmax

In [1339]:
def eval_terminal(Game):
    l = copy(Game._rows)
    o = reduce(lambda x, y: x ^ y, l)
    return 0 if not o else sum(Game._rows)
    # return sum(Game._rows)
    

In [1340]:
game = Nim(4)
eval_terminal(game)
# o =reduce(lambda x, y: x ^ y, game._rows)
# # print(game._rows)
# print(sum(game._rows))
# print(o)
# 0 if not o else sum(game._rows)

0

In [1341]:
def minmax(Game : Nim) -> Nimply:
    
    val = eval_terminal(Game)
    possible =  Game.possible_plays()
    if (val == 0 and sum(Game._rows) == 0) or len(possible) == 0 :
        return None,val
    
    evaluations = list()
    for ply in Game.possible_plays():
        if ply[0] != None:
            tmp = deepcopy(Game)
            tmp.play(ply)
            _,val = minmax(tmp)
            evaluations.append((ply, -val))
        
    s = random.choice([num[0] for num in evaluations if num[1] == 0 and num[0]!= None]) if evaluations[0][1]== 0 else list()
    return s if len(s)!=0 else max(evaluations,key= lambda k:k[1])[0]
    

In [1342]:
Game = Nim(3)
strategy=[expertSystem,minmax]

player = 0
while Game:
    ply = strategy[player](Game)
    # print(ply)
    Game.play(ply)
    print(f"status: After player {player} -> {Game}")
    player = 1 - player
winner = 1 - player
print(f"status: Player {winner} won!")


status: After player 0 -> <1 3 2>
status: After player 1 -> <0 3 2>
status: After player 0 -> <0 2 2>
status: After player 1 -> <0 1 2>
status: After player 0 -> <0 1 1>
status: After player 1 -> <0 0 1>
status: After player 0 -> <0 0 0>
status: Player 0 won!


## Task 3.4 : Reinforcement Learning

In [1343]:
class RL:
    def __init__(self, Game, alpha=0.15, random_factor=0.2):  # 80% explore, 20% exploit
        self.Game = Game
        self.state_history = [(tuple(Game._rows), 0)]
        self.alpha = alpha
        self.random_factor = random_factor
        self.G = {}
        self.G [tuple(Game._rows)]=np.random.uniform(low =0.1,high=1.0)
    
    def choose_action(self):
        maxG = -10e15 # very low number 
        
        allowedMoves = self.Game.possible_plays()
        next_ply = allowedMoves[0]
        randomN = np.random.random()
        if randomN < self.random_factor:
           row = random.choice([r for r, c in enumerate(allowedMoves)]) 
           next_ply = allowedMoves[row]
        else:
            for ply in allowedMoves:
                tmp = deepcopy(self.Game)
                tmp.play(ply)
                if self.G.get(tuple(tmp._rows)):
                    if self.G[tuple(tmp._rows)] >= maxG:
                        next_ply = ply
                        maxG = self.G[tuple(tmp._rows)]
                else:
                    self.G[tuple(tmp._rows)] = np.random.uniform(low =0.1,high=1.0)
        return next_ply
    def update_state_history(self, rows, reward):
        self.state_history.append((tuple(rows), reward))
   
    def learn(self):
        target = 0
        for prev, reward in reversed(self.state_history):
            if self.G.get(prev):
                self.G[prev] = self.G[prev] + self.alpha * (target - self.G[prev])
            else:
                self.G[prev] = np.random.uniform(low =0.1,high=1.0)
            
            target += reward # and here we are updating the reward as the cumulative reward
            
        self.state_history = []

        self.random_factor -= 10e-5  # decrease random factor each episode of play
    


In [1344]:
Game = Nim(5)
rl = RL(Game)
count = 0

for i in range(5000):
    print ('**************New Game****************')
    Game = Nim(5)
    rl.Game = Game
    while Game:
        action = rl.choose_action()
        Game.play(action)
        reward = -1 if Game else 0
        state = Game._rows
        rl.update_state_history(state,reward)
        print(f"status: After RL  {Game}")
        if not Game:
            count += 1
            print('RL won')
            break
        ply = expertSystem(Game)
        Game.play(ply)
        print(f"status: After expert  {Game}")  
        if not Game:
            print('Expert Won')
        
    rl.learn()
print( f'The number of times that RL won is {count} out of 5000 times')

        

**************New Game****************
status: After RL  <0 3 5 7 9>
status: After expert  <0 3 5 7 1>
status: After RL  <0 2 5 7 1>
status: After expert  <0 2 4 7 1>
status: After RL  <0 1 4 7 1>
status: After expert  <0 1 4 4 1>
status: After RL  <0 0 4 4 1>
status: After expert  <0 0 4 4 0>
status: After RL  <0 0 3 4 0>
status: After expert  <0 0 3 3 0>
status: After RL  <0 0 1 3 0>
status: After expert  <0 0 1 1 0>
status: After RL  <0 0 0 1 0>
status: After expert  <0 0 0 0 0>
Expert Won
**************New Game****************
status: After RL  <1 3 5 3 9>
status: After expert  <1 3 5 3 4>
status: After RL  <0 3 5 3 4>
status: After expert  <0 2 5 3 4>
status: After RL  <0 1 5 3 4>
status: After expert  <0 1 5 0 4>
status: After RL  <0 0 5 0 4>
status: After expert  <0 0 4 0 4>
status: After RL  <0 0 4 0 0>
status: After expert  <0 0 0 0 0>
Expert Won
**************New Game****************
status: After RL  <1 3 5 7 8>
status: After expert  <1 3 5 7 0>
status: After RL  <1 3 5 0 0>