In [1]:
import logging
from collections import namedtuple
import random
from numpy.random import choice
import functools

In [2]:
Nimply = namedtuple("Nimply", "row, num_objects")

class Nim:
    def __init__(self, num_rows: int, k: int = None) -> None:
        self._rows = [i * 2 + 1 for i in range(num_rows)]
        self._k = k

    def __bool__(self):
        return sum(self._rows) > 0

    def __str__(self):
        return "<" + " ".join(str(_) for _ in self._rows) + ">"

    @property
    def rows(self) -> tuple:
        return tuple(self._rows)

    def nimming(self, ply: Nimply) -> None:
        row, num_objects = ply
        assert self._rows[row] >= num_objects
        assert self._k is None or num_objects <= self._k
        self._rows[row] -= num_objects

In [3]:
def _nimsum(state):
    return functools.reduce(lambda a,b : a^b,state)

def pure_random(state: Nim) -> Nimply:
    row = random.choice([r for r, c in enumerate(state.rows) if c > 0])
    num_objects = random.randint(1, state.rows[row])
    return Nimply(row, num_objects)

def gabriele(state: Nim) -> Nimply:
    """Pick always the maximum possible number of the lowest row"""
    possible_moves = [(r, o) for r, c in enumerate(state.rows) for o in range(1, c + 1)]
    return Nimply(*max(possible_moves, key=lambda m: (-m[0], m[1])))

def optimal(state: Nim) -> Nimply:
    nimsum=_nimsum(state._rows)
    if nimsum==0:
        return pure_random(state)
    else:
        for _ in reversed(range(len(state._rows))):
            if state._rows[_]^nimsum<state._rows[_]:
                return (_,state._rows[_]-(state._rows[_]^nimsum))
        return pure_random(state)

In [4]:
class ReinforcementLearningAgent:
    def __init__(self,rows):
        self._rules=dict()
        self._game=[]
        self._num_rows_=rows
    
    def evaluate_game(self,won):
        for state,move in self._game:
            self._rules[state]=tuple(sorted([rule if rule!=move else 
            ((rule[0],rule[1],rule[2]+1,rule[3]+1) if won else (rule[0],rule[1],rule[2]-2,rule[3]+1))
             for rule in self._rules[state]],key=lambda a: a[2],reverse=True))
        self._game=[]

    def pickmove(self,state):
        #logging.debug(f"In pickmove with state {state}, state in self.rules? {state in self._rules}")
        if state not in self._rules:
            #logging.debug(f"New state found {state}")
            self._rules[state]=tuple(random.sample(self.__possiblemoves(state),self.__lenpossiblemoves(state))[:5 if self.__lenpossiblemoves(state)>5 else self.__lenpossiblemoves(state)])
            #logging.debug(f"Now moves for state {state} are {self._rules[state]}")
        else:
            #logging.debug(f"Old state {state}")
            if any([rule[2]<0 and rule[3]>5 for rule in self._rules[state]]):
                #logging.debug(f"State has all moves evaluated already {state} -> {self._rules[state]}")
                new_rules=random.sample(self.__newpossiblemoves(state),len(self.__newpossiblemoves(state)))[:5 if len(self.__newpossiblemoves(state))>5 else len(self.__newpossiblemoves(state))]
                badcurrmoves=sum([rule[2]<0 and rule[3]>5 for rule in self._rules[state]])
                len_new_rules=len(new_rules)
                if len_new_rules>0:
                    self._rules[state]=tuple(sorted(list(self._rules[state])[:len(self._rules[state])-badcurrmoves]+new_rules[:badcurrmoves if len_new_rules>badcurrmoves else len_new_rules],key=lambda a: a[2],reverse=True))
                #logging.debug(f"Now fixed and state {state} has moves -> {self._rules[state]}")
        #logging.debug(f"Before picking a move the rules for state {state} are {self._rules[state]}")
        #if any([rule[3]<3 for rule in self._rules[state]]):
            #picked_move=random.choice([rule for rule in self._rules[state] if rule[3]<3])
        #else:
        minfit=min([rule[2] for rule in self._rules[state]])
        weigths=[-minfit+rule[2]+1 for rule in self._rules[state]]
        weigths=[_/sum(weigths) for _ in weigths]
        picked_move_index=choice(list(range(len(weigths))),1,p=weigths)[0]
        picked_move=self._rules[state][picked_move_index]
        self._game.append((state,picked_move))
        #logging.debug(f"Picked move {picked_move} for state {state}")
        return Nimply(picked_move[0],picked_move[1])

    def __possiblemoves(self,state):
        #moves=[(row,toTake,0,0) for row in range(self._num_rows_) for toTake in range(state[row])]
        #logging.debug(f"Moves for state {state} have len{len(moves)}")
        #return moves
        return [(row,toTake+1,0,0) for row in range(self._num_rows_) for toTake in range(state[row])]

    def __newpossiblemoves(self,state):
        return [(row,toTake+1,0,0) for row in range(self._num_rows_) for toTake in range(state[row]) if (row,toTake+1) not in [(_[0],_[1]) for _ in self._rules[state]]]

    def __lenpossiblemoves(self,state):
        #logging.debug(f"LEN POSSIBLE MOVES FOR STATE {state} is {sum(state)}")
        return sum(state)


In [5]:
mp=ReinforcementLearningAgent(11)

In [16]:
wins=0
logging.getLogger().setLevel(logging.DEBUG)
starting=random.choice([True,False])
player=0
NUM_GAMES=1000
for _ in range(NUM_GAMES):
    nim=Nim(11)
    #logging.debug(f"In this game I'm player #{0 if starting else 1}")
    while nim:
        if starting!=player:
            ply=mp.pickmove(nim.rows)
        else:
            ply=gabriele(nim)
        nim.nimming(ply)
        #logging.debug(f"After player {player} move now rows are {nim}")
        player=1-player
    winner=1-player
    won=(winner==0 and starting) or (winner==1 and not starting)
    mp.evaluate_game(won)
    if won:
        wins+=1
    starting=random.choice([True,False])

logging.debug(f"After {NUM_GAMES} my player won {wins} games!")

DEBUG:root:After 1000 my player won 971 games!
