# EASY 21


### 1 - Implement the game


In [1]:
from enum import Enum

from tqdm import trange
import numpy as np


class Color(Enum):
    RED = -1
    BLACK = 1


class Card:
    def __init__(self, color: Color, value: int):
        self.color = color
        self.value_ = value

    @property
    def value(self) -> int:
        return self.value_ * self.color.value

    @staticmethod
    def draw_card() -> "Card":
        value = int(np.random.uniform(1, 11, None))
        color = np.random.choice([Color.RED, Color.BLACK], p=[1 / 3, 2 / 3])

        return Card(color, value)

In [2]:
class State:
    def __init__(
        self,
        player_sum: int,
        dealer_sum: int,
        is_terminal=False,
    ):
        self.player_sum = player_sum
        self.dealer_sum = dealer_sum
        self.is_terminal = is_terminal

    def __str__(self) -> str:
        return f"Player sum: {self.player_sum}, Dealer sum: {self.dealer_sum}"

    def __repr__(self) -> str:
        return str(self)

    def is_bust(self, value: int) -> bool:
        return value > 21 or value < 1

    def is_player_bust(self) -> bool:
        return self.is_bust(self.player_sum)

    def is_dealer_bust(self) -> bool:
        return self.is_bust(self.dealer_sum)

    @staticmethod
    def initial_state() -> "State":
        player_sum = int(np.random.uniform(1, 11, None))
        dealer_card = int(np.random.uniform(1, 11, None))

        return State(player_sum, dealer_card)


class Action(Enum):
    HIT = 0
    STICK = 1

    @staticmethod
    def from_value(value) -> "Action":
        if value == Action.HIT.value:
            return Action.HIT
        return Action.STICK


class Environment:
    @staticmethod
    def step(state: State, action: Action) -> tuple[State, int]:
        if state.is_terminal:
            raise ValueError("Cannot step in terminal state")

        state = State(state.player_sum, state.dealer_sum)

        if action == Action.HIT:
            card = Card.draw_card()
            state.player_sum += card.value
            if state.is_player_bust():
                state.is_terminal = True
                return state, -1
            else:
                return state, 0

        elif action == Action.STICK:
            state.is_terminal = True
            while state.dealer_sum < 17:
                card = Card.draw_card()
                state.dealer_sum += card.value
                if state.is_dealer_bust():
                    return state, 1

            if state.dealer_sum > state.player_sum:
                return state, -1
            elif state.dealer_sum < state.player_sum:
                return state, 1
            else:
                return state, 0

In [3]:
from abc import ABC, abstractmethod


class Agent(ABC):
    MAX_DEALER_SUM = 10
    MAX_PLAYER_SUM = 21
    MAX_ACTIONS = 2

    def __init__(self, discount_factor: float = 1) -> None:
        self.action_value = Agent.initialize_vector()
        self.N0 = 100
        self.gamma = discount_factor

    def epsilon_greedy_action(self, state: State, eps: float) -> Action:
        if state.is_terminal:
            raise ValueError("Cannot choose action in terminal state")

        if eps > np.random.random():
            return np.random.choice(list(Action))

        action = np.argmax(self.action_value[state.dealer_sum, state.player_sum])
        return Action.from_value(action)

    @property
    def state_value(self):
        state_value = self.initialize_vector()[..., 0]

        for d_sum in range(1, Agent.MAX_DEALER_SUM + 1):
            for p_sum in range(1, Agent.MAX_DEALER_SUM + 1):
                state_value[d_sum][p_sum] = max(self.action_value[d_sum][p_sum])

        return state_value

    def eval(self, n=1000):
        result = 0
        wins = 0
        for _ in trange(n):
            s = State.initial_state()
            while not s.is_terminal:
                action = self.epsilon_greedy_action(s, 0)
                s, r = Environment.step(s, action)
            result += r
            if r == 1:
                wins += 1
        return (wins / n, result / n)

    @staticmethod
    def initialize_vector() -> np.ndarray:
        return np.zeros(
            (
                Agent.MAX_DEALER_SUM + 1,
                Agent.MAX_PLAYER_SUM + 1,
                Agent.MAX_ACTIONS,
            )
        )

    @abstractmethod
    def train(self, episodes: int):
        pass

### 2 - Monte Carlo Control


In [4]:
class MonteCarloControl(Agent):
    def __init__(self, discount_factor=1):
        super().__init__(discount_factor)

    def train(self, episodes: int):
        counter = self.initialize_vector()

        for _ in trange(episodes):
            trajectory = []
            s = State.initial_state()
            while not s.is_terminal:
                epsilon = self.N0 / (
                    self.N0 + sum(counter[s.dealer_sum, s.player_sum, :])
                )
                a = self.epsilon_greedy_action(s, epsilon)
                next_s, r = Environment.step(s, a)
                trajectory.append((s, a, r))
                s = next_s

            returns = [trajectory[-1][-1]]
            for i, (_, _, r) in enumerate(reversed(trajectory)):
                if i == 0:
                    continue
                returns.append(r + self.gamma * returns[-1])
            returns = list(reversed(returns))

            for i, (s, a, _) in enumerate(trajectory):
                state = [s.dealer_sum, s.player_sum]
                counter[state[0], state[1], a.value] += 1
                alpha = 1 / counter[state[0], state[1], a.value]
                self.action_value[state[0], state[1], a.value] += alpha * (
                    returns[i] - self.action_value[state[0], state[1], a.value]
                )

In [5]:
agent = MonteCarloControl()
agent.train(1_000_000)
f"(wins %, avg reward): {agent.eval(100_000)}"

100%|██████████| 1000000/1000000 [01:21<00:00, 12237.99it/s]
100%|██████████| 100000/100000 [00:06<00:00, 14795.19it/s]


'(wins %, avg reward): (0.52814, 0.0594)'