In [40]:
import numpy as np
from tqdm import tqdm

## Hyperparamters

In [41]:
gamma = 0.9
alpha = 0.02
num_episodes = 1000000
eval_episodes = 100000
epsilon = 0.1
deck = [2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 11]
action = [0, 1]

In [42]:
def get_random_state(deck):
    non_ace_sum = 0
    cnt_ace = 0
    while True:
        card = np.random.choice(deck)
        if card == 11:
            cnt_ace += 1
        else:
            non_ace_sum += card
        sum11 = cnt_ace * 11 + non_ace_sum
        sum1 = cnt_ace * 1 + non_ace_sum
        if sum11 >= 12 and sum11 <= 21:
            break
        elif sum1 >= 12 and sum1 <= 21:
            break

    dealer_sum = np.random.choice(deck)
    return (non_ace_sum, cnt_ace, dealer_sum)

In [43]:
def train(s, a, qnew, q):
    """
    Training a natural blackjack model using constant alpha monte carlo method
    """
    non_ace_sum, cnt_ace, dealer_sum = s
    sum11 = cnt_ace * 11 + non_ace_sum
    sum1 = cnt_ace * 1 + non_ace_sum
    if sum11 == 21 or sum1 == 21:
        return 1
    elif sum1 > 21:
        return -1
    else:
        if a == 0:
            sum = sum11 if sum11 >= 12 and sum11 <= 21 else sum1
            dealer_cnt_ace, dealer_non_ace_sum = (
                (1, 0) if dealer_sum == 11 else (0, dealer_sum)
            )
            while True:
                dealer_card = np.random.choice(deck)
                if dealer_card == 11:
                    dealer_cnt_ace += 1
                else:
                    dealer_non_ace_sum += dealer_card
                dsum1 = dealer_cnt_ace * 1 + dealer_non_ace_sum
                dsum11 = dealer_cnt_ace * 11 + dealer_non_ace_sum
                if dsum1 > 21:
                    return 1
                if dsum1 == 21 or dsum11 == 21:
                    return -1
                if dsum1 > 16:
                    return 0 if dsum1 == sum else -1 if dsum1 > sum else 1
                if dsum11 > 16 and dsum11 <= 21:
                    return 0 if dsum11 == sum else -1 if dsum11 > sum else 1
        else:
            card = np.random.choice(deck)
            if card == 11:
                cnt_ace += 1
            else:
                non_ace_sum += card
            snew = (non_ace_sum, cnt_ace, dealer_sum)
            anew = (
                np.random.choice(action)
                if np.random.rand() <= epsilon
                else np.argmax(q[*snew, :])
            )
            G = train(snew, anew, qnew, q)
            qnew[*s, a] = qnew[*s, a] + alpha * (G - qnew[*s, a])
            return G * gamma
    return 0

In [44]:
def eval(s, a, q):
    """
    returns 1(Win), -1(Loss), 0(Draw) when played according the policy based on q
    """
    non_ace_sum, cnt_ace, dealer_sum = s
    sum11 = cnt_ace * 11 + non_ace_sum
    sum1 = cnt_ace * 1 + non_ace_sum
    if sum11 == 21 or sum1 == 21:
        return 1
    elif sum1 > 21:
        return -1
    else:
        if a == 0:
            sum = sum11 if sum11 >= 12 and sum11 <= 21 else sum1
            dealer_cnt_ace, dealer_non_ace_sum = (
                (1, 0) if dealer_sum == 11 else (0, dealer_sum)
            )
            while True:
                dealer_card = np.random.choice(deck)
                if dealer_card == 11:
                    dealer_cnt_ace += 1
                else:
                    dealer_non_ace_sum += dealer_card
                dsum1 = dealer_cnt_ace * 1 + dealer_non_ace_sum
                dsum11 = dealer_cnt_ace * 11 + dealer_non_ace_sum
                if dsum1 > 21:
                    return 1
                if dsum1 == 21 or dsum11 == 21:
                    return -1
                if dsum1 > 16:
                    return 0 if dsum1 == sum else -1 if dsum1 > sum else 1
                if dsum11 > 16 and dsum11 <= 21:
                    return 0 if dsum11 == sum else -1 if dsum11 > sum else 1
        else:
            card = np.random.choice(deck)
            if card == 11:
                cnt_ace += 1
            else:
                non_ace_sum += card
            snew = (non_ace_sum, cnt_ace, dealer_sum)
            anew = np.argmax(q[*snew, :])
            G = eval(snew, anew, q)
            return G
    return 0

## Training

In [49]:
q = np.zeros((32, 22, 12, 2))
for episode in tqdm(range(num_episodes)):
    qnew = q.copy()
    s0 = get_random_state(deck)
    a = (
        np.random.choice(action)
        if np.random.rand() <= epsilon
        else np.argmax(q[*s0, :])
    )
    G = train(s0, a, qnew, q)
    qnew = qnew + alpha * (G - qnew[*s0, a])
    q = qnew

100%|██████████| 1000000/1000000 [03:56<00:00, 4224.06it/s]


## Evaluation

In [50]:
win = 0.0
loss = 0.0
draw = 0.0
for episode in tqdm(range(eval_episodes)):
    s0 = get_random_state(deck)
    a = np.argmax(q[*s0, :])
    score = eval(s0, a, q)
    if score == 1:
        win += 1
    elif score == -1:
        loss += 1
    else:
        draw += 1

print(f"Win% = {(win*100)/eval_episodes:.3f}")
print(f"Draw% = {(draw*100)/eval_episodes:.3f}")
print(f"Loss% = {(loss*100)/eval_episodes:.3f}")

100%|██████████| 100000/100000 [00:21<00:00, 4645.30it/s]

Win% = 42.424
Draw% = 5.889
Loss% = 51.687



