In [1]:
import numpy as np
from itertools import product
from blackjack_env import BlackjackEnv

In [2]:
env = BlackjackEnv(natural=True)

In [3]:
env.seed(1)

[1]

In [4]:
env.observation_space

Tuple(Discrete(32), Discrete(11), Discrete(2))

In [5]:
(player_sum, dealer_sum, usable_ace), reward, done, _ = env.step(1)

In [6]:
pi = np.ones((32, 11, 2), dtype=np.int8)
pi[19:, :, :] = 0

In [13]:
def run_episode(env, pi):
    observation = env.reset()
    observation = tuple(map(int, observation))
    states, rewards = [observation], [0]
    done = False
    while not done:
        observation, reward, done, _ = env.step(pi[observation])
        observation = tuple(map(int, observation))
        if pi[observation] == 1:
            states.append(observation)
            rewards.append(reward)
        else:
            rewards[-1] = reward
    return states, rewards

def update_returns(R, states, rewards, R_all=None):
    state_first_visit = np.full((32, 11, 2), -1, dtype=np.int16)
    for t, state in enumerate(states):
        if state_first_visit[state] == -1:
            state_first_visit[state] = t
    g = 0
    for t in range(len(states) - 1, -1, -1):
        g += rewards[t]
        i, j, k = states[t]
        if state_first_visit[states[t]] == t:
            R[i][j][k].append(g)
            if R_all is not None:
                R_all[i][j][k].append(g)
    for state in product(range(32), range(11), range(2)):
        if state_first_visit[state] == -1:
            R_all[i][j][k].append(np.nan)
    return R, R_all

In [14]:
R = [ [[[] for _ in range(2)] for _ in range(11)] for _ in range(32) ]
R_all = [ [[[] for _ in range(2)] for _ in range(11)] for _ in range(32) ]

In [15]:
total_episodes = 100000

for _ in range(total_episodes):
    states, rewards = run_episode(env, pi)
    R, R_all = update_returns(R, states, rewards, R_all=R_all)
