In [None]:
import collections
import numpy as np
import gymnasium as gym

# Import the necessaries libraries
import plotly.graph_objects as go

import plotly.io as pio
pio.renderers.default = 'notebook'


In [None]:
env = gym.make('Blackjack-v1', sab=True) # render_mode="human")

In [None]:
def play_env(env, agent):
    terminated = False
    observation, info = env.reset()

    while not terminated:
        action = agent.action(observation)

        new_observation, reward, terminated, truncated, info = env.step(action)

        agent.observe(observation, action, reward)

        observation = new_observation
    
    agent.estimating()

    return reward

In [None]:
class MCFirstVisit():

    def __init__(self, gamma, policy):
        self.gamma  = gamma
        self.policy = policy

        self.state_value = collections.defaultdict(lambda: 0)
        self.returns = collections.defaultdict(lambda: [])

        self.states = []
        self.rewards = []

    def action(self, state):
        return self.policy(state)
    
    def observe(self, state, action, reward):
        self.states.append(state)
        self.rewards.append(reward)
    
    def estimating(self):
        g = 0

        for t in reversed(range(len(self.states))):
            g = self.gamma * g + self.rewards[t]

            self.returns[self.states[t]].append(g)

            # Here, we could avoid keeping a list a return values and use the update formula
            self.state_value[self.states[t]] = sum(self.returns[self.states[t]]) / len(self.returns[self.states[t]])
        
        self.states = []
        self.rewards = []


In [None]:
def random_policy(state):
    return np.random.randint(low=0, high=1, size=(1))[0]

def stick_policy(state):
    player_score = state[0]
    if player_score in [20, 21]:
        return 0
    else:
        return 1 

agent = MCFirstVisit(gamma=1, policy=stick_policy)

In [None]:
for i in range(100_000):
    play_env(env, agent)

In [None]:
len(agent.state_value)

In [None]:
Z = np.zeros(shape=(22, 12)) * np.nan

for k in agent.state_value.keys():
    Z[k[0]][k[1]] = agent.state_value[k]

sh_0, sh_1 = Z.shape

x, y = np.linspace(0, sh_1, sh_1), np.linspace(0, sh_0, sh_0)

fig = go.Figure(data=[go.Surface(z=Z, x=x, y=y)])

fig.update_layout(title='MCFirstVisit',
                  autosize=False,
                  width=500, height=500,
                  margin=dict(l=65, r=50, b=65, t=90))

fig.show()

In [None]:
class MonteCarloGeneralizePolicyIteration():

    def __init__(self, action_space, gamma, policy):
        self.gamma  = gamma
        self.policy = policy

        self.state_action_values = collections.defaultdict(action_space)
        self.returns = collections.defaultdict(lambda: [])

        self.states = []
        self.actions = []
        self.rewards = []

    def action(self, state):
        state_action_value = self.state_action_values[state]
        return self.policy(state_action_value)
    
    def observe(self, state, action, reward):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
    
    def estimating(self):
        g = 0

        for t in reversed(range(len(self.states))):
            g = self.gamma * g + self.rewards[t]

            return_state_action_index = str(self.states[t]) + " " + str(self.actions[t])
            self.returns[return_state_action_index].append(g)
            self.state_action_values[self.states[t]][self.actions[t]] = sum(self.returns[return_state_action_index]) / len(self.returns[return_state_action_index])

        
        self.states = []
        self.actions = []
        self.rewards = []


In [None]:
def max_policy(state_action_value):
    return np.argmax(state_action_value)

def build_action_space_exploring_start(env):
    return lambda: [1] * env.action_space.n

agent = MonteCarloGeneralizePolicyIteration(action_space=build_action_space_exploring_start(env), gamma=1, policy=max_policy)
env = gym.make('Blackjack-v1', sab=True)

In [None]:
buffer_size = 1000
mean_last_100_rewards = []
last_100_rewards = [0] * buffer_size

for i in range(150_000):
    last_reward = play_env(env, agent)

    last_100_rewards[i % buffer_size] = last_reward

    if i % buffer_size == 0:
        mean_last_100_rewards.append(sum(last_100_rewards) / buffer_size)

In [None]:
Z = np.zeros(shape=(22, 12)) * np.nan

for k in agent.state_action_values.keys():
    Z[k[0]][k[1]] = np.max(agent.state_action_values[k])

sh_0, sh_1 = Z.shape

x, y = np.linspace(0, sh_1, sh_1), np.linspace(0, sh_0, sh_0)

fig = go.Figure(data=[go.Surface(z=Z, x=x, y=y)])

fig.update_layout(title='MonteCarloExploringStart',
                  autosize=False,
                  width=500, height=500,
                  margin=dict(l=65, r=50, b=65, t=90))

fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=[i for i in range(len(mean_last_100_rewards))],
    y=mean_last_100_rewards,
))

fig.show()

In [None]:
# (Exploring Starts)
class MonteCarloES():

    def __init__(self, gamma, alpha, policy):
        self.gamma  = gamma
        self.alpha = alpha
        self.policy = policy

        self.state_action_values = collections.defaultdict(lambda: [1, 1])

        self.states = []
        self.actions = []
        self.rewards = []

    def action(self, state):
        state_action_value = self.state_action_values[state]
        return self.policy(state_action_value)
    
    def observe(self, state, action, reward):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
    
    def estimating(self):
        g = 0

        for t in reversed(range(len(self.states))):
            g = self.gamma * g + self.rewards[t]
            self.state_action_values[self.states[t]][self.actions[t]] += self.alpha * (g - self.state_action_values[self.states[t]][self.actions[t]])

        self.states = []
        self.actions = []
        self.rewards = []


In [None]:
def max_policy(state_action_value):
    return np.argmax(state_action_value)

agent = MonteCarloES(gamma=0.9, alpha=0.05, policy=max_policy)
env = gym.make('Blackjack-v1', sab=True)

In [None]:
buffer_size = 1000
mean_last_100_rewards = []
last_100_rewards = [0] * buffer_size

for i in range(500_000):
    last_reward = play_env(env, agent)

    last_100_rewards[i % buffer_size] = last_reward

    if i >= buffer_size:
        mean_last_100_rewards.append(sum(last_100_rewards) / buffer_size)

In [None]:
import plotly.graph_objects as go

Z = np.zeros(shape=(22, 12)) * np.nan

for k in agent.state_action_values.keys():
    Z[k[0]][k[1]] = np.argmax(agent.state_action_values[k])

sh_0, sh_1 = Z.shape

x, y = np.linspace(0, sh_1, sh_1), np.linspace(0, sh_0, sh_0)

fig = go.Figure(data=[go.Surface(z=Z, x=x, y=y)])

fig.update_layout(title='MCFirstVisit',
                  autosize=False,
                  width=500, height=500,
                  margin=dict(l=65, r=50, b=65, t=90))

fig.show()

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=[i for i in range(len(mean_last_100_rewards))],
    y=mean_last_100_rewards,
))

fig.show()