In [1]:
import gymnasium as gym

In [2]:
env = gym.make('Blackjack-v1', natural=False, sab=False, render_mode="human")#render_mode="human" para q pete

#Métodos:
#reset(self) : Reinicia el estado del entorno, a su estado inicial, devolviendo una observación de dicho estado.
#step(self, action) : "Avanza" un timestep. Devuelve: observation, reward, done, info.
#render(self) : Muestra en pantalla el entorno.
#close(self) : Finaliza con la instancia del agente.
#seed(self) : Establece la semilla aleatoria del generador de números aleatorios del presente entorno.

#Atributos:
#action_space : El objeto de tipo Space correspondiente al espacio de acciones válidas.
#observation_space : El objeto de tipo Space correspondiente a todos los rangos posibles de observaciones.
#reward_range : Tupla que contiene los valores mínimo y máximo de recompensa posible.


In [3]:
print("Tamaño de espacio de estados", env.observation_space)
print("Estado aleatorio", env.observation_space.sample())
size_estados = env.observation_space[0].n * env.observation_space[1].n * env.observation_space[2].n
print("Hay", size_estados, " estados posibles.")
print("Acciones posibles", env.action_space.n)
print("Acción aleatoria", env.action_space.sample())
size_acciones = env.action_space.n
print("Hay", size_acciones, " acciones posibles.")

Tamaño de espacio de estados Tuple(Discrete(32), Discrete(11), Discrete(2))
Estado aleatorio (13, 6, 1)
Hay 704  estados posibles.
Acciones posibles 2
Acción aleatoria 1
Hay 2  acciones posibles.


In [4]:
# Reseteamos el entorno y obtenemos el estado inicial
estado = env.reset()

# Imprime el estado inicial
print("Estado inicial:", estado)

# Simulamos una acción para ver cómo se actualiza el estado
accion = 1  # Por ejemplo, podemos elegir 'hit'
nuevo_estado, recompensa, hecho, info, a = env.step(accion)

print("Nuevo estado:", nuevo_estado)
print("Recompensa:", recompensa)
print("Terminado:", hecho)
print("Info:", info)

Estado inicial: ((12, 6, 0), {})
Nuevo estado: (20, 6, 0)
Recompensa: 0.0
Terminado: False
Info: False


In [5]:
env = gym.make('Blackjack-v1', natural=True, sab=False, render_mode="human")
done = False
while not done:
    state = env.reset()
    while state[0][0] != 21: #genera un nuevo estado hasta tener un bj natural
        state = env.reset()
    env.render()
    action = env.action_space.sample()
    print("Estado inicial:", state)
    nuevo_estado, recompensa, hecho, info, a = env.step(0)
    print("Nuevo estado:", nuevo_estado)
    print("Recompensa:", recompensa)
    print("Terminado:", hecho)
    print("Info:", info)
    if(recompensa == 1 or recompensa == 1.5): 
        env.close()
        done = True

Estado inicial: ((21, 10, 1), {})
Nuevo estado: (21, 10, 1)
Recompensa: 1.5
Terminado: True
Info: False


In [72]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 1
gamma = 0
epsilon = 1
episodes = 100000

# For plotting metrics
all_epochs = []
all_penalties = []

env = gym.make('Blackjack-v1', natural=True, sab=False)
import numpy as np
n = env.env.observation_space[0].n * env.observation_space[1].n * env.observation_space[2].n
q_table = np.zeros([n, env.action_space.n]) 
#vamos a usar la fórmula: A * 22 + B * 2 + C (da un valor entre 0 y 704)

for i in range(1, episodes+1):
    state,_ = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False

    alpha = 1 - (i/(episodes+1))
    gamma = i/(episodes+1)
    epsilon = 1 - (i/(episodes+1))
    
    while not done:
        A, B, C = state
        discrete_state = A*22+B*2+C
        
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[discrete_state]) # Exploit learned values
        
        next_state, reward, done, info, _ = env.step(action) 
        
        old_value = q_table[discrete_state, action]
        
        next_A, next_B, next_C = next_state
        discrete_next_state = next_A*22+next_B*2+next_C
        next_max = np.max(q_table[discrete_next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        
        q_table[discrete_state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")


Episode: 100000
Training finished.

CPU times: total: 20.1 s
Wall time: 31.4 s


In [73]:
for i in q_table:
    print(i)

[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[0. 0.]
[-1.1290932  -0.23230287]
[0. 0.]
[-0.98330807  0.39520572]
[0. 0.]
[-0.35391215  0.40900974]
[0. 0.]
[-0.16275193  0.64670202]
[0. 0.]
[0.28608832 0.04858727]
[0. 0.]
[-0.36627957  0.26146912]
[0. 0.]
[-0.82268979  0.49916916]
[0. 0.]
[-0.98855504  0.69944438]
[0. 0.]
[-0.509358

In [84]:
total_epochs, total_penalties, total_rewards = 0, 0, 0
episodes = 100

for _ in range(episodes):
    state,_ = env.reset()
    epochs, penalties, reward, rewards = 0, 0, 0, 0
    done = False
    
    while not done:
        A, B, C = state
        discrete_state = A*22+B*2+C
        
        action = np.argmax(q_table[discrete_state])
        state, reward, done, info, _ = env.step(action)

        if reward < 0:
            penalties += reward
        else:
            rewards += reward

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs
    total_rewards += rewards

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")
print(f"Average reward per episode: {total_rewards / episodes}")


Results after 100 episodes:
Average timesteps per episode: 1.88
Average penalties per episode: -0.54
Average reward per episode: 0.405
