Required packages

In [1]:
'''
%pip install gymnasium==0.27.0
%pip install matplotlib
%pip install numpy
%pip install tqdm
%matplotlib inline
'''

'\n%pip install gymnasium==0.27.0\n%pip install matplotlib\n%pip install numpy\n%pip install tqdm\n%matplotlib inline\n'

In [2]:
from collections import defaultdict #for accessing keys which are not present in dictionary
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import gymnasium as gym
import sys
import random
from matplotlib.patches import Patch
import seaborn as sns

Crate the environment

In [3]:
env = gym.make('Blackjack-v1', sab=False, natural=True, render_mode='rgb_array') #We are not folllowing the default sutton and barto book settings, which are sab=True, natural=False, render_mode='human'

Observe environment

In [4]:
#observation space is a tuple of 3 elements:
#1. player's current sum (1-31)
#2. dealer's face up card (1-10)
#3. whether or not the player has a usable ace (0 or 1)


done = False
observation, info = env.reset() #get the first observation
print("Observation space:", env.observation_space) 
print("Action space:", env.action_space) #0: stick, 1: hit
print("Observation:", observation) #player´s first two cards
print("Info:", info) #dealer´s first card



Observation space: Tuple(Discrete(32), Discrete(11), Discrete(2))
Action space: Discrete(2)
Observation: (17, 10, False)
Info: {}


In [5]:
#env.step(action) returns: observation, reward, terminated, truncated, info

#observation: tuple of 3 elements (player's current sum, dealer's face up card, whether or not the player has a usable ace)

#reward: +1.5, +1, 0 or -1 (win, draw or loss), 1.5 if the player wins with a natural blackjack

#terminated: boolean (True if the episode is over)

#truncated: boolean (True if the episode is over because it reached the maximum number of steps)

#info: dictionary with additional information. We will not use this.


In [6]:
#sample random actions from the action space
print("Random actions:")
for i in range(5):
    env.reset() # reset the environment at the beginning of each iteration
    action = env.action_space.sample()
    print("Action:", action)
    observation, reward, terminated, truncated, info = env.step(action) #take a random action and observe the results of the action taken
    print("Observation:", observation)
    print("Reward:", reward)
    print("Terminated:", terminated)
    print("Truncated:", truncated)
    print("Info:", info)
    print("")
    


Random actions:
Action: 1
Observation: (15, 10, False)
Reward: 0.0
Terminated: False
Truncated: False
Info: {}

Action: 0
Observation: (19, 10, True)
Reward: 1.0
Terminated: True
Truncated: False
Info: {}

Action: 0
Observation: (14, 10, False)
Reward: -1.0
Terminated: True
Truncated: False
Info: {}

Action: 1
Observation: (29, 5, False)
Reward: -1.0
Terminated: True
Truncated: False
Info: {}

Action: 1
Observation: (15, 1, False)
Reward: 0.0
Terminated: False
Truncated: False
Info: {}



  if not isinstance(terminated, (bool, np.bool8)):


Epsilon Greedy Strategy to solve blackjack

In [7]:
class BlackJackAgent:
    def __init__(
        self,
        epsilon:float,
        learning_rate:float,
        initial_epsilon:float,
        epsilon_decay:float,
        final_epsilon:float,
        discount_factor:float = 0.95,
    ):
        #initialize the agent's parameters with empty state-action value (q_values),
        #a learning rate, an initial epsilon, an epsilon decay, a final epsilon and a discount factor
        self.q_values = defaultdict(lambda: np.zeros(env.action_space.n))
        
        self.epsilon = epsilon #epsilon value
        self.lr = learning_rate #learning rate
        self.initial_epsilon = initial_epsilon #initial value of epsilon
        self.epsilon_decay = epsilon_decay #epsilon decay factor
        self.final_epsilon = final_epsilon #minimum value of epsilon
        self.discount_factor = discount_factor #gamma
        
        self.training_error = [] #list to store the training error at each episode
        
    def get_action(self, obs:tuple[int, int, bool])->int:
        #epsilon-greedy policy, returns the action with the highest q-value 
        # for the given observation with probability 1-epsilon, this ensures exploration
        if np.random.random() < self.epsilon: 
            return env.action_space.sample() #explore
        else:
            return np.argmax(self.q_values[obs]) #exploit

    
    def update(
        self, obs:tuple[int, int, bool],
        action:int,
        reward:float,
        next_obs:tuple[int, int, bool],
        terminated:bool
        )->None:
        #update the q-values using the q-learning update rule 
        #and the agent's learning rate and discount factor
        
        # if the episode is terminated, the future q-value is 0 (no future rewards) if its not terminated, we compute the future q-value
        future_q_value = 0 if terminated else np.max(self.q_values[next_obs])

        temporal_difference = (
            #compute the temporal difference (TD) error 
            reward + self.discount_factor * future_q_value - self.q_values[obs][action]
            )
        
        self.q_values[obs][action] = (
            #update the q-value for the given observation and action
            self.q_values[obs][action] + self.lr * temporal_difference
            )
        
        #append the TD error to the training error list
        self.training_error.append(temporal_difference)
        
    def decay_epsilon(self)->None:
        #decay the epsilon value
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.final_epsilon)
    
        
        
        

Training of the agent

In [8]:
#defining the hyperparameters
learning_rate = 0.01
n_episodes = 100000
initial_epsilon = 1.0
epsilon_decay = initial_epsilon / (n_episodes/2)
final_epsilon = 0.1
discount_factor = 0.95

#initialize the agent  
agent = BlackJackAgent(
    epsilon=initial_epsilon,
    learning_rate=learning_rate,
    initial_epsilon=initial_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
    discount_factor=discount_factor
)


In [9]:
from collections import deque
from gymnasium.wrappers import RecordEpisodeStatistics
from IPython.display import clear_output

# Only apply the wrapper once
if not isinstance(env, gym.wrappers.RecordEpisodeStatistics):
    # RecordEpisodeStatistics is a wrapper that keeps track of the rewards obtained in the last n episodes
    env = gym.wrappers.RecordEpisodeStatistics(env, deque_size=n_episodes)
for episode in tqdm(range(n_episodes)):
    obs, info = env.reset()
    done = False
    clear_output()
    
    #play one episode
    while not done:
        action = agent.get_action(obs) #get the action
        next_obs, reward, terminated, truncated, info = env.step(action) #take the action and observe the results
        agent.update(obs, action, reward, terminated, next_obs) #update the q-values
        
        #render the environment
        frame = env.render() #render the environment
        plt.imshow(frame)
        plt.axis('off')
        plt.show()
        #plt.pause(0.01)
        
        obs = next_obs #update the observation
        done = terminated or truncated #update the done flag
    agent.decay_epsilon() #decay the epsilon value
        