In [1]:
# @title Imports and installs
'''
%pip install gymnasium==0.27.0
%pip install matplotlib
%pip install numpy
%pip install tqdm
%matplotlib inline
'''
from collections import defaultdict #for accessing keys which are not present in dictionary
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import gymnasium as gym
import sys
import random
from matplotlib.patches import Patch
import seaborn as sns

Let´s first of all create the environment.
We´ll use the Gymnasium´s Blackjack environment, we´ll allow natural blackjacks as well and the settings won´t follow the Sutton & Barto´s Book´s approach.

In [2]:
env = gym.make('Blackjack-v1', sab=False, natural=True, render_mode='rgb_array') #We are not folllowing the default sutton and barto book settings, which are sab=True, natural=False, render_mode='human'

### Understanding and Observing the Environment

In [3]:
#observation space is a tuple of 3 elements:
#1. player's current sum (1-31)
#2. dealer's face up card (1-10)
#3. whether or not the player has a usable ace (0 or 1)

done = False
observation, info = env.reset() #get the first observation
print("Observation space:", env.observation_space) 
print("\nAction space:", env.action_space) #0: stick, 1: hit
print("\nObservation:", observation) #Observation[1] is player's current sum, Observation[2] is dealer's face up card, Observation[3] is whether or not the player has a usable ace
print("\nInfo:", info) #dealer´s first card



Observation space: Tuple(Discrete(32), Discrete(11), Discrete(2))

Action space: Discrete(2)

Observation: (16, 1, False)

Info: {}


### Now let´s see how the agent behaves when making a step

**env.step(action)** returns: observation, reward, terminated, truncated, info

**observation**: tuple of 3 elements (player's current sum, dealer's face up card, whether or not the player has a usable ace)

**reward**: +1.5, +1, 0 or -1 (win, draw or loss), 1.5 if the player wins with a natural blackjack

**terminated**: boolean (True if the episode is over)

**truncated**: boolean (True if the episode is over because it reached the maximum number of steps)

**info**: dictionary with additional information. We will not use this.

In [4]:
#sample random actions from the action space
print("Random actions:")
for i in range(5):
    env.reset() # reset the environment at the beginning of each iteration
    action = env.action_space.sample()
    print("Action:", action)
    observation, reward, terminated, truncated, info = env.step(action) #take a random action and observe the results of the action taken
    print("Observation:", observation) #Observation[1] is player's current sum, Observation[2] is dealer's face up card, Observation[3] is whether or not the player has a usable ace
    print("Reward:", reward) #reward is 1 if the player wins, 1.5 if player wins with natural blackjack (an usable ace and a 10), -1 if the player loses, and 0 if the game is a draw
    print("Terminated:", terminated)
    print("Truncated:", truncated)
    print("Info:", info)
    print("")
    


Random actions:
Action: 0
Observation: (19, 8, True)
Reward: 1.0
Terminated: True
Truncated: False
Info: {}

Action: 1
Observation: (10, 9, False)
Reward: 0.0
Terminated: False
Truncated: False
Info: {}

Action: 0
Observation: (7, 10, False)
Reward: 1.0
Terminated: True
Truncated: False
Info: {}

Action: 0
Observation: (16, 4, False)
Reward: -1.0
Terminated: True
Truncated: False
Info: {}

Action: 1
Observation: (13, 10, False)
Reward: 0.0
Terminated: False
Truncated: False
Info: {}



  if not isinstance(terminated, (bool, np.bool8)):


Let´s create a simple agent, the policy is very naive, if its own sum surpasses 20, sticks with its cards, if not, hits for more.

In [5]:
class NaiveBlackjackAgent:
    def __init__(self): 
        pass

    def play(self, obs):
        return 0 if obs[0] >= 20 else 1 #stick if player's current sum is 20 or more, else hit
        

Now we will evaluate the agent

In [6]:
#defining the hyperparameters
n_episodes = 100

#initialize the agent  
agent = NaiveBlackjackAgent()


In [7]:
from collections import deque
from gymnasium.wrappers import RecordEpisodeStatistics
from IPython.display import clear_output
import wandb
import pygame
import os

# initialize wandb
os.environ['WANDB_NOTEBOOK_NAME'] = 'blackjack_naive_solver.ipynb'
wandb.init(project="blackjack_naive", entity="ai42") 

for episode in tqdm(range(n_episodes)):
    obs, info = env.reset()
    done = False
    clear_output()
    step = 0
    
    while not done:
        action = agent.play(obs) #play according to the agent's policy: if obs[0] >= 20, stick(0), else hit(1)
        obs, reward, terminated, truncated, info = env.step(action)
        step += 1
        
        frame = env.render()
        plt.imshow(frame)
        plt.axis('off')
        #plt.show()
        plt.title("Episode: {}, Step: {}".format(episode, step))
        
        # Convert plot to image and log to wandb
        plt.savefig('frame.png')
        wandb.log({"frame": wandb.Image('frame.png')})
        
        #plt.pause(1)
        plt.close()
        
        done = terminated or truncated
        
        print("Reward:", reward)
        print("Done:", done)
        print("Info:", info)
        wandb.log({"reward": reward})
        print("")
        if done:
            break
    

env.close()


[34m[1mwandb[0m: Currently logged in as: [33mneildlf[0m ([33mai42[0m). Use [1m`wandb login --relogin`[0m to force relogin


Problem at: C:\Users\neild\AppData\Local\Temp\ipykernel_21444\2919867372.py 10 <module>


MailboxError: transport failed

In [None]:
wandb login --relogin

SyntaxError: invalid syntax (1271219994.py, line 1)