In [1]:
'''
%pip install wandb
%pip install matplotlib
%pip install numpy
%pip install tqdm
%matplotlib inline
%pip install gymnasium==0.29.1
'''

'\n%pip install wandb\n%pip install matplotlib\n%pip install numpy\n%pip install tqdm\n%matplotlib inline\n%pip install gymnasium==0.29.1\n'

In [2]:
#@title Imports
from collections import defaultdict #for accessing keys which are not present in dictionary
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import gymnasium as gym
import sys
import random
from matplotlib.patches import Patch
import seaborn as sns

Let´s first of all create the environment.
We´ll use the Gymnasium´s Blackjack environment, we´ll allow natural blackjacks as well and the settings won´t follow the Sutton & Barto´s Book´s approach.

In [3]:
env = gym.make('Blackjack-v1',sab=False, natural=True, render_mode='rgb_array') #We are not folllowing the default sutton and barto book settings, which are sab=True, natural=False, render_mode='human'

### Understanding and Observing the Environment

In [4]:
#observation space is a tuple of 3 elements:
#1. player's current sum (1-31)
#2. dealer's face up card (1-10)
#3. whether or not the player has a usable ace (0 or 1)

done = False
observation, info = env.reset() #get the first observation
print("Observation space:", env.observation_space)
print("\nAction space:", env.action_space) #0: stick, 1: hit
print("\nObservation:", observation) #Observation[1] is player's current sum, Observation[2] is dealer's face up card, Observation[3] is whether or not the player has a usable ace
print("\n info:", info)



Observation space: Tuple(Discrete(32), Discrete(11), Discrete(2))

Action space: Discrete(2)

Observation: (16, 10, 0)

 info: {}


### Now let´s see how the agent behaves when making a step

**env.step(action)** returns: observation, reward, terminated, truncated, info

**observation**: tuple of 3 elements (player's current sum, dealer's face up card, whether or not the player has a usable ace)

**reward**: +1.5, +1, 0 or -1 (win, draw or loss), 1.5 if the player wins with a natural blackjack

**terminated**: boolean (True if the episode is over)

**truncated**: boolean (True if the episode is over because it reached the maximum number of steps)

**info**: dictionary with additional information. We will not use this.

In [5]:
#sample random actions from the action space
print("Random actions:")
for i in range(5):
    env.reset() # reset the environment at the beginning of each iteration
    action = env.action_space.sample()
    print("Action:", action)
    observation, reward, terminated, truncated, info = env.step(action) #take a random action and observe the results of the action taken
    print("Observation:", observation) #Observation[1] is player's current sum, Observation[2] is dealer's face up card, Observation[3] is whether or not the player has a usable ace
    print("Reward:", reward) #reward is 1 if the player wins, 1.5 if player wins with natural blackjack (an usable ace and a 10), -1 if the player loses, and 0 if the game is a draw
    print("Terminated:", terminated)
    print("Truncated:", truncated)
    print("info:", info)
    print("")



Random actions:
Action: 0
Observation: (12, 5, 0)
Reward: 1.0
Terminated: True
Truncated: False
info: {}

Action: 0
Observation: (9, 5, 0)
Reward: 1.0
Terminated: True
Truncated: False
info: {}

Action: 1
Observation: (23, 10, 0)
Reward: -1.0
Terminated: True
Truncated: False
info: {}

Action: 1
Observation: (23, 8, 0)
Reward: -1.0
Terminated: True
Truncated: False
info: {}

Action: 1
Observation: (21, 9, 0)
Reward: 0.0
Terminated: False
Truncated: False
info: {}



Let´s create a simple agent, the policy is very naive, if its own sum surpasses 20, sticks with its cards, if not, hits for more.

In [6]:
class NaiveBlackjackAgent:
    def __init__(self):
        pass

    def play(self, obs):
        return 0 if obs[0] >= 20 else 1 #stick if player's current sum is 20 or more, else hit


Now we will evaluate the agent

In [7]:
#defining the hyperparameters
n_episodes = 100

#initialize the agent
agent = NaiveBlackjackAgent()


In [8]:
'''from collections import deque
from gymnasium.wrappers import RecordEpisodeStatistics
from IPython.display import clear_output
import wandb
import pygame


# initialize pygame and wandb
pygame.init()
wandb.init(project="blackjack_naive", entity="ai42")

# Assuming env and agent are defined and initialized here

for episode in tqdm(range(n_episodes)):
    obs, info = env.reset()
    terminated = False
    truncated = False
    clear_output()
    step = 0

    while not terminated and not truncated:
        action = agent.play(obs)  # Agent's policy
        obs, reward, terminated, truncated, info = env.step(action)

        frame = env.render() # Ensure you're getting an RGB image
        step += 1
        plt.imshow(frame)
        plt.axis('off')
        plt.title(f"Episode: {episode}, Step: {step}")
        plt.savefig('frame.png')
        if terminated or truncated:
            plt.title(f"TERMINATED OR TRUNCATED, Episode: {episode}, Step: {step}")
            plt.savefig('frame.png')
            wandb.log({"frame": wandb.Image('frame.png')})
            plt.close()
            break
        wandb.log({"frame": wandb.Image('frame.png')})
        plt.close()



    print("Reward:", reward)
    print("Done:", terminated or truncated)
    print("info", info)
    wandb.log({"reward": reward})
    print("")

env.close()
'''

'from collections import deque\nfrom gymnasium.wrappers import RecordEpisodeStatistics\nfrom IPython.display import clear_output\nimport wandb\nimport pygame\n\n\n# initialize pygame and wandb\npygame.init()\nwandb.init(project="blackjack_naive", entity="ai42")\n\n# Assuming env and agent are defined and initialized here\n\nfor episode in tqdm(range(n_episodes)):\n    obs, info = env.reset()\n    terminated = False\n    truncated = False\n    clear_output()\n    step = 0\n    \n    while not terminated and not truncated:\n        action = agent.play(obs)  # Agent\'s policy\n        obs, reward, terminated, truncated, info = env.step(action)\n        \n        frame = env.render() # Ensure you\'re getting an RGB image\n        step += 1\n        plt.imshow(frame)\n        plt.axis(\'off\')\n        plt.title(f"Episode: {episode}, Step: {step}")\n        plt.savefig(\'frame.png\')\n        if terminated or truncated:\n            plt.title(f"TERMINATED OR TRUNCATED, Episode: {episode}, S

In [None]:
from collections import deque
from gymnasium.wrappers import RecordEpisodeStatistics
from IPython.display import clear_output
import wandb
import pygame


# Initialize wandb
wandb.init(project="blackjack_naive_100000", entity="ai42")
pygame.init()


n_episodes = 100000  # Define the number of episodes you want to run


win_rate = 0.0
loss_rate = 0.0
draw_rate = 0.0
natural_rate = 0.0

for episode in tqdm(range(n_episodes)):
    obs, info = env.reset()
    terminated, truncated = False, False
    clear_output(wait=True)
    step = 0
    episode_rewards = 0  # Initialize total rewards for the episode

    while not terminated and not truncated:
        action = agent.play(obs)  # Agent's policy
        obs, reward, terminated, truncated, info = env.step(action)

        # Ensure you're getting an RGB image
        frame = env.render()
        step += 1
        episode_rewards += reward  # Accumulate rewards

        # Plot frame
        plt.imshow(frame)
        plt.axis('off')
        plt.title(f"Episode: {episode}, Step: {step}")
        plt.savefig('frame.png')
        plt.close()

        # Log the frame and rewards to wandb
        wandb.log({
            "episode": episode,
            "step": step,
            "frame": wandb.Image('frame.png'),
            "reward": reward,
            "cumulative_reward": episode_rewards
        })
    if reward == 1 or reward == 1.5:
        win_rate += 1
    elif reward == -1:
        loss_rate += 1
    elif reward == 0:
        draw_rate += 1
    if reward == 1.5:
        natural_rate += 1


env.close()

# Let´s log general statistics of the training
wandb.log({"Win_rate": win_rate / n_episodes, "Loss_rate": loss_rate / n_episodes, "Draw_rate": draw_rate / n_episodes, "Natural_win_rate": natural_rate / n_episodes}) # Log the episode statistics to wandb



  1%|          | 582/100000 [04:08<11:50:26,  2.33it/s]