In [None]:
%pip install wandb
%pip install matplotlib
%pip install numpy
%pip install tqdm
%matplotlib inline
%pip install gymnasium==0.29.1

Collecting wandb
  Downloading wandb-0.16.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.40-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.34.0-py2.py3-none-any.whl (243 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.9/243.9 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->w

In [None]:
#@title Imports
from collections import defaultdict #for accessing keys which are not present in dictionary
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import gymnasium as gym
import sys
import random
from matplotlib.patches import Patch
import seaborn as sns

In [None]:
class MC_BlackjackAgent:
    def __init__(self):
        pass

    def train(self, mc_iterations=100000):
        possible_nums = [1,2,3,4,5,6,7,8,9,10,10,10,10]
        possible_current_sums = {i:0 for i in range(12,22)} #from 12 bcs for lower numbers you should always hit
        for current_sum in tqdm(possible_current_sums):
            for _ in range(mc_iterations):
                hit_value = random.choice(possible_nums)
                if hit_value == 1:
                    if (current_sum + 11) <= 21:
                        hit_value = 11
                    else:
                        hit_value = 1
                if (current_sum + hit_value) > 21:
                    possible_current_sums[current_sum] += 1


        trivial_probabilities = {i:1 for i in range(1,12)}
        self.likelihood_of_hit = trivial_probabilities | {i: 1 - round(possible_current_sums[i]/mc_iterations, 4) for i in possible_current_sums}

        self.likelihood_of_hit_based_on_dealer = {i:round(v, 2) for i,v in zip(range(1,11), np.arange(0.9,1,0.01))}

    def play(self, obs):
        if obs[0] in range(1, 12):
            return 1
        elif obs[0] > 21:
            return 0
        else:
            return 1 if random.rand() < (self.likelihood_of_hit[obs[0]] * self.likelihood_of_hit_based_on_dealer[obs[1]]) else 0


In [None]:
#initialize the agent
agent = MC_BlackjackAgent()
agent.train(mc_iterations=100000)


100%|██████████| 10/10 [00:00<00:00, 14.60it/s]


In [None]:
agent.likelihood_of_hit, agent.likelihood_of_hit_based_on_dealer

({1: 1,
  2: 1,
  3: 1,
  4: 1,
  5: 1,
  6: 1,
  7: 1,
  8: 1,
  9: 1,
  10: 1,
  11: 1,
  12: 0.6899,
  13: 0.6128,
  14: 0.5395,
  15: 0.4616,
  16: 0.384,
  17: 0.3073,
  18: 0.22960000000000003,
  19: 0.15410000000000001,
  20: 0.07669999999999999,
  21: 0.0},
 {1: 0.9,
  2: 0.91,
  3: 0.92,
  4: 0.93,
  5: 0.94,
  6: 0.95,
  7: 0.96,
  8: 0.97,
  9: 0.98,
  10: 0.99})

In [None]:
from collections import deque
from gymnasium.wrappers import RecordEpisodeStatistics
from IPython.display import clear_output
import wandb
import pygame
from numpy import random

#load the environment
env = gym.make('Blackjack-v1',sab=False, natural=True, render_mode='rgb_array') #We are not folllowing the default sutton and barto book settings, which are sab=True, natural=False, render_mode='human'

for _ in range(10):
    # Initialize wandb
    wandb.init(project="blackjack_MC_Complex_200", entity="ai42")
    pygame.init()


    n_episodes = 1000  # Define the number of episodes you want to run

    wins = 0.0
    losses = 0.0
    draws = 0.0
    naturals = 0.0

    for episode in tqdm(range(n_episodes)):
        obs, info = env.reset()
        terminated, truncated = False, False
        clear_output()
        step = 0
        episode_rewards = 0  # Initialize total rewards for the episode

        while not terminated and not truncated:
            action = agent.play(obs)  # Agent's policy
            obs, reward, terminated, truncated, info = env.step(action)


            frame = env.render()
            step += 1
            episode_rewards += reward  # Accumulate rewards

            # Plot frame
            plt.imshow(frame)
            plt.axis('off')
            plt.title(f"Episode: {episode} - Step: {step} - Action Taken: {action} - Reward: {reward} - Terminated: {terminated}")

            plt.savefig('frame.png')
            plt.close()

            # Log the frame and rewards to wandb
            wandb.log({
                "episode": episode,
                "step": step,
                "frame": wandb.Image('frame.png'),
                "reward": reward,
                "cumulative_reward": episode_rewards
            })
        if reward == 1 or reward == 1.5:
            wins += 1
        elif reward == -1:
            losses += 1
        elif reward == 0:
            draws += 1
        if reward == 1.5:
            naturals += 1

    env.close()

    # Let´s log general statistics of the training
    wandb.log({"Win_rate": wins / n_episodes, "Loss_rate": losses / n_episodes, "Draw_rate": draws / n_episodes, "Natural_win_rate": naturals / n_episodes}) # Log the episode statistics to wandb
