In [1]:
import gym 
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete

import numpy as np
import random
import os

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

from classes import Deck
from classes import Hand

In [2]:
class BJEnv(Env):
    def __init__(self):
        self.action_space = Discrete(2) # 0 = Stay, 1 = Draw
        self.observation_space =Box(low=0,high=22, shape=(2,))
        
        self.Deck = Deck()
        self.Hand = Hand(self.Deck)        
        
        self.repeat_num_list = np.array([0])
        self.repeat_mean = 0 #Var to calculate mean val of repeat
        self.state = [self.Deck.draw_a_card(), self.Hand.sum_up()]
        pass
    
    def step(self, action):

        if action == 1:
            self.Hand.draw(self.Deck)
            self.repeat_num_list[-1] += 1

        self.state[1] = self.Hand.sum_up()
        if self.state[1] < 22:
            reward = 1
        else :
            reward = -1
            done = True

        if action == 0:
            while self.state[0] < self.state[1]:
                self.state[0] += self.Deck.draw_a_card()
            done = True
            if self.state[0] > 21 or self.state[1] > self.state[0]:
                reward = 2
            else:
                reward = -1
        else:
            done = False

        info = {}

        return self.state, reward, done, info
    
    def render(self, mode = "human"):
        print('Dealer sum:{}, Player sum:{}'.format(self.Deck.draw_a_card(), self.Hand.sum_up()))
        print("Accual repeat:{}".format(self.repeat_num_list[-1]))
        print("Mean value of all repeats:{}". format(self.repeat_mean))
        print("Best possible val right now:{}".format(1 + ((self.repeat_mean-1)*2)))
        pass
    
    def reset(self):
        self.Deck = Deck()
        self.Hand = Hand(self.Deck)        
        
        self.state = [self.Deck.draw_a_card(), self.Hand.sum_up()]
        self.repeat_num_list = np.append(self.repeat_num_list, 0)
        self.repeat_mean = self.repeat_num_list.mean()
        return self.state

In [3]:
env = BJEnv()

In [4]:
print(env.state)
print(env.observation_space.sample())

[10, 17]
[ 5.644338 15.257443]


In [5]:
log_path = os.path.join('Trainig', 'Logs')
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [6]:
save_path = os.path.join('Training', 'Saved_models', 'BJ_PPO_200000k')

In [7]:
model.learn(total_timesteps=200000)
model.save(save_path)

2022-09-18 23:43:00.715365: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-18 23:43:00.935116: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/jakub/.local/lib/python3.10/site-packages/cv2/../../lib64:
2022-09-18 23:43:00.935144: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-09-18 23:43:00.986208: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already b

Logging to Trainig/Logs/PPO_26
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 2.17     |
|    ep_rew_mean     | 0.28     |
| time/              |          |
|    fps             | 460      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.31        |
|    ep_rew_mean          | 0.72        |
| time/                   |             |
|    fps                  | 606         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.023662236 |
|    clip_fraction        | 0.404       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.673      |
|    explained_variance   | -0.0331     |

In [9]:
model = PPO.load(save_path)
evaluate_policy(model, env, n_eval_episodes=50, render=True)

Dealer sum:8, Player sum:14
Accual repeat:1
Mean value of all repeats:1.5294836272040302
Best possible val right now:2.0589672544080604
Dealer sum:6, Player sum:16
Accual repeat:2
Mean value of all repeats:1.5294836272040302
Best possible val right now:2.0589672544080604
Dealer sum:7, Player sum:25
Accual repeat:3
Mean value of all repeats:1.5294836272040302
Best possible val right now:2.0589672544080604
Dealer sum:2, Player sum:16
Accual repeat:0
Mean value of all repeats:1.5295021473281194
Best possible val right now:2.059004294656239
Dealer sum:4, Player sum:20
Accual repeat:1
Mean value of all repeats:1.5295021473281194
Best possible val right now:2.059004294656239
Dealer sum:1, Player sum:26
Accual repeat:2
Mean value of all repeats:1.5295021473281194
Best possible val right now:2.059004294656239
Dealer sum:8, Player sum:11
Accual repeat:0
Mean value of all repeats:1.5295080728445127
Best possible val right now:2.0590161456890255
Dealer sum:4, Player sum:17
Accual repeat:1
Mean va

(2.24, 0.9911609354691094)

PPO:\
25K = (2.02, 1.2726350615946427)\
200K = (1.98, 0.905317623820502)\ 

A2C:\
25K = (1.28, 1.3862178760930766)