In [1]:
import numpy as np
import random
from collections import deque
from blackjack import BlackjackGame
from tensorflow.keras import layers, models
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import os

In [2]:
class DQN:

    def __init__(self, state_size, action_size, alpha=0.01):
        self.state_size = state_size
        self.action_size = action_size
        self.alpha = alpha
        self.memory = deque(maxlen=2000)  # Aquí se define la memoria de repetición
        self.gamma = 0.95  # factor de descuento para las recompensas futuras
        self.epsilon = 0.9  # tasa de exploración inicial
        self.epsilon_min = 0.01  # tasa de exploración mínima
        self.epsilon_decay = 0.995  # factor de decaimiento de la tasa de exploración
        self.model = self._build_model()  # construcción del modelo DQN

    def _build_model(self):
        model = models.Sequential()
        model.add(layers.Dense(24, input_dim=self.state_size, activation="relu"))
        model.add(layers.Dense(24, activation="relu"))
        model.add(layers.Dense(self.action_size, activation="linear"))
        custom_optimizer = Adam(learning_rate=0.01)
        model.compile(loss="mse", optimizer=custom_optimizer)
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(
                    self.model.predict(next_state)[0]
                )
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    @staticmethod
    def has_usable_ace(hand):
        """Check if the hand has a usable ace."""
        value, ace = 0, False
        for card in hand:
            card_number = card["number"]
            value += min(
                10, int(card_number) if card_number not in ["J", "Q", "K", "A"] else 11
            )
            ace |= card_number == "A"
        return int(ace and value + 10 <= 21)

    def train(self, episodes, batch_size):
        one_percent = round(episodes / 100)

        for ep in range(episodes):
            print(ep)
            game = BlackjackGame()
            bet = 5
            game.start_game(bet)

            if ep % one_percent == 0:
                progress = (ep / episodes) * 100
                print(f"Training progress: {progress:.2f}%")

            dealer_card = (
                int(game.dealer_hand[0]["number"])
                if game.dealer_hand[0]["number"] not in ["J", "Q", "K", "A"]
                else (10 if game.dealer_hand[0]["number"] != "A" else 11)
            )
            status = ["act", "continue"]

            while status[1] == "continue":
                player_sum = game.hand_value(game.player_hand)
                usable_ace = self.has_usable_ace(game.player_hand)
                state = np.array([player_sum, dealer_card, usable_ace])
                state = np.reshape(state, [1, self.state_size])
                action = self.act(state)
                action_str = ["hit", "stay", "double"][action]
                bet = game.return_bounty(bet, action_str)
                status = game.player_action(action_str)
                new_player_sum = game.hand_value(game.player_hand)
                new_usable_ace = self.has_usable_ace(game.player_hand)
                next_state = np.array([new_player_sum, dealer_card, new_usable_ace])
                next_state = np.reshape(next_state, [1, self.state_size])

                reward = 0  # Intermediate reward, only final matters

                if status[1] == "player_blackjack":
                    reward += bet
                elif status[1] == "player_bust":
                    reward -= bet

                if reward != 0:
                    done = status[1] in ["player_blackjack", "player_bust"]
                    self.remember(state, action, reward, next_state, done)

                if status[0] == "stay":
                    break

            final_result = game.game_result()
            final_reward = (
                bet
                if final_result == "win"
                else (-bet if final_result == "loss" else 0)
            )
            self.remember(state, action, final_reward, next_state, True)

            if len(self.memory) > batch_size:
                self.replay(batch_size)

    def play(self, bet):
        game = BlackjackGame()
        game.start_game(bet)

        print("Dealer shows:", game.format_cards(game.dealer_hand[:1]))
        status = ["act", "continue"]
        print(game.format_cards(game.player_hand), game.hand_value(game.player_hand))
        while status[1] == "continue":
            player_sum = game.hand_value(game.player_hand)
            usable_ace = self.has_usable_ace(game.player_hand)
            dealer_card = (
                int(game.dealer_hand[0]["number"])
                if game.dealer_hand[0]["number"] not in ["J", "Q", "K", "A"]
                else (10 if game.dealer_hand[0]["number"] != "A" else 11)
            )
            state = np.array([player_sum, dealer_card, usable_ace])
            state = np.reshape(state, [1, self.state_size])
            action = self.act(state)
            action_str = ["hit", "stay", "double"][action]
            status = game.player_action(action_str)

            if action_str == "stay":
                break

            print(
                game.format_cards(game.player_hand), game.hand_value(game.player_hand)
            )

        if status[1] == "continue":
            print(
                "Dealer has:",
                game.format_cards(game.dealer_hand),
                game.hand_value(game.dealer_hand),
            )
            game.dealer_action()

        final_result = game.game_result()
        return final_result

In [3]:
EPISODES = 1000
state_size = 3  # player_sum, dealer_card, usable_ace
action_size = 3  # hit, stay, double
agent = DQN(state_size, action_size)
batch_size = 32

agent.train(100, batch_size)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-06-04 16:26:06.690091: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-06-04 16:26:06.690198: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-06-04 16:26:06.690223: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-06-04 16:26:06.690815: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-04 16:26:06.690859: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


0
Training progress: 0.00%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step
Dealer hits and has: A♣ 4♥ 7♦ 12
Dealer hits and has: A♣ 4♥ 7♦ 3♦ 15
Dealer hits and has: A♣ 4♥ 7♦ 3♦ A♥ 16
Dealer hits and has: A♣ 4♥ 7♦ 3♦ A♥ 2♦ 18
1
Training progress: 1.00%
Dealer hits and has: 4♦ 2♦ 6♣ 12
Dealer hits and has: 4♦ 2♦ 6♣ Q♣ 22
2
Training progress: 2.00%
Dealer hits and has: K♦ 3♠ K♠ 23
3
Training progress: 3.00%
Dealer hits and has: 5♦ 2♥ Q♣ 17
4
Training progress: 4.00%
5
Training progress: 5.00%
6
Training progress: 6.00%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Dealer hits and has: 4♣ 10♠ 9♦ 23
7
Training progress: 7.00%
Dealer hits and has: 9♥ 5♣ 4♦ 18
8
Training progress: 8.00%
9
Training progress: 9.00%
Dealer hits and has: 8♣ 4♣ 3♣ 15
Dealer hits and has: 8♣ 4♣ 3♣ 9♥ 24
10
Training progress: 10.00%
Dealer hits and has: 3♠ A♦ A♠ 15
Dealer hits and has: 3♠ A♦ A♠ Q♠ 15
Dealer hits and has: 3♠ A♦ A♠ Q♠ K♦ 25
11
Training progress: 11.00

2024-06-04 16:26:07.151947: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Dealer hits and has: J♥ 4♣ A♦ 15
Dealer hits and has: J♥ 4♣ A♦ 2♥ 17
19
Training progress: 19.00%
20
Training progress: 20.00%
Dealer hits and has: K♣ 6♣ 5♣ 21
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [17]:
checkpoint_path = "models/v{ver}/training_{trainVer}/cp-{epoch:04d}.weights.h5"
checkpoint_dir = os.path.dirname(checkpoint_path)

print(checkpoint_path)
print(checkpoint_dir)

models/v{ver}/training_{trainVer}/cp-{epoch:04d}.weights.h5
models/v{ver}/training_{trainVer}


In [19]:
agent.model.save_weights(checkpoint_path.format(ver=1, trainVer=1, epoch=1))

In [20]:
os.listdir(checkpoint_dir)

FileNotFoundError: [Errno 2] No such file or directory: 'models/v{ver}/training_{trainVer}'