In [1]:
#!/usr/bin/env python3
from Play.ENV import LuxuryDiceGame
from Play.model import DiceMaster as Net
from collections import deque, namedtuple, OrderedDict
from tensorboardX import SummaryWriter
from lib.configuration import Config
from lib.utils import *

import os
import time
import numpy as np
import tensorflow as tf

import warnings
warnings.filterwarnings("ignore")

In [2]:
config = Config(os.path.join(os.getcwd(), "Play/config.yaml"))

---------------------------------- APP CONFIG ----------------------------------
data: 
  data_path: gs://xxxx.csv
  num_gaussian: 5
  time_length: 16
  train_test_split: 0.8
train: 
  batch_size: 4
  epoch: 50
test: 
  batch_size: 2
model: 
  loss: MeanSquaredError
  metrics: MeanSquaredError
optimizer: 
  method: adam
  learning_rate: 1e-5
weights: 
  simulation: weights/best_simulation_model.h5
RL_PARAMETER: 
  MEAN_REWARD_BOUND: 19.5
  INIT_BALANCE: 0
  GAMMA: 0.99
  BATCH_SIZE: 32
  REPLAY_SIZE: 10000
  OBSERVATION_SIZE: 15
  LEARNING_RATE: 1e-4
  SYNC_TARGET_FRAMES: 1000
  REPLAY_START_SIZE: 10000
  EPSILON_DECAY_LAST_EPOCH: 1e5
  EPSILON_START: 1.0
  EPSILON_FINAL: 0.02
overflow: 
  size: 40000000
  sequence: 20
probabilityTable: [{'large': 4730, 'leopard': 510, 'small': 4760}, {'large': 4740, 'leopard': 480, 'small': 4780}, {'large': 4750, 'leopard': 450, 'small': 4800}, {'large': 4760, 'leopard': 430, 'small': 4810}]
index_round: 100
odds: 
  small: 2
  leopard: 20
  large: 2


### Set GPU as available physical device

In [3]:
if gpus := tf.config.experimental.list_physical_devices(device_type='GPU'):
    tf.config.experimental.set_visible_devices(devices=gpus[0], device_type='GPU')

2023-12-26 08:49:33.107542: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-12-26 08:49:33.120139: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-12-26 08:49:33.120426: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [4]:
def calc_loss(batch):
    states, actions, rewards, dones, next_states = batch
    states_v = tf.constant(states)
    next_states_v = tf.constant(next_states)
    actions_v = tf.transpose(tf.constant(actions))
    rewards_v = tf.transpose(tf.constant(rewards))

    with tf.GradientTape() as tape:
        state_action_values = net(states_v)
        next_state_values = tgt_net(next_states_v)
        expected_state_action_values = next_state_values * config.RL_PARAMETER.GAMMA + rewards_v[:, tf.newaxis]
        loss_t = tf.keras.losses.MSE(state_action_values, expected_state_action_values)
    gradients = tape.gradient(loss_t, net.trainable_variables)
    optimizer.apply_gradients(zip(gradients, net.trainable_variables))
    return loss_t.numpy()


class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self.state = None
        self._reset()

    def _reset(self):
        self.state = self.env._reset()
        self.total_reward = 0.0

    def play_step(self, net, epsilon=0.0):
        if np.random.random() < epsilon:
            action = self.env.action_space.sample()
        else:
            state_a = np.array(list(self.env.states["Observation"]) + list(self.env.states["TimeCode"][0]), copy=False)
            state_v = tf.constant(state_a.reshape(1, -1))
            act_v = net(state_v)
            action = {"Bet": tf.reduce_sum(act_v)*1e6, "Category": int(tf.argmax(act_v, axis=1))}

        # do step in the environment
        new_state, reward, is_done, call, prob = self.env.step(action)
        self.total_reward += reward
        exp = Experience(list(self.state["Observation"]) + list(self.state["TimeCode"].numpy()[0]),
                         [float(action["Bet"]), int(action["Category"])],
                         reward,
                         is_done,
                         list(new_state["Observation"]) + list(new_state["TimeCode"].numpy()[0]))
        self.exp_buffer.append(exp)
        self.state = new_state
        if is_done:
            self._reset()
        return net, reward, action, call, prob

In [5]:
# Experience = namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

In [21]:
env = LuxuryDiceGame(config)
net = Net()
tgt_net = Net()

buffer = ExperienceBuffer(config.RL_PARAMETER.REPLAY_SIZE)
agent = Agent(env, buffer)
epsilon = config.RL_PARAMETER.EPSILON_START

optimizer = tf.keras.optimizers.Adam(learning_rate=float(config.optimizer.learning_rate))
round_idx = 0
total_reward = 0
best_mean_reward = None

loss_cache = deque([], maxlen=3000)
reward_cache = deque([], maxlen=3000)
while True:
    round_idx += 1
    epsilon = max(config.RL_PARAMETER.EPSILON_FINAL, config.RL_PARAMETER.EPSILON_START - round_idx \
                  / float(config.RL_PARAMETER.EPSILON_DECAY_LAST_EPOCH))

    net, reward, action, call, prob = agent.play_step(net, epsilon)
    reward_cache.append(reward)
    if len(buffer) < config.RL_PARAMETER.REPLAY_START_SIZE:
        continue

    batch = buffer.sample(config.RL_PARAMETER.BATCH_SIZE)
    loss_cache.append(np.mean(calc_loss(batch)))

    if round_idx % 500 == 0:
        if isinstance(action, OrderedDict):
            bet = action["Bet"][0]
        else:
            bet = int(action["Bet"])
        select = selection_map[action["Category"]]
        print(f"Round: {round_idx}, LastAverageLoss: {sum(loss_cache)/len(loss_cache):.3f}, LastAverageReward: {sum(reward_cache)/len(reward_cache):.3f}, CurrReward: {reward}, eps: {epsilon:.2f}, Prob: {prob}, Call: {call}, Bet: {bet}, Select: {select}")

    if total_reward is not None:
        if best_mean_reward is None or best_mean_reward < total_reward:
            net.save(f"weights/best_LuxuaryDiceModel.tf", save_format='tf')
            if best_mean_reward is not None:
                print(f"Best mean reward updated {best_mean_reward:.3f} -> {total_reward:.3f}, model saved")
            best_mean_reward = total_reward

    if round_idx % config.RL_PARAMETER.SYNC_TARGET_FRAMES == 0:
        # tgt_net = tf.keras.models.clone_model(net)
        tgt_net.set_weights(net.get_weights())
    break

Round: 10000, LastAverageLoss: 30113679360.000, LastAverageReward: -3021.415, CurrReward: 7866, eps: 0.90, Prob: {'large': 4750, 'leopard': 450, 'small': 4800}, Call: leopard, Bet: 414.10992431640625, Select: leopard


2023-12-26 09:01:35.277231: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: weights/best_LuxuaryDiceModel.tf/assets
