In [1]:
# -*- coding: utf-8 -*-
from io import BytesIO
from tensorflow.python.lib.io import file_io
from tqdm.auto import tqdm
from configuration import Config
from Simulation.model import LuxuryDiceSimulationMdn as Model
from Simulation.loss import MdnLoss

import os
import numpy as np
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")

In [2]:
model = Model()

2023-12-16 15:09:37.724192: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-12-16 15:09:37.736890: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-12-16 15:09:37.737203: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-12-16 15:09:37.738616: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [2]:
#
# Configuration Loading
# ----------------------------------------------------------------------------------------------------------------------
config = Config(os.path.join(os.getcwd(), "Simulation/config.yaml"))

# Set GPU as available physical device
if gpus := tf.config.experimental.list_physical_devices(device_type='GPU'):
    tf.config.experimental.set_visible_devices(devices=gpus[0], device_type='GPU')

---------------------------------- APP CONFIG ----------------------------------
data: 
  data_path: gs://bin_for_aiops/GambleMaster/LuxuryDice/luxury_dice_v2.npz
  time_length: 15
  train_test_split: 0.8
train: 
  batch_size: 256
  epoch: 3000
test: 
  batch_size: 128
model: 
  loss: MDN
  metrics: MDN
  focal_weight: 5
optimizer: 
  method: adam
  learning_rate: 1e-4
weights: 
  simulation: weights/best_luxury_dice_simulation_model.tf
--------------------------------------------------------------------------------


2023-12-16 14:53:05.997865: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-12-16 14:53:06.010535: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-12-16 14:53:06.010821: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [4]:
if __name__ == "__main__":
    # Data Preparation
    # x = np.random.rand(32, 15, 3)
    # x = tf.cast(x, tf.float32)
    # time_encode = np.random.rand(32, 2)
    # time_encode = tf.cast(time_encode, tf.float32)
    # y = np.random.rand(32, 13)
    # y = tf.cast(y, tf.float32)
    f = BytesIO(file_io.read_file_to_string(config.data.data_path, binary_mode=True))
    data = np.load(f)
    x, time_encode, y = data["record"], data["time_code"], data["y"]
    x = tf.cast(x, tf.float32)
    time_encode = tf.cast(time_encode, tf.float32)
    y = tf.cast(y, tf.float32)

    k = int(config.data.train_test_split * x.shape[0])
    x_train, x_test = x[:k, :, :], x[k:, :, :]
    time_encode_train, time_encode_test = time_encode[:k, :], time_encode[k:, :]
    y_train, y_test = y[:k, :], y[k:, :]

    training_data = tf.data.Dataset.from_tensor_slices((x_train, time_encode_train, y_train))
    training_batch = training_data.batch(config.train.batch_size)
    testing_data = tf.data.Dataset.from_tensor_slices((x_test, time_encode_test, y_test))
    testing_batch = testing_data.batch(config.train.batch_size)

    #
    # Create model (BetSimulation)
    # ----------------------------------------------------------------------------------------------------------------------
    model = Model()

    if config.optimizer.method == 'sgd':
        optimizer = tf.keras.optimizers.SGD(learning_rate=float(config.optimizer.learning_rate))
    else:
        optimizer = tf.keras.optimizers.Adam(learning_rate=float(config.optimizer.learning_rate))

    #
    # Loss
    # ----------------------------------------------------------------------------------------------------------------------
    # kld = tf.keras.losses.KLDivergence()
    mdn_loss = MdnLoss(reduce=False)
    mse = tf.keras.losses.MeanSquaredError()

    #
    # Train Model
    # ----------------------------------------------------------------------------------------------------------------------
    train_losses = []
    test_losses = []
    best_train_loss = float("inf")
    best_valid_loss = float("inf")
    for e in range(config.train.epoch):
        train_loss_cache = []
        train_mse_cache = []
        test_loss_cache = []
        test_mse_cache = []
        for x, time, y in tqdm(training_batch, desc="Training"):
            with tf.GradientTape() as tape:
                y_mu, y_sigma = model(x, time, training=True)
                neg_log_pdf_normal = mdn_loss(y_mu[:, :1], y_sigma[:, :1], y[:, :1], "Normal")
                neg_log_pdf_beta = mdn_loss(y_mu[:, 1:], y_sigma[:, 1:], y[:, 1:], "Beta")
                neg_log_pdf_normal = tf.math.reduce_mean(neg_log_pdf_normal)
                neg_log_pdf_beta = tf.math.reduce_mean(neg_log_pdf_beta)
                focal_weight = config.model.focal_weight if neg_log_pdf_normal > 0 else 1/config.model.focal_weight
                mse_loss = mse(y[:, :1], y_mu[:, :1])
                train_loss = neg_log_pdf_normal * focal_weight + neg_log_pdf_beta + mse_loss * 1e3

            gradients = tape.gradient(train_loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
            train_loss_cache.append(train_loss.numpy())
            train_mse_cache.append(mse_loss.numpy())

        for x, time, y in tqdm(testing_batch, desc="Testing"):
            y_mu, y_sigma = model(x, time)
            neg_log_pdf_normal = mdn_loss(y_mu[:, :1], y_sigma[:, :1], y[:, :1], "Normal")
            neg_log_pdf_beta = mdn_loss(y_mu[:, 1:], y_sigma[:, 1:], y[:, 1:], "Beta")
            neg_log_pdf_normal = tf.math.reduce_mean(neg_log_pdf_normal)
            neg_log_pdf_beta = tf.math.reduce_mean(neg_log_pdf_beta)
            focal_weight = config.model.focal_weight if neg_log_pdf_normal > 0 else 1/config.model.focal_weight
            mse_loss = mse(y[:, :1], y_mu[:, :1])
            test_loss = neg_log_pdf_normal * focal_weight + neg_log_pdf_beta + mse_loss * 1e3
            test_loss_cache.append(test_loss.numpy())
            test_mse_cache.append(mse_loss.numpy())

        train_loss_epoch = np.mean(train_loss_cache)
        test_loss_epoch = np.mean(test_loss_cache)
        train_losses.append(train_loss_epoch)
        test_losses.append(test_loss_epoch)
        
        print('Epoch: {}/{}\ttrain_loss: {:.6f}\ttest_loss: {:.6f}'.
              format(e + 1, config.train.epoch, train_loss_epoch, test_loss_epoch))
        # break

Training:   0%|          | 0/703 [00:00<?, ?it/s]

Testing:   0%|          | 0/176 [00:00<?, ?it/s]

Epoch: 1/3000	train_loss: 37770802233344.000000	test_loss: 3785916153856.000000


Training:   0%|          | 0/703 [00:00<?, ?it/s]

Testing:   0%|          | 0/176 [00:00<?, ?it/s]

Epoch: 2/3000	train_loss: 3681834237952.000000	test_loss: 2990320123904.000000


Training:   0%|          | 0/703 [00:00<?, ?it/s]

Testing:   0%|          | 0/176 [00:00<?, ?it/s]

Epoch: 3/3000	train_loss: 3202255683584.000000	test_loss: 2663453294592.000000


Training:   0%|          | 0/703 [00:00<?, ?it/s]

Testing:   0%|          | 0/176 [00:00<?, ?it/s]

Epoch: 4/3000	train_loss: 2745961021440.000000	test_loss: 2311037911040.000000


Training:   0%|          | 0/703 [00:00<?, ?it/s]

Testing:   0%|          | 0/176 [00:00<?, ?it/s]

Epoch: 5/3000	train_loss: 2308510580736.000000	test_loss: 1983167594496.000000


Training:   0%|          | 0/703 [00:00<?, ?it/s]

Testing:   0%|          | 0/176 [00:00<?, ?it/s]

Epoch: 6/3000	train_loss: 1905829478400.000000	test_loss: 1660243542016.000000


Training:   0%|          | 0/703 [00:00<?, ?it/s]

Testing:   0%|          | 0/176 [00:00<?, ?it/s]

Epoch: 7/3000	train_loss: 1541000658944.000000	test_loss: 1370299564032.000000


Training:   0%|          | 0/703 [00:00<?, ?it/s]

Testing:   0%|          | 0/176 [00:00<?, ?it/s]

Epoch: 8/3000	train_loss: 1241519095808.000000	test_loss: 1122038317056.000000


Training:   0%|          | 0/703 [00:00<?, ?it/s]

Testing:   0%|          | 0/176 [00:00<?, ?it/s]

Epoch: 9/3000	train_loss: 979967344640.000000	test_loss: 859600388096.000000


Training:   0%|          | 0/703 [00:00<?, ?it/s]

Testing:   0%|          | 0/176 [00:00<?, ?it/s]

Epoch: 10/3000	train_loss: 792613748736.000000	test_loss: 723215974400.000000


Training:   0%|          | 0/703 [00:00<?, ?it/s]

Testing:   0%|          | 0/176 [00:00<?, ?it/s]

Epoch: 11/3000	train_loss: 682038525952.000000	test_loss: 628315914240.000000


Training:   0%|          | 0/703 [00:00<?, ?it/s]


KeyboardInterrupt

