In [1]:
#https://keras.io/examples/generative/vae/
#https://keras.io/examples/generative/molecule_generation/

In [2]:
#!python3 -m pip install --upgrade pip
#!pip install pydot
#!apt-get install -y graphviz

In [3]:
import os,shutil,random
from sklearn.preprocessing import MinMaxScaler
import joblib
from IPython.display import clear_output
import pandas as pd
import h5py
import tensorflow as tf
from IPython.display import clear_output,display, HTML
from sklearn.model_selection import train_test_split
import datetime as dt
import numpy as np
#================== initialization ==================
currentTM=dt.datetime.now().strftime("%Y-%m-%dT%H%M%S")
PROJECT = "testVAEModel"
LATENT_DIM = 32
VAE_LR = 1e-5
EPOCHS = 5
BATCH_SIZE = 32
PARQUET_PATH = './data/OptionsEOD_STG.parquet'
SCALER_PATH = './data/scaler.gz'
UNIQUE_KEYS = ['QUOTE_DATE','SYMBOL','EXPIRE_DATE']
SCALER_COL  = ['DTE','INTRINSIC_VALUE', 'TOTAL_VOLUME',	'C_BID',	'C_ASK', 'C_VOLUME',  'P_BID',	'P_ASK',	'P_VOLUME' ]
MODEL_PATH = "./models/"
H5_PATH = './data/OptTrainData/'
DISPLAY = False
WANDB_LOG = True
RESUME = False

2024-07-05 08:53:50.421002: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
#741d760b304d0be5b18d4ee9682f77156e6967b5
import wandb
from wandb.keras import WandbCallback
CONFIG = {    "latent_dim":LATENT_DIM,
                  "learning_rate": VAE_LR,
                  "epochs": EPOCHS,
                  "batch_size": BATCH_SIZE,
                  "architecture": "VAE",
                  "dataset": "OptionsChaine",
                  "encoder_dense_units":[128,64],
                  "encoder_dropout_rate":0.2,
                  "decoder_dense_units":[64, 128],
                  "decoder_dropout_rate":0.2,
               }
    
if WANDB_LOG :
    wandb.login()
    run = wandb.init(project=PROJECT, name=currentTM, config=CONFIG)

[34m[1mwandb[0m: Currently logged in as: [33mwasan-sinlapa[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
#Example

# from IPython.display import clear_output,display, HTML
# import numpy as np
# #load scaler
# scaler = MinMaxScaler()
# PartitionDate = [ d[-7:] for d in  os.listdir(PARQUET_PATH) if 'PartitionDate' in d]
# random.shuffle(PartitionDate)
# scaler = joblib.load(SCALER_PATH)


# for i,partdate in enumerate(PartitionDate) :
#     df = pd.read_parquet(PARQUET_PATH,engine='pyarrow'
#                                  , filters=[('PartitionDate', '=', partdate)]
#                                 )
#     df['P_VOLUME'] = df['P_VOLUME'].fillna(0)
#     df['C_VOLUME'] = df['C_VOLUME'].fillna(0)
#     DATA  = np.empty((0,) + (20,9) ) 
#     for opt_id in np.unique( df[["OPTIONS_ID"]].values):
#         df_filter  = df[df["OPTIONS_ID"]==opt_id]
#         if len(df_filter) == 20:
#             DATA = np.vstack((DATA ,[scaler.transform(df_filter[SCALER_COL])]))
#         else:
#             #print( len(df_filter) )
#             #display(HTML(df_filter[['STRIKE']+SCALER_COL].to_html()))
#             pass
            
#     ## Save the NumPy array to an HDF5 file
#     # with h5py.File(H5_PATH+f"{partdate}.h5", 'w') as f:
#     #     dset = f.create_dataset(f'{partdate}', data=DATA, chunks=True , compression='gzip')

#     print(f"[Processing] {partdate}, {round(((i+1)/len(PartitionDate))*100,2)}%     ",end='\r')

In [6]:
# # Save the NumPy array to an HDF5 file
# with h5py.File(H5_PATH, 'w') as f:
#     #dset = f.create_dataset('dataset', data=DATA, chunks=True, compression='gzip')
#     #test
#     dset = f.create_dataset('dataset', data=DATA, compression='gzip')

In [7]:
#=====================================================================

In [8]:
from src.model import OptionChainGenerator
from src.layer import encoder, decoder

model = OptionChainGenerator(
    encoder(latent_dim = LATENT_DIM, 
            input_shape= (20,3), 
            dense_units = CONFIG["encoder_dense_units"], 
            dropout_rate= CONFIG["encoder_dropout_rate"]
           ), 
    decoder(latent_dim  = LATENT_DIM , 
            output_shape= (20,1),
            dense_units = CONFIG["decoder_dense_units"],
            dropout_rate= CONFIG["decoder_dropout_rate"]
           )
)

def dummy_loss(y_true, y_pred):
    return 0.0
    
vae_optimizer = tf.keras.optimizers.Adam(learning_rate=VAE_LR)
model.compile(vae_optimizer )#, loss=dummy_loss)

In [9]:
################## show model ######################
if DISPLAY :
    from tensorflow.keras.utils import model_to_dot
    from IPython.display import SVG, display
    
    def display_model(model, width=1024, height=512):
        dot = model_to_dot(model, show_shapes=True, show_layer_names=True)
        svg_data = dot.create(prog='dot', format='svg').decode("utf-8")
        svg_html = f'<div style="width:{width}px;height:{height}px;">{svg_data}</div>'
        display(HTML(svg_html))

In [10]:
## Example usage:
## Display the encoder model with reduced size
if DISPLAY :
    display_model(model.encoder, width=1024, height=512)

In [11]:
if DISPLAY :
    display_model(model.decoder, width=2500, height=512)

In [12]:
#================== loadmodel ====================

In [13]:
from tensorflow.keras.models import load_model
model_path = MODEL_PATH+f'{PROJECT}'
if not RESUME :
    if os.path.exists(model_path) :
        shutil.rmtree(model_path)
if not os.path.exists(model_path):
    os.makedirs(model_path)
    model.encoder.save(model_path+f'/'+f'encoder.keras') 
    model.decoder.save(model_path+f'/'+f'decoder.keras') 
else:
    model.encoder = load_model(model_path+'/'+f'encoder.keras') 
    model.decoder = load_model(model_path+'/'+f'decoder.keras') 

In [14]:
#================== train model ==================
PartitionDate = [ d[-7:] for d in  os.listdir(PARQUET_PATH) if 'PartitionDate' in d]
random.shuffle(PartitionDate)

STOP_MODEL = False
for partdate in PartitionDate[:] :
    clear_output(wait=False)
    #nan problem
    #partdate = '2022-05'
    #normal 
    #partdate = '2011-12'
    
    with h5py.File(H5_PATH+partdate+".h5", 'r') as f:
        DATA = f[partdate][:]
        X = DATA[:, :, :3]  # เลือกข้อมูลแถวแรกถึงแถวที่ 3 สำหรับ X
        Y = DATA[:, :, 3:]  # เลือกข้อมูลแถวที่ 3 เป็นต้นไปสำหรับ Y
        x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=42)
        if len(X) :
            random.shuffle(PartitionDate)
            tf.keras.backend.clear_session() 
            history = model.fit(x_train , y_train, epochs=5, batch_size=BATCH_SIZE, validation_data=(x_val, y_val) )
            if  np.isnan(  np.average( history.history['kl_loss'] )  ) or np.isnan(  np.average( history.history['val_kl_loss'] )  ):
                STOP_MODEL = True 
                print(x_train)
                print("---")
                print(x_val)
                print("=============")
    if WANDB_LOG :
        LogKeys = history.history.keys()
        LogVal={}
        for k in LogKeys:  
            LogVal[k] = np.average(  history.history[k] )
        wandb.log(LogVal, commit=True)
        
    if STOP_MODEL :
        break
    
            
    model.encoder.save(model_path+f'/'+f'encoder.keras') 
    model.decoder.save(model_path+f'/'+f'decoder.keras') 
if WANDB_LOG : wandb.finish()

Epoch 1/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - kl_loss: 1.5858e-09 - total_loss: 0.0023 - vol_loss: 0.0036 - val_kl_loss: 0.0000e+00 - val_total_loss: 0.0033 - val_vol_loss: 0.0021
Epoch 2/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - kl_loss: 0.0000e+00 - total_loss: 0.0024 - vol_loss: 0.0035 - val_kl_loss: 0.0000e+00 - val_total_loss: 0.0033 - val_vol_loss: 0.0023
Epoch 3/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - kl_loss: 0.0000e+00 - total_loss: 0.0023 - vol_loss: 0.0037 - val_kl_loss: 0.0000e+00 - val_total_loss: 0.0033 - val_vol_loss: 0.0021
Epoch 4/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - kl_loss: 0.0000e+00 - total_loss: 0.0021 - vol_loss: 0.0036 - val_kl_loss: 0.0000e+00 - val_total_loss: 0.0033 - val_vol_loss: 0.0023
Epoch 5/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - kl_loss: 4.3120e-09 - total_loss: 0.00

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
kl_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
total_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_kl_loss,█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_total_loss,█▅▃▂▂▂▂▁▁▂▁▂▁▁▁▁▁▁▁▁▁▂▁▂▁▂▁▁▁▁▁▁▁▁▂▁▁▂▁▁
val_vol_loss,█▅▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
vol_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
kl_loss,0.0
total_loss,0.0
val_kl_loss,0.0
val_total_loss,0.00327
val_vol_loss,0.0022
vol_loss,0.0


In [15]:
`====================================================

SyntaxError: invalid syntax (2094489582.py, line 1)

In [17]:
X[0]

array([[0.00000000e+00, 3.92071562e-01, 1.73585972e-04],
       [0.00000000e+00, 3.92071562e-01, 1.73585972e-04],
       [0.00000000e+00, 3.92071562e-01, 1.73585972e-04],
       [0.00000000e+00, 3.92071562e-01, 1.73585972e-04],
       [0.00000000e+00, 3.92071562e-01, 1.73585972e-04],
       [1.79048797e-01, 3.94499003e-01, 1.73585972e-04],
       [1.79048797e-01, 3.94241586e-01, 1.73585972e-04],
       [1.79048797e-01, 3.93984169e-01, 1.73585972e-04],
       [1.79048797e-01, 3.93726752e-01, 1.73585972e-04],
       [1.79048797e-01, 3.93469335e-01, 1.73585972e-04],
       [1.79048797e-01, 3.93211918e-01, 1.73585972e-04],
       [1.79048797e-01, 3.92954502e-01, 1.73585972e-04],
       [1.79048797e-01, 3.92697085e-01, 1.73585972e-04],
       [1.79048797e-01, 3.92439668e-01, 1.73585972e-04],
       [1.79048797e-01, 3.92182251e-01, 1.73585972e-04],
       [0.00000000e+00, 3.92071562e-01, 1.73585972e-04],
       [0.00000000e+00, 3.92071562e-01, 1.73585972e-04],
       [0.00000000e+00, 3.92071

In [None]:
a=tf.expand_dims(y_val[:, :, 0], axis=-1).numpy()

In [None]:
np.sum(Y)

In [None]:
for i in a:
    if np.isnan( np.sum(i) ) :
        print(i)
        print('-------')
        break

In [None]:
np.sum(i)

In [None]:
print(x_val)
print("---")
print(y_val)
print("=============")

In [None]:
Y=y_val
for i in range ( len( np.transpose( Y[0] ) ) ):
    col = np.transpose( Y[i] ) 
    for c in range(len(col)):
        a = np.transpose( Y[i] )[c]
        if np.sum( a ) == 0 :
            print(f"0 - i:{i},c:{c}")
        if np.isnan( np.sum( a ) ) :
            print(f"nan - i:{i},c:{c}")