In [1]:
#input must be options chain day1 - end

In [3]:
import wandb
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler
import joblib

2024-06-09 02:14:49.238270: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
CSV_PATH     = './data/OptionsEOD.csv/'
PARQUET_PATH = './data/OptionsEOD.parquet'
SCALER_COL  = ['DTE','INTRINSIC_VALUE','C_VOLUME',	'C_BID',	'C_ASK',	'P_BID',	'P_ASK',	'P_VOLUME' ]
SCALER_PATH = './data/scaler.gz'

In [14]:
#example
EOD_CSV = pd.read_csv(CSV_PATH+"qqq/qqq_eod_201201.txt", engine='pyarrow')
EOD_CSV.head()
EOD_CSV['[INTRINSIC_VALUE]']

KeyError: '[INTRINSIC_VALUE]'

In [9]:
EOD_CSV.columns

Index(['[QUOTE_UNIXTIME]', ' [QUOTE_READTIME]', ' [QUOTE_DATE]',
       ' [QUOTE_TIME_HOURS]', ' [UNDERLYING_LAST]', ' [EXPIRE_DATE]',
       ' [EXPIRE_UNIX]', ' [DTE]', ' [C_DELTA]', ' [C_GAMMA]', ' [C_VEGA]',
       ' [C_THETA]', ' [C_RHO]', ' [C_IV]', ' [C_VOLUME]', ' [C_LAST]',
       ' [C_SIZE]', ' [C_BID]', ' [C_ASK]', ' [STRIKE]', ' [P_BID]',
       ' [P_ASK]', ' [P_SIZE]', ' [P_LAST]', ' [P_DELTA]', ' [P_GAMMA]',
       ' [P_VEGA]', ' [P_THETA]', ' [P_RHO]', ' [P_IV]', ' [P_VOLUME]',
       ' [STRIKE_DISTANCE]', ' [STRIKE_DISTANCE_PCT]'],
      dtype='object')

In [None]:
#Part I
#TransformData : partition from EXPIRE_DATE 
def TransformData():
    scaler = MinMaxScaler()
    schema = None
    pqwriter = None
    for d in os.listdir(CSV_PATH):
        for f in os.listdir(CSV_PATH+f"{d}/"):
            if f.endswith(".txt"):
                ## load
                print( f"[LOAD] : {CSV_PATH}{d}/{f}        ",end='\r')
                EOD_CSV = pd.read_csv(CSV_PATH+f"{d}/"+f, engine='pyarrow')
                    
                ## rename col.
                for c in EOD_CSV.columns:
                    EOD_CSV = EOD_CSV.rename( columns={ c:c.strip().replace(']','').replace('[','') } )
                
                ## add symbol 
                EOD_CSV['SYMBOL'] = d.upper()
                ## add INTRINSIC_VALUE
                EOD_CSV['INTRINSIC_VALUE'] = EOD_CSV['UNDERLYING_LAST'] - EOD_CSV['STRIKE']
                
                ## fillnafillna
                EOD_CSV['P_VOLUME'] = EOD_CSV['P_VOLUME'].fillna(0)
                EOD_CSV['C_VOLUME'] = EOD_CSV['C_VOLUME'].fillna(0)
                
                # date columns convert to datetime
                for c in ["QUOTE_READTIME","QUOTE_DATE","EXPIRE_DATE"]:
                    EOD_CSV[c] = pd.to_datetime(EOD_CSV[c])
                
                #clean float data
                for c in ['INTRINSIC_VALUE','C_DELTA','C_GAMMA','C_VEGA','C_THETA','C_RHO','C_IV','C_VOLUME','C_LAST','C_BID','C_ASK','STRIKE','P_BID','P_ASK','P_LAST','P_DELTA','P_GAMMA','P_VEGA','P_THETA','P_RHO','P_IV','P_VOLUME','STRIKE_DISTANCE','STRIKE_DISTANCE_PCT']:
                    if EOD_CSV[c].dtype not in ( 'float32','float64'):
                        EOD_CSV[c] = EOD_CSV[c].apply(lambda x: x.strip())
                        EOD_CSV[c] = EOD_CSV[c].replace('', np.nan).fillna(np.nan)
                        EOD_CSV[c] = EOD_CSV[c].astype('float64')
                    if EOD_CSV[c].dtype == 'float32':
                        EOD_CSV[c] = EOD_CSV[c].astype('float64')
                        
                # REMAIN_DAYS(int) =>  use DTE col.
                #partition with QUOTE_DATE
                EOD_CSV['PartitionDate'] = EOD_CSV['QUOTE_DATE'].dt.strftime('%Y-%m')
                EOD_CSV.sort_values(['QUOTE_DATE','EXPIRE_DATE','SYMBOL','STRIKE'],ascending =False ) 

                #scaler(Normalization_
                scaler.partial_fit(EOD_CSV[SCALER_COL])

                # save
                if os.path.exists(parquet_path):
                  EOD_CSV.to_parquet(parquet_path, engine='fastparquet', append=True, partition_cols=['PartitionDate'], index=False )
                else:
                  EOD_CSV.to_parquet(parquet_path, engine='fastparquet' , partition_cols=['PartitionDate'], index=False  )
                    
    joblib.dump(scaler, SCALER_PATH )
    if pqwriter:
        pqwriter.close()
    print( f"[DONE]                                                       ",end='\r')

##-RunCleanData
TransformData()

In [None]:
#Part II 
#Normalization if not have scaler.gz file
scaler = MinMaxScaler()
PartitionDate = [ d[-7:] for d in  os.listdir(parquet_path) if 'PartitionDate' in d]
for i,partdate in enumerate(PartitionDate) :
    df = pd.read_parquet(parquet_path,engine='pyarrow'
                                 , filters=[('PartitionDate', '=', partdate)]
                                )
    df['P_VOLUME'] = df['P_VOLUME'].fillna(0)
    df['C_VOLUME'] = df['C_VOLUME'].fillna(0)
    
    scaler.partial_fit(df[SCALER_COL])

    print(f"[Processing] {round(((i+1)/len(PartitionDate))*100,0)}%   ",end='\r')
    
joblib.dump(scaler, SCALER_PATH )


In [None]:
#=======================================================================================

In [16]:
#example transform(norm)
#load scaler
scaler = MinMaxScaler()
PartitionDate = [ d[-7:] for d in  os.listdir(parquet_path) if 'PartitionDate' in d]
for i,partdate in enumerate(PartitionDate) :
    df = pd.read_parquet(parquet_path,engine='pyarrow'
                                 , filters=[('PartitionDate', '=', partdate)]
                                )
    df['P_VOLUME'] = df['P_VOLUME'].fillna(0)
    df['C_VOLUME'] = df['C_VOLUME'].fillna(0)
    break
scaler = joblib.load(SCALER_PATH)
scaler.transform(df[SCALER_COL])

array([[1.49622950e-03, 6.37199400e-01, 0.00000000e+00, ...,
        0.00000000e+00, 4.99997475e-08, 0.00000000e+00],
       [1.49622950e-03, 6.37121956e-01, 0.00000000e+00, ...,
        0.00000000e+00, 4.99997475e-08, 0.00000000e+00],
       [1.49622950e-03, 6.37044512e-01, 0.00000000e+00, ...,
        0.00000000e+00, 4.99997475e-08, 0.00000000e+00],
       ...,
       [5.24677812e-01, 5.63773764e-01, 0.00000000e+00, ...,
        1.49696479e-01, 5.01442468e-03, 0.00000000e+00],
       [5.24677812e-01, 5.44412710e-01, 0.00000000e+00, ...,
        1.87128244e-01, 6.23851850e-03, 0.00000000e+00],
       [5.24677812e-01, 5.05690601e-01, 0.00000000e+00, ...,
        2.62221135e-01, 8.69395610e-03, 0.00000000e+00]])

In [18]:
df.head()

Unnamed: 0,QUOTE_UNIXTIME,QUOTE_READTIME,QUOTE_DATE,QUOTE_TIME_HOURS,UNDERLYING_LAST,EXPIRE_DATE,EXPIRE_UNIX,DTE,C_DELTA,C_GAMMA,...,P_VEGA,P_THETA,P_RHO,P_IV,P_VOLUME,STRIKE_DISTANCE,STRIKE_DISTANCE_PCT,SYMBOL,INTRINSIC_VALUE,PartitionDate
0,1325624400,2012-01-03 16:00:00,2012-01-03,16.0,56.9,2012-01-06,1325883600,3.0,1.0,0.0,...,0.00035,-0.00408,0.0,0.79359,0.0,10.9,0.192,QQQ,10.9,2012-01
1,1325624400,2012-01-03 16:00:00,2012-01-03,16.0,56.9,2012-01-06,1325883600,3.0,1.0,0.0,...,0.00053,-0.00442,-0.00042,0.72127,0.0,9.9,0.174,QQQ,9.9,2012-01
2,1325624400,2012-01-03 16:00:00,2012-01-03,16.0,56.9,2012-01-06,1325883600,3.0,1.0,0.0,...,0.00095,-0.00435,-0.00043,0.64864,0.0,8.9,0.156,QQQ,8.9,2012-01
3,1325624400,2012-01-03 16:00:00,2012-01-03,16.0,56.9,2012-01-06,1325883600,3.0,1.0,0.0,...,0.00117,-0.00405,-0.00033,0.57777,0.0,7.9,0.139,QQQ,7.9,2012-01
4,1325624400,2012-01-03 16:00:00,2012-01-03,16.0,56.9,2012-01-06,1325883600,3.0,0.97154,0.01636,...,0.00137,-0.00414,0.0,0.50756,0.0,6.9,0.121,QQQ,6.9,2012-01


In [17]:
df[SCALER_COL]

Unnamed: 0,DTE,INTRINSIC_VALUE,C_VOLUME,C_BID,C_ASK,P_BID,P_ASK,P_VOLUME
0,3.0,10.90,0.0,9.68,12.16,0.0,0.01,0.0
1,3.0,9.90,0.0,8.68,10.93,0.0,0.01,0.0
2,3.0,8.90,0.0,7.62,10.16,0.0,0.01,0.0
3,3.0,7.90,0.0,6.62,9.18,0.0,0.01,0.0
4,3.0,6.90,0.0,5.75,8.18,0.0,0.01,0.0
...,...,...,...,...,...,...,...,...
56537,1052.0,-787.21,0.0,0.00,10.01,833.0,856.90,0.0
56538,1052.0,-887.21,0.0,0.00,10.00,930.2,954.09,0.0
56539,1052.0,-937.21,0.0,1.14,10.00,979.0,1002.89,0.0
56540,1052.0,-1187.21,0.0,0.00,9.99,1223.8,1247.71,0.0


In [19]:
outputs = {
        "c_bid":None,
        "c_ask":None,
        "c_volume":None,
        "p_bid":None,
        "p_ask":None,
        "p_volume":None
    }

[None, None, None, None, None, None]