# Imports

In [1]:
import h5py
import json
import numpy
import os
import pandas as pd
import polars as pl
import pickle
import sys
from tqdm import tqdm

In [2]:
sys.path.append('../src/')

In [3]:
import prepare_data
import h5_utils

# Configs

In [4]:
settings_path = '../configs/settings.json'
with open(settings_path, 'r') as f:
    config = json.load(f)

In [5]:
# EDA 3)
with open(f"../{config['MODEL_DIR']}/index_stock_weights.pkl", 
          'rb') as f:
    index_stock_weights = pickle.load(f)

In [6]:
folder_daily_h5 = f"../{config['RAW_DATA_DIR']}/daily/"
metadata_filename = 'metadata.h5'
metadata_filepath = f"../{config['RAW_DATA_DIR']}/{metadata_filename}"

# Create training clear 

## Read

In [7]:
%%time
df_raw = pd.read_csv(f"../{config['RAW_DATA_DIR']}/train.csv")

CPU times: user 8.15 s, sys: 725 ms, total: 8.88 s
Wall time: 9.57 s


In [8]:
df_raw.shape

(5237980, 17)

In [9]:
df_raw.columns

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')

## Scale sizes

In [10]:
train = df_raw
size_col = ['imbalance_size','matched_size','bid_size','ask_size']

### Dict for inference

In [11]:
scale_dict = {} # feature -> stock_id -> median
for col in size_col:
    medians = train.groupby('stock_id')[col].median()
    scale_dict[col] = medians.to_dict()

In [12]:
with open(f"../{config['MODEL_DIR']}/scale_dict.pkl", 'wb') as f:
    pickle.dump(scale_dict, f)

In [13]:
df_raw.columns

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')

In [14]:
%%time
df, feas_list = prepare_data.generate_features_no_hist_polars(df_raw, 
                                                   index_stock_weights,
                                                   scale_dict)

CPU times: user 3min 40s, sys: 45.2 s, total: 4min 26s
Wall time: 2min 2s


In [15]:
print(f'{df.shape=}')
print(f'{len(feas_list)=}')

df.shape=(5237980, 447)
len(feas_list)=157


# Record candidate features for all models

## Daily load trick h5

### Cols separation

In [16]:
INTEGRITY_COLS = ('time_id', 'row_id')
TARGET_COL = 'target'

In [17]:
h5_writer = h5_utils.WriterWithIntegrity(INTEGRITY_COLS,TARGET_COL)

In [18]:
h5_writer.get_train_feats_only(df_raw)

['stock_id',
 'seconds_in_bucket',
 'imbalance_size',
 'imbalance_buy_sell_flag',
 'reference_price',
 'matched_size',
 'far_price',
 'near_price',
 'bid_price',
 'bid_size',
 'ask_price',
 'ask_size',
 'wap']

In [19]:
if not os.path.exists(folder_daily_h5):
    os.mkdir(folder_daily_h5)
folder_daily_h5

'.././data//daily/'

### Metadata h5

In [20]:
h5_writer.save_metadata(df,
              metadata_filepath)

### Daily h5

In [21]:
all_date_ids = sorted(df['date_id'].unique())

In [22]:
for date_id in tqdm(all_date_ids):
    day_data = df.filter(pl.col('date_id') == date_id)
    h5_writer.write_daily_hdf5(day_data, f'{folder_daily_h5}/{date_id}.h5')

100%|██████████| 481/481 [02:45<00:00,  2.91it/s]
