# Imports

In [1]:
import h5py
import json
import numpy
import os
import pandas as pd
import polars as pl
import pickle
import sys
from tqdm import tqdm

In [2]:
sys.path.append('../src/')

In [3]:
import prepare_data

# Configs

In [4]:
settings_path = '../configs/settings.json'
with open(settings_path, 'r') as f:
    config = json.load(f)

In [5]:
# EDA 3)
with open(f"../{config['RAW_DATA_DIR']}/index_stock_weights.pkl", 
          'rb') as f:
    index_stock_weights = pickle.load(f)

# Create training clear 

## Read

In [6]:
%%time
df_raw = pd.read_csv(f"../{config['RAW_DATA_DIR']}/train.csv")

CPU times: user 8.02 s, sys: 755 ms, total: 8.77 s
Wall time: 9.47 s


In [7]:
df_raw.shape

(5237980, 17)

In [8]:
df_raw.columns

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')

## Scale sizes

In [14]:
train = df_raw
size_col = ['imbalance_size','matched_size','bid_size','ask_size']

### Dict for inference

In [10]:
scale_dict = {} # feature -> stock_id -> median
for col in size_col:
    medians = train.groupby('stock_id')[col].median()
    scale_dict[col] = medians.to_dict()

In [11]:
with open(f"../{config['RAW_DATA_DIR']}/scale_dict.pkl", 'wb') as f:
    pickle.dump(scale_dict, f)

In [15]:
df_raw.columns

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')

In [17]:
%%time

import importlib
importlib.reload(prepare_data)

df, feas_list = prepare_data.generate_features_no_hist_polars(df_raw, 
                                                   index_stock_weights,
                                                   scale_dict)

CPU times: user 3min 39s, sys: 48.7 s, total: 4min 28s
Wall time: 3min 1s


In [18]:
print(f'{df.shape=}')
print(f'{len(feas_list)=}')

df.shape=(5237980, 447)
len(feas_list)=157


# Record candidate features for all models

## Daily load trick h5

### Cols separation

In [19]:
C_integrity_cols = ['time_id', 'row_id']
C_target_col = ['target']

In [20]:
def get_train_feats_only(df):
    feature_columns = [col for col in df.columns if col not in \
                       ['date_id', *C_integrity_cols, *C_target_col]]

    return feature_columns

In [21]:
get_train_feats_only(df_raw)

['stock_id',
 'seconds_in_bucket',
 'imbalance_size',
 'imbalance_buy_sell_flag',
 'reference_price',
 'matched_size',
 'far_price',
 'near_price',
 'bid_price',
 'bid_size',
 'ask_price',
 'ask_size',
 'wap']

In [22]:
folder_daily_h5 = f"../{config['RAW_DATA_DIR']}/daily/"
if not os.path.exists(folder_daily_h5):
    os.mkdir(folder_daily_h5)
folder_daily_h5

'.././data//daily/'

### metadata h5

In [23]:
def save_metadata(df, filepath):
    date_ids = sorted(df['date_id'].unique())
    column_names = get_train_feats_only(df)
    
    with h5py.File(filepath, 'w') as f:
        
        f.create_dataset('date_ids', data=date_ids)
        f.create_dataset('column_names', data=column_names)

In [24]:
metadata_filename = 'metadata.h5'

save_metadata(df,
              f"../{config['RAW_DATA_DIR']}/{metadata_filename}")

### Daily h5

In [25]:
def write_daily_hdf5(df, filename):
    with h5py.File(filename, 'w') as f:
        
        integrity_group = f.create_group('integrity_cols')
        for column in C_integrity_cols:
            integrity_group.create_dataset(column, 
                                           data=df[column].to_numpy())

        f.create_dataset('data/target', 
                         data=df[C_target_col[0]].to_numpy())

        features_group = f.create_group('data/features')
        
        feature_columns = get_train_feats_only(df)
        for column in feature_columns:
            features_group.create_dataset(column, 
                                          data=df[column].to_numpy())

In [28]:
all_date_ids = sorted(df['date_id'].unique())

In [27]:
for date_id in tqdm(all_date_ids):
    day_data = df.filter(pl.col('date_id') == date_id)
    write_daily_hdf5(day_data, f'{folder_daily_h5}/{date_id}.h5')

100%|██████████| 481/481 [03:48<00:00,  2.11it/s]
