# Imports

In [1]:
import json
import numpy
import pandas as pd
import pyarrow
import pickle
import sys

In [2]:
sys.path.append('../src/')

In [3]:
import prepare_data

# Configs

In [4]:
settings_path = '../configs/settings.json'
with open(settings_path, 'r') as f:
    config = json.load(f)

In [5]:
# EDA 3)
with open(f"../{config['RAW_DATA_DIR']}/index_stock_weights.pkl", 
          'rb') as f:
    index_stock_weights = pickle.load(f)

# Create training clear 

## Read

In [6]:
%%time
df_raw = pd.read_csv(f"../{config['RAW_DATA_DIR']}/train.csv")

CPU times: user 8.39 s, sys: 774 ms, total: 9.16 s
Wall time: 9.79 s


In [7]:
df_raw.shape

(5237980, 17)

## Scale sizes

In [8]:
train = df_raw
size_col = ['imbalance_size','matched_size','bid_size','ask_size']

### Dict for inference

In [9]:
scale_dict = {} # feature -> stock_id -> median
for col in size_col:
    medians = train.groupby('stock_id')[col].median()
    scale_dict[col] = medians.to_dict()

In [10]:
with open(f"../{config['RAW_DATA_DIR']}/scale_dict.pkl", 'wb') as f:
    pickle.dump(scale_dict, f)

In [11]:
%%time

import importlib
importlib.reload(prepare_data)

df, feas_list = prepare_data.generate_features_no_hist_polars(df_raw, 
                                                   index_stock_weights,
                                                   scale_dict)

CPU times: user 3min 15s, sys: 39.9 s, total: 3min 55s
Wall time: 1min 53s


In [12]:
print(f'{df.shape=}')
#print(f'{feas_list=}')

df.shape=(5237980, 406)


# Record candidate features for all models

In [21]:
%%time
from pyarrow import parquet as pq

pq.write_table(df.to_arrow(), 
               f"../{config['RAW_DATA_DIR']}/full_features.parquet.gzip", 
               compression='GZIP')