In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
def read_files(directory):
    dfs, names = [], []
    for i, file in enumerate(sorted(os.listdir(directory))):
        names.append(file)
        if file.endswith('.hdf'):
            df = pd.read_hdf(os.path.join(directory, file))
        elif file.endswith('.parquet'):
            df = pd.read_parquet(os.path.join(directory, file))
        else:
            raise ValueError(f'Unknown file format: {file}')

        if i == 0:
            columns = df.columns
            print(df.info(), end='\n\n')
            print(df.head(10), end='\n\n')
        else:
            assert np.all(df.columns == columns)

        dfs.append(df)
    return dfs, names


def get_info(dfs, names):
    for df, name in zip(dfs, names):
        print(f'{name}: len={len(df)}')
        min_ts = pd.to_datetime(df['TIMESTAMP_VALUE'].min(), unit='ms')
        max_ts = pd.to_datetime(df['TIMESTAMP_VALUE'].max(), unit='ms')
        print(f'{min_ts} -- {max_ts}')

        min_change_id = df['CHANGE_ID'].min()
        max_change_id = df['CHANGE_ID'].max()
        print(f'CHANGE_ID: {min_change_id} -- {max_change_id}')

        print()

In [3]:
BOOKS_RAW_DIR = '../../../data_test/books_raw'
TRADES_DIR = '../../../data_test/trades'
BOOKS_PROCESSED_DIR = '../../../data_test/books_processed'

In [4]:
books_raw_dfs, books_raw_names = read_files(BOOKS_RAW_DIR)
get_info(books_raw_dfs, books_raw_names)

<class 'pandas.core.frame.DataFrame'>
Index: 999999 entries, 0 to 999998
Data columns (total 46 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   CHANGE_ID            999999 non-null  int64  
 1   INSTRUMENT_INDEX     999999 non-null  int64  
 2   INSTRUMENT_STRIKE    999999 non-null  float64
 3   INSTRUMENT_MATURITY  999999 non-null  int64  
 4   INSTRUMENT_TYPE      999999 non-null  int64  
 5   TIMESTAMP_VALUE      999999 non-null  int64  
 6   BID_0_PRICE          999999 non-null  float64
 7   BID_0_AMOUNT         999999 non-null  float64
 8   BID_1_PRICE          999999 non-null  float64
 9   BID_1_AMOUNT         999999 non-null  float64
 10  BID_2_PRICE          999999 non-null  float64
 11  BID_2_AMOUNT         999999 non-null  float64
 12  BID_3_PRICE          999999 non-null  float64
 13  BID_3_AMOUNT         999999 non-null  float64
 14  BID_4_PRICE          999999 non-null  float64
 15  BID_4_AMOUNT         9

In [5]:
trades_dfs, trades_names = read_files(TRADES_DIR)
get_info(trades_dfs, trades_names)

<class 'pandas.core.frame.DataFrame'>
Index: 1000000 entries, 0 to 999999
Data columns (total 10 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   CHANGE_ID            1000000 non-null  int64  
 1   TIMESTAMP_VALUE      1000000 non-null  int64  
 2   TRADE_ID             1000000 non-null  int64  
 3   PRICE                1000000 non-null  float64
 4   INSTRUMENT_INDEX     1000000 non-null  int64  
 5   INSTRUMENT_STRIKE    1000000 non-null  float64
 6   INSTRUMENT_MATURITY  1000000 non-null  int64  
 7   INSTRUMENT_TYPE      1000000 non-null  int64  
 8   DIRECTION            1000000 non-null  int64  
 9   AMOUNT               1000000 non-null  float64
dtypes: float64(3), int64(7)
memory usage: 83.9 MB
None

   CHANGE_ID  TIMESTAMP_VALUE   TRADE_ID    PRICE  INSTRUMENT_INDEX  \
0   62000000    1726843534430  217605158  2553.15                 1   
1   62000001    1726843534952  217605169  2556.75                 1   

In [6]:
books_processed_dfs, books_processed_names = read_files(BOOKS_PROCESSED_DIR)

<class 'pandas.core.frame.DataFrame'>
Index: 999710 entries, 0 to 999709
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   instrument_id     999710 non-null  int64  
 1   strike            999710 non-null  float64
 2   maturity          999710 non-null  int64  
 3   instrument_type   999710 non-null  int64  
 4   timestamp         999710 non-null  int64  
 5   best_bid_price    999710 non-null  float64
 6   best_ask_price    999710 non-null  float64
 7   bid_amount_total  999710 non-null  float64
 8   ask_amount_total  999710 non-null  float64
 9   bid_vwap          999710 non-null  float64
 10  ask_vwap          999710 non-null  float64
dtypes: float64(7), int64(4)
memory usage: 91.5 MB
None

   instrument_id   strike    maturity  instrument_type      timestamp  \
0              0  48000.0  1726002000                5  1725970692950   
1              0  51500.0  1726002000                6  17259706929