In [6]:
%matplotlib inline
import pyarrow.parquet as pq
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt

# book_train.parquet, book_test.parquet
### 市場に投入された最も競争力のある売買注文に関するおーだブックデータ
### データの詳細は
### https://docs.google.com/document/d/16-TzzVW-jQCzK-OKX7UrjH98ahqImFPI-9glhLVUrmM/edit

# trada_train.parquet, trade_test.parquet
### 実際に行われた取引に関するデータ
### データの詳細は
### https://docs.google.com/document/d/16-TzzVW-jQCzK-OKX7UrjH98ahqImFPI-9glhLVUrmM/edit

In [3]:
""" 
book_train = pq.read_pandas('data/book_train.parquet').to_pandas()
book_test = pq.read_pandas('data/book_test.parquet').to_pandas()
table_train = pq.read_pandas('data/trade_train.parquet').to_pandas()
table_test = pq.read_pandas('data/trade_test.parquet').to_pandas()
"""

In [None]:
"""
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
"""


In [7]:
train_test_dtypes = {
    'stock_id': np.uint8,
    'time_id': np.uint16,
    'target': np.float64
}

df_train = pd.read_csv('data/train.csv', dtype=train_test_dtypes)
df_test = pd.read_csv('data/test.csv', usecols=['stock_id', 'time_id'], dtype=train_test_dtypes)

print(f'Training Set Shape: {df_train.shape}')
print(f'Training Set Memory Usage: {df_train.memory_usage().sum() / 1024 ** 2:.2f} MB')
print(f'Test Set Shape: {df_test.shape}')
print(f'Test Set Memory Usage: {df_test.memory_usage().sum() / 1024 ** 2:.2f} MB')

Training Set Shape: (428932, 3)
Training Set Memory Usage: 4.50 MB
Test Set Shape: (3, 2)
Test Set Memory Usage: 0.00 MB


In [None]:
def read_book_data(dataset, stock_id, sort=False, forward_fill=False):
        
    book_dtypes = {
        'time_id': np.uint16,
        'seconds_in_bucket': np.uint16,
        'bid_price1': np.float32,
        'ask_price1': np.float32,
        'bid_price2': np.float32,
        'ask_price2': np.float32,
        'bid_size1': np.uint32,
        'ask_size1': np.uint32,
        'bid_size2': np.uint32,
        'ask_size2': np.uint32,
    }

    df_book = pd.read_parquet(f'data/book_{dataset}.parquet/stock_id={stock_id}')
    for column, dtype in book_dtypes.items():
        df_book[column] = df_book[column].astype(dtype)
    
    if sort:
        df_book.sort_values(by=['time_id', 'seconds_in_bucket'], inplace=True)
        
    if forward_fill:
        df_book = df_book.set_index(['time_id', 'seconds_in_bucket'])
        df_book = df_book.reindex(pd.MultiIndex.from_product([df_book.index.levels[0], np.arange(0, 600)], names=['time_id', 'seconds_in_bucket']), method='ffill')
        df_book.reset_index(inplace=True)

    return df_book