In [15]:
import os
import glob
import dask

import numpy as np
import pandas as pd
from src.preprocessing import extract_quotes

In [6]:
DATA_PATH = '/Users/mac/Desktop/Repos/FBD_Project/datasets/'
RAW_DATA_PATH = DATA_PATH + 'raw/'

quote_btc = sorted(glob.glob(RAW_DATA_PATH + 'btcusdt/quotes/*.csv.gz'))
quote_eth = sorted(glob.glob(RAW_DATA_PATH + 'ethusdt/quotes/*.csv.gz'))
quote_ada = sorted(glob.glob(RAW_DATA_PATH + 'adausdt/quotes/*.csv.gz'))

In [19]:
@dask.delayed
def extract_quotes(trade_file_path: str) -> pd.DataFrame:
    df = pd.read_csv(trade_file_path)[
        ["timestamp", "ask_price", "bid_price", "ask_amount", "bid_amount"]
    ]
        
    # calculate mid price and bidask spread
    df["mid_price"] = (df["ask_price"] + df["bid_price"]) / 2
    df["ba_spread"] = np.round((df["ask_price"] - df["bid_price"]), 5)
    df["imbalance"] = df["bid_amount"] / (df["bid_amount"] + df["ask_amount"])
    df["timestamp"] = pd.to_datetime(df["timestamp"] / 1000, unit="ms")
    df = df.set_index("timestamp")
    
    # resample by 1second frequency
    df = df.resample("1s").last().ffill()
    return df

In [11]:
df = dask.compute(extract_quotes(quote_btc[0]))[0]