<a href="https://colab.research.google.com/github/KarelZe/thesis/blob/dataset/notebooks/1.0-mb-data_preprocessing_mem_reduce.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
!pip install gcsfs==2022.10.0
!pip install modin
!pip install tqdm==4.64.1
!pip install wandb==0.13.4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
import os

import gcsfs
import google.auth
from google.colab import auth

import modin.pandas as pd
import modin.config as cfg
from modin.config import ProgressBar
from distributed import Client
cfg.Engine.put("dask")
ProgressBar.enable()

from tqdm.notebook import tqdm
import wandb

In [13]:
# connect to weights and biases
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")
dataset = wandb.Artifact(name='raw_data', type="raw_data")

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [14]:
# connect to google cloud storage
auth.authenticate_user()
credentials, _ = google.auth.default()
fs = gcsfs.GCSFileSystem(project="thesis", token=credentials)
fs_prefix = "gs://"

In [15]:
file_path_input = "gs://thesis-bucket-option-trade-classification/data/raw/matched_ise_quotes.csv"
file_path_output = "gs://thesis-bucket-option-trade-classification/data/preprocessed/"

In [16]:
def import_data(input_file:str)->pd.DataFrame:
    """create a dataframe and optimize its memory usage"""
    
    # apply some optimizations i.e, manual inference of dtypes, pre-selection
    # of unique columns and chunking to enable import.
    # https://pandas.pydata.org/pandas-docs/stable/user_guide/scale.html
    dfs = []
    chunksize = 10 ** 6
        
    # do not load 'CANCELED_TRADE_CONDITION_ID' and 'EXCHANGE'
    usecols = ['UNDERLYING_SYMBOL', 'QUOTE_DATETIME', 'SEQUENCE_NUMBER', 'ROOT',
       'EXPIRATION', 'STRK_PRC', 'OPTION_TYPE', 'TRADE_SIZE', 'TRADE_PRICE',
       'BEST_BID', 'BEST_ASK', 'order_id', 'ask_ex', 'bid_ex', 'bid_size_ex',
       'ask_size_ex', 'date', 'price_all_lead', 'price_all_lag', 'optionid',
       'day_vol', 'price_ex_lead', 'price_ex_lag', 'buy_sell', 'ASK_1',
       'ASK_2', 'ASK_3', 'ASK_4', 'ASK_5', 'ASK_6', 'ASK_7', 'ASK_8', 'ASK_9',
       'ASK_10', 'ASK_11', 'ASK_12', 'ASK_13', 'ASK_14', 'ASK_15', 'ASK_16',
       'BID_1', 'BID_2', 'BID_3', 'BID_4', 'BID_5', 'BID_6', 'BID_7', 'BID_8',
       'BID_9', 'BID_10', 'BID_11', 'BID_12', 'BID_13', 'BID_14', 'BID_15',
       'BID_16']
    
    dtypes = {'UNDERLYING_SYMBOL': 'category',
    'QUOTE_DATETIME': 'object',
    'SEQUENCE_NUMBER': 'int64',
    'ROOT': 'category',
    'EXPIRATION': 'object',
    'STRK_PRC': 'float32',
    'OPTION_TYPE': 'category',
    'EXCHANGE': 'int64',
    'TRADE_SIZE': 'int64',
    'TRADE_PRICE': 'float32',
    'CANCELED_TRADE_CONDITION_ID': 'int64',
    'BEST_BID': 'float32',
    'BEST_ASK': 'float32',
    'order_id': 'int64',
    'ask_ex': 'float32',
    'bid_ex': 'float32',
    'bid_size_ex': 'float32',
    'ask_size_ex': 'float32',
    'date': 'object',
    'price_all_lead': 'float32',
    'price_all_lag': 'float32',
    'optionid': 'float32',
    'day_vol': 'float32',
    'price_ex_lead': 'float32',
    'price_ex_lag': 'float32',
    'buy_sell': 'int8',
    'ASK_1': 'float32',
    'ASK_2': 'float32',
    'ASK_3': 'float32',
    'ASK_4': 'float32',
    'ASK_5': 'float32',
    'ASK_6': 'float32',
    'ASK_7': 'float32',
    'ASK_8': 'float32',
    'ASK_9': 'float32',
    'ASK_10': 'float32',
    'ASK_11': 'float32',
    'ASK_12': 'float32',
    'ASK_13': 'float32',
    'ASK_14': 'float32',
    'ASK_15': 'float32',
    'ASK_16': 'float32',
    'BID_1': 'float32',
    'BID_2': 'float32',
    'BID_3': 'float32',
    'BID_4': 'float32',
    'BID_5': 'float32',
    'BID_6': 'float32',
    'BID_7': 'float32',
    'BID_8': 'float32',
    'BID_9': 'float32',
    'BID_10': 'float32',
    'BID_11': 'float32',
    'BID_12': 'float32',
    'BID_13': 'float32',
    'BID_14': 'float32',
    'BID_15': 'float32',
    'BID_16': 'float32'}

    # log raw file in w & b
    dataset.add_reference(input_file, name='raw_csv')

    with pd.read_csv(input_file, chunksize=chunksize, usecols=usecols, dtype=dtypes) as reader:
        for chunk in reader:
          dfs.append(chunk)

    df = pd.concat(dfs, axis=0)

    format = '%d%b%Y'
    df['EXPIRATION'] = pd.to_datetime(df['EXPIRATION'], format=format)
    df['date'] = pd.to_datetime(df['date'], format=format)

    format = '%d%b%y:%H:%M:%S'
    df['QUOTE_DATETIME'] = pd.to_datetime(df['QUOTE_DATETIME'], format=format)  
    return df

In [17]:
def df_to_parquet(df:pd.DataFrame, target_dir:str, chunk_size:int=1000000, **parquet_wargs)->None:
    """Writes pandas DataFrame to parquet format.
    adapted from: https://stackoverflow.com/a/72010262/5755604
    Args:
        df: DataFrame
        target_dir: local directory where parquet files are written to
        chunk_size: number of rows stored in one chunk of parquet file. Defaults to 1000000.
    """    
    for i in tqdm(range(0, len(df), chunk_size)):
        slc = df.iloc[i : i + chunk_size]
        chunk = int(i/chunk_size)
        output_path = target_dir + f"matched_ise_quotes_min_mem_usage_part_{chunk:04d}.parquet"
        slc.to_parquet(output_path, **parquet_wargs)
        
        # log in w & b
        dataset.add_reference(output_path, name=f"raw_parquet_{chunk:04d}")

In [18]:
client = Client()

df = import_data(file_path_input)
df_to_parquet(df, file_path_output)

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

  0%|          | 0/50 [00:00<?, ?it/s]

In [19]:
# Log the artifact to save it as an output of this run
run.log_artifact(dataset)
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…