<a href="https://colab.research.google.com/github/KarelZe/thesis/blob/main/notebooks/data_preprocessing_mem_reduce.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install wandb
!pip install modin

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# use gcs fuse to access google cloud storage
# https://stackoverflow.com/a/60450255/5755604
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse

# mount google cloud stoarge as drive
!mkdir gcs
!gcsfuse thesis-bucket-option-trade-classification gcs

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  2537  100  2537    0     0  66763      0 --:--:-- --:--:-- --:--:-- 66763
OK
33 packages can be upgraded. Run 'apt list --upgradable' to see them.
gcsfuse is already the newest version (0.41.7).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 33 not upgraded.
mkdir: cannot create directory ‘gcs’: File exists
2022/10/28 06:04:17.842957 Start gcsfuse/0.41.7 (Go version go1.18.4) for app "" using mount point: /content/gcs
2022/10/28 06:04:17.866473 Opening GCS connection...
2022/10/28 06:04:18.090855 Mounting file system "thesis-bucket-option-trade-classification"...
2022/10/28 06:04:18.091747 File system has been succ

In [12]:
import os
import numpy as np


import modin.pandas as pd
import modin.config as cfg
from modin.config import ProgressBar
from distributed import Client
cfg.Engine.put("dask")
ProgressBar.enable()

from tqdm.notebook import tqdm

import wandb

In [4]:
pd.set_option('display.max_columns', 80)

In [5]:
from google.colab import auth
auth.authenticate_user()


In [6]:
run = wandb.init(project="thesis", job_type="dataset-creation", entity="fbv")

[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [11]:
file_path_input = "gcs/data/raw/matched_ise_quotes.csv"
file_path_output = "gcs/data/preprocessed/"

In [8]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage. 

	Adapted from: 
  https://www.kaggle.com/code/gemartin/load-data-reduce-memory-usage/notebook
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in tqdm(df.columns):
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def import_data(file):
    """create a dataframe and optimize its memory usage"""
    
    # apply some optimizations i.e, manual inference of dtypes, pre-selection
    # of unique columns and chunking to enable import.
    # https://pandas.pydata.org/pandas-docs/stable/user_guide/scale.html
    dfs = []
    chunksize = 10 ** 6
        
    # do not load 'CANCELED_TRADE_CONDITION_ID' and 'EXCHANGE'
    usecols = ['UNDERLYING_SYMBOL', 'QUOTE_DATETIME', 'SEQUENCE_NUMBER', 'ROOT',
       'EXPIRATION', 'STRK_PRC', 'OPTION_TYPE', 'TRADE_SIZE', 'TRADE_PRICE',
       'BEST_BID', 'BEST_ASK', 'order_id', 'ask_ex', 'bid_ex', 'bid_size_ex',
       'ask_size_ex', 'date', 'price_all_lead', 'price_all_lag', 'optionid',
       'day_vol', 'price_ex_lead', 'price_ex_lag', 'buy_sell', 'ASK_1',
       'ASK_2', 'ASK_3', 'ASK_4', 'ASK_5', 'ASK_6', 'ASK_7', 'ASK_8', 'ASK_9',
       'ASK_10', 'ASK_11', 'ASK_12', 'ASK_13', 'ASK_14', 'ASK_15', 'ASK_16',
       'BID_1', 'BID_2', 'BID_3', 'BID_4', 'BID_5', 'BID_6', 'BID_7', 'BID_8',
       'BID_9', 'BID_10', 'BID_11', 'BID_12', 'BID_13', 'BID_14', 'BID_15',
       'BID_16']
    
    dtypes = {'UNDERLYING_SYMBOL': 'category',
    'QUOTE_DATETIME': 'object',
    'SEQUENCE_NUMBER': 'int64',
    'ROOT': 'category',
    'EXPIRATION': 'object',
    'STRK_PRC': 'float32',
    'OPTION_TYPE': 'category',
    'EXCHANGE': 'int64',
    'TRADE_SIZE': 'int64',
    'TRADE_PRICE': 'float32',
    'CANCELED_TRADE_CONDITION_ID': 'int64',
    'BEST_BID': 'float32',
    'BEST_ASK': 'float32',
    'order_id': 'int64',
    'ask_ex': 'float32',
    'bid_ex': 'float32',
    'bid_size_ex': 'float32',
    'ask_size_ex': 'float32',
    'date': 'object',
    'price_all_lead': 'float32',
    'price_all_lag': 'float32',
    'optionid': 'float32',
    'day_vol': 'float32',
    'price_ex_lead': 'float32',
    'price_ex_lag': 'float32',
    'buy_sell': 'int8',
    'ASK_1': 'float32',
    'ASK_2': 'float32',
    'ASK_3': 'float32',
    'ASK_4': 'float32',
    'ASK_5': 'float32',
    'ASK_6': 'float32',
    'ASK_7': 'float32',
    'ASK_8': 'float32',
    'ASK_9': 'float32',
    'ASK_10': 'float32',
    'ASK_11': 'float32',
    'ASK_12': 'float32',
    'ASK_13': 'float32',
    'ASK_14': 'float32',
    'ASK_15': 'float32',
    'ASK_16': 'float32',
    'BID_1': 'float32',
    'BID_2': 'float32',
    'BID_3': 'float32',
    'BID_4': 'float32',
    'BID_5': 'float32',
    'BID_6': 'float32',
    'BID_7': 'float32',
    'BID_8': 'float32',
    'BID_9': 'float32',
    'BID_10': 'float32',
    'BID_11': 'float32',
    'BID_12': 'float32',
    'BID_13': 'float32',
    'BID_14': 'float32',
    'BID_15': 'float32',
    'BID_16': 'float32'}

    with pd.read_csv(file, chunksize=chunksize, usecols=usecols, dtype=dtypes) as reader:
        for chunk in reader:
          dfs.append(chunk)

    df = pd.concat(dfs, axis=0)

    format = '%d%b%Y'
    df['EXPIRATION'] = pd.to_datetime(df['EXPIRATION'], format=format)
    df['date'] = pd.to_datetime(df['date'], format=format)

    format = '%d%b%y:%H:%M:%S'
    df['QUOTE_DATETIME'] = pd.to_datetime(df['QUOTE_DATETIME'], format=format)  
    return df

In [14]:
def df_to_parquet(df, target_dir, chunk_size=1000000, **parquet_wargs):
    """Writes pandas DataFrame to parquet format.
    adapted from: https://stackoverflow.com/a/72010262/5755604
    Args:
        df: DataFrame
        target_dir: local directory where parquet files are written to
        chunk_size: number of rows stored in one chunk of parquet file. Defaults to 1000000.
    """    
    for i in tqdm(range(0, len(df), chunk_size)):
        slc = df.iloc[i : i + chunk_size]
        chunk = int(i/chunk_size)
        fname = os.path.join(target_dir, f"matched_ise_quotes_min_mem_usage_part_{chunk:04d}.parquet")
        slc.to_parquet(fname, **parquet_wargs)

In [15]:
client = Client()

df = import_data(file_path_input)
df_to_parquet(df, file_path_output)

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

Distributing Dataframe:   0%           Elapsed time: 00:00, estimated remaining time: ?

In [None]:
dataset_at = wandb.Artifact('data_mem_reduced',type="preprocessed_data")
dataset_at.add_reference(file_path_output)