In [1]:
from typing import List
import os
import json
from glob import glob
from datetime import datetime

import numpy as np
import pandas as pd
import datatable as dt
from rich import print as rprint
from nptyping import NDArray
# import numba
import talib
from google.cloud import storage
from google.oauth2.service_account import Credentials
import numba

SECRET_KEY_PATH = '/Users/jo/gcp_secret_key2.json'
BAKET_NAME = 'trading_datas_storage2'
GCS_SAVE_DIR = 'trading_datas'
INPUT_DIR = 'trading_datas'
OUTPUT_DIR = 'features'

cred : Credentials = Credentials.from_service_account_info(json.load(open(SECRET_KEY_PATH)))
gcs_client : storage.Client = storage.Client(credentials=cred, project=cred.project_id)
gcs_bucket : storage.Bucket = gcs_client.get_bucket(BAKET_NAME)

os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [2]:
!pwd

/Users/jo/GitHub/MLtrading/notebook/bybit_ordertable


# データ読み込み

In [3]:
# GCSからローカルに存在しないファイルをダウンロード

local_files = set([os.path.basename(path) for path in glob(os.path.join(INPUT_DIR, '*.pkl.bz2'))])
gcs_files = set([os.path.basename(file.name) for file in gcs_client.list_blobs(BAKET_NAME, prefix=GCS_SAVE_DIR)])

wanted_file_names = gcs_files - local_files  # 差集合
print('wanted_file_names: ')
print(wanted_file_names)

for filename in wanted_file_names:
    if filename == '':
        continue
    blob = gcs_bucket.blob(os.path.join(GCS_SAVE_DIR, filename))  # ストレージのパスを指定
    blob.download_to_filename(os.path.join(INPUT_DIR, filename))  # ダウンロード先のパスを指定

wanted_file_names: 
set()


In [4]:
def sort_by_datetime(pathlist: List[str]) -> List[str]:
    """ファイルパスのリストを日付によってソート"""

    def get_datetime(filename: str) -> str:
        """ファイル名の日付部分をdatetimeで返却"""
        date_str : str = os.path.basename(filename).split('_')[0]
        return datetime.strptime(date_str, '%Y%m%d')

    return sorted(pathlist, key=lambda x: get_datetime(x))

In [6]:
bybit_datas_pathlist = glob(os.path.join(INPUT_DIR, '*_bybit.pkl.bz2'))
bybit_datas_pathlist = sort_by_datetime(bybit_datas_pathlist)

df_bybit_list = [pd.read_pickle(path) for path in bybit_datas_pathlist]
df_bybit = pd.concat(df_bybit_list)

columns : List[str] = list(df_bybit.columns)

dtype_dict = {}
for column in columns:
    if column == 'timestamp':
        dtype_dict[column] = 'datetime64'
    elif column == 'orderbook':
        continue
    else:
        dtype_dict[column] = 'float64'


df_bybit = df_bybit.astype(dtype=dtype_dict)
df_bybit = df_bybit.fillna(method='bfill')
df_bybit = df_bybit.reset_index(drop=True)

print('created df')
display(df_bybit)

df_bybit.to_pickle('df_bybit.pkl')

created df


Unnamed: 0,timestamp,open,high,low,close,buy_volume,sell_volume,buy_price_avg,sell_price_avg,buy_liq_qty,sell_liq_qty,oi_open,oi_high,oi_low,oi_close,orderbook
0,2021-10-20 23:18:30.001663,65996.5,65997.0,65996.5,65996.5,5112.0,10977.0,129.102113,42.085770,0.0,0.0,1.969185e+09,1.969230e+09,1.969185e+09,1.969230e+09,"{'Buy': [{'price': '65996.50', 'size': 2362959..."
1,2021-10-20 23:18:35.000008,65997.0,65997.0,65996.5,65996.5,141948.0,208937.0,4.184441,2.842811,0.0,0.0,1.969230e+09,1.969259e+09,1.968839e+09,1.969259e+09,"{'Buy': [{'price': '65996.50', 'size': 1076854..."
2,2021-10-20 23:18:40.000002,65997.0,65997.0,65996.5,65997.0,40817.0,63913.0,21.019698,8.260792,0.0,0.0,1.969259e+09,1.969504e+09,1.969259e+09,1.969504e+09,"{'Buy': [{'price': '65996.50', 'size': 1562067..."
3,2021-10-20 23:18:45.000002,65996.5,65997.0,65996.5,65997.0,92649.0,13793.0,5.698669,28.708693,0.0,0.0,1.969504e+09,1.969535e+09,1.969504e+09,1.969535e+09,"{'Buy': [{'price': '65996.50', 'size': 1228344..."
4,2021-10-20 23:18:50.000005,65996.5,65997.0,65996.5,65997.0,14071.0,13244.0,14.070855,34.881871,0.0,0.0,1.969535e+09,1.969544e+09,1.969489e+09,1.969515e+09,"{'Buy': [{'price': '65996.50', 'size': 1290222..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1244463,2021-12-31 23:59:35.186087,47983.5,47983.5,47983.5,47983.5,133.0,0.0,721.556391,21.382799,0.0,0.0,9.836653e+08,9.836654e+08,9.836653e+08,9.836654e+08,"{'Buy': [{'price': '47983.00', 'size': 2571797..."
1244464,2021-12-31 23:59:40.022507,47983.0,47983.5,47983.0,47983.5,10.0,2244.0,4798.350000,21.382799,0.0,0.0,9.836654e+08,9.836654e+08,9.836654e+08,9.836654e+08,"{'Buy': [{'price': '47983.00', 'size': 2219711..."
1244465,2021-12-31 23:59:45.125927,47983.5,47983.5,47983.5,47983.5,7.0,0.0,6854.785714,22.913961,0.0,0.0,9.836654e+08,9.836654e+08,9.836654e+08,9.836654e+08,"{'Buy': [{'price': '47983.00', 'size': 2368680..."
1244466,2021-12-31 23:59:50.126301,47983.0,47983.5,47979.5,47979.5,0.0,0.0,47981.750000,22.913961,0.0,0.0,9.836654e+08,9.836654e+08,9.836654e+08,9.836654e+08,"{'Buy': [{'price': '47983.00', 'size': 2038524..."


In [None]:
def _cal_ask_bit_spread(row):
    best_bit = float(row['Buy'][0]['price'])
    best_ask = float(row['Sell'][0]['price'])
    return best_ask - best_bit

def _create_ask_bit_size_ratio(row):
    best_bit_size = float(row['Buy'][0]['size'])
    best_ask_size = float(row['Sell'][0]['size'])
    return np.log(best_ask_size / best_bit_size)

def cal_ba_spread_mean(df) -> float:
    spreads : NDArray[float] = np.vectorize(_cal_ask_bit_spread)(df['orderbook'])
    return np.mean(spreads)

def cal_ba_spread_std(df) -> float:
    spreads : NDArray[float] = np.vectorize(_cal_ask_bit_spread)(df['orderbook'])
    return np.std(spreads)

def cal_ba_ratio_mean(df) -> float:
    ratios : NDArray[float] = np.vectorize(_create_ask_bit_size_ratio)(df['orderbook'])
    return np.mean(ratios)

def cal_ba_ratio_std(df) -> float:
    ratios : NDArray[float] = np.vectorize(_create_ask_bit_size_ratio)(df['orderbook'])
    return np.std(ratios)



def convert_timescale(df: pd.DataFrame, timescale: int) -> pd.DataFrame:
    df = df.copy()
    df_result = pd.DataFrame(columns=df.columns)
    for i in range(0, len(df), timescale):
        try:
            df_s = df[i:i+timescale]
        except KeyError:
            break
        record = {
            'timestamp': df_s['timestamp'].iloc[-1],
            'open': df_s['open'].iloc[0],
            'high': df_s['high'].max(),
            'low': df_s['low'].min(),
            'close': df_s['close'].iloc[-1],
            'buy_volume': df_s['buy_volume'].sum(),
            'sell_volume': df_s['sell_volume'].sum(),
            'buy_price_avg': df_s['buy_price_avg'].mean(),
            'sell_price_avg': df_s['sell_price_avg'].mean(),
            'buy_liq_qty': df_s['buy_liq_qty'].sum(),
            'sell_liq_qty': df_s['sell_liq_qty'].sum(),
            'oi_open': df_s['oi_open'].iloc[0],
            'oi_high': df_s['oi_high'].max(),
            'oi_low': df_s['oi_low'].min(),
            'oi_close': df_s['oi_close'].iloc[-1],
            'ba_spread_mean': cal_ba_spread_mean(df=df_s),
            'ba_spread_std': cal_ba_spread_std(df=df_s),
            'ba_ratio_mean': cal_ba_ratio_mean(df=df_s),
            'ba_ratio_std': cal_ba_ratio_std(df=df_s),
            }
        df_result = df_result.append(record, ignore_index=True)
    return df_result

df = pd.read_pickle('df_bybit.pkl')

df_15m = convert_timescale(df=df, timescale=12 * 15)
# df_1m = convert_timescale(df=df_bybit, timescale=12)
# df_2m = convert_timescale(df=df_bybit, timescale=24)
# df_5m = convert_timescale(df=df_bybit, timescale=12*5)
df_15m.to_pickle('df_bybit_15m.pkl.bz2')