In [1]:
# This is necessary to recognize the modules
import os
import sys
from decimal import Decimal
import warnings

warnings.filterwarnings("ignore")

root_path = os.path.abspath('/home/dominhnhat/quants-lab/research_notebooks/bitcoinenaitor')
sys.path.append(root_path)

# assuming your notebook lives in PROJECT_ROOT/research_notebooks/bitcoinainer/
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

In [2]:
from core.data_sources.clob import CLOBDataSource

# Get trading rules and candles
clob = CLOBDataSource()

In [3]:
clob.load_candles_cache(root_path)

In [4]:
candles = clob.candles_cache[("binance", "BTC-USDT", "1s")]

In [5]:
df = candles.data

In [6]:
df.shape

(259200, 10)

In [7]:
from core.backtesting.triple_barrier_method import triple_barrier_method
df["side"] = 1
df_with_tbm = triple_barrier_method(df, tp=3.5, sl=3.5, tl=300, std_span=200, trade_cost=0.0000)

In [8]:
df_with_tbm.close_type.value_counts()

close_type
 0    119857
-1     72515
 1     66629
Name: count, dtype: int64

In [9]:
df_with_tbm.target.describe()

count       259001
mean    0.00025326
std     0.00018892
min     0.00000005
25%      0.0001351
50%      0.0002063
75%     0.00031247
max     0.00178846
Name: target, dtype: float64

In [10]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# Add technical indicators using pandas_ta
import pandas_ta as ta

# Create a copy to work with
df_with_indicators = df_with_tbm.copy()

# Bollinger Bands with different lengths
df_with_indicators.ta.bbands(length=20, std=2, append=True)  # Standard BB
df_with_indicators.ta.bbands(length=50, std=2, append=True)  # Longer term BB

# MACD with different parameters
df_with_indicators.ta.macd(fast=12, slow=26, signal=9, append=True)  # Standard MACD
df_with_indicators.ta.macd(fast=8, slow=21, signal=5, append=True)  # Faster MACD

# RSI with different lengths
df_with_indicators.ta.rsi(length=14, append=True)  # Standard RSI
df_with_indicators.ta.rsi(length=21, append=True)  # Longer RSI

# Moving averages
df_with_indicators.ta.sma(length=20, append=True)  # Short MA
df_with_indicators.ta.sma(length=50, append=True)  # Medium MA
df_with_indicators.ta.ema(length=20, append=True)  # Short EMA
df_with_indicators.ta.ema(length=50, append=True)  # Medium EMA

# Volatility and momentum indicators
df_with_indicators.ta.atr(length=14, append=True)  # ATR
df_with_indicators.ta.stoch(k=14, d=3, append=True)  # Stochastic
df_with_indicators.ta.adx(length=14, append=True)  # ADX

# Replace df_with_tbm with df_with_indicators for further processing
df_processed = df_with_indicators.copy()

# df_processed.reset_index(inplace=True, drop=True)

# 1. Remove unnecessary columns
columns_to_drop = ['timestamp', 'taker_buy_base_volume', 'volume', 
                   'close_time', 'real_class', 'ret', 'tp', 'sl', 'take_profit_time', 'stop_loss_time', 'tl', 'side']
df_processed = df_processed.drop(columns=columns_to_drop)
# 2. Convert prices to returns
price_columns = ['open', 'high', 'low', 'close']
for col in price_columns:
    df_processed[f'{col}_ret'] = df_processed[col].pct_change()
df_processed = df_processed.drop(columns=price_columns)

# 3. Create buy/sell volume ratio
df_processed['buy_volume_ratio'] = df_processed['taker_buy_quote_volume'] / df_processed['quote_asset_volume']
df_processed = df_processed.drop(columns=['taker_buy_quote_volume'])

# 4. Drop any rows with NaN values (first row will have NaN due to returns calculation)
df_processed = df_processed.dropna()

# 5. Get all numeric columns for scaling (excluding the target 'close_type')
numeric_columns = df_processed.select_dtypes(include=['float64', 'int64']).columns.tolist()
numeric_columns.remove('close_type')  # Don't scale the target variable

# 6. Apply StandardScaler to all numeric columns
scaler = StandardScaler()
df_processed[numeric_columns] = scaler.fit_transform(df_processed[numeric_columns])

# Show the first few rows of the processed dataset
print("Processed dataset shape:", df_processed.shape)
df_processed.head()

Processed dataset shape: (238315, 37)


Unnamed: 0_level_0,quote_asset_volume,n_trades,target,close_type,BBL_20_2.0,BBM_20_2.0,BBU_20_2.0,BBB_20_2.0,BBP_20_2.0,BBL_50_2.0,...,STOCHk_14_3_3,STOCHd_14_3_3,ADX_14,DMP_14,DMN_14,open_ret,high_ret,low_ret,close_ret,buy_volume_ratio
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-05-04 08:29:04,-0.15148792,-0.31905545,-0.62987315,1,0.7943657,0.78232764,0.77010068,-0.82203242,0.55026583,0.80556869,...,0.35946062,0.09214339,-1.4939684,-0.32296004,-0.4365209,-0.00058266,-0.00530715,0.00342753,-0.00127716,1.13467609
2025-05-04 08:29:05,0.40151344,1.98234991,-0.62798419,1,0.78803974,0.78167499,0.77512605,-0.45003277,-3.02935123,0.8016671,...,0.35946062,0.37184758,-1.2744241,-1.36552023,1.51948877,-0.00301088,-0.00773518,-2.8066775,-2.80145967,-1.16237301
2025-05-04 08:29:06,-0.14999624,-0.33104194,-0.62755749,1,0.78524738,0.78102235,0.776615,-0.30971399,-2.08945651,0.79994688,...,-0.45202658,0.09214339,-1.07263452,-1.36552629,1.51690458,-2.82702936,-2.83153291,0.00342753,-0.00127716,0.93806513
2025-05-04 08:29:07,-0.14891736,-0.31905545,-0.62932249,1,0.78311004,0.78037083,0.77745063,-0.21226871,-1.65984816,0.79860502,...,-1.26281781,-0.46702511,-0.8870197,-1.36553281,1.51412689,-0.00058266,-0.00530715,0.00342753,0.00112466,-0.21834965
2025-05-04 08:29:08,-0.15363839,-0.33104194,-0.63123873,1,0.78133784,0.77971819,0.7779186,-0.13883616,-1.39816699,0.7974623,...,-1.26281781,-1.0261936,-0.71616113,-1.36553982,1.51114167,-0.00058266,-0.00530715,0.00342753,-0.00367898,-0.97150288


In [12]:
candles_path = os.path.join(root_path, "data", "features_df")
filename = os.path.join(candles_path, f"{candles.connector_name}|{candles.trading_pair}|{candles.interval}.parquet")
df_processed.to_parquet(
filename,
engine='pyarrow',
compression='snappy',
index=True
)

In [14]:
# dump the scaler
import joblib

joblib.dump(scaler, os.path.join(root_path, "models", "scaler.pkl"))

['/home/dominhnhat/quants-lab/research_notebooks/bitcoinenaitor/models/scaler.pkl']