In [1]:
# This is necessary to recognize the modules
import os
import sys
from decimal import Decimal
import warnings
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

warnings.filterwarnings("ignore")

root_path = os.path.abspath(os.path.join(os.getcwd(), '../..'))
sys.path.append(root_path)

In [2]:
# Load data
connector_name = "binance"
trading_pair = "BTC-USDT"
interval = "1s"

df_with_features = pd.read_parquet('/home/dominhnhat/quants-lab/research_notebooks/bitcoinenaitor/data/features_df/binance|BTC-USDT|1s.parquet')
print(df_with_features.columns)
print(len(df_with_features.columns))

# Prepare features and target
feature_columns = [col for col in df_with_features.columns if col not in ['timestamp', 'tl', 'stop_loss_time', 
                                                                       'take_profit_time', 'close_time', 'close_type',
                                                                       'real_class', 'ret']]

print(len(feature_columns))

X = df_with_features[feature_columns]
y = df_with_features['close_type']

# Print initial class distribution
print("Initial class distribution:")
print(y.value_counts().sort_index())
print("\n")


# Get the size of the smaller classes
target_size = df_with_features[df_with_features['close_type'] != 0].shape[0] // 2
df_neg = df_with_features[df_with_features['close_type'] == -1]
df_pos = df_with_features[df_with_features['close_type'] == 1]
df_mid = df_with_features[df_with_features['close_type'] == 0].sample(n=target_size, random_state=42)

# Combine the balanced dataset
balanced_df = pd.concat([df_neg, df_mid, df_pos])

X_balanced = balanced_df[feature_columns]
y_balanced = balanced_df['close_type']

# Print balanced distribution
print("Balanced class distribution:")
print(y_balanced.value_counts().sort_index())
print("\n")



# Split the data first
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.4, random_state=42, shuffle=True)


# Initialize and train XGBoost
model = RandomForestClassifier(
    n_estimators=500,
    max_depth=3,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced',
)
# Train the model
print("Training model...")
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)


# Print model performance
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the model and scaler
print("\nSaving model and scaler...")
model_path = os.path.join(root_path, "models", f"{connector_name}_{trading_pair}_{interval}_xgb_model.joblib")
scaler_path = os.path.join(root_path, "models", f"{connector_name}_{trading_pair}_{interval}_scaler.joblib")

# Create models directory if it doesn't exist
os.makedirs(os.path.dirname(model_path), exist_ok=True)

# Save both model and scaler
joblib.dump(model, model_path)

print(f"Model saved to: {model_path}")

Index(['quote_asset_volume', 'n_trades', 'target', 'close_type', 'BBL_20_2.0',
       'BBM_20_2.0', 'BBU_20_2.0', 'BBB_20_2.0', 'BBP_20_2.0', 'BBL_50_2.0',
       'BBM_50_2.0', 'BBU_50_2.0', 'BBB_50_2.0', 'BBP_50_2.0', 'MACD_12_26_9',
       'MACDh_12_26_9', 'MACDs_12_26_9', 'MACD_8_21_5', 'MACDh_8_21_5',
       'MACDs_8_21_5', 'RSI_14', 'RSI_21', 'SMA_20', 'SMA_50', 'EMA_20',
       'EMA_50', 'ATRr_14', 'STOCHk_14_3_3', 'STOCHd_14_3_3', 'ADX_14',
       'DMP_14', 'DMN_14', 'open_ret', 'high_ret', 'low_ret', 'close_ret',
       'buy_volume_ratio'],
      dtype='object')
37
36
Initial class distribution:
close_type
-1     66271
 0    110564
 1     61480
Name: count, dtype: int64


Balanced class distribution:
close_type
-1    66271
 0    63875
 1    61480
Name: count, dtype: int64


Training model...

Classification Report:
              precision    recall  f1-score   support

          -1       0.48      0.52      0.50     26337
           0       0.48      0.80      0.61     25498
  

In [3]:
df_balanced = balanced_df

print(balanced_df.columns)

train_df, test_df = train_test_split(
    df_balanced, test_size=0.1, random_state=42, stratify=df_balanced['close_type']
)

print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")
print("Train class counts:", train_df['close_type'].value_counts().to_dict())
print("Test class counts:", test_df['close_type'].value_counts().to_dict())

X_train = train_df[feature_columns]
y_train = train_df['close_type'].copy()
X_test  = test_df[feature_columns]
y_test  = test_df['close_type'].copy()

# Convert labels to 0,1,2 encoding for PyTorch
label_mapping = {-1: 0, 0: 1, 1: 2}
y_train_mapped = y_train.map(label_mapping).values
y_test_mapped = y_test.map(label_mapping).values

Index(['quote_asset_volume', 'n_trades', 'target', 'close_type', 'BBL_20_2.0',
       'BBM_20_2.0', 'BBU_20_2.0', 'BBB_20_2.0', 'BBP_20_2.0', 'BBL_50_2.0',
       'BBM_50_2.0', 'BBU_50_2.0', 'BBB_50_2.0', 'BBP_50_2.0', 'MACD_12_26_9',
       'MACDh_12_26_9', 'MACDs_12_26_9', 'MACD_8_21_5', 'MACDh_8_21_5',
       'MACDs_8_21_5', 'RSI_14', 'RSI_21', 'SMA_20', 'SMA_50', 'EMA_20',
       'EMA_50', 'ATRr_14', 'STOCHk_14_3_3', 'STOCHd_14_3_3', 'ADX_14',
       'DMP_14', 'DMN_14', 'open_ret', 'high_ret', 'low_ret', 'close_ret',
       'buy_volume_ratio'],
      dtype='object')
Train size: 172463, Test size: 19163
Train class counts: {-1: 59644, 0: 57487, 1: 55332}
Test class counts: {-1: 6627, 0: 6388, 1: 6148}


In [4]:
import torch
import torch.nn as nn

# Define a simple MLP model
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        
        self.fc1 = nn.Linear(input_dim, 512)
        self.relu1 = nn.ReLU()
        self.drop1 = nn.Dropout(0.2)
        
        self.fc2 = nn.Linear(512, 256)
        self.relu2 = nn.ReLU()
        self.drop2 = nn.Dropout(0.2)
        
        self.fc3 = nn.Linear(256, 128)
        self.relu3 = nn.ReLU()
        self.drop3 = nn.Dropout(0.2)

        self.fc4 = nn.Linear(128, 64)
        self.relu4 = nn.ReLU()
        self.drop4 = nn.Dropout(0.2)

        self.fc5 = nn.Linear(64, 3)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.drop1(x)

        x = self.fc2(x)
        x = self.relu2(x)
        x = self.drop2(x)

        x = self.fc3(x)
        x = self.relu3(x)
        x = self.drop3(x)

        x = self.fc4(x)
        x = self.relu4(x)
        x = self.drop4(x)

        x = self.fc5(x)

        return x

# Initialize the model
input_dim = X_train.shape[1]  # number of feature columns
model = MLP(input_dim)
print(input_dim)

36


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

MLP(
  (fc1): Linear(in_features=36, out_features=512, bias=True)
  (relu1): ReLU()
  (drop1): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (relu2): ReLU()
  (drop2): Dropout(p=0.2, inplace=False)
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (relu3): ReLU()
  (drop3): Dropout(p=0.2, inplace=False)
  (fc4): Linear(in_features=128, out_features=64, bias=True)
  (relu4): ReLU()
  (drop4): Dropout(p=0.2, inplace=False)
  (fc5): Linear(in_features=64, out_features=3, bias=True)
)

In [6]:
from torch.utils.data import TensorDataset, DataLoader

# Convert training and testing data to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_mapped, dtype=torch.long)
X_test_tensor  = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor  = torch.tensor(y_test_mapped, dtype=torch.long)

print(X_test_tensor.shape)

# Create TensorDataset and DataLoader for training and testing
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset  = TensorDataset(X_test_tensor, y_test_tensor)

bsize = 1024
train_loader = DataLoader(train_dataset, batch_size=bsize, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=bsize, shuffle=False)

torch.Size([19163, 36])


In [8]:
criterion = nn.CrossEntropyLoss()  # cross-entropy for multi-class
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)

# Training loop
epochs = 150
for epoch in range(1, epochs + 1):
    model.train()  # set model to training mode
    running_loss = 0.0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)  # move data to device
        
        optimizer.zero_grad()             # reset gradients
        outputs = model(batch_X)          # forward pass
        loss = criterion(outputs, batch_y)  # compute loss
        loss.backward()                   # backpropagation
        optimizer.step()                  # update parameters
        
        running_loss += loss.item()
    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch}/{epochs}, Training Loss: {avg_loss:.4f}")

Epoch 1/150, Training Loss: 0.9372
Epoch 2/150, Training Loss: 0.9313
Epoch 3/150, Training Loss: 0.9275
Epoch 4/150, Training Loss: 0.9228
Epoch 5/150, Training Loss: 0.9212
Epoch 6/150, Training Loss: 0.9171
Epoch 7/150, Training Loss: 0.9142
Epoch 8/150, Training Loss: 0.9086
Epoch 9/150, Training Loss: 0.9068
Epoch 10/150, Training Loss: 0.9019
Epoch 11/150, Training Loss: 0.8993
Epoch 12/150, Training Loss: 0.8964
Epoch 13/150, Training Loss: 0.8933
Epoch 14/150, Training Loss: 0.8902
Epoch 15/150, Training Loss: 0.8877
Epoch 16/150, Training Loss: 0.8820
Epoch 17/150, Training Loss: 0.8823
Epoch 18/150, Training Loss: 0.8781
Epoch 19/150, Training Loss: 0.8743
Epoch 20/150, Training Loss: 0.8727
Epoch 21/150, Training Loss: 0.8691
Epoch 22/150, Training Loss: 0.8667
Epoch 23/150, Training Loss: 0.8661
Epoch 24/150, Training Loss: 0.8645
Epoch 25/150, Training Loss: 0.8588
Epoch 26/150, Training Loss: 0.8578
Epoch 27/150, Training Loss: 0.8551
Epoch 28/150, Training Loss: 0.8544
E

In [9]:
from sklearn.metrics import classification_report
import numpy as np
model.eval()  # set model to evaluation mode
y_pred = []
y_true = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        outputs = model(batch_X)
        preds = outputs.argmax(dim=1)  # predicted class index for each sample
        y_pred.append(preds.cpu().numpy())
        y_true.append(batch_y.cpu().numpy())

# Concatenate all batches
y_pred = np.concatenate(y_pred)
y_true = np.concatenate(y_true)
inv_mapping = {0: -1, 1: 0, 2: 1}
y_pred_orig = [inv_mapping[i] for i in y_pred]
y_true_orig = [inv_mapping[i] for i in y_true]

# import random
# y_pred_new = []
# flip_prob = 0.35
# for p in y_pred_orig:
#     if random.random() < flip_prob:
#         new_label = random.choice([c for c in [-1, 0, 1] if c != p])
#         y_pred_new.append(new_label)
#     else:
#         y_pred_new.append(p)

# Print classification report
print(classification_report(y_true_orig, y_pred_orig, labels=[-1, 0, 1], target_names=["-1", "0", "1"]))

              precision    recall  f1-score   support

          -1       0.72      0.64      0.68      6627
           0       0.71      0.76      0.73      6388
           1       0.67      0.70      0.68      6148

    accuracy                           0.70     19163
   macro avg       0.70      0.70      0.70     19163
weighted avg       0.70      0.70      0.70     19163



In [11]:
model_path = '/home/dominhnhat/quants-lab/research_notebooks/bitcoinenaitor/models/mlp.pth'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to /home/dominhnhat/quants-lab/research_notebooks/bitcoinenaitor/models/mlp.pth


In [None]:
X_train.shape 

(172463, 36)