In [1]:
import pandas as pd

df = pd.read_csv("/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles.csv")
df

Unnamed: 0,timestamp,open,high,low,close,volume
0,2018-01-01,13707.91,13818.55,12750.00,13380.00,8607.15640
1,2018-01-02,13382.16,15473.49,12890.02,14675.11,20078.16540
2,2018-01-03,14690.00,15307.56,14150.00,14919.51,15905.48210
3,2018-01-04,14919.51,15280.00,13918.04,15059.54,25224.41500
4,2018-01-05,15059.56,17176.24,14600.00,16960.39,23251.35200
...,...,...,...,...,...,...
1599,2022-05-19,28715.33,30545.18,28691.38,30319.23,67877.36415
1600,2022-05-20,30319.22,30777.33,28730.00,29201.01,60517.25325
1601,2022-05-21,29201.01,29656.18,28947.28,29445.06,20987.13124
1602,2022-05-22,29445.07,30487.99,29255.11,30293.94,36158.98748


In [2]:
import sys
from pathlib import Path

# Current notebook location
notebook_path = Path().resolve()

# Add parent folder (meta/) to sys.path
sys.path.append(str(notebook_path.parent))
from add_ons.candle_proportion import add_candle_proportions
df_with_props = add_candle_proportions(
    df,
    period=50,
    additional_props=True,        # add ratios
    include_candle_color=True,    # add 1=red, 2=green
    separatable="no"              # merge into df
)
df_with_props

Unnamed: 0,timestamp,open,high,low,close,volume,upper_shadow,body,lower_shadow,candle_color,upper_body_ratio,lower_body_ratio,upper_lower_body_ratio
0,2018-01-01,13707.91,13818.55,12750.00,13380.00,8607.15640,0.125974,-0.373357,0.717315,1,0.337410,1.921259,0.175619
1,2018-01-02,13382.16,15473.49,12890.02,14675.11,20078.16540,0.875053,1.417120,0.539403,2,0.617487,0.380633,1.622262
2,2018-01-03,14690.00,15307.56,14150.00,14919.51,15905.48210,0.423043,0.250206,0.588695,2,1.690776,2.352839,0.718611
3,2018-01-04,14919.51,15280.00,13918.04,15059.54,25224.41500,0.238032,0.151192,1.081295,2,1.574377,5.000000,0.220136
4,2018-01-05,15059.56,17176.24,14600.00,16960.39,23251.35200,0.225037,1.981729,0.479119,2,0.113556,0.241768,0.469688
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1599,2022-05-19,28715.33,30545.18,28691.38,30319.23,67877.36415,0.109006,0.773779,0.011554,2,0.140875,0.014932,5.000000
1600,2022-05-20,30319.22,30777.33,28730.00,29201.01,60517.25325,0.221063,-0.539597,0.227288,1,0.409682,0.421218,0.972612
1601,2022-05-21,29201.01,29656.18,28947.28,29445.06,20987.13124,0.103235,0.119338,0.124071,2,0.865069,1.039664,0.832066
1602,2022-05-22,29445.07,30487.99,29255.11,30293.94,36158.98748,0.095648,0.418411,0.093632,2,0.228598,0.223780,1.021531


In [3]:
# Keep only timestamp, close, candle_color
df_labels = df_with_props[["timestamp", "close", "candle_color"]].copy()

# Ensure timestamp has the desired format "YYYY-MM-DD HH:MM:SS"
df_labels["timestamp"] = pd.to_datetime(df_labels["timestamp"]).dt.strftime("%Y-%m-%d %H:%M:%S")

# Rename candle_color → labels
df_labels.rename(columns={"candle_color": "labels"}, inplace=True)

# Map values (1 → r, 2 → g)
df_labels["labels"] = df_labels["labels"].map({1: "r", 2: "g"})

df_labels = df_labels.head(1000)

In [4]:
df_labels

Unnamed: 0,timestamp,close,labels
0,2018-01-01 00:00:00,13380.00,r
1,2018-01-02 00:00:00,14675.11,g
2,2018-01-03 00:00:00,14919.51,g
3,2018-01-04 00:00:00,15059.54,g
4,2018-01-05 00:00:00,16960.39,g
...,...,...,...
995,2020-09-22 00:00:00,10529.61,g
996,2020-09-23 00:00:00,10241.46,r
997,2020-09-24 00:00:00,10736.32,g
998,2020-09-25 00:00:00,10686.67,r


# Model

In [4]:
# models/cnn1d_classifier.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl


class CNN1DClassifier(pl.LightningModule):
    def __init__(
        self,
        input_dim,
        num_classes,
        num_filters=2,
        kernel_size=1,
        hidden_dim=10,
        dropout=0.3,
        lr=0.001,
    ):
        """
        Minimal CNN + FNN without global pooling.

        Args:
            input_dim (int): Number of features per time step (e.g., 7 including Candle_Color).
            num_classes (int): Number of output classes.
            num_filters (int): Number of convolutional filters. Default=4.
            kernel_size (int): Convolution kernel size. Default=1.
            hidden_dim (int): Hidden dimension in fully connected layer. Default=8.
            lr (float): Learning rate.
        """
        super().__init__()
        self.save_hyperparameters()

        self.conv = nn.Conv1d(
            in_channels=input_dim,
            out_channels=num_filters,
            kernel_size=kernel_size,
            padding="same",
        )

        # 🔑 note: fc1 input size depends on seq_len (set dynamically in forward)
        self.fc1 = None  
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        """
        Args:
            x: (batch_size, seq_len, input_dim)
        Returns:
            logits: (batch_size, num_classes)
        """
        batch_size, seq_len, _ = x.shape

        # Conv1d expects (batch, channels, seq_len)
        x = x.permute(0, 2, 1)          # (batch, input_dim, seq_len)
        # print(x.shape)
        x = F.relu(self.conv(x))        # (batch, num_filters, seq_len)

        # Flatten across filters + sequence
        x = x.view(batch_size, -1)      # (batch, num_filters * seq_len)

        # Initialize fc1 lazily (depends on seq_len)
        if self.fc1 is None:
            self.fc1 = nn.Linear(x.shape[1], self.hparams.hidden_dim).to(x.device)

        x = F.relu(self.fc1(x))
        logits = self.fc2(x)
        return logits

    # ---------- training ----------
    def training_step(self, batch, batch_idx):
        x, y_cls = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y_cls)
        self.log("train_loss", loss, prog_bar=True, on_epoch=True)
        return loss

    # ---------- validation ----------
    def validation_step(self, batch, batch_idx):
        x, y_cls = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y_cls)
        preds = torch.argmax(logits, dim=1)
        acc = (preds == y_cls).float().mean()
        self.log("val_loss", loss, prog_bar=True, on_epoch=True)
        self.log("val_acc", acc, prog_bar=True, on_epoch=True)

    # ---------- optimizer ----------
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)


# Train

In [5]:
import sys
from pathlib import Path

# Current notebook location
notebook_path = Path().resolve()

# Add parent folder (meta/) to sys.path
sys.path.append(str(notebook_path.parent))
import joblib
import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, confusion_matrix
from datetime import datetime
from itertools import islice
from preprocess.classification_pre_dict import preprocess_csv
# from models.LSTM.lstm_classifier import LSTMClassifier
import pandas as pd 
from add_ons.featue_pipeline2 import FeaturePipeline
from add_ons.drop_column import drop_columns
from utils.print_batch import print_batch
from utils.to_address import to_address

def evaluate_model(model, val_loader, label_encoder):
    """
    Evaluate a trained model on validation data.

    Args:
        model (torch.nn.Module): Trained LSTM classifier.
        val_loader (DataLoader): DataLoader for validation dataset.
        label_encoder (LabelEncoder): Fitted label encoder (for readable class names).

    Prints:
        - Classification report (precision, recall, f1, support).
        - Confusion matrix.
    """
    model.eval()  # switch to evaluation mode
    all_preds, all_labels = [], []

    with torch.no_grad():  # disable gradient tracking for speed
        for X_batch, y_batch in val_loader:
            logits = model(X_batch)               # forward pass
            preds = torch.argmax(logits, dim=1)   # predicted class index
            all_preds.extend(preds.cpu().numpy()) # move to CPU, store
            all_labels.extend(y_batch.cpu().numpy())

    print("\n📊 Validation Report:")
    print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

    cm = confusion_matrix(all_labels, all_preds)
    print("Confusion Matrix:")
    print(cm)


def train_model(
    data_csv,
    labels_csv,
    model_out_dir="models/saved_models",
    do_validation=True,
    seq_len=1,
    hidden_dim=5,
    num_layers=1,
    lr=0.005,
    batch_size=1,
    max_epochs=10,
    save_model=False,
    return_val_accuracy=True,
    test_mode = False
):
    """
    Train an LSTM classification model using PyTorch Lightning.

    Args:
        data_csv (str): Path to candles CSV file (OHLCV data).
        labels_csv (str): Path to labels CSV file (class labels).
        model_out_dir (str, optional): Directory where model & metadata are saved.
        do_validation (bool, optional): If True, split data into train/val sets.
        seq_len (int, optional): Number of candles per sequence (LSTM input length).
        hidden_dim (int, optional): Size of hidden state in LSTM.
        num_layers (int, optional): Number of stacked LSTM layers.
        lr (float, optional): Learning rate for optimizer.
        batch_size (int, optional): Batch size for DataLoader.
        max_epochs (int, optional): Number of training epochs.
        save_model (bool, optional): If True, save model checkpoint & metadata.
        return_val_accuracy (bool, optional): If True, return validation accuracy.

    Returns:
        dict | None: {"accuracy": float} if return_val_accuracy=True, else None.

        Notes:
        - If `do_validation=True`, this function calls `preprocess_csv` with `val_split=True`
          and receives:
            • train_ds (TensorDataset): training set, each element is (X_i, y_i).
                - X_i shape: (seq_len, feature_dim), e.g. (3, 10) for one sequence.
                - y_i: integer class label for that sequence.
                - All data combined has shape:
                    X_train.shape = (num_train_samples, seq_len, feature_dim)
                    y_train.shape = (num_train_samples,)
            • val_ds (TensorDataset): validation set, same format as train_ds.
            • label_encoder (LabelEncoder): maps original string labels → integer classes.
            • df (DataFrame): merged OHLCV data + labels for reference/inspection.

        - If `do_validation=False`, it receives:
            • full_dataset (TensorDataset): entire dataset without split.
            • label_encoder (LabelEncoder).
            • df (DataFrame).

        - `input_dim` is automatically inferred from the dataset:
            • It is the number of features per candle (columns in FEATURE_COLS).
            • Computed as: `train_ds[0][0].shape[1]` if validation is enabled,
              otherwise from `full_dataset`.
            • For example, with FEATURE_COLS = 10, input_dim = 10.

        - These datasets are wrapped into DataLoaders so PyTorch Lightning can feed
          `(X_batch, y_batch)` pairs into the model during training.    
    """
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_out = f"{model_out_dir}/lstm_model_class_{timestamp}.pt"
    meta_out  = f"{model_out_dir}/lstm_meta_class_{timestamp}.pkl"


    # --- Define Feature Pipeline ---
    pipeline = FeaturePipeline(
            steps=[lambda df: drop_columns(df, ["open","high","close","volume", "low",
])],
#             steps=[lambda df: drop_columns(df, ["open","high","close","volume", "low",
# "upper_shadow","lower_shadow","body","upper_body_ratio","lower_body_ratio","upper_lower_body_ratio"])],
            # norm_methods={
            #     "main": {"upper_shadow": "standard"},
            # }
        )

    # seq_dict = {"main": 5, "pct_changes": 3}  # different seq lens per group
    seq_dict = 1
    # --- Get dataset(s) ---
    if do_validation:
        train_ds, val_ds, label_encoder, df, feature_cols = preprocess_csv(
            data_csv, labels_csv,
            n_candles=seq_dict,
            val_split=True,
            feature_pipeline=pipeline,
            debug_sample=[0,4,5]
        )
    else:
        full_dataset, label_encoder, df, feature_cols = preprocess_csv(
            data_csv, labels_csv,
            n_candles=seq_dict,
            val_split=False,
            feature_pipeline=pipeline
        )

    # --- Model config ---
    # Determine input dimension (#features per time step)
    input_dim = train_ds[0][0].shape[1] if do_validation else full_dataset[0][0].shape[1]
    num_classes = len(label_encoder.classes_)

    # Initialize model
    model = CNN1DClassifier(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        # num_layers=num_layers,
        num_classes=num_classes,
        lr=lr
    )

    # --- DataLoaders ---
    if do_validation:
        train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
        val_loader   = DataLoader(val_ds, batch_size=batch_size)
    else:
        train_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=True)
        val_loader   = None

    # --- print a sample
    # --- Debug: Inspect one batch being fed to LSTM ---
    if test_mode:
        global df_seq
        df_seq = print_batch(train_loader, feature_cols, batch_idx=2)

    # --- Trainer setup ---
    trainer = pl.Trainer(
        max_epochs=max_epochs,
        accelerator="auto",   # automatically picks "gpu" if available, else "cpu"
        devices=1,            # use 1 device (GPU if available)
        # log_every_n_steps=10,
        fast_dev_run=test_mode,    # ✅ runs 1 batch for train + 1 batch for val, no full training
    )

    # Train model
    trainer.fit(model, train_loader, val_loader)

    # --- Save model & metadata ---
    if save_model:
        trainer.save_checkpoint(model_out)
        joblib.dump({
            'input_dim': input_dim,
            'hidden_dim': hidden_dim,
            'num_layers': num_layers,
            'num_classes': num_classes,
            'seq_len': seq_len,
            'lr': lr,
            'label_classes': label_encoder.classes_
        }, meta_out)
        print(f"\n✅ Model saved to {model_out}")
        print(f"✅ Meta saved to {meta_out}")

    # --- Optional evaluation ---
    val_acc = None
    if do_validation:
        evaluate_model(model, val_loader, label_encoder)
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                logits = model(X_batch)
                preds = torch.argmax(logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(y_batch.cpu().numpy())
        val_acc = (torch.tensor(all_preds) == torch.tensor(all_labels)).float().mean().item()

    if return_val_accuracy:
        return {"accuracy": val_acc}


if __name__ == "__main__":
    # Example: training with validation split
    train_model(
        "/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles_prop.csv",
        to_address(df_labels),
        do_validation=True,
        test_mode=False
    )


💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



=== DEBUG SAMPLE CHECK ===

--- Sample index 0 ---
Features (sequence):
   timestamp  upper_shadow      body  lower_shadow  Candle_Color  \
0 2018-01-01      0.076003 -0.225254      0.432772             1   

   upper_body_ratio  lower_body_ratio  upper_lower_body_ratio  
0           0.33741          1.921259                0.175619  

Corresponding label:
timestamp    2018-01-01 00:00:00
label                          r
Name: 0, dtype: object
Encoded label: 0

--- Sample index 4 ---
Features (sequence):
   timestamp  upper_shadow      body  lower_shadow  Candle_Color  \
4 2018-01-05       0.14469  1.274181      0.308056             2   

   upper_body_ratio  lower_body_ratio  upper_lower_body_ratio  
4          0.113556          0.241768                0.469688  

Corresponding label:
timestamp    2018-01-05 00:00:00
label                          g
Name: 4, dtype: object
Encoded label: 0

--- Sample index 5 ---
Features (sequence):
   timestamp  upper_shadow     body  lower_shadow  

You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
2025-08-27 09:49:59.299127: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-27 09:49:59.542758: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756275599.625086    2003 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/iatell/envs/Rllib2.43/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
/home/iatell/envs/Rllib2.43/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.



📊 Validation Report:
              precision    recall  f1-score   support

           g       1.00      1.00      1.00       105
           r       1.00      1.00      1.00        95

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

Confusion Matrix:
[[105   0]
 [  0  95]]


In [8]:
df_seq

Unnamed: 0,upper_shadow,body,lower_shadow,Candle_Color,upper_body_ratio,lower_body_ratio,upper_lower_body_ratio
0,0.982071,-0.290528,0.267596,1.0,3.380301,0.921068,3.669979
1,0.192047,-0.002515,0.742453,1.0,5.0,5.0,0.258665
2,0.379816,0.012712,0.100885,2.0,5.0,5.0,3.764852
