In [2]:
import pandas as pd
import talib

# Load CSV
df = pd.read_csv("/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles.csv")

# Ensure proper column names (adjust if your file differs)
# Expected: timestamp, open, high, low, close, volume
df.columns = [col.lower() for col in df.columns]

# Dictionary of candlestick recognition functions from TA-Lib
candle_funcs = {
    'CDLDOJI': talib.CDLDOJI,
    'CDLHAMMER': talib.CDLHAMMER,
    'CDLHANGINGMAN': talib.CDLHANGINGMAN,
    'CDLENGULFING': talib.CDLENGULFING,
    'CDLSHOOTINGSTAR': talib.CDLSHOOTINGSTAR,
    'CDLMORNINGSTAR': talib.CDLMORNINGSTAR,
    'CDLEVENINGSTAR': talib.CDLEVENINGSTAR,
    # add more as needed...
}

# Apply candlestick functions
labels = pd.Series([0] * len(df), index=df.index, dtype=int)

for name, func in candle_funcs.items():
    pattern_result = func(df['open'], df['high'], df['low'], df['close'])
    # TA-Lib returns +100/-100/0, so keep whichever is nonzero
    labels = labels.where(pattern_result == 0, pattern_result)

# Build final result
result = pd.DataFrame({
    "timestamp": df["timestamp"],
    "label": labels
})

print(result.head())


    timestamp  label
0  2018-01-01      0
1  2018-01-02      0
2  2018-01-03      0
3  2018-01-04      0
4  2018-01-05      0


In [1]:
import sys
from pathlib import Path

# Current notebook location
notebook_path = Path().resolve()

# Add parent folder (meta/) to sys.path
sys.path.append(str(notebook_path.parent))
from utils.json_to_csv import json_to_csv
import pandas as pd
string_seq = json_to_csv("/home/iatell/projects/meta-learning/data/string_sequence.json")
df = pd.read_csv(string_seq)
df

Unnamed: 0,startIndex,endIndex,startTime,endTime,label
0,1386,1409,1636502400,1634515200,a
1,1407,1431,1638403200,1636329600,a
2,1468,1491,1641600000,1643587200,s
3,1494,1513,1643846400,1645488000,s
4,1303,1367,1627344000,1632873600,s


In [4]:
import sys
from pathlib import Path

# Current notebook location
notebook_path = Path().resolve()

# Add parent folder (meta/) to sys.path
sys.path.append(str(notebook_path.parent))
import pytorch_lightning as pl

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch
class LSTMClassifier(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, num_layers, num_classes, lr=0.001):
        super().__init__()
        self.save_hyperparameters()
        self.lstm = torch.nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, num_classes)
        self.loss_fn = torch.nn.CrossEntropyLoss()
        self.lr = lr

    def forward(self, x, lengths):
        # Pack
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (hn, _) = self.lstm(packed)
        # hn: (num_layers, batch, hidden_dim)
        out = self.fc(hn[-1])  # last layer's hidden state
        return out

    def training_step(self, batch, batch_idx):
        X, y, lengths = batch
        logits = self(X, lengths)
        loss = self.loss_fn(logits, y)
        return loss

    def validation_step(self, batch, batch_idx):
        X, y, lengths = batch
        logits = self(X, lengths)
        loss = self.loss_fn(logits, y)
        acc = (logits.argmax(dim=1) == y).float().mean()
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)


In [2]:
import sys
from pathlib import Path

# Current notebook location
notebook_path = Path().resolve()

# Add parent folder (meta/) to sys.path
sys.path.append(str(notebook_path.parent))
from utils.json_to_csv import json_to_csv
import pandas as pd
string_seq = json_to_csv("/home/iatell/projects/meta-learning/data/string_sequence.json")
df_label = pd.read_csv(string_seq)

In [5]:
import joblib
import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import classification_report, confusion_matrix
from datetime import datetime
from preprocess.classification_dif_seq import preprocess_sequences_csv
# from models.LSTM.lstm_classifier import LSTMClassifier
import pandas as pd 
from utils.print_batch import print_batch
from utils.to_address import to_address

# ---------------- Collate fn for variable-length sequences ---------------- #
def collate_batch(batch):
    """
    Pads variable-length sequences in the batch.

    Args:
        batch: list of tuples (X, y), where
               - X: Tensor of shape (seq_len, feature_dim)
               - y: scalar Tensor (label)
    Returns:
        padded_X: Tensor of shape (batch, max_seq_len, feature_dim)
        y: Tensor of shape (batch,)
        lengths: list of original sequence lengths
    """
    Xs, ys = zip(*batch)
    lengths = [x.size(0) for x in Xs]
    padded_X = pad_sequence(Xs, batch_first=True)  # (batch, max_seq_len, feature_dim)
    y = torch.stack(ys)
    return padded_X, y, torch.tensor(lengths)


# ---------------- Evaluation ---------------- #
def evaluate_model(model, val_loader, label_encoder):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for X_batch, y_batch, lengths in val_loader:
            logits = model(X_batch, lengths)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())

    print("\n📊 Validation Report:")
    print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

    cm = confusion_matrix(all_labels, all_preds)
    print("Confusion Matrix:")
    print(cm)


# ---------------- Train ---------------- #
def train_model(
    data_csv,
    labels_csv,
    model_out_dir="models/saved_models",
    do_validation=True,
    hidden_dim=128,
    num_layers=1,
    lr=0.001,
    batch_size=32,
    max_epochs=50,
    save_model=True,
    return_val_accuracy=True,
    test_mode=False
):
    """
    Train an LSTM classifier with variable-length sequences.
    """
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_out = f"{model_out_dir}/lstm_model_seq_{timestamp}.pt"
    meta_out  = f"{model_out_dir}/lstm_meta_seq_{timestamp}.pkl"

    # --- Get dataset(s) --- #
    if do_validation:
        train_ds, val_ds, label_encoder, df, feature_cols = preprocess_sequences_csv(
            data_csv, labels_csv, val_split=True
        )
    else:
        full_dataset, label_encoder, df, feature_cols = preprocess_sequences_csv(
            data_csv, labels_csv, val_split=False
        )

    # --- Model config --- #
    input_dim = train_ds[0][0].shape[1] if do_validation else full_dataset[0][0].shape[1]
    num_classes = len(label_encoder.classes_)

    model = LSTMClassifier(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        num_classes=num_classes,
        lr=lr
    )

    # --- DataLoaders --- #
    if do_validation:
        train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
        val_loader   = DataLoader(val_ds, batch_size=batch_size, collate_fn=collate_batch)
    else:
        train_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
        val_loader   = None

    # Debug batch
    if test_mode:
        global df_seq
        df_seq = print_batch(train_loader, feature_cols, batch_idx=2)

    # --- Trainer --- #
    trainer = pl.Trainer(
        max_epochs=max_epochs,
        accelerator="auto",
        devices=1,
        log_every_n_steps=10,
        fast_dev_run=test_mode,
    )

    trainer.fit(model, train_loader, val_loader)

    # --- Save --- #
    if save_model:
        trainer.save_checkpoint(model_out)
        joblib.dump({
            'input_dim': input_dim,
            'hidden_dim': hidden_dim,
            'num_layers': num_layers,
            'num_classes': num_classes,
            'lr': lr,
            'label_classes': label_encoder.classes_
        }, meta_out)
        print(f"\n✅ Model saved to {model_out}")
        print(f"✅ Meta saved to {meta_out}")

    # --- Evaluation --- #
    val_acc = None
    if do_validation:
        evaluate_model(model, val_loader, label_encoder)
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for X_batch, y_batch, lengths in val_loader:
                logits = model(X_batch, lengths)
                preds = torch.argmax(logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(y_batch.cpu().numpy())
        val_acc = (torch.tensor(all_preds) == torch.tensor(all_labels)).float().mean().item()

    if return_val_accuracy:
        return {"accuracy": val_acc}


if __name__ == "__main__":
    train_model(
        "/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles_prop.csv",
        to_address(df_label),
        do_validation=True
    )


💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | lstm    | LSTM             | 72.7 K | train
1 | fc      | Linear           | 129    | train
2 | loss_fn | CrossEntropyLoss | 0      | train
-----------------------------------------------------
72.8 K    Trainable params
0         Non-trainable params
72.8 K    Total params
0.291     Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/iatell/envs/Rllib2.43/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
/home/iatell/envs/Rllib2.43/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
/home/iatell/envs/Rllib2.43/lib/python3.11/site-packages/pytorch_lightning/loops/fit_loop.py:310: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n_steps=10). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=50` reached.



✅ Model saved to models/saved_models/lstm_model_seq_20250830_184140.pt
✅ Meta saved to models/saved_models/lstm_meta_seq_20250830_184140.pkl

📊 Validation Report:
              precision    recall  f1-score   support

           s       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1

Confusion Matrix:
[[1]]


