In [None]:
import pandas as pd
label = pd.read_csv("/home/iatell/projects/meta-learning/data/seq_line_labels.csv")
label["seq_len"] = label["endIndex"] - label["startIndex"]
label

In [2]:

import pandas as pd
df = pd.read_csv("/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles_prop.csv")
df

Unnamed: 0,timestamp,open,high,low,close,volume,upper_shadow,body,lower_shadow,Candle_Color,upper_body_ratio,lower_body_ratio,upper_lower_body_ratio
0,2018-01-01,13707.91,13818.55,12750.00,13380.00,8607.15640,0.076003,-0.225254,0.432772,1,0.337410,1.921259,0.175619
1,2018-01-02,13382.16,15473.49,12890.02,14675.11,20078.16540,0.540071,0.874627,0.332912,2,0.617487,0.380633,1.622262
2,2018-01-03,14690.00,15307.56,14150.00,14919.51,15905.48210,0.263644,0.155931,0.366880,2,1.690776,2.352839,0.718611
3,2018-01-04,14919.51,15280.00,13918.04,15059.54,25224.41500,0.150006,0.095280,0.681423,2,1.574377,5.000000,0.220136
4,2018-01-05,15059.56,17176.24,14600.00,16960.39,23251.35200,0.144690,1.274181,0.308056,2,0.113556,0.241768,0.469688
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1599,2022-05-19,28715.33,30545.18,28691.38,30319.23,67877.36415,0.109006,0.773779,0.011554,2,0.140875,0.014932,5.000000
1600,2022-05-20,30319.22,30777.33,28730.00,29201.01,60517.25325,0.221063,-0.539597,0.227288,1,0.409682,0.421218,0.972612
1601,2022-05-21,29201.01,29656.18,28947.28,29445.06,20987.13124,0.103235,0.119338,0.124071,2,0.865069,1.039664,0.832066
1602,2022-05-22,29445.07,30487.99,29255.11,30293.94,36158.98748,0.095648,0.418411,0.093632,2,0.228598,0.223780,1.021531


# model


## Hungarian

### Hungarian lstm

In [1]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.nn.utils.rnn import pack_padded_sequence
from scipy.optimize import linear_sum_assignment


class LSTMMultiRegressor(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, num_layers, max_len_y, lr=0.001):
        super().__init__()
        self.save_hyperparameters()

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
        )
        # Predict max_len_y candidate values
        self.fc_reg = nn.Linear(hidden_dim, max_len_y)
        self.lr = lr

        self.loss_fn_reg = nn.MSELoss(reduction="none")  # per-element loss

    def forward(self, x, lengths):
        x = x["main"]
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (hn, _) = self.lstm(packed)
        last_h = hn[-1]

        y_pred = self.fc_reg(last_h)  # (B, max_len_y)
        return y_pred

    def hungarian_loss(self, y_pred, y_true, mask):
        """
        Hungarian matching loss.
        y_pred: (B, max_len_y)
        y_true: (B, max_len_y)
        mask:   (B, max_len_y), 1 where valid target, 0 where padding
        """
        B, max_len = y_true.shape
        total_loss = 0.0
        total_count = 0

        for i in range(B):
            gt_vals = y_true[i][mask[i] > 0]  # (L,)
            preds = y_pred[i]                 # (max_len_y,)

            if len(gt_vals) == 0:
                continue

            # Build cost matrix (L x max_len_y)
            cost = torch.cdist(gt_vals.unsqueeze(1), preds.unsqueeze(1), p=2).pow(2)  # squared error
            cost = cost.detach().cpu().numpy()

            # Hungarian assignment
            row_ind, col_ind = linear_sum_assignment(cost)

            # Compute loss only for assigned pairs
            matched_preds = preds[col_ind]
            matched_gts = gt_vals[row_ind]
            loss = self.loss_fn_reg(matched_preds, matched_gts).sum()

            total_loss += loss
            total_count += len(gt_vals)

        return total_loss / max(total_count, 1)

    def training_step(self, batch, batch_idx):
        X, y, lengths = batch
        y_pred = self(X, lengths)

        mask = (y != 0).float()  # assume padding = 0

        # Hungarian matching loss
        loss_reg = self.hungarian_loss(y_pred, y, mask)

        self.log("train_loss", loss_reg, prog_bar=True)
        return loss_reg

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)


### Hungarian lstm order weightening

In [None]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.nn.utils.rnn import pack_padded_sequence
from scipy.optimize import linear_sum_assignment


class LSTMMultiRegressor(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, num_layers, max_len_y, lr=0.001):
        super().__init__()
        self.save_hyperparameters()

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
        )
        # Predict max_len_y candidate values
        self.fc_reg = nn.Linear(hidden_dim, max_len_y)
        self.lr = lr

        self.loss_fn_reg = nn.MSELoss(reduction="none")  # per-element loss

    def forward(self, x, lengths):
        x = x["main"]
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (hn, _) = self.lstm(packed)
        last_h = hn[-1]

        y_pred = self.fc_reg(last_h)  # (B, max_len_y)
        return y_pred

    def hungarian_loss(self, y_pred, y_true, mask):
        """
        Hungarian matching loss with position-based weighting.
        Earlier ground-truth positions in y_true get higher weight.
        """
        B, max_len = y_true.shape
        total_loss = 0.0
        total_count = 0

        for i in range(B):
            # Extract ground-truth values and their positions
            gt_vals = y_true[i][mask[i] > 0]  # (L,)
            gt_indices = torch.nonzero(mask[i] > 0, as_tuple=False).squeeze(1)  # positions in y_true

            preds = y_pred[i]  # (max_len_y,)

            if len(gt_vals) == 0:
                continue

            # Cost matrix (L x max_len_y) using squared error
            cost = torch.cdist(gt_vals.unsqueeze(1), preds.unsqueeze(1), p=2).pow(2)
            cost = cost.detach().cpu().numpy()

            # Hungarian assignment
            row_ind, col_ind = linear_sum_assignment(cost)

            # Matched pairs
            matched_preds = preds[col_ind]
            matched_gts = gt_vals[row_ind]

            # --- weighting logic ---
            # Lower index = higher weight (inverse rank)
            gt_pos = gt_indices[row_ind]  # actual positions of matched gts
            weights = 1.0 / (1.0 + gt_pos.float())  # e.g. pos=0 -> 1.0, pos=2 -> 0.33

            # Compute weighted MSE
            loss = (weights * self.loss_fn_reg(matched_preds, matched_gts)).sum()

            total_loss += loss
            total_count += weights.sum().item()

        return total_loss / max(total_count, 1.0)
    
    def hungarian_loss_unweighted(self, y_pred, y_true, mask):
        """
        Same Hungarian matching but without weights (baseline).
        """
        B, max_len = y_true.shape
        total_loss = 0.0
        total_count = 0

        for i in range(B):
            gt_vals = y_true[i][mask[i] > 0]
            preds = y_pred[i]

            if len(gt_vals) == 0:
                continue

            cost = torch.cdist(gt_vals.unsqueeze(1), preds.unsqueeze(1), p=2).pow(2)
            cost = cost.detach().cpu().numpy()

            row_ind, col_ind = linear_sum_assignment(cost)

            matched_preds = preds[col_ind]
            matched_gts = gt_vals[row_ind]

            loss = self.loss_fn_reg(matched_preds, matched_gts).sum()

            total_loss += loss
            total_count += len(gt_vals)

        return total_loss / max(total_count, 1)
    
    def training_step(self, batch, batch_idx):
        X, y, lengths = batch
        y_pred = self(X, lengths)

        mask = (y != 0).float()  # assume padding = 0

        # Hungarian matching loss (weighted)
        loss_reg = self.hungarian_loss(y_pred, y, mask)

        # Log both weighted and unweighted (for comparison/debug)
        unweighted_loss = self.hungarian_loss_unweighted(y_pred, y, mask)

        self.log("train_loss", loss_reg, prog_bar=True)              # weighted
        self.log("train_loss_unweighted", unweighted_loss)           # reference
        return loss_reg

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)


### CNN -attention lstm hungarian - concatination

In [None]:
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
from scipy.optimize import linear_sum_assignment

# --- Sinusoidal positional encoding ---
class SinusoidalPositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: (B, T, d_model)
        return x + self.pe[:, :x.size(1), :]



class LearnablePositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        # nn.Embedding is a perfect layer for this.
        # It's a lookup table that stores embeddings of a fixed size.
        self.embedding = nn.Embedding(max_len, d_model)

    def forward(self, x):
        # x: (B, T, d_model)
        batch_size, seq_len, _ = x.shape
        
        # Create a tensor of positions [0, 1, 2, ..., T-1]
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0) # (1, T)
        
        # Look up the embeddings for these positions
        positional_encodings = self.embedding(positions) # (1, T, d_model)
        
        # Add to the input tensor
        return x + positional_encodings

class RotaryPositionalEncoding(nn.Module):
    def __init__(self, dim, base=10000):
        super().__init__()
        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer("inv_freq", inv_freq)
        self.seq_len_cached = None
        self.cos_cached = None
        self.sin_cached = None

    def forward(self, x):
        seq_len = x.shape[1]
        if seq_len != self.seq_len_cached:
            self.seq_len_cached = seq_len
            t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq)
            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
            emb = torch.cat((freqs, freqs), dim=-1)
            self.cos_cached = emb.cos()[:, None, None, :]
            self.sin_cached = emb.sin()[:, None, None, :]
        return self.cos_cached, self.sin_cached

def rotate_half(x):
    x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=x1.ndim - 1)

def apply_rotary_pos_emb(q, k, cos, sin):
    # q, k have shape (B, H, T, head_dim)
    # cos, sin have shape (T, 1, 1, head_dim)
    return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)

# --- How to use it in your Transformer ---
# self.rope = RotaryPositionalEncoding(dim=head_dim)
#
# def forward(self, x):
#     q, k, v = self.to_qkv(x)
#     cos, sin = self.rope(q)
#     q, k = apply_rotary_pos_emb(q, k, cos, sin)
#     # ... proceed with attention calculation using the new q and k

# --- CNN + Transformer Regressor ---
class CNNAttentionTransformerRegressor(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, num_layers, max_len_y, nhead=4, lr=0.001):
        super().__init__()
        self.save_hyperparameters()

        # --- Multi-branch CNN ---
        self.branches = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(in_channels=input_dim, out_channels=32, kernel_size=k, padding="same"),
                nn.BatchNorm1d(32),
                nn.ReLU(),
                nn.Dropout(0.3)
            ) for k in [3, 5, 7, 11]
        ])

        # --- Conv2d fusion ---
        self.fusion_conv2d = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=1, kernel_size=(1, 3), padding=(0, 1)),
            nn.BatchNorm2d(1),
            nn.ReLU(),
            nn.Dropout(0.3)
        )

        # --- Positional encoding ---
        self.positional_encoding = SinusoidalPositionalEncoding(d_model=32)

        # --- Transformer encoder ---
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=32,
            nhead=nhead,
            dim_feedforward=hidden_dim,
            dropout=0.3,
            activation="relu",
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # --- Regressor ---
        self.regressor = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(32, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, max_len_y)
        )

        self.loss_fn_reg = nn.MSELoss(reduction="none")
        self.lr = lr

    # --- Forward ---
    def forward(self, x, lengths):
        # x["main"]: (B, T, input_dim)
        x = x["main"].transpose(1, 2)  # (B, input_dim, T)

        # Multi-branch CNN
        branch_outputs = [branch(x) for branch in self.branches]  # list of (B, 32, T)
        stacked = torch.stack(branch_outputs, dim=1)               # (B, 4, 32, T)

        # Conv2d fusion
        fused = self.fusion_conv2d(stacked)                        # (B, 1, 32, T)
        fused = fused.squeeze(1)                                   # (B, 32, T)
        fused = fused.transpose(1, 2)                               # (B, T, 32)

        # Positional encoding
        fused = self.positional_encoding(fused)                    # (B, T, 32)

        # Padding mask for transformer
        max_len = fused.size(1)
        mask = torch.arange(max_len, device=lengths.device)[None, :] >= lengths[:, None]  # True=masked

        # Transformer encoder
        transformer_out = self.transformer(fused, src_key_padding_mask=mask)  # (B, T, 32)

        # Masked mean pooling over sequence
        seq_mask = ~mask
        pooled = (transformer_out * seq_mask.unsqueeze(-1)).sum(1) / seq_mask.sum(1, keepdim=True)  # (B, 32)

        # Regression
        y_pred = self.regressor(pooled)  # (B, max_len_y)
        return y_pred

    # --- Hungarian weighted loss ---
    def hungarian_loss(self, y_pred, y_true, mask):
        B, max_len = y_true.shape
        total_loss = 0.0
        total_count = 0

        for i in range(B):
            gt_vals = y_true[i][mask[i] > 0]
            gt_indices = torch.nonzero(mask[i] > 0, as_tuple=False).squeeze(1)
            preds = y_pred[i]

            if len(gt_vals) == 0:
                continue

            cost = torch.cdist(gt_vals.unsqueeze(1), preds.unsqueeze(1), p=2).pow(2)
            cost = cost.detach().cpu().numpy()

            row_ind, col_ind = linear_sum_assignment(cost)
            matched_preds = preds[col_ind]
            matched_gts = gt_vals[row_ind]

            gt_pos = gt_indices[row_ind]
            weights = 1.0 / (1.0 + gt_pos.float())

            loss = (weights * self.loss_fn_reg(matched_preds, matched_gts)).sum()
            total_loss += loss
            total_count += weights.sum().item()

        return total_loss / max(total_count, 1.0)

    # --- Hungarian unweighted loss ---
    def hungarian_loss_unweighted(self, y_pred, y_true, mask):
        B, max_len = y_true.shape
        total_loss = 0.0
        total_count = 0

        for i in range(B):
            gt_vals = y_true[i][mask[i] > 0]
            preds = y_pred[i]
            if len(gt_vals) == 0:
                continue

            cost = torch.cdist(gt_vals.unsqueeze(1), preds.unsqueeze(1), p=2).pow(2)
            cost = cost.detach().cpu().numpy()
            row_ind, col_ind = linear_sum_assignment(cost)
            matched_preds = preds[col_ind]
            matched_gts = gt_vals[row_ind]

            loss = self.loss_fn_reg(matched_preds, matched_gts).sum()
            total_loss += loss
            total_count += len(gt_vals)

        return total_loss / max(total_count, 1)

    # --- Training step ---
    def training_step(self, batch, batch_idx):
        X, y, lengths = batch
        y_pred = self(X, lengths)

        mask = (y != 0).float()  # assume padding = 0
        loss_reg = self.hungarian_loss(y_pred, y, mask)
        unweighted_loss = self.hungarian_loss_unweighted(y_pred, y, mask)

        self.log("train_loss", loss_reg, prog_bar=True)
        self.log("train_loss_unweighted", unweighted_loss)
        return loss_reg

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)


depricated

In [None]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from scipy.optimize import linear_sum_assignment
import torch.nn.functional as F
from utils.load_attention import  load_attention
from importlib import import_module

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
        self.v_context = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, lstm_outputs, lengths):
        energy = torch.tanh(self.attn_layer(lstm_outputs))
        attn_scores = self.v_context(energy).squeeze(2)
        mask = torch.arange(
            lstm_outputs.size(1), device=lstm_outputs.device
        )[None, :] < lengths[:, None]
        attn_scores = attn_scores.masked_fill(mask == 0, -1e10)
        attn_weights = F.softmax(attn_scores, dim=1)
        context_vector = torch.bmm(attn_weights.unsqueeze(1), lstm_outputs).squeeze(1)
        return context_vector


class CNNAttentionLSTMMultiRegressor(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, num_layers, max_len_y, lr=0.001):
        super().__init__()
        self.save_hyperparameters()

        # Multi-branch 1D convolutions
        self.branches = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(in_channels=input_dim, out_channels=32, kernel_size=k, padding="same"),
                nn.BatchNorm1d(32),
                nn.ReLU(),
                nn.Dropout(0.3)
            ) for k in [3, 5, 7, 11]
        ])

        # Fusion with Conv2d over (branches × seq)
        self.fusion_conv2d = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=1, kernel_size=(1, 3), padding=(0, 1)),
            nn.BatchNorm2d(1),
            nn.ReLU(),
            nn.Dropout(0.3)
        )

        # LSTM takes feature_dim = 32 after fusion
        self.lstm = nn.LSTM(
            input_size=32,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.3 if num_layers > 1 else 0.0
        )

        self.attention = Attention(hidden_dim)

        self.regressor = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, max_len_y)
        )

        self.loss_fn_reg = nn.MSELoss(reduction="none")
        self.lr = lr

    def forward(self, x, lengths):
        # Input: x["main"] → (B, T, input_dim)
        x = x["main"].transpose(1, 2)  # (B, input_dim, T)

        # Branch outputs
        branch_outputs = [branch(x) for branch in self.branches]  # list of (B, 32, T)
        stacked = torch.stack(branch_outputs, dim=1)  # (B, 4, 32, T)

        # Fusion conv2d
        fused = self.fusion_conv2d(stacked)  # (B, 1, 32, T)
        fused = fused.squeeze(1)             # (B, 32, T)

        # Prepare for LSTM
        lstm_input = fused.transpose(1, 2)   # (B, T, 32)

        # LSTM with packing
        packed_input = pack_padded_sequence(lstm_input, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        lstm_outputs, _ = pad_packed_sequence(packed_output, batch_first=True)  # (B, T, H)

        # Attention
        context_vector = self.attention(lstm_outputs, lengths)  # (B, H)

        # Regression
        y_pred = self.regressor(context_vector)  # (B, max_len_y)
        return y_pred

    def hungarian_loss(self, y_pred, y_true, mask):
        """
        Hungarian matching loss with position-based weighting.
        Earlier ground-truth positions in y_true get higher weight.
        """
        B, max_len = y_true.shape
        total_loss = 0.0
        total_count = 0

        for i in range(B):
            # Extract ground-truth values and their positions
            gt_vals = y_true[i][mask[i] > 0]  # (L,)
            gt_indices = torch.nonzero(mask[i] > 0, as_tuple=False).squeeze(1)  # positions in y_true

            preds = y_pred[i]  # (max_len_y,)

            if len(gt_vals) == 0:
                continue

            # Cost matrix (L x max_len_y) using squared error
            cost = torch.cdist(gt_vals.unsqueeze(1), preds.unsqueeze(1), p=2).pow(2)
            cost = cost.detach().cpu().numpy()

            # Hungarian assignment
            row_ind, col_ind = linear_sum_assignment(cost)

            # Matched pairs
            matched_preds = preds[col_ind]
            matched_gts = gt_vals[row_ind]

            # --- weighting logic ---
            # Lower index = higher weight (inverse rank)
            gt_pos = gt_indices[row_ind]  # actual positions of matched gts
            weights = 1.0 / (1.0 + gt_pos.float())  # e.g. pos=0 -> 1.0, pos=2 -> 0.33

            # Compute weighted MSE
            loss = (weights * self.loss_fn_reg(matched_preds, matched_gts)).sum()

            total_loss += loss
            total_count += weights.sum().item()

        return total_loss / max(total_count, 1.0)
    
    def hungarian_loss_unweighted(self, y_pred, y_true, mask):
        """
        Same Hungarian matching but without weights (baseline).
        """
        B, max_len = y_true.shape
        total_loss = 0.0
        total_count = 0

        for i in range(B):
            gt_vals = y_true[i][mask[i] > 0]
            preds = y_pred[i]

            if len(gt_vals) == 0:
                continue

            cost = torch.cdist(gt_vals.unsqueeze(1), preds.unsqueeze(1), p=2).pow(2)
            cost = cost.detach().cpu().numpy()

            row_ind, col_ind = linear_sum_assignment(cost)

            matched_preds = preds[col_ind]
            matched_gts = gt_vals[row_ind]

            loss = self.loss_fn_reg(matched_preds, matched_gts).sum()

            total_loss += loss
            total_count += len(gt_vals)

        return total_loss / max(total_count, 1)
    
    def training_step(self, batch, batch_idx):
        X, y, lengths = batch
        y_pred = self(X, lengths)

        mask = (y != 0).float()  # assume padding = 0

        # Hungarian matching loss (weighted)
        loss_reg = self.hungarian_loss(y_pred, y, mask)

        # Log both weighted and unweighted (for comparison/debug)
        unweighted_loss = self.hungarian_loss_unweighted(y_pred, y, mask)

        self.log("train_loss", loss_reg, prog_bar=True)              # weighted
        self.log("train_loss_unweighted", unweighted_loss)           # reference
        return loss_reg

    def configure_optimizers(self):
        # Import optimizer dynamically
        opt_module = import_module(f"model.optimizer.{self.optimizer_name}")
        optimizer = opt_module.build(self, self.lr)

        # No scheduler
        if self.scheduler_name is None:
            return optimizer

        # Import scheduler dynamically
        sch_module = import_module(f"model.schedulers.{self.scheduler_name}")
        # OneCycle needs trainer
        if self.scheduler_name == "onecycle":
            scheduler = sch_module.build(optimizer, self.lr, self.trainer)
        else:
            scheduler = sch_module.build(optimizer)

        # Lightning accepts dict or list depending on scheduler type
        if isinstance(scheduler, dict):
            return {"optimizer": optimizer, "lr_scheduler": scheduler}
        elif isinstance(scheduler, torch.optim.lr_scheduler._LRScheduler):
            return [optimizer], [scheduler]
        else:
            raise ValueError(f"Unsupported scheduler return type: {type(scheduler)}")



### CNN simple attention lstm weightening

In [2]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from scipy.optimize import linear_sum_assignment
import torch.nn.functional as F
from utils.load_attention import  load_attention
from utils.load_class import load_class
from importlib import import_module
from models.losses.hungarian_loss import hungarian_loss_weighted
from models.losses.hungarian_loss_unweighted import hungarian_loss_unweighted

class CNNAttentionLSTMMultiRegressor(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, num_layers, max_len_y, lr=0.001, attention_name="tanh_attention",optimizer_name="adamw",kernels= [3, 5, 7, 11],
    cnn_out_channels=32,first_drop= 0.3, second_drop=0.3, third_drop= 0.3,scheduler_name=None, scheduler_params=None, optimizer_params=None):
        super().__init__()
        self.save_hyperparameters()
        self.optimizer_name = optimizer_name
        self.scheduler_name = scheduler_name or None
        self.optimizer_params = optimizer_params or {}
        self.scheduler_params = scheduler_params or {}
        self.kernels = kernels
        # Multi-branch 1D convolutions
        self.branches = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(in_channels=input_dim["candle_shape"], out_channels=cnn_out_channels, kernel_size=k, padding="same"),
                nn.BatchNorm1d(cnn_out_channels),
                nn.ReLU(),
                nn.Dropout(0.3)
            ) for k in self.kernels
        ])
        # Fusion with Conv2d over (branches × seq)
        self.fusion_conv2d = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=1, kernel_size=(1, 3), padding=(0, 1)),
            nn.BatchNorm2d(1),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        # LSTM takes feature_dim = 32 after fusion
        self.lstm = nn.LSTM(
            input_size=cnn_out_channels,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.3 if num_layers > 1 else 0.0
        )
        self.attention = load_class(f"models.attention.{attention_name}", hidden_dim=hidden_dim)
        self.regressor = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, max_len_y)
        )

        self.loss_fn_reg = nn.MSELoss(reduction="none")
        self.lr = lr
        self.hungarian_loss = hungarian_loss_weighted
        self.hungarian_loss_unweighted = hungarian_loss_unweighted
        
    def forward(self, x, lengths):
        # Input: x["main"] → (B, T, input_dim)
        x = x["main"].transpose(1, 2)  # (B, input_dim, T)
        # Branch outputs
        branch_outputs = [branch(x) for branch in self.branches]  # list of (B, 32, T)
        stacked = torch.stack(branch_outputs, dim=1)  # (B, 4, 32, T)
        # Fusion conv2d
        fused = self.fusion_conv2d(stacked)  # (B, 1, 32, T)
        fused = fused.squeeze(1)             # (B, 32, T)
        # Prepare for LSTM
        lstm_input = fused.transpose(1, 2)   # (B, T, 32)
        # LSTM with packing
        packed_input = pack_padded_sequence(lstm_input, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        lstm_outputs, _ = pad_packed_sequence(packed_output, batch_first=True)  # (B, T, H)
        # Attention
        context_vector = self.attention(lstm_outputs, lengths)  # (B, H)
        # Regression
        y_pred = self.regressor(context_vector)  # (B, max_len_y)
        return y_pred

    def training_step(self, batch, batch_idx):
        X, y, lengths = batch
        y_pred = self(X, lengths)

        mask = (y != 0).float()  # assume padding = 0

        # Hungarian matching loss (weighted)
        loss_reg = self.hungarian_loss(y_pred, y, mask)

        # Log both weighted and unweighted (for comparison/debug)
        unweighted_loss = self.hungarian_loss_unweighted(y_pred, y, mask)

        self.log("train_loss", loss_reg, prog_bar=True)              # weighted
        self.log("train_loss_unweighted", unweighted_loss)           # reference
        return loss_reg

    def validation_step(self, batch, batch_idx):
        X, y, lengths = batch
        y_pred = self(X, lengths)
        mask = (y != 0).float()

        loss_reg = self.hungarian_loss(y_pred, y, mask)
        self.log("val_loss", loss_reg, on_step=False, on_epoch=True, prog_bar=True)
        return loss_reg

    def configure_optimizers(self):
        # Import optimizer dynamically
        opt_module = import_module(f"models.optimizer.{self.optimizer_name}")
        optimizer = opt_module.build(self, self.lr)

        # No scheduler
        if self.scheduler_name is None:
            return optimizer

        # Import scheduler dynamically
        sch_module = import_module(f"models.schedulers.{self.scheduler_name}")
        # OneCycle needs trainer
        if self.scheduler_name == "onecycle":
            scheduler = sch_module.build(optimizer, self.lr, self.trainer)
        else:
            scheduler = sch_module.build(optimizer)

        # Lightning accepts dict or list depending on scheduler type
        if isinstance(scheduler, dict):
            return {"optimizer": optimizer, "lr_scheduler": scheduler}
        elif isinstance(scheduler, torch.optim.lr_scheduler._LRScheduler):
            return [optimizer], [scheduler]
        else:
            raise ValueError(f"Unsupported scheduler return type: {type(scheduler)}")




In [5]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from scipy.optimize import linear_sum_assignment
import torch.nn.functional as F
from utils.load_attention import load_attention
from utils.load_class import load_class
from importlib import import_module
from models.losses.hungarian_loss import hungarian_loss_weighted
from models.losses.hungarian_loss_unweighted import hungarian_loss_unweighted
from typing import Optional

class CNNAttentionLSTMMultiRegressor(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, num_layers, max_len_y, lr=0.001, attention_name="tanh_attention",optimizer_name="adamw",kernels= [3, 5, 7, 11],fusion_out_channels = 10,
    cnn_out_channels=32,first_drop= 0.3, second_drop=0.3, third_drop= 0.3,scheduler_name=None, scheduler_params=None, optimizer_params=None):
        super().__init__()
        self.save_hyperparameters()
        self.optimizer_name = optimizer_name
        self.scheduler_name = scheduler_name or None
        self.optimizer_params = optimizer_params or {}
        self.scheduler_params = scheduler_params or {}
        self.input_dim = input_dim["candle_shape"]
        self.cnn_out_channels = cnn_out_channels
        self.kernels = kernels
        self.num_branches = len(kernels)
        self.fusion_out_channels = fusion_out_channels
        self.main_feat_dim = input_dim['main']
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.max_len_y = max_len_y
        self.loss_fn_reg = nn.MSELoss(reduction="none")
        self.lr = lr
        self.attention = load_class(f"models.attention.{attention_name}", hidden_dim=hidden_dim)
        self.hungarian_loss = hungarian_loss_weighted
        self.hungarian_loss_unweighted = hungarian_loss_unweighted
        # ----- Branches: multiple Conv1d with different kernel sizes -----
        branches = []
        for k in kernels:
            pad = (k - 1) // 2  # 'same' padding for odd kernels; for even kernels behavior approximated
            branches.append(
                nn.Sequential(
                    nn.Conv1d(in_channels=self.input_dim , out_channels=cnn_out_channels, kernel_size=k, padding=pad),
                    nn.BatchNorm1d(cnn_out_channels),
                    nn.ReLU(inplace=True),
                    nn.Dropout(first_drop)
                )
            )
        self.branches = nn.ModuleList(branches)

        # ----- Fusion Conv2d: we will stack branch outputs into shape (B, num_branches, C, T)
        # in_channels should equal number of branches.
        # Kernel height must be cnn_out_channels to cover "full feature height".
        self.fusion_conv2d = nn.Sequential(
            nn.Conv2d(
                in_channels=self.num_branches,
                out_channels=self.fusion_out_channels,
                kernel_size=(self.cnn_out_channels, 1),
                padding=(0, 0)
            ),
            nn.BatchNorm2d(self.fusion_out_channels),
            nn.ReLU(inplace=True),
            nn.Dropout(second_drop)
        )

        # After fusion we will have (B, fusion_out_channels, 1, T) -> squeeze -> (B, fusion_out_channels, T)

        # ----- LSTM: input_size should be fusion_out_channels + main_feat_dim -----
        lstm_input_size = self.fusion_out_channels + self.main_feat_dim
        self.lstm = nn.LSTM(
            input_size=lstm_input_size,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=third_drop if num_layers > 1 else 0.0
        )

        # Regressor (maps attention context to target length)
        self.regressor = nn.Sequential(
            nn.Dropout(third_drop),
            nn.Linear(hidden_dim, max(4, hidden_dim // 2)),
            nn.ReLU(inplace=True),
            nn.Linear(max(4, hidden_dim // 2), max_len_y)
        )

    def forward(self, x: dict, lengths):
        """
        x is a dict with at least:
          - x["candle_shape"]: Tensor shape (B, input_dim, T)
          - x["main"]: Tensor shape (B, main_feat_dim, T)

        Returns:
          - out: (B, max_len_y)
          - optionally attention weights if you want them (we return both)
        """
        candle = x["candle_shape"]
        candle = candle.permute(0, 2, 1) 
        main = x["main"]
        main = main.permute(0, 2, 1)
        # ---- Validate shapes ----
        B, _, T = candle.shape
        assert main.shape[0] == B and main.shape[2] == T, \
            f"main must match batch and time dims, got {main.shape} vs candle {candle.shape}"
        # ---- Branches: each branch returns (B, C, T) ----
        branch_feats = [branch(candle) for branch in self.branches]  # list of (B, C, T)
        # stack into (B, num_branches, C, T)
        stacked = torch.stack(branch_feats, dim=1)

        # ---- Fusion Conv2d expects (B, in_channels=num_branches, height=C, width=T) ----
        fused = self.fusion_conv2d(stacked)  # -> (B, fusion_out_channels, 1, T)
        fused = fused.squeeze(2)  # -> (B, fusion_out_channels, T)

        # ---- Concatenate with main features along channel dimension -> (B, fusion_out + m, T) ----
        combined = torch.cat([fused, main], dim=1)

        # ---- Prepare for LSTM: LSTM batch_first expects (B, T, feat) ----
        combined_t = combined.permute(0, 2, 1)  # (B, T, feat_dim)
        packed_input = pack_padded_sequence(combined_t, lengths.cpu(), batch_first=True, enforce_sorted=False)
        # LSTM
        packed_output, _ = self.lstm(packed_input)  # lstm_out: (B, T, hidden_dim)
        lstm_outputs, _ = pad_packed_sequence(packed_output, batch_first=True)
        # Attention over LSTM outputs
        context= self.attention(lstm_outputs,lengths)  # context: (B, hidden_dim)

        # Regressor -> (B, max_len_y)
        y_pred = self.regressor(context)

        return y_pred

    # ---------------------------
    def training_step(self, batch, batch_idx):
        X, y, lengths = batch
        y_pred = self(X, lengths)

        mask = (y != 0).float()  # assume padding = 0

        # Hungarian matching loss (weighted)
        loss_reg = self.hungarian_loss(y_pred, y, mask)

        # Log both weighted and unweighted (for comparison/debug)
        unweighted_loss = self.hungarian_loss_unweighted(y_pred, y, mask)

        self.log("train_loss", loss_reg, prog_bar=True)              # weighted
        self.log("train_loss_unweighted", unweighted_loss)           # reference
        return loss_reg

    def validation_step(self, batch, batch_idx):
        X, y, lengths = batch
        y_pred = self(X, lengths)
        mask = (y != 0).float()

        loss_reg = self.hungarian_loss(y_pred, y, mask)
        self.log("val_loss", loss_reg, on_step=False, on_epoch=True, prog_bar=True)
        return loss_reg

    def configure_optimizers(self):
        # Import optimizer dynamically
        opt_module = import_module(f"models.optimizer.{self.optimizer_name}")
        optimizer = opt_module.build(self, self.lr)

        # No scheduler
        if self.scheduler_name is None:
            return optimizer

        # Import scheduler dynamically
        sch_module = import_module(f"models.schedulers.{self.scheduler_name}")
        # OneCycle needs trainer
        if self.scheduler_name == "onecycle":
            scheduler = sch_module.build(optimizer, self.lr, self.trainer)
        else:
            scheduler = sch_module.build(optimizer)

        # Lightning accepts dict or list depending on scheduler type
        if isinstance(scheduler, dict):
            return {"optimizer": optimizer, "lr_scheduler": scheduler}
        elif isinstance(scheduler, torch.optim.lr_scheduler._LRScheduler):
            return [optimizer], [scheduler]
        else:
            raise ValueError(f"Unsupported scheduler return type: {type(scheduler)}")




### CNN LSTM Hungarian weightening

In [1]:
import pytorch_lightning as pl
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence
from scipy.optimize import linear_sum_assignment


class CNNLSTMMultiRegressor(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, num_layers, max_len_y, lr=0.001):
        super().__init__()
        self.save_hyperparameters()

        # Conv1d branches
        self.branches = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(in_channels=input_dim, out_channels=32, kernel_size=k, padding=k // 2),
                nn.BatchNorm1d(32),      # normalize per branch
                nn.ReLU(),
                nn.Dropout(0.3)          # regularize per branch
            )
            for k in [1, 3, 7, 10]
        ])

        # Conv2d fuse across branch dimension
        self.conv2d = nn.Conv2d(
            in_channels=4, out_channels=1, kernel_size=(1, 3), padding=(0, 1)
        )
        self.bn2d = nn.BatchNorm2d(1)   # normalize conv2d output
        self.dropout2d = nn.Dropout(0.3)

        self.lstm_input_dim = 32  # after conv2d → (B, 32, T)
        self.lstm = nn.LSTM(
            input_size=self.lstm_input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.3 if num_layers > 1 else 0.0   # built-in LSTM dropout
        )

        self.dropout_fc = nn.Dropout(0.3)
        self.fc_reg = nn.Linear(hidden_dim, max_len_y)

        self.lr = lr
        self.loss_fn_reg = nn.MSELoss(reduction="none")

    def forward(self, x, lengths):
        x = x["main"]  # (B, T, F)
        B, T, F = x.shape

        # Conv1d branches
        feats = [branch(x.transpose(1, 2)) for branch in self.branches]  # (B, 32, T) each
        fusion = torch.stack(feats, dim=1)                               # (B, 4, 32, T)

        # Conv2d fusion
        fusion2d = self.conv2d(fusion)                                   # (B, 1, 32, T)
        fusion2d = self.bn2d(fusion2d)
        fusion2d = F.relu(fusion2d)
        fusion2d = self.dropout2d(fusion2d)
        fusion2d = fusion2d.squeeze(1)                                   # (B, 32, T)

        # LSTM
        fusion2d = fusion2d.transpose(1, 2)                              # (B, T, 32)
        packed = pack_padded_sequence(fusion2d, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (hn, _) = self.lstm(packed)
        last_h = hn[-1]

        # Fully connected with dropout
        last_h = self.dropout_fc(last_h)
        y_pred = self.fc_reg(last_h)  # (B, max_len_y)
        return y_pred



    # ------------------- Hungarian Losses (same as your code) -------------------
    def hungarian_loss(self, y_pred, y_true, mask):
        B, max_len = y_true.shape
        total_loss = 0.0
        total_count = 0

        for i in range(B):
            gt_vals = y_true[i][mask[i] > 0]
            gt_indices = torch.nonzero(mask[i] > 0, as_tuple=False).squeeze(1)

            preds = y_pred[i]

            if len(gt_vals) == 0:
                continue

            cost = torch.cdist(gt_vals.unsqueeze(1), preds.unsqueeze(1), p=2).pow(2)
            cost = cost.detach().cpu().numpy()

            row_ind, col_ind = linear_sum_assignment(cost)

            matched_preds = preds[col_ind]
            matched_gts = gt_vals[row_ind]

            gt_pos = gt_indices[row_ind]
            weights = 1.0 / (1.0 + gt_pos.float())

            loss = (weights * self.loss_fn_reg(matched_preds, matched_gts)).sum()

            total_loss += loss
            total_count += weights.sum().item()

        return total_loss / max(total_count, 1.0)

    def hungarian_loss_unweighted(self, y_pred, y_true, mask):
        B, max_len = y_true.shape
        total_loss = 0.0
        total_count = 0

        for i in range(B):
            gt_vals = y_true[i][mask[i] > 0]
            preds = y_pred[i]

            if len(gt_vals) == 0:
                continue

            cost = torch.cdist(gt_vals.unsqueeze(1), preds.unsqueeze(1), p=2).pow(2)
            cost = cost.detach().cpu().numpy()

            row_ind, col_ind = linear_sum_assignment(cost)

            matched_preds = preds[col_ind]
            matched_gts = gt_vals[row_ind]

            loss = self.loss_fn_reg(matched_preds, matched_gts).sum()

            total_loss += loss
            total_count += len(gt_vals)

        return total_loss / max(total_count, 1)

    def training_step(self, batch, batch_idx):
        X, y, lengths = batch
        y_pred = self(X, lengths)
        mask = (y != 0).float()

        loss_reg = self.hungarian_loss(y_pred, y, mask)
        unweighted_loss = self.hungarian_loss_unweighted(y_pred, y, mask)

        self.log("train_loss", loss_reg, prog_bar=True)
        self.log("train_loss_unweighted", unweighted_loss)
        return loss_reg

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)


### transformer Hungarian

In [None]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    """Injects position information into the input sequence."""
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x shape: (Sequence Length, Batch Size, Feature Dim)
        x = x + self.pe[:x.size(0)]
        return x

class TransformerRegressor(pl.LightningModule):
    def __init__(self, input_dim, model_dim, num_heads, num_encoder_layers, max_len_y, lr=0.001):
        super().__init__()
        self.save_hyperparameters()

        # 1. CNN Feature Extractor (same as before)
        self.branches = nn.ModuleList([...]) # Your Conv1D branches
        self.fusion_conv = nn.Sequential([...]) # Your 1x1 fusion conv
        
        # We need to ensure the output dim of fusion_conv matches model_dim
        # Let's assume fusion_conv outputs `model_dim` channels

        # 2. Positional Encoding
        self.pos_encoder = PositionalEncoding(d_model=model_dim)

        # 3. Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=model_dim, 
            nhead=num_heads, 
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)

        # 4. Final Regressor Head
        self.regressor = nn.Linear(model_dim, max_len_y)

    def forward(self, x, lengths):
        x = x["main"].transpose(1, 2)
        
        # --- CNN Part ---
        branch_outputs = [branch(x) for branch in self.branches]
        fused_features = torch.cat(branch_outputs, dim=1)
        fused_features = self.fusion_conv(fused_features) # (B, model_dim, T)
        
        # --- Transformer Part ---
        # Reshape for Transformer: (B, T, F)
        transformer_input = fused_features.transpose(1, 2)
        
        # Add positional encoding
        # Note: Pytorch's default Transformer expects (T, B, F) or batch_first=True
        # We used batch_first=True, so shape is (B, T, F)
        transformer_input = self.pos_encoder(transformer_input.transpose(0, 1)).transpose(0, 1)

        # Create padding mask for the Transformer
        # (B, T) -> True for positions that should be ignored
        padding_mask = (torch.arange(x.size(2), device=x.device)[None, :] >= lengths[:, None])

        # Pass through Transformer Encoder
        transformer_output = self.transformer_encoder(transformer_input, src_key_padding_mask=padding_mask) # (B, T, F)

        # Aggregate the output sequence into a single vector for prediction.
        # Simple averaging is a common and effective method.
        aggregated_output = transformer_output.mean(dim=1)
        
        y_pred = self.regressor(aggregated_output)
        return y_pred

    # ... (training_step, loss functions, etc. would remain the same) ...

## two head lstm

In [14]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.nn.utils.rnn import pack_padded_sequence

class LSTMMultiRegressor(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, num_layers, max_len_y, lr=0.001, threshold=0.5):
        super().__init__()
        self.save_hyperparameters()

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
        )
        # Main regression output: predict all linePrices up to max_len_y
        self.fc_reg = nn.Linear(hidden_dim, max_len_y)
        # Length prediction branch: logits per possible line (max_len_y)
        self.fc_len = nn.Linear(hidden_dim, max_len_y)
        self.lr = lr
        self.threshold = threshold

        self.loss_fn_reg = nn.MSELoss(reduction="none")  # we'll mask padded values
        self.loss_fn_len = nn.BCEWithLogitsLoss()        # treat as multi-label classification

    def forward(self, x, lengths):
        x = x["main"] 
        print("x",x)
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (hn, _) = self.lstm(packed)
        last_h = hn[-1]

        y_pred = self.fc_reg(last_h)      # regression outputs
        len_logits = self.fc_len(last_h)  # logits per possible line
        return y_pred, len_logits

    def training_step(self, batch, batch_idx):
        X, y, lengths = batch
        y_pred, len_logits = self(X, lengths)

        # --- Regression loss with masking ---
        mask = (y != 0).float()  # assume padding = 0
        loss_reg = (self.loss_fn_reg(y_pred, y) * mask).sum() / mask.sum()

        # --- Length loss ---
        target_lengths = torch.zeros_like(len_logits, dtype=torch.float32)
        for i, l in enumerate(lengths):
            target_lengths[i, :l] = 1.0   # first l positions are 1, rest are 0

        loss_len = self.loss_fn_len(len_logits, target_lengths)

        loss = loss_reg + 0.1 * loss_len
        self.log("train_loss", loss, prog_bar=True)
        self.log("train_loss_reg", loss_reg, prog_bar=True)
        self.log("train_loss_len", loss_len, prog_bar=True)

        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

    def predict_length(self, len_logits):
        """
        Convert logits to predicted number of lines using threshold.
        """
        probs = torch.sigmoid(len_logits)
        pred_len = (probs > self.threshold).sum(dim=1)
        return pred_len


## two head lstm greedy match

In [None]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.nn.utils.rnn import pack_padded_sequence

def match_and_loss(y_pred, y_true, mask, loss_fn):
    """
    y_pred: (B, max_len_y)
    y_true: (B, max_len_y)
    mask: (B, max_len_y)  1 if real, 0 if padding
    loss_fn: pointwise loss, e.g. MSELoss(reduction="none")
    for each target find closest line
    """
    B, max_len = y_true.shape
    total_loss = 0.0
    total_count = 0

    for i in range(B):
        gt_vals = y_true[i][mask[i] > 0]  # real targets
        preds = y_pred[i]

        if len(gt_vals) == 0:
            continue

        # greedy matching: for each gt, find closest prediction
        used = set()
        for gt in gt_vals:
            dists = torch.abs(preds - gt)
            for u in used:
                dists[u] = float("inf")  # prevent reuse
            j = torch.argmin(dists)     # index of closest prediction
            used.add(j.item())

            total_loss += loss_fn(preds[j], gt)
            total_count += 1

    return total_loss / max(total_count, 1)

class LSTMMultiRegressor(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, num_layers, max_len_y, lr=0.001, threshold=0.5):
        super().__init__()
        self.save_hyperparameters()

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
        )
        # Main regression output: predict all linePrices up to max_len_y
        self.fc_reg = nn.Linear(hidden_dim, max_len_y)
        # Length prediction branch: logits per possible line (max_len_y)
        self.fc_len = nn.Linear(hidden_dim, max_len_y)
        self.lr = lr
        self.threshold = threshold

        self.loss_fn_reg = nn.MSELoss(reduction="none")  # we'll mask padded values
        self.loss_fn_len = nn.BCEWithLogitsLoss()        # treat as multi-label classification

    def forward(self, x, lengths):
        x = x["main"] 
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (hn, _) = self.lstm(packed)
        last_h = hn[-1]

        y_pred = self.fc_reg(last_h)      # regression outputs
        len_logits = self.fc_len(last_h)  # logits per possible line
        return y_pred, len_logits

    def training_step(self, batch, batch_idx):
        X, y, lengths = batch
        y_pred, len_logits = self(X, lengths)

        mask = (y != 0).float()

        # --- New greedy-matching regression loss ---
        loss_reg = match_and_loss(y_pred, y, mask, nn.MSELoss())

        # --- Length loss (unchanged) ---
        target_lengths = torch.zeros_like(len_logits, dtype=torch.float32)
        for i, l in enumerate(lengths):
            target_lengths[i, :l] = 1.0

        loss_len = self.loss_fn_len(len_logits, target_lengths)

        loss = loss_reg + 0.1 * loss_len
        self.log("train_loss", loss, prog_bar=True)
        return loss


    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

    def predict_length(self, len_logits):
        """
        Convert logits to predicted number of lines using threshold.
        """
        probs = torch.sigmoid(len_logits)
        pred_len = (probs > self.threshold).sum(dim=1)
        return pred_len


## two head lstm sum of logits loss

In [None]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.nn.utils.rnn import pack_padded_sequence
from models.losses.two_head_logit_sum import sum_of_logits
class LSTMMultiRegressor(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, num_layers, max_len_y, lr=0.001, threshold=0.5):
        super().__init__()
        self.save_hyperparameters()

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
        )
        self.fc_reg = nn.Linear(hidden_dim, max_len_y)
        self.fc_len = nn.Linear(hidden_dim, max_len_y)

        self.lr = lr
        self.threshold = threshold

        self.loss_fn_reg = nn.MSELoss(reduction="none")  # masked regression
        self.loss_fn_len = nn.BCEWithLogitsLoss()        # multi-label classification
        self.compute_loss = sum_of_logits()
        
    def forward(self, x, lengths):
        x = x["main"] 
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (hn, _) = self.lstm(packed)
        last_h = hn[-1]

        y_pred = self.fc_reg(last_h)
        len_logits = self.fc_len(last_h)
        return y_pred, len_logits

    def training_step(self, batch, batch_idx):
        X, y, lengths = batch
        y_pred, len_logits = self(X, lengths)

        loss, loss_reg, loss_len = self.compute_loss(y_pred, len_logits, y, lengths)
        self.log("train_loss", loss, prog_bar=True)
        self.log("loss_reg", loss_reg, prog_bar=False)
        self.log("loss_len", loss_len, prog_bar=False)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

    def predict_length(self, len_logits):
        """
        Convert logits to predicted number of lines using threshold.
        """
        probs = torch.sigmoid(len_logits)
        pred_len = (probs > self.threshold).sum(dim=1)
        return pred_len


## two head lstm soft thresholding

In [None]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.nn.utils.rnn import pack_padded_sequence
import torch.nn.functional as F
from models.losses.soft_thresholding_two_head import soft_thresholding_loss

class LSTMMultiRegressor(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim, num_layers, max_len_y, lr=0.001, threshold=0.5, k_soft=20.0):
        super().__init__()
        self.save_hyperparameters()

        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
        )
        self.fc_reg = nn.Linear(hidden_dim, max_len_y)
        self.fc_len = nn.Linear(hidden_dim, max_len_y)

        self.lr = lr
        self.threshold = threshold
        self.k_soft = k_soft

        self.loss_fn_reg = nn.MSELoss(reduction="none")
        self.loss_fn_len = nn.BCEWithLogitsLoss()
        self.compute_loss = soft_thresholding_loss
        
    def forward(self, x, lengths):
        x = x["main"]
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (hn, _) = self.lstm(packed)
        last_h = hn[-1]

        y_pred = self.fc_reg(last_h)
        len_logits = self.fc_len(last_h)
        return y_pred, len_logits

    def training_step(self, batch, batch_idx):
        X, y, lengths = batch
        y_pred, len_logits = self(X, lengths)

        loss, loss_reg, loss_len = self.compute_loss(y_pred, len_logits, y, lengths)
        self.log("train_loss", loss, prog_bar=True)
        self.log("loss_reg", loss_reg, prog_bar=False)
        self.log("loss_len", loss_len, prog_bar=False)

        return loss


    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

    def predict_length(self, len_logits):
        """
        Convert logits to predicted number of lines using threshold.
        """
        probs = torch.sigmoid(len_logits)
        pred_len = (probs > self.threshold).sum(dim=1)
        return pred_len


## FNNCNN

In [None]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence

def mdn_split_params(raw_params, n_components):
    """
    raw_params: (B, 3K) tensor from mdn_head
    returns:
        pi    (B, K) mixture weights
        mu    (B, K) means
        sigma (B, K) std devs
    """
    B, threeK = raw_params.shape
    assert threeK == 3 * n_components

    raw = raw_params.view(B, n_components, 3)

    pi = raw[..., 0]                 # (B,K)
    mu = raw[..., 1]                 # (B,K)
    sigma = raw[..., 2]              # (B,K)

    pi = F.softmax(pi, dim=-1)       # weights sum to 1
    sigma = F.softplus(sigma) + 1e-4 # strictly positive
    return pi, mu, sigma


def mdn_nll_multitarget(y_line, pi, mu, sigma):
    """
    Negative log-likelihood for MDN with multiple valid targets per sample.
    Args:
        y_line : (B, L) padded targets (0 where invalid)
        pi, mu, sigma : (B, K) MDN params
    Returns:
        scalar loss
    """
    B, K = mu.shape
    losses = []

    for b in range(B):
        valid_y = y_line[b][y_line[b] > 0]  # (M,)
        if len(valid_y) == 0:
            continue

        # expand to (M, K)
        y_exp = valid_y.unsqueeze(-1).expand(-1, K)

        log_prob = -0.5 * ((y_exp - mu[b]) / (sigma[b] + 1e-8))**2 \
                   - torch.log(sigma[b] + 1e-8) \
                   - 0.5 * torch.log(torch.tensor(2.0 * torch.pi, device=y_line.device))

        log_mix = torch.log(pi[b] + 1e-8) + log_prob
        log_sum = torch.logsumexp(log_mix, dim=-1)  # (M,)

        losses.append(-log_sum.mean())

    if len(losses) == 0:
        return torch.tensor(0.0, device=y_line.device, requires_grad=True)

    return torch.stack(losses).mean()


class CNNLSTM_MDN(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim=128, num_layers=1, hidden_features=64, out_features=32,
                 lr=1e-3, n_components=5, cnn_channels=64, dropout=0.1):
        super().__init__()
        self.save_hyperparameters()

        # Time-distributed feature extractor
        self.fc1 = nn.Linear(input_dim, hidden_features)
        self.ln1 = nn.LayerNorm(hidden_features) # ADDED: LayerNorm for time-step features
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.ln2 = nn.LayerNorm(out_features) # ADDED: LayerNorm

        # CNN feature extractors
        self.conv1 = nn.Conv1d(out_features, cnn_channels, kernel_size=1)
        self.bn1 = nn.BatchNorm1d(cnn_channels) # ADDED: BatchNorm for convolutional features
        self.conv3 = nn.Conv1d(out_features, cnn_channels, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm1d(cnn_channels) # ADDED: BatchNorm

        # Learnable mixer for CNN outputs
        self.mixer = nn.Conv2d(in_channels=2, out_channels=1, kernel_size=1, bias=True)

        # LSTM for temporal dependency
        fused_dim = cnn_channels # Input to LSTM is the mixed CNN output
        self.lstm = nn.LSTM(fused_dim, hidden_dim, num_layers=num_layers,
                            batch_first=True, dropout=dropout if num_layers > 1 else 0)

        # MDN Head
        self.mdn_head = nn.Linear(hidden_dim, 3 * n_components)
        self.n_components = n_components
        self.lr = lr

        # Apply weight initialization
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.kaiming_normal_(module.weight, mode="fan_in", nonlinearity="relu")
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)
        elif isinstance(module, (nn.Conv1d, nn.Conv2d)):
            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)

    def forward(self, X, lengths=None):
        # Input shape X["main"]: (B, T, F_in)
        x = X["main"] # REMOVED redundant transposes

        # 1. Time-distributed feature extraction
        x = self.fc1(x)
        x = F.relu(self.ln1(x)) # CHANGED: Apply LayerNorm before ReLU
        x = self.fc2(x)
        x = F.relu(self.ln2(x)) # CHANGED: Apply LayerNorm before ReLU

        # 2. CNN feature extraction
        x = x.transpose(1, 2)   # Shape: (B, C_in, L=T)
        x1 = F.relu(self.bn1(self.conv1(x))) # CHANGED: Apply BatchNorm before ReLU
        x3 = F.relu(self.bn3(self.conv3(x))) # CHANGED: Apply BatchNorm before ReLU

        # 3. Mix CNN outputs
        paired = torch.stack([x1, x3], dim=1) # Shape: (B, 2, C_out, L)
        mixed = self.mixer(paired)            # Shape: (B, 1, C_out, L)
        
        # Prepare for LSTM
        xf = mixed.squeeze(1).transpose(1, 2) # Shape: (B, L, C_out)

        # 4. LSTM for sequence summary
        if lengths is not None:
            packed_input = pack_padded_sequence(
                xf, lengths.cpu(), batch_first=True, enforce_sorted=False
            )
            _, (h_last, _) = self.lstm(packed_input)
        else:
            _, (h_last, _) = self.lstm(xf)
        
        last_h = h_last[-1] # Shape: (B, H)
        
        # 5. MDN head for distribution parameters
        raw = self.mdn_head(last_h)
        pi, mu, sigma = mdn_split_params(raw, self.n_components)
        return {"pi": pi, "mu": mu, "sigma": sigma}

    def training_step(self, batch, batch_idx):
        X, y_line, lengths = batch
        mdn = self(X, lengths)
        loss = mdn_nll_multitarget(y_line, mdn["pi"], mdn["mu"], mdn["sigma"])
        self.log("train/loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        X, y_line, lengths = batch
        mdn = self(X, lengths)
        loss = mdn_nll_multitarget(y_line, mdn["pi"], mdn["mu"], mdn["sigma"])
    # Log everything to progress bar
        self.log("val/loss", loss, prog_bar=True)
        self.log("val/pi_mean", mdn["pi"].mean(), prog_bar=True)
        self.log("val/pi_std", mdn["pi"].std(), prog_bar=True)
        self.log("val/mu_mean", mdn["mu"].mean(), prog_bar=True)
        self.log("val/mu_std", mdn["mu"].std(), prog_bar=True)
        self.log("val/sigma_mean", mdn["sigma"].mean(), prog_bar=True)
        self.log("val/sigma_std", mdn["sigma"].std(), prog_bar=True)
        
    def configure_optimizers(self): 
        return torch.optim.Adam(self.parameters(), lr=self.lr)
    # def configure_optimizers(self):
    #     optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
    #     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    #         optimizer,
    #         mode="min",
    #         factor=0.2,   # Reduce LR by 80%
    #         patience=5,   # After 5 epochs of no val_loss improvement
    #         verbose=True
    #     )
    #     return {
    #         "optimizer": optimizer,
    #         "lr_scheduler": {
    #             "scheduler": scheduler,
    #             "monitor": "val/loss",  # Important!
    #         },
        # }


## CNNLSTM weightening

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
import pytorch_lightning as pl
# Your mdn_split_params function remains the same
def mdn_split_params(raw_params, n_components):
    B, threeK = raw_params.shape
    assert threeK == 3 * n_components
    raw = raw_params.view(B, n_components, 3)
    pi = F.softmax(raw[..., 0], dim=-1)
    mu = raw[..., 1]
    sigma = F.softplus(raw[..., 2]) + 1e-4
    return pi, mu, sigma

def weighted_mdn_nll(y_true, mdn_params, weights):
    total_loss = 0.0
    num_lines = y_true.shape[1]
    B = y_true.shape[0]

    # Keep track if any valid lines are found
    valid_line_found = False

    for i in range(num_lines):
        y_target = y_true[:, i:i+1]  # (B,1)
        pi, mu, sigma = mdn_params['pi'][i], mdn_params['mu'][i], mdn_params['sigma'][i]

        mask = (y_target != 0).squeeze()
        if mask.sum() == 0:
            continue

        valid_line_found = True
        y_target_masked = y_target[mask]
        pi_masked, mu_masked, sigma_masked = pi[mask], mu[mask], sigma[mask]

        dist = Normal(loc=mu_masked, scale=sigma_masked)
        log_prob = dist.log_prob(y_target_masked.expand_as(mu_masked))
        log_mix_prob = torch.log(pi_masked + 1e-8) + log_prob
        log_likelihood = torch.logsumexp(log_mix_prob, dim=1)
        line_loss = -log_likelihood.mean()
        total_loss += weights[i] * line_loss

    if not valid_line_found:
        # Avoid returning a Python float; create a tensor with requires_grad
        total_loss = torch.tensor(0.0, device=y_true.device, requires_grad=True)

    return total_loss


class CNNLSTM_MDN_MultiHead(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim=128, num_layers=1, hidden_features=64, out_features=32,
                 lr=1e-3, n_components=5, cnn_channels=64, dropout=0.1, num_lines=9):
        super().__init__()
        self.save_hyperparameters()

        # --- Your CNN and LSTM base remains the same ---
        self.fc1 = nn.Linear(input_dim, hidden_features)
        self.ln1 = nn.LayerNorm(hidden_features)
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.ln2 = nn.LayerNorm(out_features)
        self.conv1 = nn.Conv1d(out_features, cnn_channels, kernel_size=1)
        self.bn1 = nn.BatchNorm1d(cnn_channels)
        self.conv3 = nn.Conv1d(out_features, cnn_channels, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm1d(cnn_channels)
        self.mixer = nn.Conv2d(in_channels=2, out_channels=1, kernel_size=1, bias=True)
        fused_dim = cnn_channels
        self.lstm = nn.LSTM(fused_dim, hidden_dim, num_layers=num_layers,
                              batch_first=True, dropout=dropout if num_layers > 1 else 0)

        # === MODIFICATION: Create a list of MDN heads ===
        self.num_lines = num_lines
        self.mdn_heads = nn.ModuleList(
            [nn.Linear(hidden_dim, 3 * n_components) for _ in range(num_lines)]
        )

        self.n_components = n_components
        self.lr = lr

        # === Define importance weights here ===
        # Using exponential decay: w_i = 0.9^(i-1)
        weights = torch.tensor([0.9**i for i in range(self.num_lines)])
        self.register_buffer('loss_weights', weights)

        self.apply(self._init_weights)

    def _init_weights(self, module): # Your init function is fine
        if isinstance(module, nn.Linear):
            nn.init.kaiming_normal_(module.weight, mode="fan_in", nonlinearity="relu")
            if module.bias is not None: nn.init.constant_(module.bias, 0)
        elif isinstance(module, (nn.Conv1d, nn.Conv2d)):
            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
            if module.bias is not None: nn.init.constant_(module.bias, 0)

    def forward(self, X, lengths=None):
        # --- Your forward pass for the base model is the same ---
        x = X["main"]
        x = F.relu(self.ln1(self.fc1(x)))
        x = F.relu(self.ln2(self.fc2(x)))
        x = x.transpose(1, 2)
        x1 = F.relu(self.bn1(self.conv1(x)))
        x3 = F.relu(self.bn3(self.conv3(x)))
        paired = torch.stack([x1, x3], dim=1)
        mixed = self.mixer(paired)
        xf = mixed.squeeze(1).transpose(1, 2)
        
        # We'll assume lengths is None for simplicity here, but your implementation is fine
        _, (h_last, _) = self.lstm(xf)
        last_h = h_last[-1]

        # === MODIFICATION: Get parameters from all heads ===
        all_params = {'pi': [], 'mu': [], 'sigma': []}
        for i in range(self.num_lines):
            raw_params = self.mdn_heads[i](last_h)
            pi, mu, sigma = mdn_split_params(raw_params, self.n_components)
            all_params['pi'].append(pi)
            all_params['mu'].append(mu)
            all_params['sigma'].append(sigma)

        return all_params
    

    # This would be inside your CNNLSTM_MDN_MultiHead class

    def training_step(self, batch, batch_idx):
        # Assuming your batch now provides a y tensor of shape (B, 9)
        # where y has your target line values, padded with -1.
        X, y, lengths = batch

        # Get the dictionary of parameter lists from the forward pass
        mdn_params = self(X, lengths)

        # Calculate loss using the new weighted function
        loss = weighted_mdn_nll(y, mdn_params, self.loss_weights)

        self.log("train/loss", loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr)

    # NOTE: You'll also need a validation_step that mirrors the training_step logic
    def validation_step(self, batch, batch_idx):
        X, y, lengths = batch
        mdn_params = self(X, lengths)
        loss = weighted_mdn_nll(y, mdn_params, self.loss_weights)
        self.log("val/loss", loss, prog_bar=True)
        return loss

## LSTM weightening with pi order

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
import pytorch_lightning as pl


def mdn_split_params(raw_params, n_components):
    """
    Splits raw MDN output into mixture weights (pi), means (mu), and stds (sigma)
    """
    B, threeK = raw_params.shape
    assert threeK == 3 * n_components
    raw = raw_params.view(B, n_components, 3)
    pi = F.softmax(raw[..., 0], dim=-1)           # mixture probabilities
    mu = raw[..., 1]                              # means
    sigma = F.softplus(raw[..., 2]) + 1e-4       # stds
    return pi, mu, sigma


def weighted_mdn_nll(y_true, mdn_params, weights):
    """
    y_true: (B, num_lines)
    mdn_params: dict with 'pi', 'mu', 'sigma' each of shape (B, n_components)
    weights: (num_lines,) tensor
    """
    B, num_lines = y_true.shape
    pi, mu, sigma = mdn_params['pi'], mdn_params['mu'], mdn_params['sigma']  # (B, n_components)

    # Sort components by pi descending
    _, idx = torch.sort(pi, descending=True, dim=1)  # (B, n_components)

    total_loss = 0.0
    valid_line_found = False

    for i in range(num_lines):
        y_target = y_true[:, i]  # (B,)

        # Skip masked/padded targets
        mask = (y_target != 0)
        if mask.sum() == 0:
            continue
        valid_line_found = True

        # Select top pi component for this line
        top_mu = mu.gather(1, idx[:, i].unsqueeze(1)).squeeze(1)      # (B,)
        top_sigma = sigma.gather(1, idx[:, i].unsqueeze(1)).squeeze(1) # (B,)
        y_target_masked = y_target[mask]
        top_mu_masked = top_mu[mask]
        top_sigma_masked = top_sigma[mask]

        dist = Normal(top_mu_masked, top_sigma_masked)
        line_loss = -dist.log_prob(y_target_masked).mean()
        total_loss += weights[i] * line_loss

    if not valid_line_found:
        total_loss = torch.tensor(0.0, device=y_true.device, requires_grad=True)

    return total_loss


class CNNLSTM_MDN(pl.LightningModule):
    def __init__(self, input_dim, feature_eng=15,hidden_dim=32, n_components=9, num_lines=9, lr=1e-3, dropout=0.1):
        super().__init__()
        self.save_hyperparameters()
        self.num_lines = num_lines
        self.n_components = n_components
        self.lr = lr

        # Base network
        self.fc1 = nn.Linear(input_dim, feature_eng)
        self.ln1 = nn.LayerNorm(feature_eng)
        self.lstm = nn.LSTM(feature_eng, hidden_dim, batch_first=True, dropout=dropout)

        # Single MDN head predicting n_components Gaussians
        self.mdn_head = nn.Linear(hidden_dim, 3 * n_components)

        # Importance weights for lines
        weights = torch.tensor([0.9**i for i in range(num_lines)], dtype=torch.float)
        self.register_buffer("loss_weights", weights)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.kaiming_normal_(module.weight, mode="fan_in", nonlinearity="relu")
            if module.bias is not None: nn.init.constant_(module.bias, 0)

    def forward(self, X, lengths=None):
        """
        X: (B, T, input_dim)
        """
        x = X["main"]
        x = F.relu(self.ln1(self.fc1(x)))
        
        if lengths is not None:
            x = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
            _, (h_last, _) = self.lstm(x)
        else:
            _, (h_last, _) = self.lstm(x)

        last_h = h_last[-1]  # (B, hidden_dim)
        raw_params = self.mdn_head(last_h)  # (B, 3*n_components)
        pi, mu, sigma = mdn_split_params(raw_params, self.n_components)
        return {"pi": pi, "mu": mu, "sigma": sigma}

    def training_step(self, batch, batch_idx):
        X, y, lengths = batch
        mdn_params = self(X)
        loss = weighted_mdn_nll(y, mdn_params, self.loss_weights)
        self.log("train/loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        X, y, lengths = batch
        mdn_params = self(X)
        loss = weighted_mdn_nll(y, mdn_params, self.loss_weights)
        self.log("val/loss", loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr)


## CNNLSTM weightening order

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch.distributions import Normal

def mdn_split_params(raw_params, n_components):
    """
    Splits raw MDN output into mixture weights (pi), means (mu), and stds (sigma)
    """
    B, threeK = raw_params.shape
    assert threeK == 3 * n_components
    raw = raw_params.view(B, n_components, 3)
    pi = F.softmax(raw[..., 0], dim=-1)           # mixture probabilities
    mu = raw[..., 1]                              # means
    sigma = F.softplus(raw[..., 2]) + 1e-4       # stds
    return pi, mu, sigma


def weighted_mdn_nll(y_true, mdn_params, weights):
    """
    y_true: (B, num_lines)
    mdn_params: dict with 'pi', 'mu', 'sigma' each of shape (B, n_components)
    weights: (num_lines,) tensor
    """
    B, num_lines = y_true.shape
    pi, mu, sigma = mdn_params['pi'], mdn_params['mu'], mdn_params['sigma']  # (B, n_components)

    # Sort components by pi descending
    _, idx = torch.sort(pi, descending=True, dim=1)  # (B, n_components)

    total_loss = 0.0
    valid_line_found = False

    for i in range(num_lines):
        y_target = y_true[:, i]  # (B,)

        # Skip masked/padded targets
        mask = (y_target != 0)
        if mask.sum() == 0:
            continue
        valid_line_found = True

        # Select top pi component for this line
        top_mu = mu.gather(1, idx[:, i].unsqueeze(1)).squeeze(1)      # (B,)
        top_sigma = sigma.gather(1, idx[:, i].unsqueeze(1)).squeeze(1) # (B,)
        y_target_masked = y_target[mask]
        top_mu_masked = top_mu[mask]
        top_sigma_masked = top_sigma[mask]

        dist = Normal(top_mu_masked, top_sigma_masked)
        line_loss = -dist.log_prob(y_target_masked).mean()
        total_loss += weights[i] * line_loss

    if not valid_line_found:
        total_loss = torch.tensor(0.0, device=y_true.device, requires_grad=True)

    return total_loss


class cnn_lstm(pl.LightningModule):
    def __init__(self, input_dim, feature_eng=15, hidden_dim=32, n_components=9, num_lines=9, lr=1e-3, dropout=0.1):
        super().__init__()
        self.save_hyperparameters()
        self.num_lines = num_lines
        self.n_components = n_components
        self.lr = lr

        # Base feature projection
        self.fc1 = nn.Linear(input_dim, feature_eng)
        self.ln1 = nn.LayerNorm(feature_eng)

        # Parallel conv1d branches
        self.k1 = nn.Conv1d(feature_eng, feature_eng, kernel_size=1, padding=0)
        self.k3 = nn.Conv1d(feature_eng, feature_eng, kernel_size=3, padding=1)

        # Fusion via conv2d
        # Input channels = 2 (from k1 + k3), Output = 1, kernel size (1,1) to fuse
        self.fusion_conv2d = nn.Conv2d(2, 1, kernel_size=(1, 1))

        # LSTM
        self.lstm = nn.LSTM(feature_eng, hidden_dim, batch_first=True, dropout=dropout)

        # Single MDN head predicting n_components Gaussians
        self.mdn_head = nn.Linear(hidden_dim, 3 * n_components)

        # Importance weights for lines
        weights = torch.tensor([0.9**i for i in range(num_lines)], dtype=torch.float)
        self.register_buffer("loss_weights", weights)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.kaiming_normal_(module.weight, mode="fan_in", nonlinearity="relu")
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)

    def forward(self, X, lengths=None):
        """
        X: dict with key "main", value shape (B, T, input_dim)
        """
        x = X["main"]  # (B, T, input_dim)
        B, T, _ = x.shape

        # Fully connected projection
        x = F.relu(self.ln1(self.fc1(x)))  # (B, T, F)

        # Conv1d expects (B, F, T)
        x_cnn = x.transpose(1, 2)  # (B, F, T)

        # Parallel convs
        x1 = self.k1(x_cnn)  # (B, F, T)
        x3 = self.k3(x_cnn)  # (B, F, T)

        # Stack into 2-channel feature map
        stacked = torch.stack([x1, x3], dim=1)  # (B, 2, F, T)

        # Fuse with conv2d → (B, 1, F, T)
        fused = self.fusion_conv2d(stacked).squeeze(1)  # (B, F, T)

        # Back to (B, T, F)
        fused = fused.transpose(1, 2)

        # LSTM with packed sequence
        if lengths is not None:
            packed = nn.utils.rnn.pack_padded_sequence(fused, lengths.cpu(), batch_first=True, enforce_sorted=False)
            _, (h_last, _) = self.lstm(packed)
        else:
            _, (h_last, _) = self.lstm(fused)

        last_h = h_last[-1]  # (B, hidden_dim)
        raw_params = self.mdn_head(last_h)  # (B, 3 * n_components)

        # Assume you have mdn_split_params(pi, mu, sigma)
        pi, mu, sigma = mdn_split_params(raw_params, self.n_components)
        return {"pi": pi, "mu": mu, "sigma": sigma}
    
    def training_step(self, batch, batch_idx):
        X, y, lengths = batch
        mdn_params = self(X)
        loss = weighted_mdn_nll(y, mdn_params, self.loss_weights)
        self.log("train/loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        X, y, lengths = batch
        mdn_params = self(X)
        loss = weighted_mdn_nll(y, mdn_params, self.loss_weights)
        self.log("val/loss", loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr)

## CNNLSTM weightening with sigma confidance

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
import pytorch_lightning as pl
# Your mdn_split_params function remains the same
def mdn_split_params(raw_params, n_components):
    B, threeK = raw_params.shape
    assert threeK == 3 * n_components
    raw = raw_params.view(B, n_components, 3)
    pi = F.softmax(raw[..., 0], dim=-1)
    mu = raw[..., 1]
    sigma = F.softplus(raw[..., 2]) + 1e-4
    return pi, mu, sigma

def weighted_mdn_nll_with_sigma_penalty(y_true, mdn_params, weights, lambda_sigma=0.01):
    """
    Calculates weighted MDN NLL and adds a penalty for large sigmas.
    
    Args:
        lambda_sigma (float): The strength of the sigma penalty.
    """
    total_loss = 0.0
    num_lines = y_true.shape[1]

    for i in range(num_lines):
        y_target = y_true[:, i:i+1]
        pi, mu, sigma = mdn_params['pi'][i], mdn_params['mu'][i], mdn_params['sigma'][i]
        mask = (y_target != -1).squeeze()

        if mask.sum() == 0:
            continue

        y_target_masked = y_target[mask]
        pi_masked, mu_masked, sigma_masked = pi[mask], mu[mask], sigma[mask]
        
        # --- 1. NLL Loss Calculation (same as before) ---
        dist = Normal(loc=mu_masked, scale=sigma_masked)
        log_prob = dist.log_prob(y_target_masked.expand_as(mu_masked))
        log_mix_prob = torch.log(pi_masked + 1e-8) + log_prob
        log_likelihood = torch.logsumexp(log_mix_prob, dim=1)
        line_nll_loss = -log_likelihood.mean()

        # --- 2. NEW: Sigma Penalty Calculation ---
        # We penalize the mean of the sigmas for the most likely component
        # This focuses the penalty on the component the model actually uses
        most_likely_idx = torch.argmax(pi_masked, dim=1)
        most_likely_sigma = sigma_masked.gather(1, most_likely_idx.unsqueeze(1)).squeeze()
        sigma_penalty = torch.mean(most_likely_sigma)
        
        # --- 3. Combine and Weight ---
        combined_line_loss = line_nll_loss + (lambda_sigma * sigma_penalty)
        total_loss += weights[i] * combined_line_loss

    return total_loss

# In your training_step, you would call this new function:
# loss = weighted_mdn_nll_with_sigma_penalty(y, mdn_params, self.loss_weights, lambda_sigma=0.01)

class CNNLSTM_MDN_MultiHead(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim=128, num_layers=1, hidden_features=64, out_features=32,
                 lr=1e-3, n_components=5, cnn_channels=64, dropout=0.1, num_lines=9):
        super().__init__()
        self.save_hyperparameters()

        # --- Your CNN and LSTM base remains the same ---
        self.fc1 = nn.Linear(input_dim, hidden_features)
        self.ln1 = nn.LayerNorm(hidden_features)
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.ln2 = nn.LayerNorm(out_features)
        self.conv1 = nn.Conv1d(out_features, cnn_channels, kernel_size=1)
        self.bn1 = nn.BatchNorm1d(cnn_channels)
        self.conv3 = nn.Conv1d(out_features, cnn_channels, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm1d(cnn_channels)
        self.mixer = nn.Conv2d(in_channels=2, out_channels=1, kernel_size=1, bias=True)
        fused_dim = cnn_channels
        self.lstm = nn.LSTM(fused_dim, hidden_dim, num_layers=num_layers,
                              batch_first=True, dropout=dropout if num_layers > 1 else 0)

        # === MODIFICATION: Create a list of MDN heads ===
        self.num_lines = num_lines
        self.mdn_heads = nn.ModuleList(
            [nn.Linear(hidden_dim, 3 * n_components) for _ in range(num_lines)]
        )

        self.n_components = n_components
        self.lr = lr

        # === Define importance weights here ===
        # Using exponential decay: w_i = 0.9^(i-1)
        weights = torch.tensor([0.9**i for i in range(self.num_lines)])
        self.register_buffer('loss_weights', weights)

        self.apply(self._init_weights)

    def _init_weights(self, module): # Your init function is fine
        if isinstance(module, nn.Linear):
            nn.init.kaiming_normal_(module.weight, mode="fan_in", nonlinearity="relu")
            if module.bias is not None: nn.init.constant_(module.bias, 0)
        elif isinstance(module, (nn.Conv1d, nn.Conv2d)):
            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
            if module.bias is not None: nn.init.constant_(module.bias, 0)

    def forward(self, X, lengths=None):
        # --- Your forward pass for the base model is the same ---
        x = X["main"]
        x = F.relu(self.ln1(self.fc1(x)))
        x = F.relu(self.ln2(self.fc2(x)))
        x = x.transpose(1, 2)
        x1 = F.relu(self.bn1(self.conv1(x)))
        x3 = F.relu(self.bn3(self.conv3(x)))
        paired = torch.stack([x1, x3], dim=1)
        mixed = self.mixer(paired)
        xf = mixed.squeeze(1).transpose(1, 2)
        
        # We'll assume lengths is None for simplicity here, but your implementation is fine
        _, (h_last, _) = self.lstm(xf)
        last_h = h_last[-1]

        # === MODIFICATION: Get parameters from all heads ===
        all_params = {'pi': [], 'mu': [], 'sigma': []}
        for i in range(self.num_lines):
            raw_params = self.mdn_heads[i](last_h)
            pi, mu, sigma = mdn_split_params(raw_params, self.n_components)
            all_params['pi'].append(pi)
            all_params['mu'].append(mu)
            all_params['sigma'].append(sigma)

        return all_params
    

    # This would be inside your CNNLSTM_MDN_MultiHead class

    def training_step(self, batch, batch_idx):
        # Assuming your batch now provides a y tensor of shape (B, 9)
        # where y has your target line values, padded with -1.
        X, y, lengths = batch

        # Get the dictionary of parameter lists from the forward pass
        mdn_params = self(X, lengths)

        # Calculate loss using the new weighted function
        loss = weighted_mdn_nll_with_sigma_penalty(y, mdn_params, self.loss_weights)

        self.log("train/loss", loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr)

    # NOTE: You'll also need a validation_step that mirrors the training_step logic
    def validation_step(self, batch, batch_idx):
        X, y, lengths = batch
        mdn_params = self(X, lengths)
        loss = weighted_mdn_nll_with_sigma_penalty(y, mdn_params, self.loss_weights)
        self.log("val/loss", loss, prog_bar=True)
        return loss

## CNNlSTM

In [2]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence

def mdn_split_params(raw_params, n_components):
    """
    raw_params: (B, 3K) tensor from mdn_head
    returns:
        pi    (B, K) mixture weights
        mu    (B, K) means
        sigma (B, K) std devs
    """
    B, threeK = raw_params.shape
    assert threeK == 3 * n_components

    raw = raw_params.view(B, n_components, 3)

    pi = raw[..., 0]                 # (B,K)
    mu = raw[..., 1]                 # (B,K)
    sigma = raw[..., 2]              # (B,K)

    pi = F.softmax(pi, dim=-1)       # weights sum to 1
    sigma = F.softplus(sigma) + 1e-4 # strictly positive
    return pi, mu, sigma


def mdn_nll_multitarget(y_line, pi, mu, sigma):
    """
    Negative log-likelihood for MDN with multiple valid targets per sample.
    Args:
        y_line : (B, L) padded targets (0 where invalid)
        pi, mu, sigma : (B, K) MDN params
    Returns:
        scalar loss
    """
    B, K = mu.shape
    losses = []

    for b in range(B):
        valid_y = y_line[b][y_line[b] > 0]  # (M,)
        if len(valid_y) == 0:
            continue

        # expand to (M, K)
        y_exp = valid_y.unsqueeze(-1).expand(-1, K)

        log_prob = -0.5 * ((y_exp - mu[b]) / (sigma[b] + 1e-8))**2 \
                   - torch.log(sigma[b] + 1e-8) \
                   - 0.5 * torch.log(torch.tensor(2.0 * torch.pi, device=y_line.device))

        log_mix = torch.log(pi[b] + 1e-8) + log_prob
        log_sum = torch.logsumexp(log_mix, dim=-1)  # (M,)

        losses.append(-log_sum.mean())

    if len(losses) == 0:
        return torch.tensor(0.0, device=y_line.device, requires_grad=True)

    return torch.stack(losses).mean()


class CNNLSTM_MDN(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim=128, num_layers=1, hidden_features=64, out_features=32,
                 lr=1e-3, n_components=5, cnn_channels=64, dropout=0.1):
        super().__init__()
        self.save_hyperparameters()

        # Time-distributed feature extractor
        self.fc1 = nn.Linear(input_dim, hidden_features)
        self.ln1 = nn.LayerNorm(hidden_features) # ADDED: LayerNorm for time-step features
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.ln2 = nn.LayerNorm(out_features) # ADDED: LayerNorm

        # CNN feature extractors
        self.conv1 = nn.Conv1d(out_features, cnn_channels, kernel_size=1)
        self.bn1 = nn.BatchNorm1d(cnn_channels) # ADDED: BatchNorm for convolutional features
        self.conv3 = nn.Conv1d(out_features, cnn_channels, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm1d(cnn_channels) # ADDED: BatchNorm

        # Learnable mixer for CNN outputs
        self.mixer = nn.Conv2d(in_channels=2, out_channels=1, kernel_size=1, bias=True)

        # LSTM for temporal dependency
        fused_dim = cnn_channels # Input to LSTM is the mixed CNN output
        self.lstm = nn.LSTM(fused_dim, hidden_dim, num_layers=num_layers,
                            batch_first=True, dropout=dropout if num_layers > 1 else 0)

        # MDN Head
        self.mdn_head = nn.Linear(hidden_dim, 3 * n_components)
        self.n_components = n_components
        self.lr = lr

        # Apply weight initialization
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.kaiming_normal_(module.weight, mode="fan_in", nonlinearity="relu")
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)
        elif isinstance(module, (nn.Conv1d, nn.Conv2d)):
            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)

    def forward(self, X, lengths=None):
        # Input shape X["main"]: (B, T, F_in)
        x = X["main"] # REMOVED redundant transposes

        # 1. Time-distributed feature extraction
        x = self.fc1(x)
        x = F.relu(self.ln1(x)) # CHANGED: Apply LayerNorm before ReLU
        x = self.fc2(x)
        x = F.relu(self.ln2(x)) # CHANGED: Apply LayerNorm before ReLU
        # 2. CNN feature extraction
        x = x.transpose(1, 2)   # Shape: (B, C_in, L=T)
        x1 = F.relu(self.bn1(self.conv1(x))) # CHANGED: Apply BatchNorm before ReLU
        x3 = F.relu(self.bn3(self.conv3(x))) # CHANGED: Apply BatchNorm before ReLU

        # 3. Mix CNN outputs
        paired = torch.stack([x1, x3], dim=1) # Shape: (B, 2, C_out, L)
        mixed = self.mixer(paired)            # Shape: (B, 1, C_out, L)
        
        # Prepare for LSTM
        xf = mixed.squeeze(1).transpose(1, 2) # Shape: (B, L, C_out)

        # 4. LSTM for sequence summary
        if lengths is not None:
            packed_input = pack_padded_sequence(
                xf, lengths.cpu(), batch_first=True, enforce_sorted=False
            )
            _, (h_last, _) = self.lstm(packed_input)
        else:
            _, (h_last, _) = self.lstm(xf)
        
        last_h = h_last[-1] # Shape: (B, H)
        
        # 5. MDN head for distribution parameters
        raw = self.mdn_head(last_h)
        pi, mu, sigma = mdn_split_params(raw, self.n_components)
        return {"pi": pi, "mu": mu, "sigma": sigma}

    def training_step(self, batch, batch_idx):
        X, y_line, lengths = batch
        mdn = self(X, lengths)
        loss = mdn_nll_multitarget(y_line, mdn["pi"], mdn["mu"], mdn["sigma"])
        self.log("train/loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        X, y_line, lengths = batch
        mdn = self(X, lengths)
        loss = mdn_nll_multitarget(y_line, mdn["pi"], mdn["mu"], mdn["sigma"])
    # Log everything to progress bar
        self.log("val/loss", loss, prog_bar=True)
        self.log("val/pi_mean", mdn["pi"].mean(), prog_bar=True)
        self.log("val/pi_std", mdn["pi"].std(), prog_bar=True)
        self.log("val/mu_mean", mdn["mu"].mean(), prog_bar=True)
        self.log("val/mu_std", mdn["mu"].std(), prog_bar=True)
        self.log("val/sigma_mean", mdn["sigma"].mean(), prog_bar=True)
        self.log("val/sigma_std", mdn["sigma"].std(), prog_bar=True)
        
    # Inside your CNNLSTM_MDN class
    # def configure_optimizers(self):
    #     optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        
    #     # Define the scheduler
    #     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    #         optimizer,
    #         mode='min',      # We want to minimize the validation loss
    #         factor=0.5,      # Reduce LR by 50% (1.0 -> 0.2)
    #         patience=10,      # Wait 5 validation epochs with no improvement before reducing
    #         verbose=True
    #     )
        
    #     return {
    #         "optimizer": optimizer,
    #         "lr_scheduler": {
    #             "scheduler": scheduler,
    #             "monitor": "val/loss",  # The metric to watch
    #         },
    #     }
    
    def configure_optimizers(self): 
        return torch.optim.Adam(self.parameters(), lr=self.lr)
    # def configure_optimizers(self):
    #     optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
    #     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    #         optimizer,
    #         mode="min",
    #         factor=0.2,   # Reduce LR by 80%
    #         patience=5,   # After 5 epochs of no val_loss improvement
    #         verbose=True
    #     )
    #     return {
    #         "optimizer": optimizer,
    #         "lr_scheduler": {
    #             "scheduler": scheduler,
    #             "monitor": "val/loss",  # Important!
    #         },
        # }


## CNNLSTM scalie

In [2]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence

def mdn_split_params(raw_params, n_components, mu_scale=10, mu_bias=.9, sigma_scale=10.0):
    """
    Split raw MDN parameters into (pi, mu, sigma).

    Args:
        raw_params: (B, 3 * K) from the network
        n_components: number of mixture components
        mu_scale: scaling factor for mu (default 1.0 = no scaling)
        mu_bias: shift/bias applied after scaling
        sigma_scale: scaling factor for sigma (default 10.0)
    """
    B = raw_params.size(0)
    raw = raw_params.view(B, n_components, 3)

    pi_raw = raw[..., 0]
    mu_raw = raw[..., 1]
    sigma_raw = raw[..., 2]

    pi = F.softmax(pi_raw, dim=-1)
    mu = mu_raw / mu_scale + mu_bias
    sigma = F.softplus(sigma_raw / sigma_scale) + 1e-4

    return pi, mu, sigma


def mdn_nll_multitarget(y_line, pi, mu, sigma):
    """
    Negative log-likelihood for MDN with multiple valid targets per sample.
    Args:
        y_line : (B, L) padded targets (0 where invalid)
        pi, mu, sigma : (B, K) MDN params
    Returns:
        scalar loss
    """
    B, K = mu.shape
    losses = []

    for b in range(B):
        valid_y = y_line[b][y_line[b] > 0]  # (M,)
        if len(valid_y) == 0:
            continue

        # expand to (M, K)
        y_exp = valid_y.unsqueeze(-1).expand(-1, K)

        log_prob = -0.5 * ((y_exp - mu[b]) / (sigma[b] + 1e-8))**2 \
                   - torch.log(sigma[b] + 1e-8) \
                   - 0.5 * torch.log(torch.tensor(2.0 * torch.pi, device=y_line.device))

        log_mix = torch.log(pi[b] + 1e-8) + log_prob
        log_sum = torch.logsumexp(log_mix, dim=-1)  # (M,)

        losses.append(-log_sum.mean())

    if len(losses) == 0:
        return torch.tensor(0.0, device=y_line.device, requires_grad=True)

    return torch.stack(losses).mean()


class CNNLSTM_MDN(pl.LightningModule):
    def __init__(self, input_dim, hidden_dim=128, num_layers=1, hidden_features=64, out_features=32,
                 lr=1e-3, n_components=5, cnn_channels=64, dropout=0.1):
        super().__init__()
        self.save_hyperparameters()

        # Time-distributed feature extractor
        self.fc1 = nn.Linear(input_dim, hidden_features)
        self.ln1 = nn.LayerNorm(hidden_features) # ADDED: LayerNorm for time-step features
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.ln2 = nn.LayerNorm(out_features) # ADDED: LayerNorm

        # CNN feature extractors
        self.conv1 = nn.Conv1d(out_features, cnn_channels, kernel_size=1)
        self.bn1 = nn.BatchNorm1d(cnn_channels) # ADDED: BatchNorm for convolutional features
        self.conv3 = nn.Conv1d(out_features, cnn_channels, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm1d(cnn_channels) # ADDED: BatchNorm

        # Learnable mixer for CNN outputs
        self.mixer = nn.Conv2d(in_channels=2, out_channels=1, kernel_size=1, bias=True)

        # LSTM for temporal dependency
        fused_dim = cnn_channels # Input to LSTM is the mixed CNN output
        self.lstm = nn.LSTM(fused_dim, hidden_dim, num_layers=num_layers,
                            batch_first=True, dropout=dropout if num_layers > 1 else 0)

        # MDN Head
        self.mdn_head = nn.Linear(hidden_dim, 3 * n_components)
        self.n_components = n_components
        self.lr = lr

        # Apply weight initialization
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.kaiming_normal_(module.weight, mode="fan_in", nonlinearity="relu")
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)
        elif isinstance(module, (nn.Conv1d, nn.Conv2d)):
            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)

    def forward(self, X, lengths=None):
        # Input shape X["main"]: (B, T, F_in)
        x = X["main"] 

        # --- Debug print first candle ---
        # if x.ndim == 3:  # batched: (B, T, F)
        #     first_candle = x[0, 0, :]   # first sample, first time step, all features
        #     print("First candle features:", first_candle.detach().cpu().numpy())
        # elif x.ndim == 2:  # single sequence: (T, F)
        #     first_candle = x[0, :]      # first time step, all features
        #     print("First candle features:", first_candle.detach().cpu().numpy())
        # else:
        #     print("Unexpected shape for x:", x.shape)
        # 1. Time-distributed feature extraction
        x = self.fc1(x)
        x = F.relu(self.ln1(x)) # CHANGED: Apply LayerNorm before ReLU
        x = self.fc2(x)
        x = F.relu(self.ln2(x)) # CHANGED: Apply LayerNorm before ReLU

        # 2. CNN feature extraction
        x = x.transpose(1, 2)   # Shape: (B, C_in, L=T)
        x1 = F.relu(self.bn1(self.conv1(x))) # CHANGED: Apply BatchNorm before ReLU
        x3 = F.relu(self.bn3(self.conv3(x))) # CHANGED: Apply BatchNorm before ReLU

        # 3. Mix CNN outputs
        paired = torch.stack([x1, x3], dim=1) # Shape: (B, 2, C_out, L)
        mixed = self.mixer(paired)            # Shape: (B, 1, C_out, L)
        
        # Prepare for LSTM
        xf = mixed.squeeze(1).transpose(1, 2) # Shape: (B, L, C_out)

        # 4. LSTM for sequence summary
        if lengths is not None:
            packed_input = pack_padded_sequence(
                xf, lengths.cpu(), batch_first=True, enforce_sorted=False
            )
            _, (h_last, _) = self.lstm(packed_input)
        else:
            _, (h_last, _) = self.lstm(xf)
        
        last_h = h_last[-1] # Shape: (B, H)
        
        # 5. MDN head for distribution parameters
        raw = self.mdn_head(last_h)
        pi, mu, sigma = mdn_split_params(raw, self.n_components)
        return {"pi": pi, "mu": mu, "sigma": sigma}

    def training_step(self, batch, batch_idx):
        X, y_line, lengths = batch
        mdn = self(X, lengths)
        loss = mdn_nll_multitarget(y_line, mdn["pi"], mdn["mu"], mdn["sigma"])
        self.log("train/loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        X, y_line, lengths = batch
        mdn = self(X, lengths)
        loss = mdn_nll_multitarget(y_line, mdn["pi"], mdn["mu"], mdn["sigma"])
    # Log everything to progress bar
        self.log("val/loss", loss, prog_bar=True)
        self.log("val/pi_mean", mdn["pi"].mean(), prog_bar=True)
        self.log("val/pi_std", mdn["pi"].std(), prog_bar=True)
        self.log("val/mu_mean", mdn["mu"].mean(), prog_bar=True)
        self.log("val/mu_std", mdn["mu"].std(), prog_bar=True)
        self.log("val/sigma_mean", mdn["sigma"].mean(), prog_bar=True)
        self.log("val/sigma_std", mdn["sigma"].std(), prog_bar=True)
        
    # # Inside your CNNLSTM_MDN class
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr, weight_decay=1e-4)
    # def configure_optimizers(self):
    #     optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        
    #     # Define the scheduler
    #     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    #         optimizer,
    #         mode='min',      # We want to minimize the validation loss
    #         factor=0.5,      # Reduce LR by 80% (1.0 -> 0.2)
    #         patience=10,      # Wait 5 validation epochs with no improvement before reducing
    #         verbose=True
    #     )
        
    #     return {
    #         "optimizer": optimizer,
    #         "lr_scheduler": {
    #             "scheduler": scheduler,
    #             "monitor": "val/loss",  # The metric to watch
    #         },
    #     }
    # def configure_optimizers(self):
    #     return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)


## CNNtransformer wheightening order

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
import pytorch_lightning as pl
import math

# --- Helper Functions and Modules ---

def mdn_split_params(raw_params, n_components):
    """
    Splits raw MDN output into mixture weights (pi), means (mu), and stds (sigma).
    This function is used by each individual MDN head.
    """
    B, threeK = raw_params.shape
    assert threeK == 3 * n_components
    raw = raw_params.view(B, n_components, 3)
    pi = F.softmax(raw[..., 0], dim=-1)
    mu = raw[..., 1]
    sigma = F.softplus(raw[..., 2]) + 1e-6 # Added a small epsilon for stability
    return pi, mu, sigma

class PositionalEncoding(nn.Module):
    """
    Injects positional information into the input sequence for the Transformer.
    """
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 500):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

# --- Weighted Loss Function for Multi-Head Architecture ---

def weighted_mdn_nll_multihead(y_true, mdn_params_list, weights, padding_value=-1):
    """
    Calculates the weighted negative log-likelihood for a multi-headed MDN.
    This version correctly handles multiple heads and calculates the full NLL for each.

    Args:
        y_true (Tensor): Padded target values, shape (B, num_lines).
        mdn_params_list (list): A list of dicts, one for each head.
        weights (Tensor): A 1D tensor of importance weights, shape (num_lines,).
        padding_value (int): Value used for padding in y_true.
    """
    total_loss = 0.0
    num_lines = y_true.shape[1]
    
    for i in range(num_lines):
        y_target = y_true[:, i:i+1]
        pi, mu, sigma = mdn_params_list[i]['pi'], mdn_params_list[i]['mu'], mdn_params_list[i]['sigma']

        # Create a mask for valid (non-padded) targets for this line
        mask = (y_target != padding_value).squeeze()

        if mask.sum() == 0:  # Skip if no valid targets for this line in the batch
            continue

        # Select only the valid data for this line's loss calculation
        y_target_masked = y_target[mask]
        pi_masked, mu_masked, sigma_masked = pi[mask], mu[mask], sigma[mask]
        
        # Use torch.distributions for a clean and stable calculation
        dist = Normal(loc=mu_masked, scale=sigma_masked)
        
        # Calculate log probabilities of the target values in each Gaussian component
        log_prob = dist.log_prob(y_target_masked.expand_as(mu_masked))
        
        # Mix the probabilities using the mixture weights (pi)
        log_mix_prob = torch.log(pi_masked + 1e-8) + log_prob
        
        # Use logsumexp for numerical stability to get the log-likelihood
        log_likelihood = torch.logsumexp(log_mix_prob, dim=1)
        
        # Calculate the mean negative log-likelihood for this line
        line_loss = -log_likelihood.mean()

        # Apply the importance weight and add to total loss
        total_loss += weights[i] * line_loss

    # If no valid lines were found in the entire batch, return a zero tensor
    if not isinstance(total_loss, torch.Tensor):
        return torch.tensor(0.0, device=y_true.device, requires_grad=True)
        
    return total_loss

# --- The CNN-Transformer Model ---

class cnn_transformer(pl.LightningModule):
    def __init__(self, input_dim, cnn_out_channels=64, d_model=128, nhead=4, num_encoder_layers=2,
                 n_components=9, num_lines=9, lr=1e-4, dropout=0.1):
        super().__init__()
        self.save_hyperparameters()
        self.num_lines = num_lines
        self.n_components = n_components
        self.lr = lr
        
        # 1. CNN Feature Extractor Block
        self.cnn_extractor = nn.Sequential(
            nn.Conv1d(input_dim, cnn_out_channels, kernel_size=3, padding=1),
            nn.BatchNorm1d(cnn_out_channels),
            nn.ReLU(),
            nn.Conv1d(cnn_out_channels, d_model, kernel_size=3, padding=1),
            nn.BatchNorm1d(d_model),
            nn.ReLU()
        )
        
        # 2. Positional Encoding
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        
        # 3. Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        
        # 4. Multi-Head MDN Output
        self.mdn_heads = nn.ModuleList([
            nn.Linear(d_model, 3 * n_components) for _ in range(num_lines)
        ])
        
        # Importance weights for lines (exponential decay)
        weights = torch.tensor([0.9**i for i in range(num_lines)], dtype=torch.float)
        self.register_buffer("loss_weights", weights)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.kaiming_normal_(module.weight, mode="fan_in", nonlinearity="relu")
            if module.bias is not None: nn.init.constant_(module.bias, 0)
        elif isinstance(module, nn.Conv1d):
            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")

    def forward(self, X, src_key_padding_mask=None):
        """
        X: (B, T, input_dim)
        src_key_padding_mask: (B, T) boolean mask for padded elements in X
        """
        x = X["main"]
        
        # 1. CNN Feature Extraction
        # Input for Conv1d needs to be (B, C_in, L), so we permute
        x = x.permute(0, 2, 1)
        x = self.cnn_extractor(x)
        # Permute back to (B, T, C_out) for Transformer
        x = x.permute(0, 2, 1)

        # 2. Add Positional Encoding
        # Transformer expects (T, B, C), so permute again
        x = x.permute(1, 0, 2)
        x = self.pos_encoder(x)
        # Permute back to (B, T, C) for batch_first=True
        x = x.permute(1, 0, 2)

        # 3. Transformer Encoder
        # The mask should indicate which key values are NOT to be attended to
        encoded_seq = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)

        # We use the representation of the last valid timestep for prediction
        # (A common strategy, alternatively you could use mean pooling)
        # For simplicity, we'll take the last hidden state of the sequence.
        sequence_summary = encoded_seq[:, -1, :] # (B, d_model)
        
        # 4. Get parameters from all MDN heads
        mdn_params_list = []
        for i in range(self.num_lines):
            raw_params = self.mdn_heads[i](sequence_summary)
            pi, mu, sigma = mdn_split_params(raw_params, self.n_components)
            mdn_params_list.append({"pi": pi, "mu": mu, "sigma": sigma})

        return mdn_params_list

    def training_step(self, batch, batch_idx):
        X, y, lengths = batch
        # Create the padding mask for the transformer
        # True values indicate positions that should be ignored.
        max_len = X['main'].shape[1]
        mask = torch.arange(max_len, device=self.device)[None, :] >= lengths[:, None]

        mdn_params = self(X, src_key_padding_mask=mask)
        # Use a padding value of -1 for the loss function
        loss = weighted_mdn_nll_multihead(y, mdn_params, self.loss_weights, padding_value=-1)
        self.log("train/loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        X, y, lengths = batch
        max_len = X['main'].shape[1]
        mask = torch.arange(max_len, device=self.device)[None, :] >= lengths[:, None]
        
        mdn_params = self(X, src_key_padding_mask=mask)
        loss = weighted_mdn_nll_multihead(y, mdn_params, self.loss_weights, padding_value=-1)
        self.log("val/loss", loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr)


# data manipulation

In [32]:
import pandas as pd
df_labels = pd.read_csv("/home/iatell/projects/meta-learning/data/line_seq_ordered.csv")
cols = [f'price_line{i}' for i in range(1, 10)]
df_labels = df_labels.dropna(subset=cols, how='all')
df_labels = df_labels.rename(columns={c: c.replace('price_line', 'linePrice_') 
                        for c in df_labels.columns if c.startswith('price_line')})
df_labels.to_csv("/home/iatell/projects/meta-learning/data/line_seq_ordered.csv", index=False)      
#     # overwrites the old file
df_labels

Unnamed: 0,startTime,endTime,startIndex,endIndex,linePrice_1,linePrice_2,linePrice_3,linePrice_4,linePrice_5,linePrice_6,linePrice_7,linePrice_8,linePrice_9
0,1514764800,1515110400,0,4,,0.878016,0.788209,,,,,,
1,1514764800,1515283200,0,6,,1.055290,0.923251,0.828937,,,,,
2,1515024000,1515369600,3,7,1.143628,,,,,,,,
3,1515456000,1514937600,2,8,1.139775,,,,,,,,
4,1515110400,1515542400,4,9,1.143279,0.964469,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
328,1651795200,1649116800,1555,1586,0.873150,0.825739,0.905267,0.938913,,,,0.955736,
330,1652054400,1652227200,1589,1591,1.063729,,,1.023085,,,,,
331,1652572800,1651881600,1587,1595,0.813907,0.870793,,,,,,0.788406,0.904141
332,1653264000,1652227200,1591,1603,1.042211,1.075683,0.992004,0.958532,,,,,


# train

## simple

In [None]:
import sys
from pathlib import Path

# Current notebook location
notebook_path = Path().resolve()

# Add parent folder (meta/) to sys.path
sys.path.append(str(notebook_path.parent))
import joblib
import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from datetime import datetime
from preprocess.multi_regression_seq_dif import preprocess_sequences_csv_multilines
# from models.LSTM.lstm_multi_line_reg_seq_dif import LSTMMultiRegressor
from utils.make_step import make_step
from utils.padding_batch_reg import collate_batch
from utils.get_init_argumens import get_init_args
import pandas as pd
import io
import numpy as np
import os
from add_ons.drop_column import drop_columns
from add_ons.normalize_candle_seq import add_label_normalized_candles
from add_ons.feature_pipeline3 import FeaturePipeline
from add_ons.candle_dif_rate_of_change_percentage import add_candle_rocp
from add_ons.candle_rate_of_change import add_candle_ratios
from sklearn.metrics import accuracy_score, f1_score
# ---------------- Evaluation ---------------- #
@torch.no_grad()
def evaluate_model_mdn(model, val_loader, zero_idx=0, threshold=0.1):
    """
    Evaluate CNN–LSTM–MDN model (last-output version).

    Args
    ----
    model : pl.LightningModule with MDN forward
    val_loader : DataLoader yielding (X, y, lengths)
    zero_idx : which mixture component is considered "no-line" (usually 0)
    threshold : if pi[:,zero_idx] > threshold → predict invalid

    Returns
    -------
    dict with mse, mae, acc, f1
    """
    model.eval()
    all_preds_reg, all_labels_reg = [], []
    all_preds_len, all_labels_len = [], []

    device = next(model.parameters()).device

    with torch.no_grad():
        for X_batch, y_batch, lengths in val_loader:
            if isinstance(X_batch, dict):
                X_batch = {k: v.to(device) for k, v in X_batch.items()}
            else:
                X_batch = X_batch.to(device)

            y_batch = y_batch.to(device)
            mdn = model(X_batch, lengths)
            pi, mu, sigma = mdn["pi"], mdn["mu"], mdn["sigma"]  # (B,K)

            # regression expectation
            y_pred = (pi * mu).sum(dim=-1)  # (B,)
            B = y_batch.size(0)
            y_len = (y_batch > 0).sum(dim=1)                # (B,)
            idx = torch.clamp(y_len - 1, min=0)             # last valid index
            y_true = y_batch[torch.arange(B, device=y_batch.device), idx]  # (B,)
            # only last step
            # print("lengths(features):", lengths[:10])
            # print("lengths(labels):", y_len[:10])

            all_preds_reg.append(y_pred.cpu().numpy())
            all_labels_reg.append(y_true.cpu().numpy())

            # validity classification
            pi_zero = pi[:, zero_idx]  # (B,)
            pred_valid = (pi_zero < (1 - threshold)).long()
            true_valid = torch.ones_like(pred_valid)  # last step always valid

            all_preds_len.extend(pred_valid.cpu().numpy().tolist())
            all_labels_len.extend(true_valid.cpu().numpy().tolist())


        # ----- Regression metrics -----
    all_preds_reg = np.concatenate(all_preds_reg)  # (N,)
    all_labels_reg = np.concatenate(all_labels_reg)
    mse = ((all_preds_reg - all_labels_reg) ** 2).mean()
    mae = np.abs(all_preds_reg - all_labels_reg).mean()
    # ----- Validity metrics -----
    acc = accuracy_score(all_labels_len, all_preds_len)
    f1 = f1_score(all_labels_len, all_preds_len, average="macro")

    print("\n📊 Validation Metrics (MDN, last-output):")
    print(f"  Regression → MSE: {mse:.6f}, MAE: {mae:.6f}")
    print(f"  Validity   → Acc: {acc:.4f}, F1: {f1:.4f}")

    return {"mse": mse, "mae": mae, "acc": acc, "f1": f1}
# ---------------- Train ---------------- #
def train_model(
    data_csv,
    labels_csv,
    model_out_dir="models/saved_models",
    do_validation=True,
    hidden_dim=200,
    num_layers=1,
    lr=0.001,
    batch_size=32,
    max_epochs=1000,
    save_model=False,
    return_val_accuracy = True,
    test_mode = False,
    early_stop = False
):

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_out = f"{model_out_dir}/lstm_model_multireg_{timestamp}.pt"
    meta_out  = f"{model_out_dir}/lstm_meta_multireg_{timestamp}.pkl"

    pipeline = FeaturePipeline(
        steps=[
            # make_step(add_label_normalized_candles),
            make_step(add_candle_rocp),
            make_step(drop_columns, cols_to_drop=["open","high","low","close","volume"]),
            
        ],
        # norm_methods={
        #     "main": {
        #         "upper_shadow": "robust", "body": "standard", "lower_shadow": "standard",
        #         "upper_body_ratio": "standard", "lower_body_ratio": "standard",
        #         "upper_lower_body_ratio": "standard", "Candle_Color": "standard"
        #     }
        # },
        per_window_flags=[
            False, 
          False, 
        #   True
                ]
    )
    # Preprocess: pad linePrices and sequences
    if do_validation:
        train_ds, val_ds, df, feature_cols, max_len_y = preprocess_sequences_csv_multilines(
            data_csv, labels_csv,
            val_split=True,
            for_xgboost=False,
            debug_sample=True,
            feature_pipeline=pipeline
        )
    else:
        train_ds, df, feature_cols, max_len_y = preprocess_sequences_csv_multilines(
            data_csv, labels_csv,
            val_split=False,
            for_xgboost=False,
            debug_sample=False,
            feature_pipeline=pipeline
        )
        val_ds = None

    sample = train_ds[0][0]  # first sample's features
    if isinstance(sample, dict):  # multiple feature groups
        input_dim = sample['main'].shape[1]
    else:  # single tensor
        input_dim = sample.shape[1]

    model = CNNLSTM_MDN(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        lr=lr
    )
    init_args = get_init_args(model, input_dim=input_dim, hidden_dim=hidden_dim, num_layers=num_layers, lr=lr)

    model_class_info = {
        "module": model.__class__.__module__,
        "class": model.__class__.__name__,
        "init_args": init_args
    }

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
    val_loader = DataLoader(val_ds, batch_size=batch_size, collate_fn=collate_batch) if val_ds else None
    
    # --- Debug / Test mode --- #
    if test_mode:
        save_model = False
        from itertools import islice

        # Try to grab 3rd batch; if not available, take first
        try:
            batch = next(islice(iter(train_loader), 2, 3))
        except StopIteration:
            batch = next(iter(train_loader))

        X_batch_dict, y_batch, lengths = batch

        print("🔍 Debug batch:")
        if isinstance(X_batch_dict, dict):
            print("  Keys in X_batch:", list(X_batch_dict.keys()))
        print("  y_batch shape:", y_batch.shape)
        print("  First label in batch:", y_batch[0])

        # --- Track real column names for each feature group ---
        feature_names_dict = {}
        for name, X_batch in X_batch_dict.items():
            if name == "main":
                # Use actual feature columns after preprocessing
                feature_names_dict[name] = feature_cols
            else:
                # For extra feature groups, fallback to generic names
                feature_names_dict[name] = [f"{name}_{i}" for i in range(X_batch.shape[2])]

        dfs = []
        for name, X_batch in X_batch_dict.items():
            print(f"\nFeature group: {name}")
            print("  X_batch shape:", X_batch.shape)
            print("  First sequence in batch (first  steps):\n", X_batch[0][:])

            batch_size_, seq_len, feature_dim = X_batch.shape
            df_part = pd.DataFrame(
                X_batch.reshape(batch_size_ * seq_len, feature_dim).numpy(),
                columns=feature_names_dict[name]
            )
            dfs.append(df_part)

        # Combine all feature groups horizontally
        global df_seq
        df_seq = pd.concat(dfs, axis=1)
        print("\n✅ Combined df_seq shape:", df_seq.shape)
        print("✅ Column names in df_seq:", df_seq.columns.tolist())

    # --- Early stopping --- #
    if early_stop == True:
        from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
        early_stop_callback = EarlyStopping(
            monitor="val_loss",   # metric to monitor (must be logged in your LightningModule)
            patience=10,          # number of epochs with no improvement before stopping
            min_delta=0.001,      # minimum improvement to qualify as "better"
            mode="min",           # "min" for loss, "max" for accuracy
            verbose=True
        )

        checkpoint_callback = ModelCheckpoint(
            dirpath=model_out_dir,
            filename="best_model",
            save_top_k=1,
            monitor="val_loss",
            mode="min"
        )
        callbacks=[early_stop_callback,checkpoint_callback]

    trainer = pl.Trainer(
        max_epochs=max_epochs,
        accelerator="auto",
        devices=1,
        fast_dev_run=test_mode,
        gradient_clip_val=1.0,
        gradient_clip_algorithm="norm",
        callbacks= callbacks if early_stop else None
    )

    trainer.fit(model, train_loader, val_loader)

    if save_model:
        os.makedirs(model_out_dir, exist_ok=True)
        trainer.save_checkpoint(model_out)
        joblib.dump({
    "input_dim": input_dim,
    "hidden_dim": hidden_dim,
    "num_layers": num_layers,
    "max_len_y": max_len_y,
    "feature_cols": feature_cols,
    "scalers": pipeline.scalers,
    "pipeline_config": pipeline.export_config(),
    "model_class_info": model_class_info   # ✅ save model class info
}, meta_out)
        
    # --- Evaluation --- #
    if do_validation:
        mse, mae, acc, f1 = evaluate_model_mdn(model, val_loader)
        if return_val_accuracy:
            return {"mse": mse, "mae": mae, "acc": acc, "f1": f1}
        
if __name__ == "__main__":
    train_model(
        "/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles.csv",
        "/home/iatell/projects/meta-learning/data/seq_line_labels.csv",
        save_model=True,
        do_validation=True,
        test_mode = False
    )


### hungarian lstm

In [None]:
import sys
from pathlib import Path

# Current notebook location
notebook_path = Path().resolve()

# Add parent folder (meta/) to sys.path
sys.path.append(str(notebook_path.parent))
import joblib
import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, confusion_matrix
from datetime import datetime
from preprocess.multi_regression_seq_dif3 import preprocess_sequences_csv_multilines
# from models.LSTM.lstm_multi_line_reg_seq_dif import LSTMMultiRegressor
from utils.print_batch import print_batch
from utils.to_address import to_address
from utils.json_to_csv import json_to_csv_in_memory
from utils.padding_batch_reg import collate_batch
import pandas as pd
import io
import numpy as np
import os
from sklearn.metrics import accuracy_score, f1_score
from add_ons.feature_pipeline5 import FeaturePipeline
from add_ons.drop_column import drop_columns
from add_ons.candle_dif_rate_of_change_percentage2 import add_candle_rocp
from add_ons.candle_proportion import add_candle_proportions
from add_ons.candle_rate_of_change import add_candle_ratios
from utils.make_step import make_step

# ---------------- Evaluation ---------------- #
def evaluate_model(model, val_loader):
    model.eval()
    all_preds_reg, all_labels_reg = [], []

    with torch.no_grad():
        for X_batch, y_batch, lengths in val_loader:
            device = next(model.parameters()).device
            X_batch = {k: v.to(device) for k, v in X_batch.items()}
            y_batch = y_batch.to(device)
            lengths = lengths.to(device)

            # Forward pass: regression only
            y_pred = model(X_batch, lengths)

            mask = (y_batch != 0).float()

            # --- Hungarian assignment per batch ---
            batch_preds = []
            batch_labels = []
            for i in range(y_batch.shape[0]):
                gt_vals = y_batch[i][mask[i] > 0]  # true targets
                preds = y_pred[i]

                if len(gt_vals) == 0:
                    continue

                cost = torch.cdist(gt_vals.unsqueeze(1), preds.unsqueeze(1), p=2).pow(2)
                row_ind, col_ind = linear_sum_assignment(cost.cpu().numpy())

                matched_preds = preds[col_ind].cpu().numpy()
                matched_labels = gt_vals[row_ind].cpu().numpy()

                batch_preds.extend(matched_preds.tolist())
                batch_labels.extend(matched_labels.tolist())

            all_preds_reg.extend(batch_preds)
            all_labels_reg.extend(batch_labels)

    # Convert to arrays
    all_preds_reg = np.array(all_preds_reg)
    all_labels_reg = np.array(all_labels_reg)

    # Regression metrics
    mse = ((all_preds_reg - all_labels_reg) ** 2).mean()
    mae = np.abs(all_preds_reg - all_labels_reg).mean()

    print("\n📊 Validation Metrics (Hungarian matched):")
    print(f"  Regression → MSE: {mse:.6f}, MAE: {mae:.6f}")

    return {"mse": mse, "mae": mae}


# ---------------- Train ---------------- #
def train_model(
    data_csv,
    labels_csv,
    model_out_dir="models/saved_models",
    do_validation=True,
    hidden_dim=128,
    num_layers=1,
    lr=0.001,
    batch_size=50,
    max_epochs=300,
    save_model=True,
    return_val_accuracy = True,
    test_mode = True,
    early_stop = False
):

    pipeline = FeaturePipeline(
        steps=[
            # make_step(add_label_normalized_candles),
            make_step(add_candle_rocp),
            make_step(drop_columns, cols_to_drop=["open","high","low","close","volume"]),
            
        ],
        # norm_methods={
        #     "main": {
        #         "upper_shadow": "robust", "body": "standard", "lower_shadow": "standard",
        #         "upper_body_ratio": "standard", "lower_body_ratio": "standard",
        #         "upper_lower_body_ratio": "standard", "Candle_Color": "standard"
        #     }
        # },
        per_window_flags=[
            False, 
          False, 
        #   True
                ]
    )
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_out = f"{model_out_dir}/lstm_model_multireg_multihead_{timestamp}.pt"
    meta_out  = f"{model_out_dir}/lstm_meta_multireg_multihead_{timestamp}.pkl"

    # Preprocess: pad linePrices and sequences
    if do_validation:
        train_ds, val_ds, df, feature_cols, max_len_y = preprocess_sequences_csv_multilines(
            data_csv, labels_csv,
            val_split=True,
            for_xgboost=False,
            debug_sample=True,
            feature_pipeline=pipeline
        )
    else:
        train_ds, df, feature_cols, max_len_y = preprocess_sequences_csv_multilines(
            data_csv, labels_csv,
            val_split=False,
            for_xgboost=False,
            debug_sample=False
        )
        val_ds = None

    sample = train_ds[0][0]  # first sample's features
    if isinstance(sample, dict):  # multiple feature groups
        input_dim = sample['main'].shape[1]
    else:  # single tensor
        input_dim = sample.shape[1]

    model = LSTMMultiRegressor(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        max_len_y=max_len_y,
        lr=lr
    )
    init_args = {
    "input_dim": input_dim,
    "hidden_dim": hidden_dim,
    "num_layers": num_layers,
    "max_len_y": max_len_y,
    "lr": lr
}

    model_class_info = {
        "module": model.__class__.__module__,
        "class": model.__class__.__name__,
        "init_args": init_args
    }

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
    val_loader = DataLoader(val_ds, batch_size=batch_size, collate_fn=collate_batch) if val_ds else None
    # --- Early stopping --- #
    if early_stop == True:
        from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
        early_stop_callback = EarlyStopping(
            monitor="val_loss",   # metric to monitor (must be logged in your LightningModule)
            patience=10,          # number of epochs with no improvement before stopping
            min_delta=0.001,      # minimum improvement to qualify as "better"
            mode="min",           # "min" for loss, "max" for accuracy
            verbose=True
        )

        checkpoint_callback = ModelCheckpoint(
            dirpath=model_out_dir,
            filename="best_model",
            save_top_k=1,
            monitor="val_loss",
            mode="min"
        )
        callbacks=[early_stop_callback,checkpoint_callback]

    trainer = pl.Trainer(
        max_epochs=max_epochs,
        accelerator="auto",
        devices=1,
        fast_dev_run=test_mode,
        gradient_clip_val=1.0,
        gradient_clip_algorithm="norm",
        callbacks= callbacks if early_stop else None
    )

    trainer.fit(model, train_loader, val_loader)

    # --- Debug / Test mode --- #
    if test_mode:
        save_model = False
        from itertools import islice

        # Try to grab 3rd batch; if not available, take first
        try:
            batch = next(islice(iter(train_loader), 2, 3))
        except StopIteration:
            batch = next(iter(train_loader))

        X_batch_dict, y_batch, lengths = batch

        print("🔍 Debug batch:")
        if isinstance(X_batch_dict, dict):
            print("  Keys in X_batch:", list(X_batch_dict.keys()))
        print("  y_batch shape:", y_batch.shape)
        print("  First label in batch:", y_batch[0])

        # --- Track real column names for each feature group ---
        feature_names_dict = {}
        for name, X_batch in X_batch_dict.items():
            if name == "main":
                # Use actual feature columns after preprocessing
                feature_names_dict[name] = feature_cols
            else:
                # For extra feature groups, fallback to generic names
                feature_names_dict[name] = [f"{name}_{i}" for i in range(X_batch.shape[2])]

        dfs = []
        for name, X_batch in X_batch_dict.items():
            print(f"\nFeature group: {name}")
            print("  X_batch shape:", X_batch.shape)
            print("  First sequence in batch (first  steps):\n", X_batch[0][:])

            batch_size_, seq_len, feature_dim = X_batch.shape
            df_part = pd.DataFrame(
                X_batch.reshape(batch_size_ * seq_len, feature_dim).numpy(),
                columns=feature_names_dict[name]
            )
            dfs.append(df_part)

        # Combine all feature groups horizontally
        global df_seq
        df_seq = pd.concat(dfs, axis=1)
        print("\n✅ Combined df_seq shape:", df_seq.shape)
        print("✅ Column names in df_seq:", df_seq.columns.tolist())

        
    if save_model:
        os.makedirs(model_out_dir, exist_ok=True)
        trainer.save_checkpoint(model_out)
        joblib.dump({
            "input_dim": input_dim,
            "hidden_dim": hidden_dim,
            "num_layers": num_layers,
            "max_len_y": max_len_y,
            "feature_cols": feature_cols,
            "scalers": pipeline.scalers,
            "pipeline_config": pipeline.export_config(),
            "model_class_info": model_class_info 
        }, meta_out)
        print(f"✅ Model saved to {model_out}")
        print(f"✅ Meta saved to {meta_out}")


        
    # --- Evaluation --- #
    if do_validation:
        metrics = evaluate_model(model, val_loader)
        if return_val_accuracy:
            return metrics

        
if __name__ == "__main__":
    train_model(
        "/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles.csv",
        "/home/iatell/projects/meta-learning/data/line_seq_ordered.csv",
        do_validation=True,
        test_mode = False
    )


## ordered

### cnn lstm

In [2]:
import sys
from pathlib import Path

# Current notebook location
notebook_path = Path().resolve()

# Add parent folder (meta/) to sys.path
sys.path.append(str(notebook_path.parent))
import joblib
import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from datetime import datetime
from preprocess.multi_regression_seq_dif3 import preprocess_sequences_csv_multilines
# from models.LSTM.lstm_multi_line_reg_seq_dif import LSTMMultiRegressor
from utils.make_step import make_step
from utils.padding_batch_reg import collate_batch
from utils.get_init_argumens import get_init_args
import pandas as pd
import io
import numpy as np
import os
from add_ons.drop_columns2 import drop_columns
from add_ons.normalize_candle_seq import add_label_normalized_candles
from add_ons.feature_pipeline5 import FeaturePipeline
from add_ons.candle_dif_rate_of_change_percentage2 import add_candle_rocp
from add_ons.candle_rate_of_change import add_candle_ratios
from add_ons.candle_proportion_simple import add_candle_shape_features
from sklearn.metrics import accuracy_score, f1_score,mean_squared_error,mean_absolute_error
from utils.to_address import to_address
# ---------------- Evaluation ---------------- #
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, f1_score
import torch

@torch.no_grad()
def evaluate_model_mdn(model, val_loader, threshold=0.1):
    """
    Evaluate CNN–LSTM–MDN model (multi-head, top-pi selection per line).

    Args
    ----
    model : pl.LightningModule with multi-head MDN forward
    val_loader : DataLoader yielding (X, y, lengths)
    threshold : optional threshold for validity classification

    Returns
    -------
    dict with mse, mae, acc, f1
    """
    model.eval()
    all_preds_reg, all_labels_reg = [], []
    all_preds_len, all_labels_len = [], []

    device = next(model.parameters()).device

    for X_batch, y_batch, lengths in val_loader:
        # Move to device
        if isinstance(X_batch, dict):
            X_batch = {k: v.to(device) for k, v in X_batch.items()}
        else:
            X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        # Forward pass
        mdn_params = model(X_batch, lengths)

        B, num_lines = y_batch.shape
        y_pred_lines = []

        for i in range(num_lines):
            pi, mu = mdn_params['pi'], mdn_params['mu']  # both (B, n_components)
            
            # Pick component with highest pi per sample
            top_idx = torch.argmax(pi, dim=1, keepdim=True)     # (B,1)
            selected_mu = mu.gather(1, top_idx).squeeze(1)     # (B,)

            # Mask padded targets
            mask = (y_batch[:, i] != 0)
            selected_mu[~mask] = 0.0

            y_pred_lines.append(selected_mu)

        y_pred_all = torch.stack(y_pred_lines, dim=1)  # (B, num_lines)

        # Last valid step per sample
        y_len = (y_batch > 0).sum(dim=1)
        idx = torch.clamp(y_len - 1, min=0)
        y_true = y_batch[torch.arange(B), idx]
        y_pred = y_pred_all[torch.arange(B), idx]

        all_preds_reg.append(y_pred.cpu().numpy())
        all_labels_reg.append(y_true.cpu().numpy())

        # --- Validity classification ---
        pred_valid_lines = []
        for i in range(num_lines):
            pi = mdn_params['pi']    # (B, n_components)
            top_idx = torch.argmax(pi, dim=1, keepdim=True)
            pi_max = pi.gather(1, top_idx).squeeze(1)
            pred_valid_lines.append((pi_max > threshold).long())

        pred_valid_all = torch.stack(pred_valid_lines, dim=1)
        pred_valid_last = pred_valid_all[torch.arange(B), idx]
        true_valid_last = torch.ones_like(pred_valid_last)

        all_preds_len.extend(pred_valid_last.cpu().numpy().tolist())
        all_labels_len.extend(true_valid_last.cpu().numpy().tolist())

    # Concatenate all batches
    y_pred_reg = np.concatenate(all_preds_reg)
    y_true_reg = np.concatenate(all_labels_reg)

    mse = mean_squared_error(y_true_reg, y_pred_reg)
    mae = mean_absolute_error(y_true_reg, y_pred_reg)
    acc = accuracy_score(all_labels_len, all_preds_len)
    f1 = f1_score(all_labels_len, all_preds_len)

    print("mse:", mse, "mae:", mae, "acc:", acc, "f1:", f1)
    return {"mse": mse, "mae": mae, "acc": acc, "f1": f1}

# ---------------- Train ---------------- #
def train_model(
    data_csv,
    labels_csv,
    model_out_dir="models/saved_models",
    do_validation=False,
    hidden_dim=32,
    num_layers=1,
    lr=0.001,
    feature_eng=15,
    n_components=9,
    dropout = 0.1,
    batch_size=2,
    max_epochs=600,
    save_model=False,
    return_val_accuracy = True,
    test_mode = False,
    early_stop = False
):

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_out = f"{model_out_dir}/lstm_model_multireg_{timestamp}.pt"
    meta_out  = f"{model_out_dir}/lstm_meta_multireg_{timestamp}.pkl"

    pipeline = FeaturePipeline(
        steps=[
            # make_step(add_label_normalized_candles),
            make_step(add_candle_rocp),
            make_step(add_candle_shape_features),
            make_step(add_label_normalized_candles),
            make_step(drop_columns, cols_to_drop=["open","high","low","close","volume"]),
            
        ],
        # norm_methods={
        #     "main": {
        #         "upper_shadow": "standard", "body": "standard", "lower_shadow": "standard",
        #         # "open_dif":"standard","close_dif":"standard","high_dif":"standard","low_dif":"standard"
        #         # "upper_body_ratio": "standard", "lower_body_ratio": "standard",
        #         # "upper_lower_body_ratio": "standard", "Candle_Color": "standard"
        #     }
        # },
        per_window_flags=[
            False, 
          False, 
          True,
          True
                ]
    )
    # Preprocess: pad linePrices and sequences

    if do_validation:
        train_ds, val_ds, df, feature_cols, max_len_y = preprocess_sequences_csv_multilines(
            data_csv, labels_csv,
            val_split=True,
            for_xgboost=False,
            debug_sample=True,
            feature_pipeline=pipeline,
            preserve_order= True
        )
    else:
        train_ds, df, feature_cols, max_len_y = preprocess_sequences_csv_multilines(
            data_csv, labels_csv,
            val_split=False,
            for_xgboost=False,
            debug_sample=True,
            feature_pipeline=pipeline,
            preserve_order= True,
        )
        val_ds = None
    print("features",feature_cols)
    sample = train_ds[0][0]  # first sample's features
    if isinstance(sample, dict):  # multiple feature groups
        input_dim = sample['main'].shape[1]
    else:  # single tensor
        input_dim = sample.shape[1]

    model = cnn_lstm(input_dim=input_dim, feature_eng= feature_eng, hidden_dim=hidden_dim, 
                     n_components=n_components,  lr=lr, dropout=dropout,num_lines=max_len_y)
    init_args = get_init_args(model, input_dim=input_dim,feature_eng= feature_eng
                              ,hidden_dim=hidden_dim, n_components=n_components,
                              lr=lr, dropout=dropout,num_lines=max_len_y)
    model_class_info = {
        "module": model.__class__.__module__,
        "class": model.__class__.__name__,
        "init_args": init_args
    }

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
    val_loader = DataLoader(val_ds, batch_size=batch_size, collate_fn=collate_batch) if val_ds else None
    
    # --- Debug / Test mode --- #
    if test_mode:
        save_model = False
        from itertools import islice

        # Try to grab 3rd batch; if not available, take first
        try:
            batch = next(islice(iter(train_loader), 2, 3))
        except StopIteration:
            batch = next(iter(train_loader))

        X_batch_dict, y_batch, lengths = batch

        print("🔍 Debug batch:")
        if isinstance(X_batch_dict, dict):
            print("  Keys in X_batch:", list(X_batch_dict.keys()))
        print("  y_batch shape:", y_batch.shape)
        print("  First label in batch:", y_batch[0])

        # --- Track real column names for each feature group ---
        feature_names_dict = {}
        for name, X_batch in X_batch_dict.items():
            if name == "main":
                # Use actual feature columns after preprocessing
                feature_names_dict[name] = feature_cols
            else:
                # For extra feature groups, fallback to generic names
                feature_names_dict[name] = [f"{name}_{i}" for i in range(X_batch.shape[2])]

        dfs = []
        for name, X_batch in X_batch_dict.items():
            print(f"\nFeature group: {name}")
            print("  X_batch shape:", X_batch.shape)
            print("  First sequence in batch (first  steps):\n", X_batch[0][:])

            batch_size_, seq_len, feature_dim = X_batch.shape
            df_part = pd.DataFrame(
                X_batch.reshape(batch_size_ * seq_len, feature_dim).numpy(),
                columns=feature_names_dict[name]
            )
            dfs.append(df_part)

        # Combine all feature groups horizontally
        global df_seq
        df_seq = pd.concat(dfs, axis=1)
        print("\n✅ Combined df_seq shape:", df_seq.shape)
        print("✅ Column names in df_seq:", df_seq.columns.tolist())

    # --- Early stopping --- #
    if early_stop == True:
        from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
        early_stop_callback = EarlyStopping(
            monitor="val_loss",   # metric to monitor (must be logged in your LightningModule)
            patience=10,          # number of epochs with no improvement before stopping
            min_delta=0.001,      # minimum improvement to qualify as "better"
            mode="min",           # "min" for loss, "max" for accuracy
            verbose=True
        )

        checkpoint_callback = ModelCheckpoint(
            dirpath=model_out_dir,
            filename="best_model",
            save_top_k=1,
            monitor="val_loss",
            mode="min"
        )
        callbacks=[early_stop_callback,checkpoint_callback]

    trainer = pl.Trainer(
        max_epochs=max_epochs,
        accelerator="auto",
        devices=1,
        fast_dev_run=test_mode,
        gradient_clip_val=1.0,
        gradient_clip_algorithm="norm",
        callbacks= callbacks if early_stop else None
    )

    trainer.fit(model, train_loader, val_loader)

    if save_model:
        os.makedirs(model_out_dir, exist_ok=True)
        trainer.save_checkpoint(model_out)
        joblib.dump({
    "input_dim": input_dim,
    "hidden_dim": hidden_dim,
    "num_layers": num_layers,
    "max_len_y": max_len_y,
    "feature_cols": feature_cols,
    "scalers": pipeline.scalers,
    "pipeline_config": pipeline.export_config(),
    "model_class_info": model_class_info   # ✅ save model class info
}, meta_out)
        
    # --- Evaluation --- #
    if do_validation:
        mse, mae, acc, f1 = evaluate_model_mdn(model, val_loader)
        if return_val_accuracy:
            return {"mse": mse, "mae": mae, "acc": acc, "f1": f1}
        
if __name__ == "__main__":
    train_model(
        "/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles.csv",
        "/home/iatell/projects/meta-learning/data/debug_test_seq.csv",
        save_model=True,
        do_validation=False,
        test_mode = True
    )


💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.
/home/iatell/envs/Rllib2.43/lib/python3.11/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision



=== DEBUG SAMPLE CHECK (Torch mode) ===

--- Sequence 0 ---
Label: [1.235186 0.       0.       0.       0.       0.       0.       0.
 0.      ] Encoded (padded): [1.235186 0.       0.       0.       0.       0.       0.       0.
 0.      ]
[main] Shape: (3, 12)
[main] First few rows:
 [[ 0.0334583  -0.0164952  -0.08295181 -0.05172484  0.0091133   0.06722008
   0.05172484  0.3         1.3036697   1.3155504   1.1531377   1.2362376 ]
 [-0.05151443 -0.0062422   0.04603237  0.0048193   0.05244192  0.02449848
   0.00457536  0.7         1.236512    1.3073386   1.2062193   1.2421954 ]
 [ 0.00163378 -0.04961828 -0.31281227 -0.19497368  0.00318     0.17110091
   0.19259259  0.3         1.2385321   1.2424706   0.8288991   1.        ]]

features ['open_dif', 'high_dif', 'low_dif', 'close_dif', 'upper_shadow', 'lower_shadow', 'body', 'color', 'open_prop', 'high_prop', 'low_prop', 'close_prop']
🔍 Debug batch:
  Keys in X_batch: ['main']
  y_batch shape: torch.Size([2, 9])
  First label in batch: t


  | Name          | Type      | Params | Mode 
----------------------------------------------------
0 | fc1           | Linear    | 195    | train
1 | ln1           | LayerNorm | 30     | train
2 | k1            | Conv1d    | 240    | train
3 | k3            | Conv1d    | 690    | train
4 | fusion_conv2d | Conv2d    | 3      | train
5 | lstm          | LSTM      | 6.3 K  | train
6 | mdn_head      | Linear    | 891    | train
----------------------------------------------------
8.3 K     Trainable params
0         Non-trainable params
8.3 K     Total params
0.033     Total estimated model params size (MB)
7         Modules in train mode
0         Modules in eval mode
/home/iatell/envs/Rllib2.43/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=1` reached.


### cnn transforemer

In [3]:
import sys
from pathlib import Path

# Current notebook location
notebook_path = Path().resolve()

# Add parent folder (meta/) to sys.path
sys.path.append(str(notebook_path.parent))
import joblib
import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from datetime import datetime
from preprocess.multi_regression_seq_dif2 import preprocess_sequences_csv_multilines
# from models.LSTM.lstm_multi_line_reg_seq_dif import LSTMMultiRegressor
from utils.make_step import make_step
from utils.padding_batch_reg import collate_batch
from utils.get_init_argumens import get_init_args
import pandas as pd
import io
import numpy as np
import os
from add_ons.drop_column import drop_columns
from add_ons.normalize_candle_seq import add_label_normalized_candles
from add_ons.feature_pipeline4 import FeaturePipeline
from add_ons.candle_dif_rate_of_change_percentage2 import add_candle_rocp
from add_ons.candle_rate_of_change import add_candle_ratios
from sklearn.metrics import accuracy_score, f1_score,mean_squared_error,mean_absolute_error
from utils.to_address import to_address
# ---------------- Evaluation ---------------- #
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, f1_score
import torch

@torch.no_grad()
def evaluate_model_mdn(model, val_loader, threshold=0.1):
    """
    Evaluate CNN–LSTM–MDN model (multi-head, top-pi selection per line).

    Args
    ----
    model : pl.LightningModule with multi-head MDN forward
    val_loader : DataLoader yielding (X, y, lengths)
    threshold : optional threshold for validity classification

    Returns
    -------
    dict with mse, mae, acc, f1
    """
    model.eval()
    all_preds_reg, all_labels_reg = [], []
    all_preds_len, all_labels_len = [], []

    device = next(model.parameters()).device

    for X_batch, y_batch, lengths in val_loader:
        # Move to device
        if isinstance(X_batch, dict):
            X_batch = {k: v.to(device) for k, v in X_batch.items()}
        else:
            X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        # Forward pass
        mdn_params = model(X_batch, lengths)

        B, num_lines = y_batch.shape
        y_pred_lines = []

        for i in range(num_lines):
            pi, mu = mdn_params['pi'], mdn_params['mu']  # both (B, n_components)
            
            # Pick component with highest pi per sample
            top_idx = torch.argmax(pi, dim=1, keepdim=True)     # (B,1)
            selected_mu = mu.gather(1, top_idx).squeeze(1)     # (B,)

            # Mask padded targets
            mask = (y_batch[:, i] != 0)
            selected_mu[~mask] = 0.0

            y_pred_lines.append(selected_mu)

        y_pred_all = torch.stack(y_pred_lines, dim=1)  # (B, num_lines)

        # Last valid step per sample
        y_len = (y_batch > 0).sum(dim=1)
        idx = torch.clamp(y_len - 1, min=0)
        y_true = y_batch[torch.arange(B), idx]
        y_pred = y_pred_all[torch.arange(B), idx]

        all_preds_reg.append(y_pred.cpu().numpy())
        all_labels_reg.append(y_true.cpu().numpy())

        # --- Validity classification ---
        pred_valid_lines = []
        for i in range(num_lines):
            pi = mdn_params['pi']    # (B, n_components)
            top_idx = torch.argmax(pi, dim=1, keepdim=True)
            pi_max = pi.gather(1, top_idx).squeeze(1)
            pred_valid_lines.append((pi_max > threshold).long())

        pred_valid_all = torch.stack(pred_valid_lines, dim=1)
        pred_valid_last = pred_valid_all[torch.arange(B), idx]
        true_valid_last = torch.ones_like(pred_valid_last)

        all_preds_len.extend(pred_valid_last.cpu().numpy().tolist())
        all_labels_len.extend(true_valid_last.cpu().numpy().tolist())

    # Concatenate all batches
    y_pred_reg = np.concatenate(all_preds_reg)
    y_true_reg = np.concatenate(all_labels_reg)

    mse = mean_squared_error(y_true_reg, y_pred_reg)
    mae = mean_absolute_error(y_true_reg, y_pred_reg)
    acc = accuracy_score(all_labels_len, all_preds_len)
    f1 = f1_score(all_labels_len, all_preds_len)

    print("mse:", mse, "mae:", mae, "acc:", acc, "f1:", f1)
    return {"mse": mse, "mae": mae, "acc": acc, "f1": f1}

# ---------------- Train ---------------- #
def train_model(
    data_csv,
    labels_csv,
    model_out_dir="models/saved_models",
    do_validation=True,
    hidden_dim=32,
    num_layers=1,
    lr=0.001,
    batch_size=32,
    max_epochs=500,
    save_model=False,
    return_val_accuracy = True,
    test_mode = False,
    early_stop = False
):

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_out = f"{model_out_dir}/lstm_model_multireg_{timestamp}.pt"
    meta_out  = f"{model_out_dir}/lstm_meta_multireg_{timestamp}.pkl"

    pipeline = FeaturePipeline(
        steps=[
            # make_step(add_label_normalized_candles),
            make_step(add_candle_rocp),
            make_step(drop_columns, cols_to_drop=["open","high","low","close","volume"]),
            
        ],
        # norm_methods={
        #     "main": {
        #         "upper_shadow": "robust", "body": "standard", "lower_shadow": "standard",
        #         "upper_body_ratio": "standard", "lower_body_ratio": "standard",
        #         "upper_lower_body_ratio": "standard", "Candle_Color": "standard"
        #     }
        # },
        per_window_flags=[
            False, 
          False, 
        #   True
                ]
    )
    # Preprocess: pad linePrices and sequences

    if do_validation:
        train_ds, val_ds, df, feature_cols, max_len_y = preprocess_sequences_csv_multilines(
            data_csv, labels_csv,
            val_split=True,
            for_xgboost=False,
            debug_sample=True,
            feature_pipeline=pipeline,
            preserve_order= True
        )
    else:
        train_ds, df, feature_cols, max_len_y = preprocess_sequences_csv_multilines(
            data_csv, labels_csv,
            val_split=False,
            for_xgboost=False,
            debug_sample=False,
            feature_pipeline=pipeline,
            preserve_order= True
        )
        val_ds = None

    sample = train_ds[0][0]  # first sample's features
    if isinstance(sample, dict):  # multiple feature groups
        input_dim = sample['main'].shape[1]
    else:  # single tensor
        input_dim = sample.shape[1]

    model = cnn_transformer(input_dim, feature_eng=15, hidden_dim=32, n_components=9, num_lines=9, lr=1e-3, dropout=0.1
    )
    init_args = get_init_args(model, input_dim=input_dim,num_lines= max_len_y )

    model_class_info = {
        "module": model.__class__.__module__,
        "class": model.__class__.__name__,
        "init_args": init_args
    }

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
    val_loader = DataLoader(val_ds, batch_size=batch_size, collate_fn=collate_batch) if val_ds else None
    
    # --- Debug / Test mode --- #
    if test_mode:
        save_model = False
        from itertools import islice

        # Try to grab 3rd batch; if not available, take first
        try:
            batch = next(islice(iter(train_loader), 2, 3))
        except StopIteration:
            batch = next(iter(train_loader))

        X_batch_dict, y_batch, lengths = batch

        print("🔍 Debug batch:")
        if isinstance(X_batch_dict, dict):
            print("  Keys in X_batch:", list(X_batch_dict.keys()))
        print("  y_batch shape:", y_batch.shape)
        print("  First label in batch:", y_batch[0])

        # --- Track real column names for each feature group ---
        feature_names_dict = {}
        for name, X_batch in X_batch_dict.items():
            if name == "main":
                # Use actual feature columns after preprocessing
                feature_names_dict[name] = feature_cols
            else:
                # For extra feature groups, fallback to generic names
                feature_names_dict[name] = [f"{name}_{i}" for i in range(X_batch.shape[2])]

        dfs = []
        for name, X_batch in X_batch_dict.items():
            print(f"\nFeature group: {name}")
            print("  X_batch shape:", X_batch.shape)
            print("  First sequence in batch (first  steps):\n", X_batch[0][:])

            batch_size_, seq_len, feature_dim = X_batch.shape
            df_part = pd.DataFrame(
                X_batch.reshape(batch_size_ * seq_len, feature_dim).numpy(),
                columns=feature_names_dict[name]
            )
            dfs.append(df_part)

        # Combine all feature groups horizontally
        global df_seq
        df_seq = pd.concat(dfs, axis=1)
        print("\n✅ Combined df_seq shape:", df_seq.shape)
        print("✅ Column names in df_seq:", df_seq.columns.tolist())

    # --- Early stopping --- #
    if early_stop == True:
        from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
        early_stop_callback = EarlyStopping(
            monitor="val_loss",   # metric to monitor (must be logged in your LightningModule)
            patience=10,          # number of epochs with no improvement before stopping
            min_delta=0.001,      # minimum improvement to qualify as "better"
            mode="min",           # "min" for loss, "max" for accuracy
            verbose=True
        )

        checkpoint_callback = ModelCheckpoint(
            dirpath=model_out_dir,
            filename="best_model",
            save_top_k=1,
            monitor="val_loss",
            mode="min"
        )
        callbacks=[early_stop_callback,checkpoint_callback]

    trainer = pl.Trainer(
        max_epochs=max_epochs,
        accelerator="auto",
        devices=1,
        fast_dev_run=test_mode,
        gradient_clip_val=1.0,
        gradient_clip_algorithm="norm",
        callbacks= callbacks if early_stop else None
    )

    trainer.fit(model, train_loader, val_loader)

    if save_model:
        os.makedirs(model_out_dir, exist_ok=True)
        trainer.save_checkpoint(model_out)
        joblib.dump({
    "input_dim": input_dim,
    "hidden_dim": hidden_dim,
    "num_layers": num_layers,
    "max_len_y": max_len_y,
    "feature_cols": feature_cols,
    "scalers": pipeline.scalers,
    "pipeline_config": pipeline.export_config(),
    "model_class_info": model_class_info   # ✅ save model class info
}, meta_out)
        
    # --- Evaluation --- #
    if do_validation:
        mse, mae, acc, f1 = evaluate_model_mdn(model, val_loader)
        if return_val_accuracy:
            return {"mse": mse, "mae": mae, "acc": acc, "f1": f1}
        
if __name__ == "__main__":
    train_model(
        "/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles.csv",
        "/home/iatell/projects/meta-learning/data/line_seq_ordered.csv",
        save_model=True,
        do_validation=True,
        test_mode = False
    )



=== DEBUG SAMPLE CHECK (Torch mode) ===

--- Sequence 0 ---
Label: [1.143628 0.       0.       0.       0.       0.       0.       0.
 0.      ] Encoded (padded): [1.143628 0.       0.       0.       0.       0.       0.       0.
 0.      ]
Shape: (5, 4)
First few rows of sequence:
 [[ 0.01562355 -0.00180042 -0.01639293  0.0093857 ]
 [ 0.00938704  0.12409948  0.04899828  0.12622231]
 [ 0.12622082 -0.00192766  0.09665822  0.00645032]
 [ 0.00645032 -0.00251821 -0.02505807 -0.05388233]
 [-0.04985064 -0.0454773  -0.17924407 -0.07724382]]



NameError: name 'cnn_transformer' is not defined

### two head lstm

In [4]:
import sys
from pathlib import Path

# Current notebook location
notebook_path = Path().resolve()

# Add parent folder (meta/) to sys.path
sys.path.append(str(notebook_path.parent))
import joblib
import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, confusion_matrix
from datetime import datetime
from preprocess.multi_regression_seq_dif3 import preprocess_sequences_csv_multilines
# from models.LSTM.lstm_multi_line_reg_seq_dif import LSTMMultiRegressor
from utils.print_batch import print_batch
from utils.to_address import to_address
from utils.json_to_csv import json_to_csv_in_memory
from utils.padding_batch_reg import collate_batch
import pandas as pd
import io
import numpy as np
import os
from sklearn.metrics import accuracy_score, f1_score
from add_ons.feature_pipeline5 import FeaturePipeline
from add_ons.drop_column import drop_columns
from add_ons.candle_dif_rate_of_change_percentage2 import add_candle_rocp
from add_ons.candle_proportion import add_candle_proportions
from add_ons.candle_rate_of_change import add_candle_ratios
from utils.make_step import make_step

# ---------------- Evaluation ---------------- #
def evaluate_model(model, val_loader, threshold=0.5):
    model.eval()
    all_preds_reg, all_labels_reg = [], []
    all_preds_len, all_labels_len = [], []

    with torch.no_grad():
        for X_batch, y_batch, lengths in val_loader:
            # Send to same device as model
            device = next(model.parameters()).device
            X_batch = {k: v.to(device) for k, v in X_batch.items()}
            y_batch = y_batch.to(device)
            lengths = lengths.to(device)

            # Forward pass: regression + length logits
            y_pred, len_logits = model(X_batch, lengths)

            # Regression targets
            all_preds_reg.append(y_pred.cpu().numpy())
            all_labels_reg.append(y_batch.cpu().numpy())

            # Length targets
            true_lengths = lengths.cpu().numpy()
            pred_lengths = model.predict_length(len_logits).cpu().numpy()

            all_labels_len.extend(true_lengths.tolist())
            all_preds_len.extend(pred_lengths.tolist())

    # ----- Regression metrics -----
    all_preds_reg = np.vstack(all_preds_reg)
    all_labels_reg = np.vstack(all_labels_reg)

    mse = ((all_preds_reg - all_labels_reg) ** 2).mean()
    mae = np.abs(all_preds_reg - all_labels_reg).mean()

    # ----- Length metrics -----


    acc = accuracy_score(all_labels_len, all_preds_len)
    f1 = f1_score(all_labels_len, all_preds_len, average="macro")

    print("\n📊 Validation Metrics:")
    print(f"  Regression → MSE: {mse:.6f}, MAE: {mae:.6f}")
    print(f"  Length     → Acc: {acc:.4f}, F1: {f1:.4f}")

    return {"mse": mse, "mae": mae, "acc": acc, "f1": f1}


# ---------------- Train ---------------- #
def train_model(
    data_csv,
    labels_csv,
    model_out_dir="models/saved_models",
    do_validation=True,
    hidden_dim=128,
    num_layers=1,
    lr=0.001,
    batch_size=32,
    max_epochs=50,
    save_model=True,
    return_val_accuracy = True,
    test_mode = True,
    early_stop = False
):

    pipeline = FeaturePipeline(
        steps=[
            # make_step(add_label_normalized_candles),
            make_step(add_candle_rocp),
            make_step(drop_columns, cols_to_drop=["open","high","low","close","volume"]),
            
        ],
        # norm_methods={
        #     "main": {
        #         "upper_shadow": "robust", "body": "standard", "lower_shadow": "standard",
        #         "upper_body_ratio": "standard", "lower_body_ratio": "standard",
        #         "upper_lower_body_ratio": "standard", "Candle_Color": "standard"
        #     }
        # },
        per_window_flags=[
            False, 
          False, 
        #   True
                ]
    )
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_out = f"{model_out_dir}/lstm_model_multireg_multihead_{timestamp}.pt"
    meta_out  = f"{model_out_dir}/lstm_meta_multireg_multihead_{timestamp}.pkl"

    # Preprocess: pad linePrices and sequences
    if do_validation:
        train_ds, val_ds, df, feature_cols, max_len_y = preprocess_sequences_csv_multilines(
            data_csv, labels_csv,
            val_split=True,
            for_xgboost=False,
            debug_sample=True,
            feature_pipeline=pipeline
        )
    else:
        train_ds, df, feature_cols, max_len_y = preprocess_sequences_csv_multilines(
            data_csv, labels_csv,
            val_split=False,
            for_xgboost=False,
            debug_sample=False
        )
        val_ds = None

    sample = train_ds[0][0]  # first sample's features
    if isinstance(sample, dict):  # multiple feature groups
        input_dim = sample['main'].shape[1]
    else:  # single tensor
        input_dim = sample.shape[1]

    model = LSTMMultiRegressor(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        max_len_y=max_len_y,
        lr=lr
    )
    init_args = {
    "input_dim": input_dim,
    "hidden_dim": hidden_dim,
    "num_layers": num_layers,
    "max_len_y": max_len_y,
    "lr": lr
}

    model_class_info = {
        "module": model.__class__.__module__,
        "class": model.__class__.__name__,
        "init_args": init_args
    }

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
    val_loader = DataLoader(val_ds, batch_size=batch_size, collate_fn=collate_batch) if val_ds else None
    # --- Early stopping --- #
    if early_stop == True:
        from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
        early_stop_callback = EarlyStopping(
            monitor="val_loss",   # metric to monitor (must be logged in your LightningModule)
            patience=10,          # number of epochs with no improvement before stopping
            min_delta=0.001,      # minimum improvement to qualify as "better"
            mode="min",           # "min" for loss, "max" for accuracy
            verbose=True
        )

        checkpoint_callback = ModelCheckpoint(
            dirpath=model_out_dir,
            filename="best_model",
            save_top_k=1,
            monitor="val_loss",
            mode="min"
        )
        callbacks=[early_stop_callback,checkpoint_callback]

    trainer = pl.Trainer(
        max_epochs=max_epochs,
        accelerator="auto",
        devices=1,
        fast_dev_run=test_mode,
        gradient_clip_val=1.0,
        gradient_clip_algorithm="norm",
        callbacks= callbacks if early_stop else None
    )

    trainer.fit(model, train_loader, val_loader)

    # --- Debug / Test mode --- #
    if test_mode:
        save_model = False
        from itertools import islice

        # Try to grab 3rd batch; if not available, take first
        try:
            batch = next(islice(iter(train_loader), 2, 3))
        except StopIteration:
            batch = next(iter(train_loader))

        X_batch_dict, y_batch, lengths = batch

        print("🔍 Debug batch:")
        if isinstance(X_batch_dict, dict):
            print("  Keys in X_batch:", list(X_batch_dict.keys()))
        print("  y_batch shape:", y_batch.shape)
        print("  First label in batch:", y_batch[0])

        # --- Track real column names for each feature group ---
        feature_names_dict = {}
        for name, X_batch in X_batch_dict.items():
            if name == "main":
                # Use actual feature columns after preprocessing
                feature_names_dict[name] = feature_cols
            else:
                # For extra feature groups, fallback to generic names
                feature_names_dict[name] = [f"{name}_{i}" for i in range(X_batch.shape[2])]

        dfs = []
        for name, X_batch in X_batch_dict.items():
            print(f"\nFeature group: {name}")
            print("  X_batch shape:", X_batch.shape)
            print("  First sequence in batch (first  steps):\n", X_batch[0][:])

            batch_size_, seq_len, feature_dim = X_batch.shape
            df_part = pd.DataFrame(
                X_batch.reshape(batch_size_ * seq_len, feature_dim).numpy(),
                columns=feature_names_dict[name]
            )
            dfs.append(df_part)

        # Combine all feature groups horizontally
        global df_seq
        df_seq = pd.concat(dfs, axis=1)
        print("\n✅ Combined df_seq shape:", df_seq.shape)
        print("✅ Column names in df_seq:", df_seq.columns.tolist())

        
    if save_model:
        os.makedirs(model_out_dir, exist_ok=True)
        trainer.save_checkpoint(model_out)
        joblib.dump({
            "input_dim": input_dim,
            "hidden_dim": hidden_dim,
            "num_layers": num_layers,
            "max_len_y": max_len_y,
            "feature_cols": feature_cols,
            "scalers": pipeline.scalers,
            "pipeline_config": pipeline.export_config(),
            "model_class_info": model_class_info 
        }, meta_out)
        print(f"✅ Model saved to {model_out}")
        print(f"✅ Meta saved to {meta_out}")


        
    # --- Evaluation --- #
    if do_validation:
        mse, mae, acc, f1 = evaluate_model(model, val_loader)
        if return_val_accuracy:
            return {"mse": mse, "mae": mae, "acc": acc, "f1": f1}
        
if __name__ == "__main__":
    train_model(
        "/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles.csv",
        "/home/iatell/projects/meta-learning/data/seq_line_labels.csv",
        do_validation=True,
        test_mode = False
    )


💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/iatell/envs/Rllib2.43/lib/python3.11/site-packages/pytorch_lightning/trainer/configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | lstm        | LSTM              | 68.6 K | train
1 | fc_reg      | Linear            | 774    | train
2 | fc_len      | Linear            | 774    | train
3 | loss_fn_reg | MSELoss           | 0      | train
4 | loss_fn_len | BCEWithLogitsLoss | 0      | train
----------------------------------------------------------
70.


=== DEBUG SAMPLE CHECK (Torch mode) ===

--- Sequence 0 ---
Label: [1.086008 1.126277 1.165107 0.970955 0.       0.      ] Encoded (padded): [1.086008 1.126277 1.165107 0.970955 0.       0.      ]
[main] Shape: (5, 4)
[main] First few rows:
 [[ 0.00645032 -0.00251821 -0.02505807 -0.05388233]
 [-0.04985064 -0.0454773  -0.17924407 -0.07724382]
 [-0.08115927 -0.05037893  0.09358804 -0.03372177]
 [-0.03365467 -0.03511871 -0.06278902  0.03521458]
 [ 0.03742796  0.00087057 -0.13184595 -0.11191386]]



/home/iatell/envs/Rllib2.43/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
/home/iatell/envs/Rllib2.43/lib/python3.11/site-packages/pytorch_lightning/loops/fit_loop.py:310: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=50` reached.


✅ Model saved to models/saved_models/lstm_model_multireg_multihead_20250913_143827.pt
✅ Meta saved to models/saved_models/lstm_meta_multireg_multihead_20250913_143827.pkl

📊 Validation Metrics:
  Regression → MSE: 0.454865, MAE: 0.510467
  Length     → Acc: 0.0667, F1: 0.0096


### xgboost two head

In [None]:
import sys
from pathlib import Path

# Current notebook location
notebook_path = Path().resolve()

# Add parent folder (meta/) to sys.path
sys.path.append(str(notebook_path.parent))
import joblib
import joblib
from datetime import datetime
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os
import io
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from utils.make_step import make_step
from preprocess.multi_regression_seq_dif2 import preprocess_sequences_csv_multilines
from add_ons.drop_column import drop_columns
from add_ons.feature_pipeline4 import FeaturePipeline
from add_ons.normalize_candle_seq import add_label_normalized_candles
from add_ons.candle_dif_rate_of_change_percentage2 import add_candle_rocp
from add_ons.candle_rate_of_change import add_candle_ratios
# ---------------- Evaluation ---------------- #
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import warnings

def evaluate_model(model, length_model, X_val, y_val, true_lengths, return_sequences=False):
    """
    Evaluate multi-output regression with predicted sequence lengths.
    Permutation-invariant: sorts both predictions and true values before computing metrics.
    Can optionally return the predicted vs true sequences for inspection.
    """
    y_pred_full = model.predict(X_val)
    pred_lengths = np.round(length_model.predict(X_val)).astype(int)

    print("\n📊 Validation Report (Multi-Regression with variable-length sequences):")
    mse_list, mae_list, r2_list = [], [], []

    pred_vs_true_list = []  # store predicted vs true sequences if needed

    for i, (pred, pred_len, true_y, true_len) in enumerate(zip(y_pred_full, pred_lengths, y_val, true_lengths)):
        L = min(pred_len, true_len)
        pred_trunc = np.sort(pred[:L])       # sort predictions for permutation-invariant metrics
        true_trunc = np.sort(true_y[:L])     # sort true values

        mse = mean_squared_error(true_trunc, pred_trunc)
        mae = mean_absolute_error(true_trunc, pred_trunc)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            try:
                r2 = r2_score(true_trunc, pred_trunc)
            except ValueError:
                r2 = np.nan

        mse_list.append(mse)
        mae_list.append(mae)
        r2_list.append(r2)

        print(f"\nSample {i}:")
        print(f"  Predicted length: {pred_len}, True length: {true_len}")
        print(f"  MSE: {mse:.6f}, MAE: {mae:.6f}, R²: {r2:.6f}")
        print(f"  Predicted lines: {pred_trunc}")
        print(f"  True lines     : {true_trunc}")

        if return_sequences:
            pred_vs_true_list.append((pred_trunc, true_trunc))

    print("\n--- Global Scores ---")
    print(f"Mean MSE: {np.mean(mse_list):.6f}")
    print(f"Mean MAE: {np.mean(mae_list):.6f}")
    print(f"Mean R²: {np.nanmean(r2_list):.6f}")

    results = {"mse": np.mean(mse_list), "mae": np.mean(mae_list), "r2": np.nanmean(r2_list)}
    
    if return_sequences:
        results["pred_vs_true"] = pred_vs_true_list
    
    return results

# ---------------- Train ---------------- #
def train_model_xgb_multireg(
    data_csv,
    labels_csv,
    model_out_dir="models/saved_models",
    do_validation=True,
    n_estimators=1000,
    max_depth=16,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    save_model=False,
    return_val_metrics=True,
    **model_params
):
    """
    Train a multi-output XGBoost regressor with a linked sequence-length predictor.
    """
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_out = f"{model_out_dir}/xgb_model_multireg_{timestamp}.pkl"
    length_model_out = f"{model_out_dir}/xgb_model_seq_len_{timestamp}.pkl"
    meta_out = f"{model_out_dir}/xgb_meta_multireg_{timestamp}.pkl"

    pipeline = FeaturePipeline(
        steps=[
            # make_step(add_label_normalized_candles),
            make_step(add_candle_rocp),
            make_step(drop_columns, cols_to_drop=["open","high","low","close","volume"]),
            
        ],
        # norm_methods={
        #     "main": {
        #         "upper_shadow": "robust", "body": "standard", "lower_shadow": "standard",
        #         "upper_body_ratio": "standard", "lower_body_ratio": "standard",
        #         "upper_lower_body_ratio": "standard", "Candle_Color": "standard"
        #     }
        # },
        per_window_flags=[
            False, 
          False, 
        #   True
                ]
    )
    # --- Preprocess data ---
    if do_validation:
        X_train, y_train, X_val, y_val, df, feature_cols, max_len_y, seq_lengths_true = preprocess_sequences_csv_multilines(
            data_csv, labels_csv,
            val_split=True,
            for_xgboost=True,
            debug_sample=True,
            feature_pipeline=pipeline
        )
    else:
        X_train, y_train, df, feature_cols, max_len_y, seq_lengths_true = preprocess_sequences_csv_multilines(
            data_csv, labels_csv,
            val_split=False,
            for_xgboost=True,
            feature_pipeline=pipeline
        )
        X_val, y_val = None, None


    # --- Sequence length targets ---
    if do_validation:
        idx_train, idx_val = train_test_split(
            np.arange(len(seq_lengths_true)),
            test_size=0.2,  # match your preprocess split
            random_state=42
        )
        train_lengths = np.array(seq_lengths_true)[idx_train]
        val_lengths   = np.array(seq_lengths_true)[idx_val]
    else:
        train_lengths = np.array(seq_lengths_true)

    # --- Train max-line regression ---
    xgb_model = xgb.XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        objective="reg:squarederror",
        **model_params
    )
    model = MultiOutputRegressor(xgb_model, n_jobs=-1)
    model.fit(X_train, y_train)

    # --- Train length predictor ---
    xgb_len_model = xgb.XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        objective="reg:squarederror",
        **model_params
    )
    xgb_len_model.fit(X_train, train_lengths)


    # --- Save models ---
    if save_model:
        os.makedirs(model_out_dir, exist_ok=True)
        
        # Save trained models
        joblib.dump(model, model_out)
        joblib.dump(xgb_len_model, length_model_out)
        
        # Save full metadata
        meta_dict = {
            "feature_cols": feature_cols,
            "target_dim": max_len_y,
            "n_estimators": n_estimators,
            "max_depth": max_depth,
            "learning_rate": learning_rate,
            "subsample": subsample,
            "colsample_bytree": colsample_bytree,
            "model_params": model_params,
            "scalers": pipeline.scalers,
            "pipeline_config": pipeline.export_config(),
            "multioutput_wrapper": {
                "class": model.__class__.__name__,
                "module": model.__class__.__module__,
            }
        }
        joblib.dump(meta_dict, meta_out)
        
        print(f"✅ Model saved to {model_out}")
        print(f"✅ Length predictor saved to {length_model_out}")
        print(f"✅ Metadata saved to {meta_out}")
    # --- Evaluate ---
    val_metrics = None
    if do_validation:
        metrics = evaluate_model(model, xgb_len_model, X_val, y_val, val_lengths, return_sequences=True)


    if return_val_metrics:
        return val_metrics

# ---------------- Main ---------------- #
if __name__ == "__main__":
    train_model_xgb_multireg(
        "/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles.csv",
        "/home/iatell/projects/meta-learning/data/line_seq_ordered.csv",
        do_validation=True,
        save_model=False
    )


### Hungarian lstm

In [None]:
import sys
from pathlib import Path

# Current notebook location
notebook_path = Path().resolve()

# Add parent folder (meta/) to sys.path
sys.path.append(str(notebook_path.parent))
import joblib
import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, confusion_matrix
from datetime import datetime
from preprocess.multi_regression_seq_dif3 import preprocess_sequences_csv_multilines
# from models.LSTM.lstm_multi_line_reg_seq_dif import LSTMMultiRegressor
from utils.print_batch import print_batch
from utils.to_address import to_address
from utils.json_to_csv import json_to_csv_in_memory
from utils.padding_batch_reg import collate_batch
import pandas as pd
import io
import numpy as np
import os
from sklearn.metrics import accuracy_score, f1_score
from add_ons.feature_pipeline5 import FeaturePipeline
from add_ons.drop_column import drop_columns
from add_ons.candle_dif_rate_of_change_percentage2 import add_candle_rocp
from add_ons.candle_proportion import add_candle_proportions
from add_ons.candle_rate_of_change import add_candle_ratios
from utils.make_step import make_step

# ---------------- Evaluation ---------------- #
def evaluate_model(model, val_loader):
    model.eval()
    all_preds_reg, all_labels_reg = [], []

    with torch.no_grad():
        for X_batch, y_batch, lengths in val_loader:
            device = next(model.parameters()).device
            X_batch = {k: v.to(device) for k, v in X_batch.items()}
            y_batch = y_batch.to(device)
            lengths = lengths.to(device)

            # Forward pass: regression only
            y_pred = model(X_batch, lengths)

            mask = (y_batch != 0).float()

            # --- Hungarian assignment per batch ---
            batch_preds = []
            batch_labels = []
            #y_batch.shape[0] is batch actually
            for i in range(y_batch.shape[0]):
                gt_vals = y_batch[i][mask[i] > 0]  # true targets
                preds = y_pred[i]

                if len(gt_vals) == 0:
                    continue

                cost = torch.cdist(gt_vals.unsqueeze(1), preds.unsqueeze(1), p=2).pow(2)
                row_ind, col_ind = linear_sum_assignment(cost.cpu().numpy())

                matched_preds = preds[col_ind].cpu().numpy()
                matched_labels = gt_vals[row_ind].cpu().numpy()

                batch_preds.extend(matched_preds.tolist())
                batch_labels.extend(matched_labels.tolist())

            all_preds_reg.extend(batch_preds)
            all_labels_reg.extend(batch_labels)

    # Convert to arrays
    all_preds_reg = np.array(all_preds_reg)
    all_labels_reg = np.array(all_labels_reg)

    # Regression metrics
    mse = ((all_preds_reg - all_labels_reg) ** 2).mean()
    mae = np.abs(all_preds_reg - all_labels_reg).mean()

    print("\n📊 Validation Metrics (Hungarian matched):")
    print(f"  Regression → MSE: {mse:.6f}, MAE: {mae:.6f}")

    return {"mse": mse, "mae": mae}


# ---------------- Train ---------------- #
def train_model(
    data_csv,
    labels_csv,
    model_out_dir="models/saved_models",
    do_validation=True,
    hidden_dim=30,
    num_layers=1,
    lr=0.001,
    batch_size=50,
    max_epochs=100,
    save_model=True,
    return_val_accuracy = True,
    test_mode = True,
    early_stop = False
):

    pipeline = FeaturePipeline(
        steps=[
            # make_step(add_label_normalized_candles),
            make_step(add_candle_rocp),
            make_step(drop_columns, cols_to_drop=["open","high","low","close","volume"]),
            
        ],
        # norm_methods={
        #     "main": {
        #         "upper_shadow": "robust", "body": "standard", "lower_shadow": "standard",
        #         "upper_body_ratio": "standard", "lower_body_ratio": "standard",
        #         "upper_lower_body_ratio": "standard", "Candle_Color": "standard"
        #     }
        # },
        per_window_flags=[
            False, 
          False, 
        #   True
                ]
    )
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_out = f"{model_out_dir}/lstm_model_multireg_multihead_{timestamp}.pt"
    meta_out  = f"{model_out_dir}/lstm_meta_multireg_multihead_{timestamp}.pkl"

    # Preprocess: pad linePrices and sequences
    if do_validation:
        train_ds, val_ds, df, feature_cols, max_len_y = preprocess_sequences_csv_multilines(
            data_csv, labels_csv,
            val_split=True,
            for_xgboost=False,
            debug_sample=True,
            feature_pipeline=pipeline,
            preserve_order= True
        )
    else:
        train_ds, df, feature_cols, max_len_y = preprocess_sequences_csv_multilines(
            data_csv, labels_csv,
            val_split=False,
            for_xgboost=False,
            debug_sample=False,
            preserve_order= True
        )
        val_ds = None

    sample = train_ds[0][0]  # first sample's features
    if isinstance(sample, dict):  # multiple feature groups
        input_dim = sample['main'].shape[1]
    else:  # single tensor
        input_dim = sample.shape[1]

    model = LSTMMultiRegressor(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        max_len_y=max_len_y,
        lr=lr
    )
    init_args = {
    "input_dim": input_dim,
    "hidden_dim": hidden_dim,
    "num_layers": num_layers,
    "max_len_y": max_len_y,
    "lr": lr
}

    model_class_info = {
        "module": model.__class__.__module__,
        "class": model.__class__.__name__,
        "init_args": init_args
    }

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
    val_loader = DataLoader(val_ds, batch_size=batch_size, collate_fn=collate_batch) if val_ds else None
    # --- Early stopping --- #
    if early_stop == True:
        from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
        early_stop_callback = EarlyStopping(
            monitor="val_loss",   # metric to monitor (must be logged in your LightningModule)
            patience=10,          # number of epochs with no improvement before stopping
            min_delta=0.001,      # minimum improvement to qualify as "better"
            mode="min",           # "min" for loss, "max" for accuracy
            verbose=True
        )

        checkpoint_callback = ModelCheckpoint(
            dirpath=model_out_dir,
            filename="best_model",
            save_top_k=1,
            monitor="val_loss",
            mode="min"
        )
        callbacks=[early_stop_callback,checkpoint_callback]

    trainer = pl.Trainer(
        max_epochs=max_epochs,
        accelerator="auto",
        devices=1,
        fast_dev_run=test_mode,
        gradient_clip_val=1.0,
        gradient_clip_algorithm="norm",
        callbacks= callbacks if early_stop else None
    )

    trainer.fit(model, train_loader, val_loader)

    # --- Debug / Test mode --- #
    if test_mode:
        save_model = False
        from itertools import islice

        # Try to grab 3rd batch; if not available, take first
        try:
            batch = next(islice(iter(train_loader), 2, 3))
        except StopIteration:
            batch = next(iter(train_loader))

        X_batch_dict, y_batch, lengths = batch

        print("🔍 Debug batch:")
        if isinstance(X_batch_dict, dict):
            print("  Keys in X_batch:", list(X_batch_dict.keys()))
        print("  y_batch shape:", y_batch.shape)
        print("  First label in batch:", y_batch[0])

        # --- Track real column names for each feature group ---
        feature_names_dict = {}
        for name, X_batch in X_batch_dict.items():
            if name == "main":
                # Use actual feature columns after preprocessing
                feature_names_dict[name] = feature_cols
            else:
                # For extra feature groups, fallback to generic names
                feature_names_dict[name] = [f"{name}_{i}" for i in range(X_batch.shape[2])]

        dfs = []
        for name, X_batch in X_batch_dict.items():
            print(f"\nFeature group: {name}")
            print("  X_batch shape:", X_batch.shape)
            print("  First sequence in batch (first  steps):\n", X_batch[0][:])

            batch_size_, seq_len, feature_dim = X_batch.shape
            df_part = pd.DataFrame(
                X_batch.reshape(batch_size_ * seq_len, feature_dim).numpy(),
                columns=feature_names_dict[name]
            )
            dfs.append(df_part)

        # Combine all feature groups horizontally
        global df_seq
        df_seq = pd.concat(dfs, axis=1)
        print("\n✅ Combined df_seq shape:", df_seq.shape)
        print("✅ Column names in df_seq:", df_seq.columns.tolist())

        
    if save_model:
        os.makedirs(model_out_dir, exist_ok=True)
        trainer.save_checkpoint(model_out)
        joblib.dump({
            "input_dim": input_dim,
            "hidden_dim": hidden_dim,
            "num_layers": num_layers,
            "max_len_y": max_len_y,
            "feature_cols": feature_cols,
            "scalers": pipeline.scalers,
            "pipeline_config": pipeline.export_config(),
            "model_class_info": model_class_info 
        }, meta_out)
        print(f"✅ Model saved to {model_out}")
        print(f"✅ Meta saved to {meta_out}")


        
    # --- Evaluation --- #
    if do_validation:
        metrics = evaluate_model(model, val_loader)
        if return_val_accuracy:
            return metrics

        
if __name__ == "__main__":
    train_model(
        "/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles.csv",
        "/home/iatell/projects/meta-learning/data/line_seq_ordered.csv",
        do_validation=True,
        test_mode = False
    )


### Hungarian CNN-attention lstm weighted

In [6]:
import sys
# from pathlib import Path

# # Current notebook location
# notebook_path = Path().resolve()

# # Add parent folder (meta/) to sys.path
# sys.path.append(str(notebook_path.parent))
import joblib
import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, confusion_matrix
from datetime import datetime
from preprocess.multi_regression_seq_dif3 import preprocess_sequences_csv_multilines
# from models.LSTM.lstm_multi_line_reg_seq_dif import LSTMMultiRegressor
from utils.print_batch import print_batch
from utils.to_address import to_address
from utils.json_to_csv import json_to_csv_in_memory
from utils.padding_batch_reg import collate_batch
import pandas as pd
import io
import numpy as np
import os
import pickle
from sklearn.metrics import accuracy_score, f1_score
from add_ons.feature_pipeline5 import FeaturePipeline
from add_ons.drop_columns2 import drop_columns
from add_ons.candle_dif_rate_of_change_percentage2 import add_candle_rocp
from add_ons.candle_proportion import add_candle_proportions
from add_ons.candle_rate_of_change import add_candle_ratios
from add_ons.candle_proportion_simple import add_candle_shape_features
from add_ons.normalize_candle_seq import add_label_normalized_candles
from utils.make_step import make_step
from scipy.optimize import linear_sum_assignment

# ---------------- Evaluation ---------------- #
def evaluate_model(model, val_loader):
    model.eval()
    all_preds_reg, all_labels_reg = [], []

    with torch.no_grad():
        for X_batch, y_batch, lengths in val_loader:
            device = next(model.parameters()).device
            X_batch = {k: v.to(device) for k, v in X_batch.items()}
            y_batch = y_batch.to(device)
            lengths = lengths.to(device)

            # Forward pass: regression only
            y_pred = model(X_batch, lengths)

            mask = (y_batch != 0).float()

            # --- Hungarian assignment per batch ---
            batch_preds = []
            batch_labels = []
            #y_batch.shape[0] is batch actually
            for i in range(y_batch.shape[0]):
                gt_vals = y_batch[i][mask[i] > 0]  # true targets
                preds = y_pred[i]

                if len(gt_vals) == 0:
                    continue

                cost = torch.cdist(gt_vals.unsqueeze(1), preds.unsqueeze(1), p=2).pow(2)
                row_ind, col_ind = linear_sum_assignment(cost.cpu().numpy())

                matched_preds = preds[col_ind].cpu().numpy()
                matched_labels = gt_vals[row_ind].cpu().numpy()

                batch_preds.extend(matched_preds.tolist())
                batch_labels.extend(matched_labels.tolist())

            all_preds_reg.extend(batch_preds)
            all_labels_reg.extend(batch_labels)

    # Convert to arrays
    all_preds_reg = np.array(all_preds_reg)
    all_labels_reg = np.array(all_labels_reg)

    # Regression metrics
    mse = ((all_preds_reg - all_labels_reg) ** 2).mean()
    mae = np.abs(all_preds_reg - all_labels_reg).mean()

    print("\n📊 Validation Metrics (Hungarian matched):")
    print(f"  Regression → MSE: {mse:.6f}, MAE: {mae:.6f}")

    return {"mse": mse, "mae": mae}


# ---------------- Train ---------------- #
def train_model(
    data_csv,
    labels_csv,
    model_out_dir="models/saved_models",
    do_validation=True,
    hidden_dim=30,
    num_layers=1,
    lr=0.001,
    batch_size=50,
    max_epochs=500,
    save_model=True,
    return_val_accuracy = True,
    test_mode = False,
    early_stop = False,
    attention_name = "tanh_attention",
    optimizer_name= "adamw",
    kernels = [3,5,7,11],
    cnn_out_channels =32,
    first_drop = 0.3,
    second_drop = 0.3,
    third_drop= 0.3,
    scheduler_name = "reduce_on_plateau",
    optimizer_params={"weight_decay": 0.01},
    scheduler_params={"factor": 0.2, "patience": 3} 
):

    pipeline = FeaturePipeline(
        steps=[
            make_step(add_candle_shape_features, seperatable = "complete", dict_name = "candle_shape"),
            # make_step(add_candle_rocp),
            make_step(add_label_normalized_candles),
            make_step(drop_columns, cols_to_drop=["open","high","low","close","volume"]),
        ],
        # norm_methods={
        #     "main": {
        #         "upper_shadow": "robust", "body": "standard", "lower_shadow": "standard",
        #         "upper_body_ratio": "standard", "lower_body_ratio": "standard",
        #         "upper_lower_body_ratio": "standard", "Candle_Color": "standard"
        #     }
        # },
        per_window_flags=[
        False, 
        True, 
        True
                ]
    )
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_out = f"{model_out_dir}/lstm_model_multireg_multihead_{timestamp}.pt"
    meta_out  = f"{model_out_dir}/lstm_meta_multireg_multihead_{timestamp}.pkl"

    # Preprocess: pad linePrices and sequences
    if do_validation:
        train_ds, val_ds, df, feature_cols, max_len_y = preprocess_sequences_csv_multilines(
            data_csv, labels_csv,
            val_split=True,
            for_xgboost=False,
            debug_sample=True,
            feature_pipeline=pipeline,
            preserve_order= True
        )
    else:
        train_ds, df, feature_cols, max_len_y = preprocess_sequences_csv_multilines(
            data_csv, labels_csv,
            val_split=False,
            for_xgboost=False,
            debug_sample=False,
            preserve_order= True,
            feature_pipeline=pipeline,
        )
        val_ds = None

    sample = train_ds[0][0]  # first sample's features
    if isinstance(sample, dict):
        # build a dict of input_dims for all feature groups
        input_dim = {k: v.shape[1] for k, v in sample.items()}
    else:
        # single tensor → wrap into dict with a default key
        input_dim = {"main": sample.shape[1]}

    model = CNNAttentionLSTMMultiRegressor(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        max_len_y=max_len_y,
        lr=lr,
        attention_name = attention_name,
        optimizer_name= optimizer_name,
        kernels = kernels,
        cnn_out_channels =cnn_out_channels,
        first_drop = first_drop,
        second_drop = second_drop,
        third_drop = third_drop,
        scheduler_name = scheduler_name,
        optimizer_params= optimizer_params,
        scheduler_params= scheduler_params 
    )
    init_args = {
    "input_dim": input_dim,
    "hidden_dim": hidden_dim,
    "num_layers": num_layers,
    "max_len_y": max_len_y,
    "lr": lr,
    "attention_name" : attention_name,
    "optimizer_name": optimizer_name,
    "kernels" : kernels,
    "cnn_out_channels" :cnn_out_channels,
    "first_drop" : first_drop,
    "second_drop" : second_drop,
    "third_drop": third_drop,
    "scheduler_name" : scheduler_name,
    "optimizer_params":optimizer_params,
    "scheduler_params":scheduler_params
}

    model_class_info = {
        "module": model.__class__.__module__ ,
        "class": model.__class__.__name__ ,
        "init_args": init_args
    }
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
    val_loader = DataLoader(val_ds, batch_size=batch_size, collate_fn=collate_batch) if val_ds else None
    # --- Early stopping --- #
    if early_stop == True:
        from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
        early_stop_callback = EarlyStopping(
            monitor="val_loss",   # metric to monitor (must be logged in your LightningModule)
            patience=10,          # number of epochs with no improvement before stopping
            min_delta=0.001,      # minimum improvement to qualify as "better"
            mode="min",           # "min" for loss, "max" for accuracy
            verbose=True
        )

        checkpoint_callback = ModelCheckpoint(
            dirpath=model_out_dir,
            filename="best_model",
            save_top_k=1,
            monitor="val_loss",
            mode="min"
        )
        callbacks=[early_stop_callback,checkpoint_callback]

    trainer = pl.Trainer(
        max_epochs=max_epochs,
        accelerator="auto",
        devices=1,
        fast_dev_run=test_mode,
        gradient_clip_val=1.0,
        gradient_clip_algorithm="norm",
        callbacks= callbacks if early_stop else None
    )

    trainer.fit(model, train_loader, val_loader)

    # --- Debug / Test mode --- #
    if test_mode:
        save_model = False
        from itertools import islice

        # Try to grab 3rd batch; if not available, take first
        try:
            batch = next(islice(iter(train_loader), 2, 3))
        except StopIteration:
            batch = next(iter(train_loader))

        X_batch_dict, y_batch, lengths = batch

        print("🔍 Debug batch:")
        if isinstance(X_batch_dict, dict):
            print("  Keys in X_batch:", list(X_batch_dict.keys()))
        print("  y_batch shape:", y_batch.shape)
        print("  First label in batch:", y_batch[0])

        # --- Track real column names for each feature group ---
        feature_names_dict = {}
        for name, X_batch in X_batch_dict.items():
            if name == "main":
                # Use actual feature columns after preprocessing
                feature_names_dict[name] = feature_cols
            else:
                # For extra feature groups, fallback to generic names
                feature_names_dict[name] = [f"{name}_{i}" for i in range(X_batch.shape[2])]

        dfs = []
        for name, X_batch in X_batch_dict.items():
            print(f"\nFeature group: {name}")
            print("  X_batch shape:", X_batch.shape)
            print("  First sequence in batch (first  steps):\n", X_batch[0][:])

            batch_size_, seq_len, feature_dim = X_batch.shape
            df_part = pd.DataFrame(
                X_batch.reshape(batch_size_ * seq_len, feature_dim).numpy(),
                columns=feature_names_dict[name]
            )
            dfs.append(df_part)

        # Combine all feature groups horizontally
        global df_seq
        df_seq = pd.concat(dfs, axis=1)
        print("\n✅ Combined df_seq shape:", df_seq.shape)
        print("✅ Column names in df_seq:", df_seq.columns.tolist())

    if save_model:
        os.makedirs(model_out_dir, exist_ok=True)
        trainer.save_checkpoint(model_out)
        joblib.dump({
            "input_dim": input_dim,
            "hidden_dim": hidden_dim,
            "num_layers": num_layers,
            "max_len_y": max_len_y,
            "feature_cols": feature_cols,
            "scalers": pipeline.scalers,
            "pipeline_config": pipeline.export_config(),
            "model_class_info": model_class_info,
            "target_scalers": pipeline.target_scalers, 
        }, meta_out)
        print(f"✅ Model saved to {model_out}")
        print(f"✅ Meta saved to {meta_out}")

        
    # --- Evaluation --- #
    if do_validation:
        metrics = evaluate_model(model, val_loader)
        if return_val_accuracy:
            return {"accuracy": metrics["mse"] * (-1)}

        
if __name__ == "__main__":
    train_model(
        "/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles.csv",
        "/home/iatell/projects/meta-learning/data/line_seq_ordered.csv",
        do_validation=True,
        test_mode = True
    )


💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type          | Params | Mode 
--------------------------------------------------------
0 | loss_fn_reg   | MSELoss       | 0      | train
1 | attention     | TanhAttention | 960    | train
2 | branches      | ModuleList    | 3.7 K  | train
3 | fusion_conv2d | Sequential    | 1.3 K  | train
4 | lstm          | LSTM          | 5.5 K  | train
5 | regressor     | Sequential    | 609    | train
--------------------------------------------------------
12.1 K    Trainable params
0         Non-trai


=== DEBUG SAMPLE CHECK (Torch mode) ===

--- Sequence 0 ---
Label: [ 0.        -0.8797163 -1.5644624  0.         0.         0.
  0.         0.         0.       ] Encoded (padded): [ 0.        -0.8797163 -1.5644624  0.         0.         0.
  0.         0.         0.       ]
[main] Shape: (5, 4)
[main] First few rows:
 [[0.8082308  0.81475425 0.75175154 0.7888969 ]
 [0.7890243  0.912331   0.7600072  0.8652578 ]
 [0.8661357  0.9025476  0.8342968  0.8796678 ]
 [0.8796678  0.90092266 0.82062024 0.88792413]
 [0.88792527 1.0127267  0.86082923 1.        ]]
[candle_shape] Shape: (5, 4)
[candle_shape] First few rows:
 [[0.00807125 0.0470852  0.02392123 0.3       ]
 [0.05440368 0.03677583 0.08810496 0.7       ]
 [0.02600957 0.0367597  0.01538321 0.7       ]
 [0.01463923 0.06712486 0.00929843 0.7       ]
 [0.01272671 0.03051616 0.11207467 0.7       ]]



/home/iatell/envs/Rllib2.43/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
/home/iatell/envs/Rllib2.43/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=1` reached.


🔍 Debug batch:
  Keys in X_batch: ['main', 'candle_shape']
  y_batch shape: torch.Size([50, 9])
  First label in batch: tensor([ 0.4956, -0.0894,  0.0000, -0.5792,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000])

Feature group: main
  X_batch shape: torch.Size([50, 32, 4])
  First sequence in batch (first  steps):
 tensor([[1.0445, 1.0667, 0.9537, 0.9824],
        [0.9816, 1.0644, 0.9717, 1.0553],
        [1.0547, 1.1151, 1.0503, 1.0598],
        [1.0592, 1.0695, 0.9651, 0.9876],
        [0.9877, 1.0345, 0.9158, 0.9283],
        [0.9287, 0.9873, 0.9055, 0.9586],
        [0.9586, 0.9932, 0.8848, 0.9173],
        [0.9173, 0.9317, 0.8775, 0.9074],
        [0.9074, 0.9882, 0.8847, 0.9768],
        [0.9765, 1.0285, 0.9576, 1.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000],
     

In [7]:
df_seq = df_seq.loc[~(df_seq==0).all(axis=1)]
df_seq

Unnamed: 0,open_prop,high_prop,low_prop,close_prop,candle_shape_0,candle_shape_1,candle_shape_2,candle_shape_3
0,1.044518,1.066700,0.953729,0.982438,0.021237,0.029223,0.059434,0.3
1,0.981642,1.064430,0.971737,1.055252,0.008697,0.010090,0.069756,0.7
2,1.054695,1.115145,1.050293,1.059793,0.052229,0.004173,0.004810,0.7
3,1.059232,1.069542,0.965083,0.987564,0.009733,0.022764,0.067661,0.3
4,0.987698,1.034478,0.915788,0.928281,0.047363,0.013458,0.060157,0.3
...,...,...,...,...,...,...,...,...
1573,0.929585,0.995679,0.921836,0.931931,0.068404,0.008336,0.002518,0.7
1574,0.931369,1.048423,0.925005,1.030514,0.017379,0.006833,0.096209,0.7
1575,1.031084,1.053510,0.957832,0.976830,0.021751,0.019449,0.052618,0.3
1576,0.977235,1.012543,0.963175,1.004886,0.007620,0.014387,0.027516,0.7


# server

## MDN server

### cnn lstm

In [None]:
import sys
from pathlib import Path

# Current notebook location
notebook_path = Path().resolve()

# Add parent folder (meta/) to sys.path
sys.path.append(str(notebook_path.parent))
import glob
import joblib
import torch
import numpy as np
import pandas as pd
from flask import Flask, request, jsonify, render_template
from servers.pre_process.multi_reg_dif_seq2 import ServerPreprocess, import_class, build_pipeline_from_config
# from models.LSTM.cnn_lstm_mdn import CNNLSTM_MDN  # <-- your updated "last-output" model

app = Flask(__name__)

# ---------------- Load model and meta ----------------
meta_path = glob.glob("/home/iatell/projects/meta-learning/play_grounds/models/saved_models/lstm_meta_multireg_*.pkl")[0]
state_path = glob.glob("/home/iatell/projects/meta-learning/play_grounds/models/saved_models/lstm_model_multireg*.pt")[0]

meta = joblib.load(meta_path)
FEATURES = meta['feature_cols']
print("features",FEATURES)
# ---------------- Model ----------------
# Reconstruct model class
#for python file:
# model_cls_info = meta["model_class_info"]
# ModelClass = import_class(model_cls_info["module"], model_cls_info["class"])
model_cls_info = meta["model_class_info"]
ModelClass = cnn_lstm
# Initialize model with original args
model = ModelClass(**model_cls_info["init_args"])
model = cnn_lstm.load_from_checkpoint(state_path)
model.eval()

# ---------------- Load data ----------------
df = pd.read_csv( "/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles.csv", parse_dates=['timestamp'])

# ---------------- Setup pipeline ----------------
pipeline = build_pipeline_from_config(meta["pipeline_config"])
pipeline.scalers = meta["scalers"]

# Stateful preprocessing instance
preproc = ServerPreprocess(feature_pipeline=pipeline)


# ---------------- Routes ----------------
@app.route("/")
def home():
    return render_template("sequential.html")


@app.route("/get_and_add_data")
def get_and_add_data():
    dense = df.set_index('timestamp').asfreq('D').ffill()
    initial_seq_len = 21
    next_idx = request.args.get("idx", type=int)
    if next_idx is None:
        # First call → load initial candles
        if len(preproc.dataset) == 0:
            for _, row in dense.iloc[:initial_seq_len].iterrows():
                preproc.add_candle(row)
        candles = [
            {'time': int(ts.timestamp()),
             'open': float(row.open),
             'high': float(row.high),
             'low': float(row.low),
             'close': float(row.close)}
            for ts, row in dense.iloc[:initial_seq_len].iterrows()
        ]
        print("Returning initial candles:", candles)

        return jsonify({
            "initial_seq_len": initial_seq_len,
            "next_idx": initial_seq_len,
            "candles": candles
        })
    else:
        # Subsequent calls → 1 candle
        if next_idx >= len(dense):
            print("Reached end of data at index:", next_idx)
            return jsonify({"error": "End of data"}), 404

        row = dense.iloc[next_idx]
        candle = {
            'time': int(row.name.timestamp()),
            'open': float(row.open),
            'high': float(row.high),
            'low': float(row.low),
            'close': float(row.close)
        }

        # ✅ Add to preproc automatically
        preproc.add_candle(row)

        return jsonify({
            "next_idx": next_idx + 1,
            "candle": candle
        })


@app.route("/predict", methods=['POST'])
def predict():
    data = request.get_json(force=True)
    seq_len = data.get("seq_len")

    if not seq_len or not isinstance(seq_len, int):
        return jsonify({"error": "Provide 'seq_len' as an int"}), 400

    try:
        # prepare subsequence from current state
        seq_dict = preproc.prepare_seq(seq_len)  # returns dict of DataFrames
    except ValueError as e:
        return jsonify({"error": str(e)}), 400

    # Convert dict of DataFrames to dict of tensors
    dict_x = {k: torch.from_numpy(v.values.astype(np.float32)).unsqueeze(0)
            for k, v in seq_dict.items()}


    with torch.no_grad():
        mdn_out = model(dict_x)

    pi    = mdn_out['pi'][0].cpu().numpy()
    mu    = mdn_out['mu'][0].cpu().numpy()
    sigma = mdn_out['sigma'][0].cpu().numpy()
    last_close = preproc.reference_dataset.iloc[-1]['close']

    return jsonify({
        'pred_prices': (last_close * mu).tolist(),
        'pred_sigmas': (last_close * sigma).tolist(),
        'pi': pi.tolist()
    })


if __name__ == '__main__':
    app.run(debug=True, use_reloader=False)


## lstm two head

In [None]:
import sys
from pathlib import Path

# Current notebook location
notebook_path = Path().resolve()

# Add parent folder (meta/) to sys.path
sys.path.append(str(notebook_path.parent))
from pathlib import Path
import glob
import joblib
import torch
import numpy as np
import pandas as pd
from flask import Flask, request, jsonify, render_template
from servers.pre_process.multi_reg_dif_seq2 import ServerPreprocess, import_class, build_pipeline_from_config
# from models.LSTM.two_head_lstm import LSTMMultiRegressor  # your new model

app = Flask(__name__)

# ---------------- Load model and meta ----------------
meta_path = glob.glob("/home/iatell/projects/meta-learning/play_grounds/models/saved_models/lstm_meta_multireg_multihead_*.pkl")[0]
state_path = glob.glob("/home/iatell/projects/meta-learning/play_grounds/models/saved_models/lstm_model_multireg_multihead_*.pt")[0]


meta = joblib.load(meta_path)
FEATURES = meta['feature_cols']
print("features", FEATURES)

# Initialize model class
model_cls_info = meta["model_class_info"]
init_args = model_cls_info["init_args"]
model = LSTMMultiRegressor.load_from_checkpoint(state_path, **init_args)
model.eval()


# ---------------- Load data ----------------
df = pd.read_csv("/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles.csv", parse_dates=['timestamp'])

# ---------------- Setup pipeline ----------------
pipeline = build_pipeline_from_config(meta["pipeline_config"])
pipeline.scalers = meta["scalers"]
preproc = ServerPreprocess(feature_pipeline=pipeline)

# ---------------- Routes ----------------
@app.route("/")
def home():
    return render_template("two_head.html")


@app.route("/get_and_add_data")
def get_and_add_data():
    dense = df.set_index('timestamp').asfreq('D').ffill()
    initial_seq_len = 21
    next_idx = request.args.get("idx", type=int)

    if next_idx is None:
        if len(preproc.dataset) == 0:
            for _, row in dense.iloc[:initial_seq_len].iterrows():
                preproc.add_candle(row)
        candles = [{'time': int(ts.timestamp()),
                    'open': float(row.open),
                    'high': float(row.high),
                    'low': float(row.low),
                    'close': float(row.close)}
                   for ts, row in dense.iloc[:initial_seq_len].iterrows()]
        return jsonify({
            "initial_seq_len": initial_seq_len,
            "next_idx": initial_seq_len,
            "candles": candles
        })
    else:
        if next_idx >= len(dense):
            return jsonify({"error": "End of data"}), 404
        row = dense.iloc[next_idx]
        candle = {'time': int(row.name.timestamp()),
                  'open': float(row.open),
                  'high': float(row.high),
                  'low': float(row.low),
                  'close': float(row.close)}
        preproc.add_candle(row)
        return jsonify({"next_idx": next_idx + 1, "candle": candle})


@app.route("/predict", methods=['POST'])
def predict():
    data = request.get_json(force=True)
    seq_len = data.get("seq_len")

    if not seq_len or not isinstance(seq_len, int):
        return jsonify({"error": "Provide 'seq_len' as an int"}), 400

    try:
        seq_dict = preproc.prepare_seq(seq_len)
    except ValueError as e:
        return jsonify({"error": str(e)}), 400

    # Convert dict of DataFrames to dict of tensors
    dict_x = {k: torch.from_numpy(v.values.astype(np.float32)).unsqueeze(0)
            for k, v in seq_dict.items()}
    print("dict",dict_x)
    lengths = torch.tensor([seq_len], dtype=torch.long)

    with torch.no_grad():
        y_pred, len_logits = model( dict_x, lengths)

    last_close = preproc.reference_dataset.iloc[-1]['close']
    pred_prices = (last_close * y_pred[0]).tolist()
    pred_len = model.predict_length(len_logits).item()

    return jsonify({
        "pred_prices": pred_prices,
        "pred_len": pred_len
    })


if __name__ == "__main__":
    app.run(debug=True, use_reloader=False)


## Hungarian 

In [8]:
import sys
from pathlib import Path

# Current notebook location
notebook_path = Path().resolve()

# Add parent folder (meta/) to sys.path
sys.path.append(str(notebook_path.parent))
from pathlib import Path
import glob
import joblib
import torch
import numpy as np
import pandas as pd
from flask import Flask, request, jsonify, render_template
from servers.pre_process.multi_reg_dif_seq2 import ServerPreprocess, import_class, build_pipeline_from_config
# from models.LSTM.two_head_lstm import LSTMMultiRegressor  # your new model

app = Flask(__name__)

# ---------------- Load model and meta ----------------
meta_path = glob.glob("/home/iatell/projects/meta-learning/play_grounds/models/saved_models/lstm_meta_multireg_multihead_*.pkl")[0]
state_path = glob.glob("/home/iatell/projects/meta-learning/play_grounds/models/saved_models/lstm_model_multireg_multihead_*.pt")[0]


meta = joblib.load(meta_path)
FEATURES = meta['feature_cols']
print("features", FEATURES)

# Initialize model class
model_cls_info = meta["model_class_info"]
init_args = model_cls_info["init_args"]
model = CNNAttentionLSTMMultiRegressor.load_from_checkpoint(state_path, **init_args)
model.eval()

# ---------------- Load data ----------------
df = pd.read_csv("/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles.csv", parse_dates=['timestamp'])

# ---------------- Setup pipeline ----------------
pipeline = build_pipeline_from_config(meta["pipeline_config"])
pipeline.scalers = meta["scalers"]
preproc = ServerPreprocess(feature_pipeline=pipeline)

# ---------------- Routes ----------------
@app.route("/")
def home():
    return render_template("hungarian.html")


@app.route("/get_and_add_data")
def get_and_add_data():
    dense = df.set_index('timestamp').asfreq('D').ffill()
    initial_seq_len = 21
    next_idx = request.args.get("idx", type=int)

    if next_idx is None:
        if len(preproc.dataset) == 0:
            for _, row in dense.iloc[:initial_seq_len].iterrows():
                preproc.add_candle(row)
        candles = [{'time': int(ts.timestamp()),
                    'open': float(row.open),
                    'high': float(row.high),
                    'low': float(row.low),
                    'close': float(row.close)}
                   for ts, row in dense.iloc[:initial_seq_len].iterrows()]
        return jsonify({
            "initial_seq_len": initial_seq_len,
            "next_idx": initial_seq_len,
            "candles": candles
        })
    else:
        if next_idx >= len(dense):
            return jsonify({"error": "End of data"}), 404
        row = dense.iloc[next_idx]
        candle = {'time': int(row.name.timestamp()),
                  'open': float(row.open),
                  'high': float(row.high),
                  'low': float(row.low),
                  'close': float(row.close)}
        preproc.add_candle(row)
        return jsonify({"next_idx": next_idx + 1, "candle": candle})


@app.route("/predict", methods=['POST'])
def predict():
    data = request.get_json(force=True)
    seq_len = data.get("seq_len")

    if not seq_len or not isinstance(seq_len, int):
        return jsonify({"error": "Provide 'seq_len' as an int"}), 400

    try:
        seq_dict = preproc.prepare_seq(seq_len)
    except ValueError as e:
        return jsonify({"error": str(e)}), 400

    # Convert dict of DataFrames to dict of tensors
    dict_x = {k: torch.from_numpy(v.values.astype(np.float32)).unsqueeze(0)
              for k, v in seq_dict.items()}
    lengths = torch.tensor([seq_len], dtype=torch.long)

    with torch.no_grad():
        y_pred = model(dict_x, lengths)  # only regression head now
    print(y_pred)
    last_close = preproc.reference_dataset.iloc[-1]['close']
    pred_prices = (last_close * y_pred[0]).tolist()

    return jsonify({
        "pred_prices": pred_prices
    })



if __name__ == "__main__":
    app.run(debug=True, use_reloader=False)


features ['open_prop', 'high_prop', 'low_prop', 'close_prop']
 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [17/Sep/2025 23:48:54] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:48:55] "GET /get_and_add_data HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:48:55] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [17/Sep/2025 23:48:59] "GET /get_and_add_data?idx=21 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:48:59] "POST /predict HTTP/1.1" 200 -


tensor([[0.8057, 0.5789, 0.9783, 0.7046, 1.1104, 0.9502, 0.1393, 0.8178, 0.6108]])


127.0.0.1 - - [17/Sep/2025 23:49:18] "GET /get_and_add_data?idx=22 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:18] "POST /predict HTTP/1.1" 200 -


tensor([[0.8077, 0.5808, 0.9814, 0.7017, 1.1184, 0.9495, 0.1368, 0.8219, 0.6133]])


127.0.0.1 - - [17/Sep/2025 23:49:03] "GET /get_and_add_data?idx=23 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:03] "POST /predict HTTP/1.1" 200 -


tensor([[0.7915, 0.5632, 0.9664, 0.6838, 1.1046, 0.9332, 0.1326, 0.8123, 0.5991]])


127.0.0.1 - - [17/Sep/2025 23:49:04] "GET /get_and_add_data?idx=24 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:04] "POST /predict HTTP/1.1" 200 -


tensor([[0.7961, 0.5658, 0.9690, 0.6901, 1.1071, 0.9385, 0.1342, 0.8149, 0.6028]])


127.0.0.1 - - [17/Sep/2025 23:49:04] "GET /get_and_add_data?idx=25 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:04] "POST /predict HTTP/1.1" 200 -


tensor([[0.7955, 0.5663, 0.9682, 0.6915, 1.1037, 0.9382, 0.1351, 0.8127, 0.6021]])


127.0.0.1 - - [17/Sep/2025 23:49:04] "GET /get_and_add_data?idx=26 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:04] "POST /predict HTTP/1.1" 200 -


tensor([[0.7915, 0.5631, 0.9653, 0.6863, 1.1030, 0.9340, 0.1333, 0.8112, 0.5989]])


127.0.0.1 - - [17/Sep/2025 23:49:05] "GET /get_and_add_data?idx=27 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:05] "POST /predict HTTP/1.1" 200 -


tensor([[0.7893, 0.5598, 0.9633, 0.6827, 1.1022, 0.9312, 0.1320, 0.8105, 0.5973]])


127.0.0.1 - - [17/Sep/2025 23:49:05] "GET /get_and_add_data?idx=28 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:05] "POST /predict HTTP/1.1" 200 -


tensor([[0.8025, 0.5716, 0.9735, 0.6988, 1.1103, 0.9452, 0.1364, 0.8178, 0.6080]])


127.0.0.1 - - [17/Sep/2025 23:49:05] "GET /get_and_add_data?idx=29 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:05] "POST /predict HTTP/1.1" 200 -


tensor([[0.8135, 0.5851, 0.9841, 0.7118, 1.1187, 0.9567, 0.1397, 0.8232, 0.6179]])


127.0.0.1 - - [17/Sep/2025 23:49:06] "GET /get_and_add_data?idx=30 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:06] "POST /predict HTTP/1.1" 200 -


tensor([[0.8059, 0.5787, 0.9787, 0.7016, 1.1145, 0.9475, 0.1366, 0.8189, 0.6114]])


127.0.0.1 - - [17/Sep/2025 23:49:06] "GET /get_and_add_data?idx=31 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:06] "POST /predict HTTP/1.1" 200 -


tensor([[0.8202, 0.5917, 0.9908, 0.7154, 1.1288, 0.9616, 0.1393, 0.8296, 0.6246]])


127.0.0.1 - - [17/Sep/2025 23:49:07] "GET /get_and_add_data?idx=32 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:07] "POST /predict HTTP/1.1" 200 -


tensor([[0.8078, 0.5809, 0.9803, 0.7056, 1.1135, 0.9507, 0.1394, 0.8198, 0.6125]])


127.0.0.1 - - [17/Sep/2025 23:49:07] "GET /get_and_add_data?idx=33 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:07] "POST /predict HTTP/1.1" 200 -


tensor([[0.7986, 0.5718, 0.9733, 0.6912, 1.1114, 0.9397, 0.1341, 0.8164, 0.6057]])


127.0.0.1 - - [17/Sep/2025 23:49:07] "GET /get_and_add_data?idx=34 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:07] "POST /predict HTTP/1.1" 200 -


tensor([[0.8164, 0.5863, 0.9871, 0.7124, 1.1242, 0.9594, 0.1405, 0.8285, 0.6203]])


127.0.0.1 - - [17/Sep/2025 23:49:07] "GET /get_and_add_data?idx=35 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:07] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:08] "GET /get_and_add_data?idx=36 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:08] "POST /predict HTTP/1.1" 200 -


tensor([[0.8264, 0.6002, 0.9975, 0.7266, 1.1294, 0.9718, 0.1445, 0.8317, 0.6293]])
tensor([[0.7965, 0.5731, 0.9709, 0.6911, 1.1070, 0.9403, 0.1343, 0.8130, 0.6035]])


127.0.0.1 - - [17/Sep/2025 23:49:23] "GET /get_and_add_data?idx=37 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:23] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:08] "GET /get_and_add_data?idx=38 HTTP/1.1" 200 -


tensor([[0.8002, 0.5722, 0.9747, 0.6929, 1.1166, 0.9441, 0.1353, 0.8215, 0.6073]])


127.0.0.1 - - [17/Sep/2025 23:49:08] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:08] "GET /get_and_add_data?idx=39 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:08] "POST /predict HTTP/1.1" 200 -


tensor([[0.7761, 0.5471, 0.9525, 0.6688, 1.0874, 0.9185, 0.1306, 0.8017, 0.5846]])
tensor([[0.7782, 0.5496, 0.9536, 0.6707, 1.0937, 0.9209, 0.1289, 0.8040, 0.5880]])


127.0.0.1 - - [17/Sep/2025 23:49:08] "GET /get_and_add_data?idx=40 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:08] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:08] "GET /get_and_add_data?idx=41 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:08] "POST /predict HTTP/1.1" 200 -


tensor([[0.7907, 0.5605, 0.9632, 0.6863, 1.1006, 0.9340, 0.1338, 0.8107, 0.5981]])
tensor([[0.8034, 0.5739, 0.9747, 0.7011, 1.1094, 0.9467, 0.1379, 0.8175, 0.6084]])


127.0.0.1 - - [17/Sep/2025 23:49:09] "GET /get_and_add_data?idx=42 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:09] "GET /get_and_add_data?idx=43 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:09] "POST /predict HTTP/1.1" 200 -


tensor([[0.7840, 0.5570, 0.9597, 0.6774, 1.0972, 0.9260, 0.1311, 0.8063, 0.5928]])
tensor([[0.7993, 0.5684, 0.9715, 0.6934, 1.1105, 0.9417, 0.1352, 0.8179, 0.6055]])


127.0.0.1 - - [17/Sep/2025 23:49:09] "GET /get_and_add_data?idx=44 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:09] "POST /predict HTTP/1.1" 200 -


tensor([[0.7755, 0.5480, 0.9510, 0.6699, 1.0877, 0.9182, 0.1295, 0.7998, 0.5852]])


127.0.0.1 - - [17/Sep/2025 23:49:09] "GET /get_and_add_data?idx=45 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:10] "GET /get_and_add_data?idx=46 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:10] "POST /predict HTTP/1.1" 200 -


tensor([[0.7782, 0.5496, 0.9537, 0.6719, 1.0943, 0.9218, 0.1304, 0.8054, 0.5881]])
tensor([[0.7820, 0.5523, 0.9557, 0.6771, 1.0933, 0.9254, 0.1315, 0.8053, 0.5905]])


127.0.0.1 - - [17/Sep/2025 23:49:10] "GET /get_and_add_data?idx=47 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:10] "POST /predict HTTP/1.1" 200 -


tensor([[0.7757, 0.5478, 0.9510, 0.6698, 1.0895, 0.9185, 0.1293, 0.8011, 0.5857]])


127.0.0.1 - - [17/Sep/2025 23:49:10] "GET /get_and_add_data?idx=48 HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 23:49:10] "POST /predict HTTP/1.1" 200 -


tensor([[0.7993, 0.5686, 0.9710, 0.6952, 1.1091, 0.9428, 0.1363, 0.8174, 0.6055]])


In [None]:
# import sys
# from pathlib import Path

# # Current notebook location
# notebook_path = Path().resolve()

# # Add parent folder (meta/) to sys.path
# sys.path.append(str(notebook_path.parent))
# from pathlib import Path
import pickle
import glob
import joblib
import torch
import numpy as np
import pandas as pd
from flask import Flask, request, jsonify, render_template
from servers.pre_process.multi_reg_dif_seq2 import ServerPreprocess, import_class, build_pipeline_from_config
# from models.LSTM.two_head_lstm import LSTMMultiRegressor  # your new model

app = Flask(__name__)

# ---------------- Load model and meta ----------------
meta_path = glob.glob("/home/iatell/projects/meta-learning/play_grounds/models/saved_models/lstm_meta_multireg_multihead_*.pkl")[0]
state_path = glob.glob("/home/iatell/projects/meta-learning/play_grounds/models/saved_models/lstm_model_multireg_multihead_*.pt")[0]


meta = joblib.load(meta_path)
FEATURES = meta['feature_cols']
print("features", FEATURES)

# Initialize model class
model_cls_info = meta["model_class_info"]
init_args = model_cls_info["init_args"]
model = CNNAttentionLSTMMultiRegressor.load_from_checkpoint(state_path, **init_args)
model.eval()

# ---------------- Load data ----------------
df = pd.read_csv(
    "/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles.csv",
    parse_dates=['timestamp']
)
# print(f"DF loaded, shape: {df.shape}")

# ---------------- Load validation ----------------
val_path = glob.glob("/home/iatell/projects/meta-learning/play_grounds/models/saved_models/val_dataset_*.pkl")[0]
# print(f"Loading validation dataset from: {val_path}")

with open(val_path, "rb") as f:
    val_data = pickle.load(f)

# print(f"Validation data keys: {list(val_data.keys())}")
# print(f"Number of validation samples: {len(val_data['y'])}")

VAL_SAMPLES = []
for i in range(len(val_data["y"])):
    seq_len = int(val_data["x_lengths"][i])
    time_indices = val_data["time_indices"][i]  # timestamps, not integer indices
    end_ts = pd.to_datetime(time_indices[-1])   # convert last timestamp to pd.Timestamp
    
    # print(f"\nSample {i}:")
    # print(f"  seq_len: {seq_len}")
    # print(f"  time_indices: {time_indices}")
    # print(f"  end_ts: {end_ts}")
    
    # find row index in df corresponding to this timestamp
    matching_rows = df.index[df['timestamp'] == end_ts].tolist()
    # if not matching_rows:
    #     print(f"  WARNING: no matching row in df for timestamp {end_ts}, skipping sample")
    #     continue
    
    row_idx = matching_rows[0]
    end_time = str(df.iloc[row_idx].name)  # timestamp as string
    
    VAL_SAMPLES.append({
        "idx": i,
        "seq_len": seq_len,
        "end_idx": row_idx,
        "end_time": end_time
    })

# print(f"\nTotal valid samples loaded: {len(VAL_SAMPLES)}")


# ---------------- Setup pipeline ----------------
pipeline = build_pipeline_from_config(meta["pipeline_config"])
pipeline.scalers = meta["scalers"]
preproc = ServerPreprocess(feature_pipeline=pipeline)

# ---------------- Routes ----------------
@app.route("/")
def home():
    return render_template("hungarian2.html")

@app.route("/validation_samples")
def validation_samples():
    return jsonify(VAL_SAMPLES)

@app.route("/get_and_add_data")
def get_and_add_data():
    dense = df.set_index('timestamp').asfreq('D').ffill()
    initial_seq_len = 21
    next_idx = request.args.get("idx", type=int)

    if next_idx is None:
        if len(preproc.dataset) == 0:
            for _, row in dense.iloc[:initial_seq_len].iterrows():
                preproc.add_candle(row)
        candles = [{'time': int(ts.timestamp()),
                    'open': float(row.open),
                    'high': float(row.high),
                    'low': float(row.low),
                    'close': float(row.close)}
                   for ts, row in dense.iloc[:initial_seq_len].iterrows()]
        return jsonify({
            "initial_seq_len": initial_seq_len,
            "next_idx": initial_seq_len,
            "candles": candles
        })
    else:
        if next_idx >= len(dense):
            return jsonify({"error": "End of data"}), 404
        row = dense.iloc[next_idx]
        candle = {'time': int(row.name.timestamp()),
                  'open': float(row.open),
                  'high': float(row.high),
                  'low': float(row.low),
                  'close': float(row.close)}
        preproc.add_candle(row)
        return jsonify({"next_idx": next_idx + 1, "candle": candle})


@app.route("/predict", methods=['POST'])
def predict():
    data = request.get_json(force=True)
    seq_len = data.get("seq_len")

    if not seq_len or not isinstance(seq_len, int):
        return jsonify({"error": "Provide 'seq_len' as an int"}), 400

    try:
        seq_dict = preproc.prepare_seq(seq_len)
    except ValueError as e:
        return jsonify({"error": str(e)}), 400

    # Convert dict of DataFrames to dict of tensors
    dict_x = {k: torch.from_numpy(v.values.astype(np.float32)).unsqueeze(0)
              for k, v in seq_dict.items()}
    lengths = torch.tensor([seq_len], dtype=torch.long)

    with torch.no_grad():
        y_pred = model(dict_x, lengths)  # only regression head now
    print(y_pred)
    last_close = preproc.reference_dataset.iloc[-1]['close']
    pred_prices = (last_close * y_pred[0]).tolist()

    return jsonify({
        "pred_prices": pred_prices
    })

@app.route("/validation_test", methods=["POST"])
def validation_test():
    data = request.get_json(force=True)
    sample_idx = data.get("sample_idx")
    
    if sample_idx is None or not (0 <= sample_idx < len(val_data["y"])):
        return jsonify({"error": "Invalid sample_idx"}), 400

    seq_len = int(val_data["x_lengths"][sample_idx])
    raw_ts = int(val_data["time_indices"][sample_idx][-1])  # nanoseconds
    ts = pd.to_datetime(raw_ts)  # Timestamp

    target_y = val_data["y"][sample_idx][:seq_len].tolist()

    print(f"\n--- Validation Test Sample {sample_idx} ---")
    print(f"seq_len: {seq_len}, raw_ts: {raw_ts}")
    print(f"target_y: {target_y}")

    # --- Ensure reference dataset has datetime index ---
    ref = preproc.reference_dataset
    if np.issubdtype(ref.index.dtype, np.integer):
        print("Converting reference_dataset index from int64 to datetime...")
        ref.index = pd.to_datetime(ref.index)

    # --- Make sure requested timestamp exists ---
    if ts not in ref.index:
        print(f"Timestamp {ts} missing, reindexing...")
        freq = pd.infer_freq(ref.index[:10]) or "D"  # auto-detect, fallback daily
        full_index = pd.date_range(start=ref.index.min(), end=ref.index.max(), freq=freq)
        ref = ref.reindex(full_index, method="ffill")
        preproc.reference_dataset = ref

    # --- Prepare sequence ---
    try:
        seq_dict = preproc.prepare_seq_valid(end_idx=ts, seq_len=seq_len)
    except ValueError as e:
        print(f"prepare_seq_valid error: {e}")
        return jsonify({"error": str(e)}), 400

    # Convert to tensors
    dict_x = {}
    for k, v in seq_dict.items():
        arr = v.values.astype(np.float32)
        dict_x[k] = torch.from_numpy(arr).unsqueeze(0)  # add batch dim
        print(f"{k}: shape {arr.shape}, dtype {arr.dtype}")

    lengths = torch.tensor([seq_len], dtype=torch.long)
    print(f"lengths tensor: {lengths}")

    # Model forward
    try:
        with torch.no_grad():
            print(f"Passing dict_x['main'] to model: shape {dict_x['main'].shape}")
            y_pred = model(dict_x, lengths)
            print(f"y_pred shape: {y_pred.shape}")
    except Exception as e:
        print(f"Model forward error: {e}")
        return jsonify({"error": str(e)}), 500

    last_close = preproc.reference_dataset.loc[ts, 'close']
    pred_prices = (last_close * y_pred[0]).tolist()

    return jsonify({
        "sample_idx": sample_idx,
        "end_time": int(ts.timestamp() * 1000),  # unix ms for frontend
        "seq_len": seq_len,
        "pred_prices": pred_prices,
        "target_y": target_y
    })





if __name__ == "__main__":
    app.run(debug=True, use_reloader=False)


features ['open_dif', 'high_dif', 'low_dif', 'close_dif']
 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [17/Sep/2025 01:38:16] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 01:38:16] "GET /get_and_add_data HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 01:38:17] "GET /validation_samples HTTP/1.1" 200 -
127.0.0.1 - - [17/Sep/2025 01:38:19] "POST /validation_test HTTP/1.1" 500 -



--- Validation Test Sample 12 ---
seq_len: 112, raw_ts: 1554595200000000000
target_y: [0.7261959910392761, 0.7998110055923462, 0.6823400259017944, 0.9392110109329224, 0.6541470289230347, 0.0, 0.0, 0.0, 0.0]
Converting reference_dataset index from int64 to datetime...
Timestamp 2019-04-07 00:00:00 missing, reindexing...

--- prepare_seq_valid debug ---
Requested end_idx (raw): 2019-04-07 00:00:00, seq_len: 112
Timestamp 2018-12-17 00:00:00 missing in dataset → adding candle
Timestamp 2018-12-18 00:00:00 missing in dataset → adding candle
Timestamp 2018-12-19 00:00:00 missing in dataset → adding candle
Timestamp 2018-12-20 00:00:00 missing in dataset → adding candle
Timestamp 2018-12-21 00:00:00 missing in dataset → adding candle
Timestamp 2018-12-22 00:00:00 missing in dataset → adding candle
Timestamp 2018-12-23 00:00:00 missing in dataset → adding candle
Timestamp 2018-12-24 00:00:00 missing in dataset → adding candle
Timestamp 2018-12-25 00:00:00 missing in dataset → adding candle
T

In [31]:
import pandas as pd
a = pd.read_csv( "/home/iatell/projects/meta-learning/data/line_seq_ordered_added.csv")
a

Unnamed: 0,startTime,endTime,startIndex,endIndex,linePrice_1,linePrice_2,linePrice_3,linePrice_4,linePrice_5,linePrice_6,linePrice_7,linePrice_8,linePrice_9
0,1514764800,1515110400,0,4,,0.878016,0.788209,,,,,,
1,1514764800,1515283200,0,6,,1.055290,0.923251,0.828937,,,,,
2,1515024000,1515369600,3,7,1.143628,,,,,,,,
3,1514937600,1515456000,2,8,1.139775,,,,,,,,
4,1515110400,1515542400,4,9,1.143279,0.964469,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,1647216000,1648339200,1533,1546,0.873783,0.889793,0.902754,0.847861,0.840999,0.814315,,,
360,1629417600,1630108800,1327,1335,1.001120,1.013533,0.976295,1.031057,0.963152,0.958041,0.944168,,
361,1612742400,1613174400,1134,1139,0.984341,1.000241,1.021441,0.949513,0.930585,1.035826,,,
362,1608940800,1609632000,1090,1098,0.795270,0.875328,0.805007,0.861264,0.783370,,,,


## xgboost two head

In [None]:
import sys
from pathlib import Path

# Current notebook location
notebook_path = Path().resolve()
sys.path.append(str(notebook_path.parent))

import glob
import joblib
import numpy as np
import pandas as pd
from flask import Flask, request, jsonify, render_template

from servers.pre_process.multi_reg_dif_seq import ServerPreprocess, build_pipeline_from_config

# ---------------- Flask ----------------
app = Flask(__name__)

# ---------------- Load models + meta ----------------
meta_path = glob.glob("/home/iatell/projects/meta-learning/play_grounds/models/saved_models/xgb_meta_multireg_*.pkl")[0]
model_path = glob.glob("/home/iatell/projects/meta-learning/play_grounds/models/saved_models/xgb_model_multireg_*.pkl")[0]
len_model_path = glob.glob("/home/iatell/projects/meta-learning/play_grounds/models/saved_models/xgb_model_seq_len_*.pkl")[0]

meta = joblib.load(meta_path)
FEATURES = meta['feature_cols']
print("features", FEATURES)

# Models
model = joblib.load(model_path)       # MultiOutputRegressor with XGBRegressor inside
len_model = joblib.load(len_model_path)

# ---------------- Load data ----------------
df = pd.read_csv("/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles.csv", parse_dates=['timestamp'])

# ---------------- Setup pipeline ----------------
pipeline = build_pipeline_from_config(meta["pipeline_config"])
pipeline.scalers = meta["scalers"]

# Stateful preprocessing instance
preproc = ServerPreprocess(feature_pipeline=pipeline)


# ---------------- Routes ----------------
@app.route("/")
def home():
    return render_template("xgboost_seq.html")


@app.route("/get_and_add_data")
def get_and_add_data():
    dense = df.set_index('timestamp').asfreq('D').ffill()
    initial_seq_len = 21
    next_idx = request.args.get("idx", type=int)

    if next_idx is None:
        # First call → load initial candles
        if len(preproc.dataset) == 0:
            for _, row in dense.iloc[:initial_seq_len].iterrows():
                preproc.add_candle(row)

        candles = [
            {'time': int(ts.timestamp()),
             'open': float(row.open),
             'high': float(row.high),
             'low': float(row.low),
             'close': float(row.close)}
            for ts, row in dense.iloc[:initial_seq_len].iterrows()
        ]
        return jsonify({
            "initial_seq_len": initial_seq_len,
            "next_idx": initial_seq_len,
            "candles": candles
        })
    else:
        # Subsequent calls → 1 candle
        if next_idx >= len(dense):
            return jsonify({"error": "End of data"}), 404

        row = dense.iloc[next_idx]
        candle = {
            'time': int(row.name.timestamp()),
            'open': float(row.open),
            'high': float(row.high),
            'low': float(row.low),
            'close': float(row.close)
        }

        preproc.add_candle(row)

        return jsonify({
            "next_idx": next_idx + 1,
            "candle": candle
        })


@app.route("/predict", methods=['POST'])
def predict():
    data = request.get_json(force=True)
    seq_len = data.get("seq_len")

    if not seq_len or not isinstance(seq_len, int):
        return jsonify({"error": "Provide 'seq_len' as an int"}), 400

    try:
        # Use your XGBoost + preproc logic
        X_np = preproc.prepare_xgboost_seq(seq_len, model=len_model)
        pred_len = int(np.round(len_model.predict(X_np))[0])
        y_pred_full = model.predict(X_np)[0]
        pred_trunc = np.sort(y_pred_full[:pred_len])
        last_close = preproc.reference_dataset.iloc[-1]['close']
        pred_scaled = (last_close * pred_trunc).tolist()

        return jsonify({
            'pred_length': pred_len,
            'pred_lines': pred_scaled
        })
    except Exception as e:
        # <-- This will print the actual exception in the console
        import traceback
        traceback.print_exc()
        return jsonify({"error": str(e)}), 500





if __name__ == '__main__':
    app.run(debug=True, use_reloader=False)


# Tensorboard

## tensorboard model

In [8]:
import os
import subprocess
import webbrowser

logdir = "lightning_logs"

# 1. Find all version folders
versions = [d for d in os.listdir(logdir) if d.startswith("version_")]
if not versions:
    raise ValueError("No version folders found in lightning_logs")

# 2. Sort numerically and get the latest
versions.sort(key=lambda x: int(x.split("_")[1]))
latest_version = versions[-1]
latest_logdir = os.path.join(logdir, latest_version)
print(f"Launching TensorBoard for: {latest_logdir}")

# 3. Choose a port
port = 6006

# 4. Launch TensorBoard as a background process
subprocess.Popen(["tensorboard", f"--logdir={latest_logdir}", f"--port={port}"])

# 5. Open TensorBoard in default browser
webbrowser.open(f"http://localhost:{port}")


Launching TensorBoard for: lightning_logs/version_167


True

gio: http://localhost:6006: Operation not supported


2025-09-17 14:16:59.892045: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-17 14:16:59.903731: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758106019.917511   11060 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758106019.921464   11060 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1758106019.932187   11060 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

## tensoarboard tuning

In [4]:
import os
import subprocess
import webbrowser

# Base Ray Tune log directory
base_logdir = "/home/iatell/projects/meta-learning/tune_logs"

# 1. Find all experiment folders
experiments = [d for d in os.listdir(base_logdir) if os.path.isdir(os.path.join(base_logdir, d))]
if not experiments:
    raise ValueError("No experiment folders found in tune_logs")

# 2. Sort by modification time and get the latest experiment
experiments.sort(key=lambda x: os.path.getmtime(os.path.join(base_logdir, x)))
latest_experiment = experiments[-1]
latest_logdir = os.path.join(base_logdir, latest_experiment)
print(f"🚀 Launching TensorBoard for: {latest_logdir}")

# 3. Choose a port
port = 6006

# 4. Launch TensorBoard as a background process
subprocess.Popen([
    "tensorboard",
    f"--logdir={latest_logdir}",
    f"--port={port}"
])

# 5. Open TensorBoard in default browser
webbrowser.open(f"http://localhost:{port}")


🚀 Launching TensorBoard for: /home/iatell/projects/meta-learning/tune_logs/cnn_lstm_tuning


2025-09-16 01:18:23.217633: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-16 01:18:23.229407: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757972903.243704  158487 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757972903.248104  158487 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1757972903.259545  158487 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

True

gio: http://localhost:6006: Operation not supported

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.19.0 at http://localhost:6006/ (Press CTRL+C to quit)


# Tuning

## tuning cnn- attention lstm

In [3]:

import os
import torch
import psutil
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray import air
def resource_usage():
    """Print current CPU, RAM, and GPU usage."""
    cpu = psutil.cpu_percent(interval=0.5)
    ram = psutil.virtual_memory().percent
    usage = f"💻 CPU: {cpu:.1f}% | 🧠 RAM: {ram:.1f}%"
    try:
        import GPUtil
        gpus = GPUtil.getGPUs()
        if gpus:
            usage += f" | 🎮 GPU: {gpus[0].load*100:.1f}% VRAM: {gpus[0].memoryUtil*100:.1f}%"
    except ImportError:
        pass
    print(usage)


def train_cnn_lstm_tune(config):
    """
    Single Ray Tune trial.
    Args:
        config (dict): hyperparameters for this trial.
    """
    resource_usage()  # Show current hardware usage

    # Train using existing train_model function
    metrics = train_model(
        "/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles.csv",
        "/home/iatell/projects/meta-learning/data/line_seq_ordered_added.csv",
        do_validation=True,
        model_out_dir="models/tuned",
        # seq_len=config["seq_len"],
        hidden_dim=config["hidden_dim"],
        # num_layers=config["num_layers"],
        lr=config["lr"],
        batch_size=config["batch_size"],
        max_epochs=config["max_epochs"],
        return_val_accuracy=True,  # Expects dict with "accuracy" and optionally "loss"
        save_model=False  # Never save during search
    )

    # Report metrics to Ray Tune
    tune.report(metrics)

def run_tuning(save_model=True):
    """Hyperparameter tuning for CNN LSTM with Ray Tune."""

    search_space = {
        # LSTM / model
        "hidden_dim": tune.choice([32, 64, 128, 256]),
        "num_layers": tune.choice([1, 2, 3]),
        "attention_name": tune.choice(["simple_attention", "tanh_attention"]),

        # Learning rate & optimizer
        "lr": tune.loguniform(1e-4, 1e-2),
        "optimizer_name": tune.choice(["adamw", "adam"]),

        # Scheduler
        "scheduler_name": tune.choice(["reduce_on_plateau", "cosine", "onecycle"]),
        "scheduler_params": {
            "factor": tune.loguniform(0.05, 0.5),      # only used for ReduceLROnPlateau
            "patience": tune.choice([2, 3, 5, 7]),     # only used for ReduceLROnPlateau
            "T_max": tune.choice([10, 20, 50]),        # only used for CosineAnnealingLR
            "eta_min": tune.loguniform(1e-6, 1e-4)    # only used for CosineAnnealingLR
        },
        "optimizer_params": {
            "weight_decay": tune.loguniform(1e-5, 1e-2)
        },

        # CNN params
        "kernels": tune.choice([[3,5,7,11], [3,5,7], [3,5]]),
        "cnn_out_channels": tune.choice([16, 32, 64]),
        "first_drop": tune.uniform(0.1, 0.5),
        "second_drop": tune.uniform(0.1, 0.5),
        "third_drop": tune.uniform(0.1, 0.5),

        # Training
        "batch_size": tune.choice([32, 64, 128]),
        "max_epochs": tune.choice([50, 100, 150]),
    }    

    scheduler = ASHAScheduler(
        metric="accuracy",  # must exist in metrics dict from train_model
        mode="max",
        grace_period=1,
        reduction_factor=2
    )

    tuner = tune.Tuner(
        train_cnn_lstm_tune,
        param_space=search_space,
        tune_config=tune.TuneConfig(
            scheduler=scheduler,
            num_samples=10
        ),
    run_config=air.RunConfig(
        name="cnn_lstm_tuning",
        storage_path="/home/iatell/projects/meta-learning/tune_logs",
    ),
    # runtime_env=runtime_env
    )

    results = tuner.fit()

    # Best trial
    best_result = results.get_best_result(metric="accuracy", mode="max")
    print("\n🏆 Best Config:", best_result.config)
    print(f"Best Accuracy: {best_result.metrics['accuracy']:.4f}")

    # Optional: retrain best model on full data and save
    if save_model:
        print("\n🔁 Retraining best model on full dataset for saving...")

        # Map scheduler params depending on scheduler type
        scheduler_name = best_result.config.get("scheduler_name")
        scheduler_params_config = best_result.config.get("scheduler_params", {})

        if scheduler_name == "reduce_on_plateau":
            scheduler_params = {
                "factor": scheduler_params_config.get("factor", 0.2),
                "patience": scheduler_params_config.get("patience", 3)
            }
        elif scheduler_name == "cosine":
            scheduler_params = {
                "T_max": scheduler_params_config.get("T_max", 10),
                "eta_min": scheduler_params_config.get("eta_min", 1e-6)
            }
        else:  # onecycle or others
            scheduler_params = {}

        # Optimizer params
        optimizer_params = best_result.config.get("optimizer_params", {"weight_decay": 0.01})

        train_model(
            data_csv="/home/iatell/projects/meta-learning/data/Bitcoin_BTCUSDT_kaggle_1D_candles.csv",
            labels_csv="/home/iatell/projects/meta-learning/data/line_seq_ordered_added.csv",
            do_validation=True,
            return_val_accuracy=True,
            model_out_dir="models/saved_models",
            hidden_dim=best_result.config.get("hidden_dim", 32),
            num_layers=best_result.config.get("num_layers", 1),
            attention_name=best_result.config.get("attention_name", "tanh_attention"),
            optimizer_name=best_result.config.get("optimizer_name", "adamw"),
            lr=best_result.config.get("lr", 1e-3),
            batch_size=best_result.config.get("batch_size", 32),
            max_epochs=best_result.config.get("max_epochs", 10),
            kernels=best_result.config.get("kernels", [3, 5, 7, 11]),
            cnn_out_channels=best_result.config.get("cnn_out_channels", 32),
            first_drop=best_result.config.get("first_drop", 0.3),
            second_drop=best_result.config.get("second_drop", 0.3),
            third_drop=best_result.config.get("third_drop", 0.3),
            scheduler_name=scheduler_name,
            scheduler_params=scheduler_params,
            optimizer_params=optimizer_params
        )
if __name__ == "__main__":
    run_tuning(save_model=True)


0,1
Current time:,2025-09-16 01:13:15
Running for:,00:02:32.57
Memory:,6.5/15.5 GiB

Trial name,status,loc,attention_name,batch_size,cnn_out_channels,first_drop,hidden_dim,kernels,lr,max_epochs,num_layers,optimizer_name,optimizer_params/wei ght_decay,scheduler_name,scheduler_params/T_m ax,scheduler_params/eta _min,scheduler_params/fac tor,scheduler_params/pat ience,second_drop,third_drop,iter,total time (s),accuracy
train_cnn_lstm_tune_a1399_00000,TERMINATED,172.18.55.78:156016,simple_attention,32,64,0.465832,64,"[3, 5]",0.000856737,100,1,adamw,0.000473473,cosine,50,1.06994e-06,0.489182,5,0.457087,0.334107,1,63.7998,-0.00376912
train_cnn_lstm_tune_a1399_00001,TERMINATED,172.18.55.78:156020,simple_attention,128,64,0.134259,128,"[3, 5, 7, 11]",0.00267099,150,3,adamw,0.000143287,onecycle,20,5.06906e-06,0.159452,7,0.478276,0.124169,1,107.862,-0.00171126
train_cnn_lstm_tune_a1399_00002,TERMINATED,172.18.55.78:156017,tanh_attention,64,16,0.279615,64,"[3, 5, 7, 11]",0.00188269,150,3,adam,0.000105002,cosine,50,1.55418e-05,0.164757,3,0.176765,0.132369,1,81.0614,-0.00212857
train_cnn_lstm_tune_a1399_00003,TERMINATED,172.18.55.78:156014,tanh_attention,64,32,0.201896,256,"[3, 5, 7, 11]",0.00475725,150,1,adamw,6.51102e-05,reduce_on_plateau,10,3.861e-05,0.252568,3,0.314782,0.451033,1,147.619,-0.00156249
train_cnn_lstm_tune_a1399_00004,TERMINATED,172.18.55.78:156018,tanh_attention,32,32,0.272345,32,"[3, 5, 7, 11]",0.00292937,100,2,adamw,0.00418518,cosine,20,8.51017e-06,0.143769,5,0.246152,0.416094,1,59.3197,-0.003694
train_cnn_lstm_tune_a1399_00005,TERMINATED,172.18.55.78:156015,tanh_attention,128,16,0.365547,128,"[3, 5]",0.00682566,100,2,adamw,4.15075e-05,reduce_on_plateau,10,2.38103e-06,0.225908,5,0.260768,0.109157,1,85.0899,-0.0018151
train_cnn_lstm_tune_a1399_00006,TERMINATED,172.18.55.78:156021,tanh_attention,128,16,0.433743,64,"[3, 5]",0.000378752,100,2,adam,1.5505e-05,onecycle,50,1.21986e-05,0.130873,7,0.255645,0.147076,1,65.8118,-0.00548884
train_cnn_lstm_tune_a1399_00007,TERMINATED,172.18.55.78:156022,tanh_attention,32,64,0.391982,64,"[3, 5, 7, 11]",0.00855961,50,2,adamw,0.000110736,onecycle,50,1.53114e-06,0.0515268,3,0.317599,0.479085,1,37.1472,-0.00233827
train_cnn_lstm_tune_a1399_00008,TERMINATED,172.18.55.78:156019,simple_attention,128,64,0.353015,128,"[3, 5, 7, 11]",0.000588154,50,2,adam,0.000870115,cosine,50,1.49388e-05,0.148289,5,0.289063,0.239295,1,56.0168,-0.00315866
train_cnn_lstm_tune_a1399_00009,TERMINATED,172.18.55.78:156013,simple_attention,32,32,0.1977,32,"[3, 5, 7, 11]",0.000236319,50,3,adam,0.000198311,cosine,10,5.4739e-06,0.192097,5,0.168235,0.494452,1,34.1624,-0.00717554


[36m(train_cnn_lstm_tune pid=156016)[0m 💻 CPU: 0.9% | 🧠 RAM: 67.8% | 🎮 GPU: 0.0% VRAM: 1.2%
[36m(train_cnn_lstm_tune pid=156015)[0m 
[36m(train_cnn_lstm_tune pid=156015)[0m === DEBUG SAMPLE CHECK (Torch mode) ===
[36m(train_cnn_lstm_tune pid=156015)[0m 
[36m(train_cnn_lstm_tune pid=156015)[0m --- Sequence 0 ---
[36m(train_cnn_lstm_tune pid=156015)[0m Label: [1.143628 0.       0.       0.       0.       0.       0.       0.
[36m(train_cnn_lstm_tune pid=156015)[0m  0.      ] Encoded (padded): [1.143628 0.       0.       0.       0.       0.       0.       0.
[36m(train_cnn_lstm_tune pid=156015)[0m  0.      ]
[36m(train_cnn_lstm_tune pid=156015)[0m [main] Shape: (5, 4)
[36m(train_cnn_lstm_tune pid=156015)[0m [main] First few rows:
[36m(train_cnn_lstm_tune pid=156015)[0m  [[ 0.01562355 -0.00180042 -0.01639293  0.0093857 ]
[36m(train_cnn_lstm_tune pid=156015)[0m  [ 0.00938704  0.12409948  0.04899828  0.12622231]
[36m(train_cnn_lstm_tune pid=156015)[0m  [ 0.12622082

[36m(train_cnn_lstm_tune pid=156016)[0m 💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
[36m(train_cnn_lstm_tune pid=156016)[0m GPU available: False, used: False
[36m(train_cnn_lstm_tune pid=156016)[0m TPU available: False, using: 0 TPU cores
[36m(train_cnn_lstm_tune pid=156016)[0m HPU available: False, using: 0 HPUs
[36m(train_cnn_lstm_tune pid=156016)[0m 2025-09-16 01:10:49.264842: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[36m(train_cnn_lstm_tune pid=156016)[0m 2025-09-16 01:10:49.331359: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Epoch 0:   0%|          | 0/10 [00:00<?, ?it/s]                            
Epoch 0:   0%|          | 0/3 [00:00<?, ?it/s]                             


[36m(train_cnn_lstm_tune pid=156016)[0m 
[36m(train_cnn_lstm_tune pid=156016)[0m   | Name          | Type          | Params | Mode 
[36m(train_cnn_lstm_tune pid=156016)[0m --------------------------------------------------------
[36m(train_cnn_lstm_tune pid=156016)[0m 0 | branches      | ModuleList    | 3.7 K  | train
[36m(train_cnn_lstm_tune pid=156016)[0m 1 | fusion_conv2d | Sequential    | 15     | train
[36m(train_cnn_lstm_tune pid=156016)[0m 2 | lstm          | LSTM          | 25.1 K | train
[36m(train_cnn_lstm_tune pid=156016)[0m 3 | attention     | TanhAttention | 4.2 K  | train
[36m(train_cnn_lstm_tune pid=156016)[0m 4 | regressor     | Sequential    | 2.4 K  | train
[36m(train_cnn_lstm_tune pid=156016)[0m 5 | loss_fn_reg   | MSELoss       | 0      | train
[36m(train_cnn_lstm_tune pid=156016)[0m --------------------------------------------------------
[36m(train_cnn_lstm_tune pid=156016)[0m 35.4 K    Trainable params
[36m(train_cnn_lstm_tune pid=156016)[

Epoch 0:  10%|█         | 1/10 [00:10<01:36,  0.09it/s, v_num=0, train_loss=0.820]
[36m(train_cnn_lstm_tune pid=156013)[0m 💻 CPU: 1.5% | 🧠 RAM: 68.1% | 🎮 GPU: 0.0% VRAM: 1.2%[32m [repeated 9x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m
[36m(train_cnn_lstm_tune pid=156013)[0m === DEBUG SAMPLE CHECK (Torch mode) ===[32m [repeated 9x across cluster][0m
[36m(train_cnn_lstm_tune pid=156013)[0m --- Sequence 0 ---[32m [repeated 9x across cluster][0m
[36m(train_cnn_lstm_tune pid=156013)[0m Label: [1.143628 0.       0.       0.       0.       0.       0.       0.[32m [repeated 9x across cluster][0m
[36m(train_cnn_lstm_tune pid=156013)[0m  0.      ] Encoded (padded): [1.143628 0.       0.       0.       0.       0.       0.       0.[32m [repeated 9x across cluster][0m
[36m(train_cnn_lstm_tu

[36m(train_cnn_lstm_tune pid=156013)[0m `Trainer.fit` stopped: `max_epochs=50` reached.
[36m(train_cnn_lstm_tune pid=156020)[0m 💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.[32m [repeated 9x across cluster][0m
[36m(train_cnn_lstm_tune pid=156020)[0m GPU available: False, used: False[32m [repeated 9x across cluster][0m
[36m(train_cnn_lstm_tune pid=156020)[0m TPU available: False, using: 0 TPU cores[32m [repeated 9x across cluster][0m
[36m(train_cnn_lstm_tune pid=156020)[0m HPU available: False, using: 0 HPUs[32m [repeated 9x across cluster][0m
[36m(train_cnn_lstm_tune pid=156013)[0m 2025-09-16 01:10:49.264840: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them

[36m(train_cnn_lstm_tune pid=156013)[0m 
[36m(train_cnn_lstm_tune pid=156013)[0m 📊 Validation Metrics (Hungarian matched):
[36m(train_cnn_lstm_tune pid=156013)[0m   Regression → MSE: 0.007176, MAE: 0.060398
[36m(train_cnn_lstm_tune pid=156015)[0m 
[36m(train_cnn_lstm_tune pid=156019)[0m 
[36m(train_cnn_lstm_tune pid=156017)[0m 
[36m(train_cnn_lstm_tune pid=156017)[0m 
[36m(train_cnn_lstm_tune pid=156022)[0m 
Epoch 46:  60%|██████    | 6/10 [-1:59:50<-1:59:54, -0.58it/s, v_num=0, train_loss=0.00463, val_loss=0.0023]
[36m(train_cnn_lstm_tune pid=156017)[0m 
[36m(train_cnn_lstm_tune pid=156017)[0m 
[36m(train_cnn_lstm_tune pid=156018)[0m 
[36m(train_cnn_lstm_tune pid=156022)[0m 
[36m(train_cnn_lstm_tune pid=156020)[0m 
Epoch 47:  10%|█         | 1/10 [00:00<00:00, 37.42it/s, v_num=0, train_loss=7.27e-5, val_loss=0.00317] 
[36m(train_cnn_lstm_tune pid=156020)[0m 
[36m(train_cnn_lstm_tune pid=156015)[0m 
[36m(train_cnn_lstm_tune pid=156022)[0m 
[36m(train_cn

[36m(train_cnn_lstm_tune pid=156019)[0m `Trainer.fit` stopped: `max_epochs=50` reached.[32m [repeated 2x across cluster][0m


[36m(train_cnn_lstm_tune pid=156020)[0m 
[36m(train_cnn_lstm_tune pid=156021)[0m 
[36m(train_cnn_lstm_tune pid=156014)[0m 
[36m(train_cnn_lstm_tune pid=156016)[0m 
Epoch 96:  50%|█████     | 5/10 [00:00<00:00, 28.73it/s, v_num=0, train_loss=0.00991, val_loss=0.00339]
[36m(train_cnn_lstm_tune pid=156021)[0m 
[36m(train_cnn_lstm_tune pid=156017)[0m 
[36m(train_cnn_lstm_tune pid=156017)[0m 
[36m(train_cnn_lstm_tune pid=156017)[0m 
[36m(train_cnn_lstm_tune pid=156014)[0m 
[36m(train_cnn_lstm_tune pid=156014)[0m 
[36m(train_cnn_lstm_tune pid=156018)[0m 
[36m(train_cnn_lstm_tune pid=156014)[0m 
Epoch 98:  10%|█         | 1/10 [00:00<00:00, 38.46it/s, v_num=0, train_loss=0.00582, val_loss=0.00372]  
Epoch 89:  20%|██        | 2/10 [00:00<00:00, 42.07it/s, v_num=0, train_loss=0.0051, val_loss=0.00322]  
[36m(train_cnn_lstm_tune pid=156015)[0m 
[36m(train_cnn_lstm_tune pid=156018)[0m 
[36m(train_cnn_lstm_tune pid=156014)[0m 
[36m(train_cnn_lstm_tune pid=156021)[0

[36m(train_cnn_lstm_tune pid=156016)[0m `Trainer.fit` stopped: `max_epochs=100` reached.[32m [repeated 2x across cluster][0m


Epoch 97: 100%|██████████| 10/10 [00:00<00:00, 24.87it/s, v_num=0, train_loss=0.00749, val_loss=0.00319]
Epoch 36:  80%|████████  | 4/5 [00:00<00:00,  5.05it/s, v_num=0, train_loss=0.00487, val_loss=0.00161][32m [repeated 89x across cluster][0m
Epoch 108:  80%|████████  | 4/5 [00:10<00:02,  0.37it/s, v_num=0, train_loss=0.00389, val_loss=0.00173][32m [repeated 18x across cluster][0m
Epoch 99: 100%|██████████| 10/10 [00:00<00:00, 27.55it/s, v_num=0, train_loss=0.00376, val_loss=0.003][32m [repeated 3x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A[32m [repeated 50x across cluster][0m
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A[32m [repeated 50x across cluster][0m
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A[32m [repeated 50x across cluster][0m
Epoch 97: 100%|██████████| 3/3 [00:00<00:00,  8.70it/s, v_num=0, train_loss=0.0143, val_loss=0.00536] [32m [repeated 26x across cluster][0m
Validation DataLoader 0:  50%|█████     | 

[36m(train_cnn_lstm_tune pid=156017)[0m `Trainer.fit` stopped: `max_epochs=150` reached.[32m [repeated 2x across cluster][0m


[36m(train_cnn_lstm_tune pid=156020)[0m 
[36m(train_cnn_lstm_tune pid=156014)[0m 
[36m(train_cnn_lstm_tune pid=156015)[0m 
[36m(train_cnn_lstm_tune pid=156015)[0m 
[36m(train_cnn_lstm_tune pid=156015)[0m 
[36m(train_cnn_lstm_tune pid=156020)[0m 
[36m(train_cnn_lstm_tune pid=156015)[0m 
[36m(train_cnn_lstm_tune pid=156020)[0m 
[36m(train_cnn_lstm_tune pid=156015)[0m 
[36m(train_cnn_lstm_tune pid=156020)[0m 
Epoch 66:  20%|██        | 1/5 [00:00<00:00, 12.47it/s, v_num=0, train_loss=0.00145, val_loss=0.00133]
[36m(train_cnn_lstm_tune pid=156014)[0m 
[36m(train_cnn_lstm_tune pid=156020)[0m 
[36m(train_cnn_lstm_tune pid=156020)[0m 
[36m(train_cnn_lstm_tune pid=156014)[0m 
Epoch 119:  33%|███▎      | 1/3 [00:00<00:00,  4.19it/s, v_num=0, train_loss=0.00529, val_loss=0.0016][32m [repeated 124x across cluster][0m
Epoch 63:  40%|████      | 2/5 [00:00<00:00, 11.78it/s, v_num=0, train_loss=0.00246, val_loss=0.00134][32m [repeated 7x across cluster][0m
Validation:

[36m(train_cnn_lstm_tune pid=156020)[0m `Trainer.fit` stopped: `max_epochs=150` reached.[32m [repeated 2x across cluster][0m


[36m(train_cnn_lstm_tune pid=156020)[0m 
[36m(train_cnn_lstm_tune pid=156020)[0m 📊 Validation Metrics (Hungarian matched):
[36m(train_cnn_lstm_tune pid=156020)[0m   Regression → MSE: 0.001711, MAE: 0.030193
Epoch 88:  40%|████      | 2/5 [00:00<00:00, 14.84it/s, v_num=0, train_loss=0.00574, val_loss=0.00138]
Epoch 90:   0%|          | 0/5 [00:00<?, ?it/s, v_num=0, train_loss=0.00236, val_loss=0.00136]        
Epoch 90:  60%|██████    | 3/5 [00:00<00:00,  6.20it/s, v_num=0, train_loss=0.00418, val_loss=0.00136]
[36m(train_cnn_lstm_tune pid=156014)[0m 
Epoch 91:  20%|██        | 1/5 [00:00<00:00, 17.67it/s, v_num=0, train_loss=0.00187, val_loss=0.00136]
Epoch 94:  40%|████      | 2/5 [00:00<00:00, 10.16it/s, v_num=0, train_loss=0.00361, val_loss=0.00132]
Epoch 95:   0%|          | 0/5 [00:00<?, ?it/s, v_num=0, train_loss=0.00324, val_loss=0.00134]        
[36m(train_cnn_lstm_tune pid=156014)[0m 
Epoch 97:  60%|██████    | 3/5 [00:00<00:00,  6.73it/s, v_num=0, train_loss=0.00487

[36m(train_cnn_lstm_tune pid=156014)[0m `Trainer.fit` stopped: `max_epochs=150` reached.
2025-09-16 01:13:15,294	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/iatell/projects/meta-learning/tune_logs/cnn_lstm_tuning' in 0.0034s.


Epoch 149: 100%|██████████| 5/5 [00:00<00:00,  8.07it/s, v_num=0, train_loss=0.0017, val_loss=0.00135]


2025-09-16 01:13:15,299	INFO tune.py:1041 -- Total run time: 152.61 seconds (152.57 seconds for the tuning loop).



🏆 Best Config: {'hidden_dim': 256, 'num_layers': 1, 'attention_name': 'tanh_attention', 'lr': 0.004757249272982139, 'optimizer_name': 'adamw', 'scheduler_name': 'reduce_on_plateau', 'scheduler_params': {'factor': 0.2525681845866281, 'patience': 3, 'T_max': 10, 'eta_min': 3.860995904914844e-05}, 'optimizer_params': {'weight_decay': 6.51102196686824e-05}, 'kernels': [3, 5, 7, 11], 'cnn_out_channels': 32, 'first_drop': 0.20189569032451213, 'second_drop': 0.3147822852430936, 'third_drop': 0.45103251677083245, 'batch_size': 64, 'max_epochs': 150}
Best Accuracy: -0.0016

🔁 Retraining best model on full dataset for saving...
[36m(train_cnn_lstm_tune pid=156014)[0m 
[36m(train_cnn_lstm_tune pid=156014)[0m 📊 Validation Metrics (Hungarian matched):
[36m(train_cnn_lstm_tune pid=156014)[0m   Regression → MSE: 0.001562, MAE: 0.026651


💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



=== DEBUG SAMPLE CHECK (Torch mode) ===

--- Sequence 0 ---
Label: [1.143628 0.       0.       0.       0.       0.       0.       0.
 0.      ] Encoded (padded): [1.143628 0.       0.       0.       0.       0.       0.       0.
 0.      ]
[main] Shape: (5, 4)
[main] First few rows:
 [[ 0.01562355 -0.00180042 -0.01639293  0.0093857 ]
 [ 0.00938704  0.12409948  0.04899828  0.12622231]
 [ 0.12622082 -0.00192766  0.09665822  0.00645032]
 [ 0.00645032 -0.00251821 -0.02505807 -0.05388233]
 [-0.04985064 -0.0454773  -0.17924407 -0.07724382]]



2025-09-16 01:13:15.857479: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-16 01:13:15.868540: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757972595.880236  153878 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757972595.883575  153878 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1757972595.895234  153878 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/iatell/envs/Rllib2.43/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
/home/iatell/envs/Rllib2.43/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=19` in the `DataLoader` to improve performance.
/home/iatell/envs/Rllib2.43/lib/python3.11/site-packages/pytorch_lightning/loops/fit_loop.py:310: The number of training batches (5) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=150` reached.


✅ Model saved to models/saved_models/lstm_model_multireg_multihead_20250916_011315.pt
✅ Meta saved to models/saved_models/lstm_meta_multireg_multihead_20250916_011315.pkl

📊 Validation Metrics (Hungarian matched):
  Regression → MSE: 0.001342, MAE: 0.026533
