# Test of looping over $\beta$-VAE to detect and classify outliers

In [1]:
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt

from pathlib import Path

In [2]:
from magnetics_diagnostic_analysis.project_vae.setting_vae import config



Choosen device = cuda:0


In [3]:
import torch
from torch import nn

from magnetics_diagnostic_analysis.ml_tools.pytorch_device_selection import print_torch_info
print_torch_info()


Torch version?  2.4.1+cu121
Cuda?           True

GPU number : 2
GPU 0: Tesla T4
GPU 1: Tesla T4


### 1. Create dataset and DataLoader

I took the decision that one data sample will be : all the time values of one shot and for all diagnostics. It will be easy after, to reduce to one diagnostic only (It wouldn't habe been the case if we wanted to use all diagnostics for one timestep -> there reduce to one diagnostic just give us one number and that is to small).

As all shots own different lenghts, we are going to use LSTM unit in entry of our VAE. This LSTM unit is combined with padded sequence and have masking behaviour.

Thus, after the LSTM, we will have a constant size tensor (the LSTM hidden state) that we can use in our VAE.

Consideration:

We want our model to be robust to any different size during testing time.

Thus, we are going to find the max_lenght for each batch size in the dataloader.

And thanks to the two functions `pack_padded_sequence`, `pad_packed_sequence`, the LSTM is aware of the true lenght of each sequence and use masking.

In [4]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

In [5]:
path = Path().absolute().parent.parent / "data/preprocessed/mscred/data_magnetics_mscred_cleaned.nc"
data_all = xr.open_dataset(path)
data_all

In [6]:
def find_seq_length(data: xr.Dataset) -> np.ndarray:
    # Find the length of each sequence in the dataset
    seq_indices = data['shot_index'].values
    return np.bincount(seq_indices)
lengths = find_seq_length(data_all)
lengths

array([2084, 1989, 2230, ..., 3153, 2089, 1947])

In [7]:
class TimeSeriesDataset(Dataset):
    def __init__(self, data: xr.Dataset):
        # Group data by shot_index
        self.shot_indices = data['shot_index'].values
        self.unique_shots = np.unique(self.shot_indices)
        
        # Precompute sequences for each shot index
        self.sequences = {}
        for shot in self.unique_shots:
            mask = self.shot_indices == shot
            shot_data = []
            for var in data.data_vars:
                if var == 'shot_index':
                    continue
                if data[var].ndim == 1:
                    var_data = data[var].values[mask][:, np.newaxis]
                else:
                    var_data = data[var].values[mask]
                shot_data.append(var_data)
            self.sequences[shot] = np.concatenate(shot_data, axis=1)      # axis=1 => along features dimension
        
        self.lengths = {shot: len(self.sequences[shot]) for shot in self.unique_shots}

    def __len__(self):
        return len(self.unique_shots)
    
    def __getitem__(self, idx):
        shot = self.unique_shots[idx]
        return self.sequences[shot], self.lengths[shot]

In [8]:
def collate_fn(batch):
    """Custom collate function to pad sequences to max length in batch"""
    sequences, lengths = zip(*batch)
    
    # Convert sequences to tensors
    sequence_tensors = [torch.from_numpy(seq).float() for seq in sequences]
    padded_sequences = pad_sequence(
        sequence_tensors, 
        batch_first=True, 
        padding_value=0.0
    )
    length_tensor = torch.tensor(lengths, dtype=torch.long)
    
    return padded_sequences, length_tensor

In [11]:
def create_data_loaders(
    data: xr.Dataset,
    batch_size: int = 10,
    set_separation: int = 12000,
) -> tuple[DataLoader]:
    """
    Create train, validation and test data loaders from time series data
    
    Args:
        data: xarray Dataset with shot_index variable
        batch_size: batch size for data loaders
        set_separation: boundarie between train and test sets
        device: device to load data on
    
    Returns:
        train_loader, valid_loader, test_loader: DataLoader objects
    """
    # Get shot indices
    shot_indices = data['shot_index'].values
    unique_shots = np.unique(shot_indices)

    # Find the time index where each shot starts
    shot_start_times = {}
    real_test_end = config.DATA_NUMBER
    for shot in unique_shots:
        start_idx = np.where(shot_indices == shot)[0][0]
        shot_start_times[shot] = start_idx
        if start_idx > real_test_end:
            break

    # Split shots into sets based on the set_separation values but also their start time
    train_end= set_separation
    test_shots = [shot for shot in unique_shots[:len(shot_start_times)] if real_test_end > shot_start_times[shot] >= train_end][:-1]    # [:-1] to exclude last incomplete shot
    # Get real start and end times for each split: preserving shots integrity
    real_test_start = shot_start_times[np.min(test_shots)]

    # Create datasets for each split
    train_dataset = TimeSeriesDataset(data.isel(time=slice(0, real_test_start)))
    test_dataset = TimeSeriesDataset(data.isel(time=slice(real_test_start, real_test_end)))

    # Create DataLoaders with custom collate function
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn,
        drop_last=False
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn,
        drop_last=False
    )
    
    return train_loader, test_loader

In [12]:
train_loader, test_loader = create_data_loaders(
    data_all, 
    batch_size=config.BATCH_SIZE,
    set_separation=config.SET_SEPARATION
)

In [13]:
print("Length of train_loader:", len(train_loader))

print(next(iter(train_loader))[0].shape)  # Get the first batch: data
print(next(iter(train_loader))[1].shape)  # Get the first batch: lengths

Length of train_loader: 369
torch.Size([10, 2577, 96])
torch.Size([10])


## 2. Model LSTM-$\beta$-VAE implementation

In [None]:
class LengthAwareLSTMEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim, num_layers):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.linear_mean = nn.Linear(hidden_dim, latent_dim)
        self.linear_logvar = nn.Linear(hidden_dim, latent_dim)

    def forward(self, x, lengths):
        packed_input = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.encoder_lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        
        last_hidden = hidden[-1]
        mean = self.linear_mean(last_hidden)
        logvar = self.linear_logvar(last_hidden)
        return mean, logvar

In [None]:
class LengthAwareLSTMDecoder(nn.Module):
    def __init__(self, latent_dim, hidden_dim, output_dim, seq_length, num_layers):
        super().__init__()
        self.linear_init = nn.Linear(latent_dim, hidden_dim * num_layers * 2)  # For hidden and cell states of each layer
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True)
        self.output_layer = nn.Linear(hidden_dim, output_dim)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

    def forward(self, z):
        h = self.linear(z)
        h = h.unsqueeze(1).repeat(1, self.seq_length, 1)
        output, _ = self.lstm(h)
        return self.output_layer(output)
    
    def forward(self, z, lengths):

        batch_size = z.size(0)
        
        # Initialiser les états hidden/cell à partir du vecteur latent
        init_states = self.linear_init(z)
        h0 = init_states[:, :self.hidden_dim * self.num_layers].view(
            self.num_layers, batch_size, self.hidden_dim)
        c0 = init_states[:, self.hidden_dim * self.num_layers:].view(
            self.num_layers, batch_size, self.hidden_dim)
        
        # Créer une séquence d'entrée pour le décodeur
        max_length = torch.max(lengths)
        input_seq = torch.zeros(batch_size, max_length, self.hidden_dim, device=z.device)
        
        # Forward pass through LSTM
        packed_input = pack_padded_sequence(input_seq, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input, (h0, c0))
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        output = self.output_layer(output)

        # Créer un masque pour mettre à zéro les steps au-delà de lengths
        mask = torch.arange(max_length, device=z.device)[None, :] < lengths[:, None]
        mask = mask.unsqueeze(-1).float()  # [batch_size, max_length, 1]

        masked_output = output * mask
        return masked_output

In [None]:
class LSTMBetaVAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim, seq_length, num_layers):
        super().__init__()
        self.encoder = LengthAwareLSTMEncoder(input_dim, hidden_dim, latent_dim, num_layers)
        self.decoder = LengthAwareLSTMDecoder(latent_dim, hidden_dim, input_dim, seq_length, num_layers)

    def forward(self, x, lengths):
        z_mean, z_logvar = self.encoder(x, lengths)
        z = self.reparameterize(z_mean, z_logvar)
        return self.decoder(z), z_mean, z_logvar

    def reparameterize(self, mean, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mean + eps * std

## 3. Train loop

In [None]:
def train_one_time(
    model: nn.Module,
    train_loader: DataLoader,
    n_epochs: int,
    optimizer: torch.optim.Optimizer,
    device: torch.device
):
    current_data = data.copy()
    anomaly_indices = np.array([], dtype=int)

    reconstruction_error_threshold_percentile = 95


    for epoch in range(n_epochs):
        model.train()
        for batch in train_loader:
            data, lengths = batch
            data = data.to(device)
            lengths = lengths.to(device)

            optimizer.zero_grad()
            output = model(data, lengths)
            loss = criterion(output, data)
            loss.backward()
            optimizer.step()




In [None]:
def train(n_iterations: int):
    for i in range(n_iterations):
        model = VAE()
        train_one_time(model, train_loader, n_epochs, optimizer, device)

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler