References
WESAD: https://archive.ics.uci.edu/dataset/465/wesad+wearable+stress+and+affect+detection \
TabTransformer:\
https://aravindkolli.medium.com/mastering-tabular-data-with-tabtransformer-a-comprehensive-guide-119f6dbf5a79 \
https://medium.com/@cristianleo120/the-math-behind-tabtransformer-78b78c12cfc1 \
https://towardsdatascience.com/transformers-for-tabular-data-b3e196fab6f4/\
https://towardsdatascience.com/transformers-for-tabular-data-tabtransformer-deep-dive-5fb2438da820/


Steps:
1. Import Dataset
2. Train-test split and Data Loader
3. Transformer/ Neural network
    1) Create a model
    2) Choose a loss function
    3) Set an optimizer 
    4) Run a training loop
        Calculate loss (Forward pass)
        Compute gradients (Backpropagation)
        Updating model parameters
4. Evaluation

In [2]:
#1. Import Dataset
import os
import pickle
import numpy as np
from scipy.signal import resample
from scipy.stats import mode
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [3]:
from torch.utils.data import TensorDataset, Subset, DataLoader
import torch.nn as nn
import torch.optim as optim

In [4]:
class WESADDataset(Dataset):
    def __init__(self, data_path, window_size=128, overlap=0.0):
        self.data_path = data_path
        self.window_size = window_size
        self.overlap = overlap
        self.signal_names = ['ACC','Resp','EDA','Temp','ECG','EMG']  
        self.data, self.labels, self.subjects = self.load_dataset()
    
    def load_dataset(self):
        subjects = [f'S{i}' for i in range(1, 18) if i not in [1, 12]]  # S1 and S12 are not available (Problem with sensors)
        all_data = []
        all_labels = []
        all_subjects = []
        
        orig_fs = 700
        target_fs = 32
        
        for subject in subjects:
            subj_dir = os.path.join(self.data_path, subject)
            data_file = os.path.join(subj_dir, f'{subject}.pkl')
            
            if not os.path.exists(data_file):
                print(f'Warning: {data_file} does not exist')
                continue
            
            try:
                with open(data_file, 'rb') as f:
                    raw = torch.load(f) if self.data_path.endswith('.pt') else pickle.load(f, encoding='latin1')
                
                # Extract chest data and label
                chest_data = raw['signal']['chest']
                labels = raw['label']
                
                # Process signals
                signals = []
                for name in self.signal_names:
                    if name in chest_data:
                        sig = chest_data[name]
                        
                        # Handle multi-dimensional signals (like ACC with x,y,z components)
                        if len(sig.shape) > 1:
                            if name == 'ACC':
                                # For accelerometer, compute magnitude from 3D components
                                if sig.shape[1] == 3:  # x, y, z components
                                    sig = np.sqrt(np.sum(sig**2, axis=1))  # Magnitude
                                else:
                                    sig = sig.flatten()
                            else:
                                sig = sig.flatten()
                        
                        # Resample signal
                        sig_resampled = resample(sig, int(len(sig) * target_fs / orig_fs))
                        signals.append(sig_resampled)
                    else:
                        print(f'Warning: {name} missing for {subject}')
                
                if len(signals) != len(self.signal_names):
                    print(f'Skipping {subject} due to missing modalities')
                    continue
                
                # Ensure all signals have same length
                min_len = min(map(len, signals))
                signals = [s[:min_len] for s in signals]
                signal_matrix = np.stack(signals, axis=1)
                
                # Resample labels
                labels_resampled = resample(labels, min_len)
                labels_resampled = np.round(labels_resampled).astype(int)
                
                # Create sliding windows
                win_data, win_labels = self.create_windows(signal_matrix, labels_resampled)
                
                all_data.extend(win_data)
                all_labels.extend(win_labels)
                all_subjects.extend([subject]*len(win_data))
                
                print(f'Loaded {len(win_data)} sliding windows for {subject}')
                
            except Exception as e:
                print(f'Error processing {subject}: {e}')
                continue
        
        return np.array(all_data), np.array(all_labels), np.array(all_subjects)
    
    def create_windows(self, data, labels):
        step = int(self.window_size * (1 - self.overlap))
        windows = []
        window_labels = []
        
        for start in range(0, data.shape[0] - self.window_size + 1, step):
            end = start + self.window_size
            label_window = labels[start:end]
            
            # Handle newer scipy versions
            mode_result = mode(label_window, keepdims=True)
            lbl = int(mode_result[0][0])
            
            if lbl == 1:  # Baseline
                windows.append(data[start:end])
                window_labels.append(0)
            elif lbl == 2:  # Stress
                windows.append(data[start:end])
                window_labels.append(1)
        
        return windows, window_labels
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)


In [5]:
lr_rate = 0.0001
n_epochs = 100

In [6]:
DATASET_PATH = '/Users/kumar/Library/Mobile Documents/com~apple~CloudDocs/Phoenix/OVGU/HiWi2/Tasks/10_WESAD/WESAD.nosync'
# DATASET_PATH =  '/home/bumu60du/WESAD_Dataset'

ds = WESADDataset(DATASET_PATH)

Loaded 440 sliding windows for S2
Loaded 445 sliding windows for S3
Loaded 449 sliding windows for S4
Loaded 460 sliding windows for S5
Loaded 458 sliding windows for S6
Loaded 457 sliding windows for S7
Loaded 460 sliding windows for S8
Loaded 456 sliding windows for S9
Loaded 476 sliding windows for S10
Loaded 465 sliding windows for S11
Loaded 461 sliding windows for S13
Loaded 464 sliding windows for S14
Loaded 464 sliding windows for S15
Loaded 463 sliding windows for S16
Loaded 476 sliding windows for S17


In [7]:
# Window size: How many timesteps or consecutive records each sample contains
# Batch size: How many independent samples are processed in parallel

In [8]:
print(len(ds)) # How 6894?, all devices sampled at same rate?
input_sample, label_sample = ds[0]
print(len(input_sample)) # Time steps = 128, (window size is downsampled from 700 Hz to 32Hz, 128/32 = 4 seconds of data per window)
print(len(input_sample[0])) # ['ACC','Resp','EDA','Temp','ECG','EMG'], 6 sensors
print('Input sample:', input_sample) # 128 * 6
print('Label sample:', label_sample) # 0 for Baseline, 1 for Stress label for 4 seconds window

6894
128
6
Input sample: tensor([[ 9.5370e-01,  2.2468e+00,  5.5277e+00,  2.9131e+01, -1.4182e-01,
         -6.0475e-03],
        [ 9.1147e-01,  2.3274e+00,  5.5262e+00,  2.9136e+01, -1.3497e-01,
          6.8507e-05],
        [ 9.0827e-01,  2.3982e+00,  5.5229e+00,  2.9145e+01, -9.1329e-02,
         -3.4008e-03],
        [ 9.2792e-01,  2.4003e+00,  5.5208e+00,  2.9142e+01, -1.2794e-01,
          5.1459e-05],
        [ 9.3718e-01,  2.4020e+00,  5.5201e+00,  2.9131e+01, -1.3628e-01,
         -4.2756e-03],
        [ 9.3415e-01,  2.3529e+00,  5.5167e+00,  2.9139e+01, -5.8765e-02,
         -2.8011e-03],
        [ 9.2376e-01,  2.2870e+00,  5.5171e+00,  2.9137e+01,  7.2924e-02,
         -3.5566e-03],
        [ 9.2641e-01,  2.2015e+00,  5.5147e+00,  2.9140e+01,  6.3188e-02,
         -2.0663e-03],
        [ 9.3307e-01,  2.0973e+00,  5.5099e+00,  2.9133e+01,  3.0525e-02,
         -2.8087e-03],
        [ 9.3486e-01,  1.9369e+00,  5.5108e+00,  2.9139e+01,  1.3204e-02,
         -2.8071e-03],
     

In [9]:
print(ds.data)
print(ds.labels)

[[[ 9.53695909e-01  2.24679160e+00  5.52765841e+00  2.91313362e+01
   -1.41824139e-01 -6.04750620e-03]
  [ 9.11465368e-01  2.32740996e+00  5.52615135e+00  2.91355190e+01
   -1.34965516e-01  6.85065422e-05]
  [ 9.08267120e-01  2.39820206e+00  5.52290337e+00  2.91450386e+01
   -9.13289276e-02 -3.40082603e-03]
  ...
  [ 9.28455273e-01  5.44533803e-01  5.31834160e+00  2.91541538e+01
   -1.43304494e-01 -3.84787691e-03]
  [ 9.24815791e-01  4.76052949e-01  5.31637878e+00  2.91448460e+01
    2.75417114e-02 -3.66836806e-03]
  [ 9.24795830e-01  4.45706483e-01  5.31359401e+00  2.91560955e+01
    2.91937129e-02 -1.51953093e-03]]

 [[ 9.19668626e-01  3.95555448e-01  5.31144317e+00  2.91575813e+01
    3.24984189e-02 -3.88746767e-03]
  [ 9.35759078e-01  3.17082208e-01  5.30857120e+00  2.91454124e+01
   -1.36019995e-04 -2.01393702e-03]
  [ 9.38470096e-01  2.59713959e-01  5.30977232e+00  2.91462269e+01
    1.47452032e-02 -4.34544230e-03]
  ...
  [ 9.28935423e-01 -8.98991761e-01  5.18789230e+00  2.91644

In [10]:
# Normalise data

means = ds.data.mean(axis=(0, 1))   # shape: across batch and time steps
stds = ds.data.std(axis=(0, 1))

# apply normalization in-place
ds.data = (ds.data - means[None, None, :]) / stds[None, None, :]

In [11]:
# 2. Train test size

train_ratio, val_ratio, test_ratio = 0.7, 0.15, 0.15

n_total = len(ds)
n_train = int(train_ratio * n_total)  
n_val = int(val_ratio * n_total)      
n_test = n_total - n_train - n_val    

print(f"Total samples: {n_total}, Train: {n_train}, Validation: {n_val}, Test: {n_test}")

# Train test split
train_ds = Subset(ds, range(0, n_train))
val_ds   = Subset(ds, range(n_train, n_train + n_val))
test_ds  = Subset(ds, range(n_train + n_val, n_total))

Total samples: 6894, Train: 4825, Validation: 1034, Test: 1035


In [12]:
# Data Loaders
train_dataloader = DataLoader(train_ds, batch_size = 32, shuffle = False) # batch size = grouping 32 samples
val_dataloader = DataLoader(val_ds, batch_size = 32, shuffle = False)
test_dataloader = DataLoader(test_ds, batch_size = 32, shuffle = False)

# Sample batch
for batch_inputs, batch_labels in train_dataloader:
    print('batch_inputs:', batch_inputs)
    print('batch_labels:', batch_labels)
    print(len(batch_inputs))
    print(len(batch_inputs[0]))
    break

batch_inputs: tensor([[[ 7.5376e-01,  5.9662e-01,  2.5280e-01, -3.0084e+00, -6.0045e-01,
          -7.8230e-01],
         [-6.5244e-01,  6.1856e-01,  2.5237e-01, -3.0057e+00, -5.7163e-01,
           7.4942e-01],
         [-7.5893e-01,  6.3782e-01,  2.5145e-01, -2.9994e+00, -3.8830e-01,
          -1.1945e-01],
         ...,
         [-8.6707e-02,  1.3346e-01,  1.9340e-01, -2.9934e+00, -6.0667e-01,
          -2.3141e-01],
         [-2.0789e-01,  1.1483e-01,  1.9284e-01, -2.9996e+00,  1.1113e-01,
          -1.8646e-01],
         [-2.0856e-01,  1.0657e-01,  1.9205e-01, -2.9922e+00,  1.1807e-01,
           3.5171e-01]],

        [[-3.7929e-01,  9.2928e-02,  1.9144e-01, -2.9912e+00,  1.3195e-01,
          -2.4133e-01],
         [ 1.5650e-01,  7.1576e-02,  1.9063e-01, -2.9992e+00, -5.1575e-03,
           2.2788e-01],
         [ 2.4677e-01,  5.5967e-02,  1.9097e-01, -2.9987e+00,  5.7365e-02,
          -3.5603e-01],
         ...,
         [-7.0719e-02, -2.5930e-01,  1.5638e-01, -2.9867e+00,  1.

3. Transformer/ Neural network
    1) Create a model
    2) Choose a loss function
    3) Define a dataset
    4) Set an optimizer 
    5) Run a training loop
        Calculate loss (Forward pass)
        Compute gradients (Backpropagation)
        Updating model parameters

In [13]:
# num_heads = width of attention (how many perspectives are considered in parallel).
# num_layers = depth of reasoning (how many times the model refines its understanding)

In [14]:
# 3.1 TabTransformer model class, modified from Medium
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class TabTransformer(nn.Module):
    def __init__(self, num_features, num_classes, dim_embedding, num_heads, num_layers):
        super(TabTransformer, self).__init__()
        self.embedding = nn.Linear(num_features, dim_embedding) # project input features -> embedding
        # transformer encoder (batch_first=True so input shape is [batch_size, timesteps, num_features/dim_embedding])
        encoder_layer = nn.TransformerEncoderLayer(d_model=dim_embedding,nhead=num_heads,dim_feedforward=dim_embedding * 4,batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(dim_embedding, num_classes) # simple linear classifier

    def forward(self, x):
        # x: [batch, timesteps, features]
        x = self.embedding(x)            # -> [batch, timesteps, dim_embedding], project input to embedding
        x = self.transformer(x)          # -> [batch, timesteps, dim_embedding], passes through multiple [Attention + FFN + Norm] layers
        x = torch.mean(x, dim=1)         # -> [batch, dim_embedding], global mean pooling over timesteps 
        x = self.classifier(x)           # -> [batch, num_classes], final classification head
        return x

# Initialize model
model = TabTransformer(
    num_features = 6,        # 6 sensor features
    num_classes = 2,         # Binary classification
    dim_embedding = 64,      # Embedding dimension
    num_heads = 4,           # Number of attention heads
    num_layers = 4,          # Number of transformer layers
).to(device)

print(model)
print(f"Using device: {device}")

TabTransformer(
  (embedding): Linear(in_features=6, out_features=64, bias=True)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=256, out_features=64, bias=True)
        (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (classifier): Linear(in_features=64, out_features=2, bias=True)
)
Using device: cpu


In [15]:
# Loss function criterion and optimizer 
criterion = nn.CrossEntropyLoss().to(device) # measures the error between predicted and true
optimizer = optim.Adam(model.parameters(), lr=lr_rate) # updates the model weights by minimizing the loss
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

In [16]:
# Import additional libraries
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import torch.nn.functional as F

In [18]:
import os

#save_dir = "/home/bumu60du/test"
save_dir = "/Users/kumar/transformer_timeseries/checkpoints"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, "best_model.pt")

In [20]:
for epoch in range(n_epochs):
    # Training phase 
    model.train()                   # model set to training model
    epoch_loss = 0.0                # accumulates loss for the epoch
    correct = 0                     # to comupte accuracy
    total = 0 

    for x, y in train_dataloader: # iterate training data in batches
        # Move to device
        x, y = x.to(device), y.to(device)
        
        optimizer.zero_grad()        # Reset gradients
        outputs = model(x)           # Forward pass
        loss = criterion(outputs, y) # Compute loss
        loss.backward()              # Backward pass
        optimizer.step()             # Update weights

        epoch_loss += loss.item() * x.size(0) # loss * batch_size, epoch loss += batchloss
        # ---- Compute training accuracy ----
        # For classification (outputs: logits)
        _, predicted = torch.max(outputs, dim=1)
        correct += (predicted == y).sum().item()
        total += y.size(0)

    # Epoch summary
    epoch_train_loss = epoch_loss / total # avg epoch loss
    epoch_train_acc = 100.0 * correct / total

    print(f"Epoch [{epoch+1}/{n_epochs}] - Loss: {epoch_train_loss:.4f} and Accuracy: {epoch_train_acc:.2f}%")

Epoch [1/100] - Loss: 0.6865 and Accuracy: 54.20%
Epoch [2/100] - Loss: 0.6180 and Accuracy: 61.87%
Epoch [3/100] - Loss: 0.5967 and Accuracy: 65.10%


KeyboardInterrupt: 

In [46]:
# ---- Evaluation ----
model.eval()
correct = 0
total = 0
all_preds = []
all_labels = []

# Convert test data to tensors and move to device
with torch.no_grad():
    for x, y in test_dataloader:
        x, y = x.to(device), y.to(device)
        outputs = model(x)
        _, predicted = torch.max(outputs.data, 1)
        
        total += y.size(0)
        correct += (predicted == y).sum().item()
        
        # Store predictions and labels for additional metrics
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(y.cpu().numpy())
    
accuracy = 100 * correct / total
f1 = f1_score(all_labels, all_preds, average='weighted')


print(f"Test Accuracy: {accuracy :.2f}%")

Test Accuracy: 93.33%


In [21]:
# Training loop with validation
def train_model(model, train_loader, val_loader, criterion, optimizer, n_epochs, device):
    best_val_acc = 0
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
    
    for epoch in range(n_epochs):
        # ---- Training phase ----
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0

        for x, y in train_loader:
            # Move to device
            x, y = x.to(device), y.to(device)
            
            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * x.size(0)
            _, predicted = torch.max(outputs, 1)
            train_correct += (predicted == y).sum().item()
            train_total += y.size(0)

        # Calculate training metrics
        epoch_train_loss = train_loss / train_total
        epoch_train_acc = 100.0 * train_correct / train_total
        
        # ---- Validation phase ----
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(device), y.to(device)
                outputs = model(x)
                loss = criterion(outputs, y)
                
                val_loss += loss.item() * x.size(0)
                _, predicted = torch.max(outputs, 1)
                val_correct += (predicted == y).sum().item()
                val_total += y.size(0)
        
        # Calculate validation metrics
        epoch_val_loss = val_loss / val_total
        epoch_val_acc = 100.0 * val_correct / val_total
        
        # Store metrics
        history['train_loss'].append(epoch_train_loss)
        history['train_acc'].append(epoch_train_acc)
        history['val_loss'].append(epoch_val_loss)
        history['val_acc'].append(epoch_val_acc)
        
        # Save best model
        if epoch_val_acc > best_val_acc:
            best_val_acc = epoch_val_acc
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_acc': best_val_acc,
            }, 'best_model.pt')
        
        # Print progress
        print(f"Epoch [{epoch+1}/{n_epochs}]")
        print(f"Train - Loss: {epoch_train_loss:.4f} | Acc: {epoch_train_acc:.2f}%")
        print(f"Valid - Loss: {epoch_val_loss:.4f} | Acc: {epoch_val_acc:.2f}%")
        print("-" * 50)
    
    return history

# Usage
n_epochs = 100
history = train_model(
    model=model,
    train_loader=train_dataloader,
    val_loader=val_dataloader,
    criterion=criterion,
    optimizer=optimizer,
    n_epochs=n_epochs,
    device=device
)

Epoch [1/100]
Train - Loss: 0.5609 | Acc: 65.20%
Valid - Loss: 0.5132 | Acc: 65.67%
--------------------------------------------------
Epoch [2/100]
Train - Loss: 0.5458 | Acc: 69.10%
Valid - Loss: 0.4885 | Acc: 70.70%
--------------------------------------------------
Epoch [3/100]
Train - Loss: 0.5478 | Acc: 67.65%
Valid - Loss: 0.4688 | Acc: 74.95%
--------------------------------------------------
Epoch [4/100]
Train - Loss: 0.5046 | Acc: 70.90%
Valid - Loss: 0.4141 | Acc: 81.24%
--------------------------------------------------
Epoch [5/100]
Train - Loss: 0.4756 | Acc: 75.19%
Valid - Loss: 0.3428 | Acc: 84.62%
--------------------------------------------------
Epoch [6/100]
Train - Loss: 0.4196 | Acc: 79.96%
Valid - Loss: 0.3255 | Acc: 85.59%
--------------------------------------------------
Epoch [7/100]
Train - Loss: 0.3852 | Acc: 81.72%
Valid - Loss: 0.3271 | Acc: 85.20%
--------------------------------------------------
Epoch [8/100]
Train - Loss: 0.3627 | Acc: 84.79%
Valid 

In [None]:
import matplotlib.pyplot as plt
plt.plot(history['train_acc'], label='Train Acc')
plt.plot(history['val_acc'], label='Val Acc')