References
WESAD: https://archive.ics.uci.edu/dataset/465/wesad+wearable+stress+and+affect+detection \
TabTransformer:\
https://aravindkolli.medium.com/mastering-tabular-data-with-tabtransformer-a-comprehensive-guide-119f6dbf5a79 \
https://medium.com/@cristianleo120/the-math-behind-tabtransformer-78b78c12cfc1 \
https://towardsdatascience.com/transformers-for-tabular-data-b3e196fab6f4/\
https://towardsdatascience.com/transformers-for-tabular-data-tabtransformer-deep-dive-5fb2438da820/


Steps:
1. Import Dataset
2. Train-test split and Data Loader
3. Transformer/ Neural network
    1) Create a model
    2) Choose a loss function
    3) Set an optimizer 
    4) Run a training loop
        Calculate loss (Forward pass)
        Compute gradients (Backpropagation)
        Updating model parameters
4. Evaluation

In [31]:
# pip install "numpy==1.23.5" "scipy==1.10.1"
# pip install torch

In [32]:
#1. Import libraries
import os
import numpy as np
from sklearn.model_selection import LeaveOneGroupOut, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score


In [33]:
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset, Subset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [34]:
# Parameters
lr_rate = 0.0001
n_epochs = 25


# Window size: How many timesteps or consecutive records each sample contains
# Batch size: How many independent samples are processed in parallel
# num_heads = width of attention (how many perspectives are considered in parallel).
# num_layers = depth of reasoning (how many times the model refines its understanding)

In [67]:
# import module
from load_data import WESADDataset
from transformer_model import TabTransformer
from train_loop import train_model
from train_loop4 import train_model4
from test_loop import evaluate_model


In [36]:
DATASET_PATH = '/Users/kumar/Library/Mobile Documents/com~apple~CloudDocs/Phoenix/OVGU/HiWi2/Tasks/10_WESAD/WESAD.nosync'
# DATASET_PATH =  '/home/bumu60du/WESAD_Dataset'

ds = WESADDataset(DATASET_PATH)

# print(ds.data)
# print(ds.labels)

print(len(ds)) # How 6894?, all devices sampled at same rate?
input_sample, label_sample = ds[0]
print(len(input_sample)) # Time steps = 128, (window size is downsampled from 700 Hz to 32Hz, 128/32 = 4 seconds of data per window)
print(len(input_sample[0])) # ['ACC','Resp','EDA','Temp','ECG','EMG'], 6 sensors
print('Input sample:', input_sample) # 128 * 6
print('Label sample:', label_sample) # 0 for Baseline, 1 for Stress label for 4 seconds window

Loaded 440 sliding windows for S2
Loaded 445 sliding windows for S3
Loaded 449 sliding windows for S4
Loaded 460 sliding windows for S5
Loaded 458 sliding windows for S6
Loaded 457 sliding windows for S7
Loaded 460 sliding windows for S8
Loaded 456 sliding windows for S9
Loaded 476 sliding windows for S10
Loaded 465 sliding windows for S11
Loaded 461 sliding windows for S13
Loaded 464 sliding windows for S14
Loaded 464 sliding windows for S15
Loaded 463 sliding windows for S16
Loaded 476 sliding windows for S17
6894
128
6
Input sample: tensor([[ 9.5370e-01,  2.2468e+00,  5.5277e+00,  2.9131e+01, -1.4182e-01,
         -6.0475e-03],
        [ 9.1147e-01,  2.3274e+00,  5.5262e+00,  2.9136e+01, -1.3497e-01,
          6.8507e-05],
        [ 9.0827e-01,  2.3982e+00,  5.5229e+00,  2.9145e+01, -9.1329e-02,
         -3.4008e-03],
        [ 9.2792e-01,  2.4003e+00,  5.5208e+00,  2.9142e+01, -1.2794e-01,
          5.1459e-05],
        [ 9.3718e-01,  2.4020e+00,  5.5201e+00,  2.9131e+01, -1.3628e-

In [37]:
# Normalise data

means = ds.data.mean(axis=(0, 1))   # shape: across batch and time steps
stds = ds.data.std(axis=(0, 1))

# apply normalization in-place
ds.data = (ds.data - means[None, None, :]) / stds[None, None, :]

In [38]:
# 2.1 Train test size with ratios

train_ratio, val_ratio, test_ratio = 0.7, 0.15, 0.15

n_total = len(ds)
n_train = int(train_ratio * n_total)  
n_val = int(val_ratio * n_total)      
n_test = n_total - n_train - n_val    

print(f"Total samples: {n_total}, Train: {n_train}, Validation: {n_val}, Test: {n_test}")

# Train val test split
train_ds = Subset(ds, range(0, n_train))
val_ds   = Subset(ds, range(n_train, n_train + n_val))
test_ds  = Subset(ds, range(n_train + n_val, n_total))


# Data Loaders
train_dataloader = DataLoader(train_ds, batch_size = 32, shuffle = False) # batch size = grouping 32 samples
val_dataloader = DataLoader(val_ds, batch_size = 32, shuffle = False)
test_dataloader = DataLoader(test_ds, batch_size = 32, shuffle = False)

# Sample batch
for batch_inputs, batch_labels in train_dataloader:
    print('batch_inputs:', batch_inputs)
    print('batch_labels:', batch_labels)
    print(len(batch_inputs))
    print(len(batch_inputs[0]))
    break

Total samples: 6894, Train: 4825, Validation: 1034, Test: 1035
batch_inputs: tensor([[[ 7.5376e-01,  5.9662e-01,  2.5280e-01, -3.0084e+00, -6.0045e-01,
          -7.8230e-01],
         [-6.5244e-01,  6.1856e-01,  2.5237e-01, -3.0057e+00, -5.7163e-01,
           7.4942e-01],
         [-7.5893e-01,  6.3782e-01,  2.5145e-01, -2.9994e+00, -3.8830e-01,
          -1.1945e-01],
         ...,
         [-8.6707e-02,  1.3346e-01,  1.9340e-01, -2.9934e+00, -6.0667e-01,
          -2.3141e-01],
         [-2.0789e-01,  1.1483e-01,  1.9284e-01, -2.9996e+00,  1.1113e-01,
          -1.8646e-01],
         [-2.0856e-01,  1.0657e-01,  1.9205e-01, -2.9922e+00,  1.1807e-01,
           3.5171e-01]],

        [[-3.7929e-01,  9.2928e-02,  1.9144e-01, -2.9912e+00,  1.3195e-01,
          -2.4133e-01],
         [ 1.5650e-01,  7.1576e-02,  1.9063e-01, -2.9992e+00, -5.1575e-03,
           2.2788e-01],
         [ 2.4677e-01,  5.5967e-02,  1.9097e-01, -2.9987e+00,  5.7365e-02,
          -3.5603e-01],
         ...,
  

In [39]:
# 2.2 Subject bins for LOOCV

subject_counts = {'S2': 440, 'S3': 445, 'S4': 449,'S5': 460,'S6': 458,'S7': 457,'S8': 460,'S9': 456,
                  'S10': 476,'S11': 465,'S13': 461,'S14': 464,'S15': 464,'S16': 463, 'S17': 476}

subject_indices = {}
start = 0
for subject, count in subject_counts.items():
    end = start + count
    subject_indices[subject] = [start, end]
    start = end

print(subject_counts)
print(subject_indices)

{'S2': 440, 'S3': 445, 'S4': 449, 'S5': 460, 'S6': 458, 'S7': 457, 'S8': 460, 'S9': 456, 'S10': 476, 'S11': 465, 'S13': 461, 'S14': 464, 'S15': 464, 'S16': 463, 'S17': 476}
{'S2': [0, 440], 'S3': [440, 885], 'S4': [885, 1334], 'S5': [1334, 1794], 'S6': [1794, 2252], 'S7': [2252, 2709], 'S8': [2709, 3169], 'S9': [3169, 3625], 'S10': [3625, 4101], 'S11': [4101, 4566], 'S13': [4566, 5027], 'S14': [5027, 5491], 'S15': [5491, 5955], 'S16': [5955, 6418], 'S17': [6418, 6894]}


3. Transformer/ Neural network
    1) Create a model
    2) Choose a loss function
    3) Define a dataset
    4) Set an optimizer 
    5) Run a training loop
        Calculate loss (Forward pass)
        Compute gradients (Backpropagation)
        Updating model parameters

In [53]:
# Initialize model
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("Using device:", device)

model = TabTransformer(
    num_features = 6,        # 6 sensor features
    num_classes = 2,         # Binary classification
    dim_embedding = 64,      # Embedding dimension
    num_heads = 4,           # Number of attention heads
    num_layers = 4,          # Number of transformer layers
    dropout = 0.1            # Dropout rate
).to(device)

print(model)
print(f"Using device: {device}")

Using device: mps
TabTransformer(
  (embedding): Linear(in_features=6, out_features=64, bias=True)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=256, out_features=64, bias=True)
        (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (classifier): Linear(in_features=64, out_features=2, bias=True)
)
Using device: mps


In [41]:
# Loss function criterion and optimizer 
criterion = nn.CrossEntropyLoss().to(device) # measures the error between predicted and true
optimizer = optim.Adam(model.parameters(), lr=lr_rate) # updates the model weights by minimizing the loss
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

In [42]:
#save_dir = "/home/bumu60du/test"
save_dir = "/Users/kumar/Desktop/Projects/transformer_timeseries/checkpoints"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, "best_model.pt")

In [43]:
# Print current epoch setting
print(f"Training with n_epochs = {n_epochs}")

Training with n_epochs = 25


In [70]:
# Simplified LOOCV - No separate validation phase
loocv_results = []

for test_subject, test_range in subject_indices.items():
    print(f"\n{'='*60}")
    print(f"Testing on Subject: {test_subject}")
    print(f"{'='*60}")
    
    # Create test set
    test_indices = list(range(test_range[0], test_range[1]))
    test_ds = Subset(ds, test_indices)
    test_loader = DataLoader(test_ds, batch_size=32, shuffle=False)
    
    # Train on all other subjects
    train_indices = []
    for subj, subj_range in subject_indices.items():
        if subj != test_subject:
            train_indices.extend(range(subj_range[0], subj_range[1]))
    
    train_ds = Subset(ds, train_indices)
    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
    
    print(f"Train: {len(train_ds)} samples, Test: {len(test_ds)} samples")
    
    # Initialize model
    model = TabTransformer(
        num_features=6, 
        num_classes=2, 
        dim_embedding=64, 
        num_heads=4, 
        num_layers=4,
        dropout=0.1
    ).to(device)
    
    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3)

    # Training loop with early stopping
    history, best_model_state = train_model4(
        model=model,
        train_dataloader=train_loader,
        criterion=criterion,
        optimizer=optimizer,
        n_epochs=n_epochs,
        device=device,
        patience=3,  # Early stopping patience
        min_delta=1e-3  # Minimum improvement threshold
    )
    
    # Test evaluation 
    test_acc, test_f1, test_preds, test_labels = evaluate_model(
        model=model,
        test_loader=test_loader,
        device=device
    )
    
    print(f"{test_subject} Results: Acc={test_acc:.2f}%, F1={test_f1:.4f}\n")
    
    loocv_results.append({
        'subject': test_subject,
        'accuracy': test_acc,
        'f1_score': test_f1,
        'predictions': test_preds,
        'true_labels': test_labels
    })

# Summary
print(f"{'='*60}\nLOOCV Summary\n{'='*60}")
for r in loocv_results:
    print(f"{r['subject']}: Acc={r['accuracy']:.2f}%, F1={r['f1_score']:.4f}")

print(f"\nMean Accuracy: {np.mean([r['accuracy'] for r in loocv_results]):.2f}%")
print(f"Mean F1 Score: {np.mean([r['f1_score'] for r in loocv_results]):.4f}")


Testing on Subject: S2
Train: 6454 samples, Test: 440 samples
Epoch [1/25] Loss: 0.4121 | Acc: 77.84%
Epoch [2/25] Loss: 0.1700 | Acc: 93.59%
Epoch [3/25] Loss: 0.0973 | Acc: 96.87%
Epoch [4/25] Loss: 0.0779 | Acc: 97.33%
Epoch [5/25] Loss: 0.0580 | Acc: 97.86%
Epoch [6/25] Loss: 0.0403 | Acc: 98.53%
Epoch [7/25] Loss: 0.0374 | Acc: 98.76%
Epoch [8/25] Loss: 0.0254 | Acc: 99.13%
Epoch [9/25] Loss: 0.0274 | Acc: 98.99%
Early stopping counter: 1/3
Epoch [10/25] Loss: 0.0187 | Acc: 99.41%
Epoch [11/25] Loss: 0.0173 | Acc: 99.54%
Epoch [12/25] Loss: 0.0151 | Acc: 99.52%
Epoch [13/25] Loss: 0.0092 | Acc: 99.75%
Epoch [14/25] Loss: 0.0106 | Acc: 99.64%
Early stopping counter: 1/3
Epoch [15/25] Loss: 0.0089 | Acc: 99.80%
Early stopping counter: 2/3
Epoch [16/25] Loss: 0.0087 | Acc: 99.77%
Early stopping counter: 3/3

Early stopping triggered after 16 epochs!
S2 Results: Acc=65.00%, F1=0.5121


Testing on Subject: S3
Train: 6449 samples, Test: 445 samples
Epoch [1/25] Loss: 0.4437 | Acc: 77.1