# IY011 Contrastive Learning: Data Preparation

In [1]:
import os
from pathlib import Path
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import torch, itertools
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
# from classifiers.transformer_classifier import transformer_classifier
from models.simple_transformer import SimpleTransformer
from models.transformer import TransformerClassifier, train_model, evaluate_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Build groups
from utils.data_processing import build_groups
%load_ext autoreload
%autoreload 2

In [2]:
DATA_ROOT = Path("/home/ianyang/stochastic_simulations/experiments/EXP-25-IY011/data")
RESULTS_PATH = DATA_ROOT / "IY011_simulation_parameters_sobol.csv" #  this csv file stores all the simulation parameters used
df_params = pd.read_csv(RESULTS_PATH) 
# TRAJ_PATH = [DATA_ROOT / f"mRNA_trajectories_mu{row['mu_target']:.3f}_cv{row['cv_target']:.3f}_tac{row['t_ac_target']:.3f}.csv" for idx, row in df_params.iterrows()] # the trajectories 
TRAJ_PATH = [DATA_ROOT / df_params['trajectory_filename'].values[i] for i in range(len(df_params))]
TRAJ_NPZ_PATH = [traj_file.with_suffix('.npz') for traj_file in TRAJ_PATH]

## extract steady state trajectories only, then save to .npz files for faster loading:

```python
# extract only steady state part of the trajs 
ss_index_list = []
for traj_file in TRAJ_PATH:
    parameter_subsets = parameter_sets[TRAJ_PATH.index(traj_file)]
    _, ss_index = find_steady_state(parameter_subsets)
    ss_index_list.append(ss_index)
# find the maximum steady state time across all trajectories
max_ss_index = max(ss_index for ss_index in ss_index_list)
new_time_points = time_points[max_ss_index:]

for traj_file, params in zip(TRAJ_PATH, parameter_sets):
    df_traj = pd.read_csv(traj_file)
    df_traj = df_traj.drop(columns=['label'], errors='ignore') # drop in-place if it exists, do nothing otherwise
    # ensure same length across all trajs by truncating to the maximum steady state time
    df_traj = df_traj.iloc[:, max_ss_index:]
    trajectories = df_traj.values
    trajectory_data = {
            'trajectories': trajectories.astype(np.float32),
            'time_points': new_time_points.astype(np.float32),
            'size': int(size),
            'parameters': params,
    }
    try:
        np.savez_compressed(
        traj_file.with_suffix('.npz'),
        trajectories=trajectories.astype(np.float32),
        time_points=new_time_points.astype(np.float32),
        size=size,
        parameters=params,
        )
    except PermissionError:
        with tempfile.NamedTemporaryFile(delete=False, suffix='.npz') as tmp_file:
            tmp_path = tmp_file.name
            np.savez_compressed(tmp_path, **trajectory_data)
        # Move temp file to final location with sudo
        subprocess.run(['sudo', 'mv', tmp_path, traj_file.with_suffix('.npz')], check=True)
        subprocess.run(['sudo', 'chown', f'{os.getenv("USER")}:{os.getenv("USER")}', traj_file.with_suffix('.npz')], check=True)
```

In [3]:
# load back the data (the first simulation) from the npz file
data = np.load(TRAJ_NPZ_PATH[1], allow_pickle=True)
print(data['trajectories'])
print(len(data['time_points']))
print(data['size'])
print(data['parameters'])

[[ 2053.  2023.  1994. ...  1433.  1411.  1386.]
 [ 8422.  8272.  8149. ... 23341. 22966. 22593.]
 [ 3147.  3098.  3047. ...   114.   114.   113.]
 ...
 [ 4381.  4308. 11701. ... 27395. 26960. 26544.]
 [14966. 14723. 14475. ... 44179. 43516. 42803.]
 [ 2659.  2624.  2583. ...  1603.  1570.  1541.]]
1811
1000
{'sigma_b': 0.0118563038378397, 'sigma_u': 0.9881436961621602, 'rho': 11958.995981515953, 'd': 0.0163008923214477, 'label': 0}


instead of binarily labelling them based on stats, we can do a **pairwise contrastive learning**, where we take random pairs of trajectories for classification (this will eliminate issues with class imbalance too)

In [4]:
num_traj = 100
NUM_GROUPS = 2  # a pair: 1 pos, 1 neg
groups = build_groups(TRAJ_NPZ_PATH, num_groups=NUM_GROUPS, num_traj=num_traj) # list of tuples (X, y)

Building positive groups: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00, 666.08it/s]
Building negative groups: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00, 530.05it/s]


stack or flatten?

In [5]:
# Stacked groups -> individual trajectory samples
X_samples = []
y_samples = []
for Xg, yg in groups:          # Xg shape (seq_len, K)
    L, K = Xg.shape
    for k in range(K):
        X_samples.append(Xg[:, k:k+1])  # (seq_len, 1)
        y_samples.append(yg)            # or some other per-trajectory label
X_samples = np.stack(X_samples, 0)      # (N_samples, seq_len, 1)
y_samples = np.array(y_samples)
print(f'X_samples shape: {X_samples.shape}, y_samples shape: {y_samples.shape}')

X_samples shape: (200, 1811, 1), y_samples shape: (200,)


In [6]:
# Flatten groups -> individual 1D trajectory samples (seq_len,)
X_flat = []
y_flat = []
for Xg, yg in groups:          # Xg shape (seq_len, K)
    L, K = Xg.shape
    for k in range(K):
        traj = Xg[:, k].astype(np.float32)   # shape (seq_len,)
        X_flat.append(traj)
        y_flat.append(int(yg))
        # X_flat.extend(traj)
        # y_flat.extend([int(yg)] * L)
X_flat = np.stack(X_flat, 0)      # (N_samples, seq_len)
y_flat = np.array(y_flat, dtype=np.int64)
print(f'X_flat shape: {X_flat.shape}, y_flat shape: {y_flat.shape}')

X_flat shape: (200, 1811), y_flat shape: (200,)


Train/Val/Test split with stratify on group label

In [7]:
# # with the stacked samples
X_train, X_test, y_train, y_test = train_test_split(
    X_samples, y_samples, test_size=0.2, random_state=42, stratify=y_samples
)
X_train, X_val,  y_train, y_val  = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# # with the flattened samples
# X_train, X_test, y_train, y_test = train_test_split(
#     X_flat, y_flat, test_size=0.2, random_state=42, stratify=y_flat
# )
# X_train, X_val,  y_train, y_val  = train_test_split(
#     X_train, y_train, test_size=0.2, random_sta
# te=42, stratify=y_train
# )

print("Data preparation:")
print(f"  Train groups: {len(y_train)}, Val groups: {len(y_val)}, Test groups: {len(y_test)}")


Data preparation:
  Train groups: 128, Val groups: 32, Test groups: 40


Standardise data

In [8]:
# === Standardise features (across time*batch, per-channel) ===
scaler = StandardScaler()

# Reshape 3D data to 2D for scaling
original_shape_train = X_train.shape
original_shape_val = X_val.shape
original_shape_test = X_test.shape

# Reshape to 2D: (batch * seq_len, features)
X_train_2d = X_train.reshape(-1, X_train.shape[-1])
X_val_2d = X_val.reshape(-1, X_val.shape[-1])
X_test_2d = X_test.reshape(-1, X_test.shape[-1])

# Scale the data
X_train_2d = scaler.fit_transform(X_train_2d)
X_val_2d = scaler.transform(X_val_2d)
X_test_2d = scaler.transform(X_test_2d)

# Reshape back to 3D
X_train = X_train_2d.reshape(original_shape_train)
X_val = X_val_2d.reshape(original_shape_val)
X_test = X_test_2d.reshape(original_shape_test)

print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)

X_train shape: (128, 1811, 1)
X_val shape: (32, 1811, 1)
X_test shape: (40, 1811, 1)


In [9]:
# Torch loaders
batch_size = NUM_GROUPS

# === Convert to tensors and loaders ===
X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.long)
X_val_t   = torch.tensor(X_val,   dtype=torch.float32)
y_val_t   = torch.tensor(y_val,   dtype=torch.long)
X_test_t  = torch.tensor(X_test,  dtype=torch.float32)
y_test_t  = torch.tensor(y_test,  dtype=torch.long)

train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=batch_size, shuffle=True,  num_workers=4, pin_memory=True)
val_loader   = DataLoader(TensorDataset(X_val_t,   y_val_t),   batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
test_loader  = DataLoader(TensorDataset(X_test_t,  y_test_t),  batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

# check the data loaders
for X_batch, y_batch in train_loader:
    print(X_batch.shape, y_batch.shape)
    break 

torch.Size([2, 1811, 1]) torch.Size([2])


trying out this data structure with the training (loss_fn=CrossEntropyLoss)

In [10]:
# choose feature dim from training data (works for both flows)
# X_train shape must be (N, seq_len, features)
# assert X_train.ndim == 3, "expect (N, seq_len, features)"
# feature_dim = X_train.shape[-1]   # 1 for per-trajectory samples, =num_traj for group-level

input_size = 1
num_classes = len(set(y_test))

model = TransformerClassifier(
    input_size=input_size,
    d_model=64,
    nhead=4,
    num_layers=2, 
    num_classes=num_classes,
    dropout=0.001, 
    use_conv1d=False 
)

train_history = train_model(model, train_loader, val_loader, epochs=50, patience=10)

Starting training...
Epoch [1/50] | train_loss 0.34 | train_acc 0.93 | val_loss 0.00 | val_acc 1.00
No improvement (1/10).
Epoch [2/50] | train_loss 0.00 | train_acc 1.00 | val_loss 0.00 | val_acc 1.00
No improvement (2/10).
Epoch [3/50] | train_loss 0.00 | train_acc 1.00 | val_loss 0.00 | val_acc 1.00
No improvement (3/10).
Epoch [4/50] | train_loss 0.00 | train_acc 1.00 | val_loss 0.00 | val_acc 1.00
No improvement (4/10).
Epoch [5/50] | train_loss 0.00 | train_acc 1.00 | val_loss 0.00 | val_acc 1.00
No improvement (5/10).
Epoch [6/50] | train_loss 0.00 | train_acc 1.00 | val_loss 0.00 | val_acc 1.00
No improvement (6/10).
Epoch [7/50] | train_loss 0.00 | train_acc 1.00 | val_loss 0.00 | val_acc 1.00
No improvement (7/10).
Epoch [8/50] | train_loss 0.00 | train_acc 1.00 | val_loss 0.00 | val_acc 1.00
No improvement (8/10).
Epoch [9/50] | train_loss 0.00 | train_acc 1.00 | val_loss 0.00 | val_acc 1.00
No improvement (9/10).
Epoch [10/50] | train_loss 0.00 | train_acc 1.00 | val_loss 0

In [11]:
loss, acc = evaluate_model(model, test_loader)
print(f"Group-level Test Accuracy (num_traj={num_traj}): {acc:.4f}")

Test â€” loss: N/A | acc: 1.00
Group-level Test Accuracy (num_traj=100): 1.0000
