# IY011 Training Models Post Simulation

load the simulated data (csv files)

In [1]:
import os
import subprocess
import tempfile
from pathlib import Path
import numpy as np
import pandas as pd
import time
# plotting 
import matplotlib.pyplot as plt
from visualisation.plots import plot_mRNA_dist, plot_mRNA_trajectory
# simulation
from simulation.julia_simulate_telegraph_model import simulate_telegraph_model
# ml
import torch, itertools
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from classifiers.transformer_classifier import transformer_classifier
# data handling
from utils.load_data import load_and_split_data
from utils.data_processing import add_binary_labels
from utils.standardise_time_series import standardise_time_series

%load_ext autoreload
%autoreload 2  

In [2]:
DATA_ROOT = Path("/home/ianyang/stochastic_simulations/experiments/EXP-25-IY011/data")
RESULTS_PATH = DATA_ROOT / "IY011_simulation_parameters_sobol.csv" #  this csv file stores all the simulation parameters used
df_params = pd.read_csv(RESULTS_PATH) 
# TRAJ_PATH = [DATA_ROOT / f"mRNA_trajectories_mu{row['mu_target']:.3f}_cv{row['cv_target']:.3f}_tac{row['t_ac_target']:.3f}.csv" for idx, row in df_params.iterrows()] # the trajectories 
TRAJ_PATH = [DATA_ROOT / df_params['trajectory_filename'].values[i] for i in range(len(df_params))]

# extract meta data
parameter_sets = [{
    'sigma_b': row['sigma_b'],
    'sigma_u': row['sigma_u'],
    'rho': row['rho'],
    'd': row['d'],
    'label': 0
} for idx, row in df_params.iterrows()]
time_points = np.arange(0, 3000, 1.0)
size = 1000

In [3]:
# read in the first trajectory file (csv)
df_traj = pd.read_csv(TRAJ_PATH[0])
df_traj = df_traj.drop(columns=['label'], errors='ignore') # drop in-place if it exists, do nothing otherwise
df_traj.head()

Unnamed: 0,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,t_9,...,t_2990,t_2991,t_2992,t_2993,t_2994,t_2995,t_2996,t_2997,t_2998,t_2999
0,0,13004,17205,17025,16838,16669,16494,16309,16163,15994,...,4270,4234,4199,4150,4108,4057,4014,3979,3944,3894
1,0,9950,9848,9730,9653,9562,9458,9383,9302,9211,...,1637,1617,1598,1584,1565,1554,1540,1522,1509,1499
2,0,13065,25840,26840,26586,26310,26008,25742,25479,25251,...,235,232,228,226,222,222,218,217,217,216
3,0,9142,9057,8965,8869,8785,8686,8583,8471,8395,...,63,63,63,63,62,60,57,56,55,55
4,0,3420,3383,3340,3302,3269,3245,3214,3185,3156,...,5,5,5,5,5,5,5,5,5,5


label the datasets

In [None]:
label_column = 'mu_target'  # column name to base the binary labels on, mu_target for simplicity
labelled_df_params = add_binary_labels(df_params,label_column)
labels = [] 

for i in range(len(df_params)):
    # find the filename for each trajectory file
    trajectory_filename = df_params['trajectory_filename'].values[i]
    
    # get the corresponding label from labelled_df_params
    label_value = labelled_df_params[labelled_df_params['trajectory_filename'] == trajectory_filename]['label'].iloc[0]
    labels.append(label_value)

labels

[np.int64(0),
 np.int64(1),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(1),
 np.int64(0),
 np.int64(0),
 np.int64(1),
 np.int64(1),
 np.in

Convert the datafiles from csv to NPZ format for faster loading in future <span style="color: red;">(this takes half an hour)</span>


In [None]:
# convert the trajectory files (csv) into npz format
for traj_file, params, label in zip(TRAJ_PATH, parameter_sets, labels):
    df_traj = pd.read_csv(traj_file)
    df_traj = df_traj.drop(columns=['label'], errors='ignore') # drop in-place if it exists, do nothing otherwise
    trajectories = df_traj.values
    trajectory_data = {
            'trajectories': trajectories.astype(np.float32),
            'time_points': time_points.astype(np.float32),
            'size': int(size),
            'parameters': params,
            'label': label
    }
    try:
        np.savez_compressed(
        traj_file.with_suffix('.npz'),
        trajectories=trajectories.astype(np.float32),
        time_points=time_points.astype(np.float32),
        size=size,
        parameters=params,
        label=label
        )
    except PermissionError:
        with tempfile.NamedTemporaryFile(delete=False, suffix='.npz') as tmp_file:
            tmp_path = tmp_file.name
            np.savez_compressed(tmp_path, **trajectory_data)
        # Move temp file to final location with sudo
        subprocess.run(['sudo', 'mv', tmp_path, traj_file.with_suffix('.npz')], check=True)
        subprocess.run(['sudo', 'chown', f'{os.getenv("USER")}:{os.getenv("USER")}', traj_file.with_suffix('.npz')], check=True)


Each of the npz file will contain several arrays:
- trajectories
- time_points
- size
- parameters
- label (incorrectly named 'labels')

In [4]:
# load back the data (the first simulation) from the npz file
data = np.load(TRAJ_PATH[1].with_suffix('.npz'), allow_pickle=True)
print(data['trajectories'])
print(data['time_points'])
print(data['size'])
print(data['parameters'])
print(data['labels']) # ....i made a typo here during the conversion

[[    0. 11726. 23482. ...  1433.  1411.  1386.]
 [    0.  1686.  1661. ... 23341. 22966. 22593.]
 [    0.  3714.  3658. ...   114.   114.   113.]
 ...
 [    0.  2753.  2703. ... 27395. 26960. 26544.]
 [    0. 11854. 20635. ... 44179. 43516. 42803.]
 [    0.  9483.  9341. ...  1603.  1570.  1541.]]
[0.000e+00 1.000e+00 2.000e+00 ... 2.997e+03 2.998e+03 2.999e+03]
1000
{'sigma_b': 0.0118563038378397, 'sigma_u': 0.9881436961621602, 'rho': 11958.995981515953, 'd': 0.0163008923214477, 'label': 0}
1


In [5]:
# read traj back into dataframe
data_df = pd.DataFrame(data['trajectories'])
data_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
0,0.0,11726.0,23482.0,35126.0,38492.0,37841.0,37283.0,36708.0,36116.0,35510.0,...,1598.0,1562.0,1544.0,1526.0,1497.0,1476.0,1446.0,1433.0,1411.0,1386.0
1,0.0,1686.0,1661.0,1637.0,1617.0,1592.0,1573.0,1547.0,1524.0,1496.0,...,26066.0,25679.0,25242.0,24844.0,24472.0,24069.0,23703.0,23341.0,22966.0,22593.0
2,0.0,3714.0,3658.0,3605.0,3534.0,3487.0,3431.0,3375.0,3331.0,3279.0,...,125.0,122.0,120.0,119.0,117.0,115.0,114.0,114.0,114.0,113.0
3,0.0,5611.0,5525.0,8457.0,8326.0,8193.0,8085.0,7955.0,7828.0,7708.0,...,7150.0,7037.0,6930.0,6805.0,6698.0,6591.0,6494.0,6374.0,6275.0,6160.0
4,0.0,6412.0,6313.0,6227.0,6117.0,5991.0,5888.0,5808.0,5717.0,5638.0,...,468.0,462.0,456.0,452.0,445.0,442.0,433.0,427.0,420.0,412.0


concatenate all trajectories into one big npz file <span style="color: red;">(this takes about 20 minutes)</span>

In [None]:
# Use the npz path
files = [str(p.with_suffix('.npz')) for p in TRAJ_PATH]

# Detect array keys from the first NPZ
# probe = np.load(files[0], allow_pickle=True)
# X_KEY = next(k for k in ["X", "trajectories", "M", "data"] if k in probe.files)
# Y_KEY = next(k for k in ["y", "labels", "label"] if k in probe.files)
# probe.close()
X_KEY = 'trajectories'
Y_KEY = 'labels'

# Load, stack, save
Xs, ys = [], []
for f in files:
    with np.load(f) as d:
        X, y = d[X_KEY], d[Y_KEY]
    if X.ndim == 2:            # add feature dim if missing
        X = X[..., None]       # -> (n, T, 1)
    n = X.shape[0]

    # --- make y shape (n,) regardless of how it’s stored ---
    y = np.asarray(y)
    if y.ndim == 0:                       # scalar label for the whole file
        y = np.full((n,), int(y), dtype=np.int64)
    else:
        y = y.reshape(-1)
        if y.shape[0] == 1 and n > 1:     # single value -> repeat for all n
            y = np.repeat(y.astype(np.int64), n)
        else:
            y = y.astype(np.int64)
    # --------------------------------------------------------

    Xs.append(X.astype(np.float32))
    ys.append(y)

X_all = np.concatenate(Xs, axis=0)   # (N_total, T, F)
y_all = np.concatenate(ys, axis=0)   # (N_total,)

In [None]:
out_path = DATA_ROOT / "combined.npz"
try: 
    np.savez_compressed(out_path, X=X_all, y=y_all)
except PermissionError:
    with tempfile.NamedTemporaryFile(delete=False, suffix='.npz') as tmp_file:
        tmp_path = tmp_file.name
        np.savez_compressed(tmp_path, X=X_all, y=y_all)
    # Move temp file to final location with sudo
    subprocess.run(['sudo', 'mv', tmp_path, out_path], check=True)
    subprocess.run(['sudo', 'chown', f'{os.getenv("USER")}:{os.getenv("USER")}', out_path], check=True)

print("Saved:", out_path, "| Shapes:", X_all.shape, y_all.shape)


Saved: /home/ianyang/stochastic_simulations/experiments/EXP-25-IY011/data/combined.npz | Shapes: (1024000, 3000, 1) (1024000,)


Load and split the combined data

In [4]:
COMBINED_NPZ_PATH = str(DATA_ROOT / "combined.npz")
# Use load_and_split_data for consistent splitting
X_train, X_val, X_test, y_train, y_val, y_test = load_and_split_data(
    COMBINED_NPZ_PATH, split_val_size=0.2
)

In [7]:
print(f"Data preparation:")
print(f"  Train shape: {X_train.shape}, Val shape: {X_val.shape}, Test shape: {X_test.shape}")
print(f"  Train labels: {y_train.shape}, Val labels: {y_val.shape}, Test labels: {y_test.shape}")

Data preparation:
  Train shape: (655360, 3000, 1), Val shape: (163840, 3000, 1), Test shape: (204800, 3000, 1)
  Train labels: (655360,), Val labels: (163840,), Test labels: (204800,)


since the data is quite large (1,024,000 samples of length 3000 each), we will initially only use a subset for training and testing
```
Memory Usage Comparison
Setup	Memory Usage	Training Speed
10k × 3000	~120 GB	3 min/epoch
1k × 3000	~12 GB	~18 sec/epoch
5k × 512	~10 GB	~15 sec/epoch
10k × 512	~20 GB	~30 sec/epoch
```

In [6]:
# Quick test with subset of data
subset_size = 1000  # Use only 1k samples

# Calculate subset sizes that preserve original ratios
train_ratio = len(X_train) / (len(X_train) + len(X_val) + len(X_test))
val_ratio = len(X_val) / (len(X_train) + len(X_val) + len(X_test))
test_ratio = len(X_test) / (len(X_train) + len(X_val) + len(X_test))

train_subset_size = int(subset_size * train_ratio)  # ~640
val_subset_size = int(subset_size * val_ratio)      # ~160
test_subset_size = int(subset_size * test_ratio)    # ~200

X_train_subset = X_train[:train_subset_size]
X_val_subset = X_val[:val_subset_size] 
X_test_subset = X_test[:test_subset_size]
y_train_subset = y_train[:train_subset_size]
y_val_subset = y_val[:val_subset_size]
y_test_subset = y_test[:test_subset_size]

print(f"New shapes: Train {X_train_subset.shape}, Val {X_val_subset.shape}, Test {X_test_subset.shape}")

transformer_accuracy = transformer_classifier(
    X_train_subset,
    X_val_subset, 
    X_test_subset,
    y_train_subset,
    y_val_subset,
    y_test_subset,
    d_model=64,  # Smaller model
    nhead=4,
    num_layers=2,  # Fewer layers
    batch_size=32,  
    epochs=10,  # Fewer epochs for testing
    use_conv1d=False,
    verbose=True
)

New shapes: Train (640, 3000, 1), Val (160, 3000, 1), Test (200, 3000, 1)
🔄 Using device: cuda (1 GPUs available)
DEBUG: Optimizer initialized? True
✅ Running on CUDA!
Epoch [1/10], Loss: 2.6983, Train Acc: 0.5188
Validation Acc: 0.8375
Epoch [2/10], Loss: 0.6989, Train Acc: 0.7781
Validation Acc: 0.8625
Epoch [3/10], Loss: 0.5946, Train Acc: 0.8141
Validation Acc: 0.8750
Epoch [4/10], Loss: 0.5405, Train Acc: 0.8328
Validation Acc: 0.8750
No improvement (1/10).
Epoch [5/10], Loss: 0.5085, Train Acc: 0.8297
Validation Acc: 0.8750
No improvement (2/10).
Epoch [6/10], Loss: 0.4529, Train Acc: 0.8578
Validation Acc: 0.8750
No improvement (3/10).
Epoch [7/10], Loss: 0.4463, Train Acc: 0.8734
Validation Acc: 0.8750
No improvement (4/10).
Epoch [8/10], Loss: 0.4541, Train Acc: 0.8656
Validation Acc: 0.8750
No improvement (5/10).
Epoch [9/10], Loss: 0.4733, Train Acc: 0.8438
Validation Acc: 0.8750
No improvement (6/10).
Epoch [10/10], Loss: 0.4666, Train Acc: 0.8500
Validation Acc: 0.8750
No 

The above approach still takes a long time, so let's try **truncating sequences**

Why Truncation Makes Sense:
1. Transformers struggle with very long sequences anyway (quadratic memory complexity)
2. Most temporal patterns in biological data are captured in shorter windows


In [7]:
# Truncate sequences to manageable length
max_seq_length = 1024  # Much more reasonable for transformers

# Quick test with subset AND truncated sequences
subset_size = 10000  # Can use more samples now

train_subset_size = int(subset_size * train_ratio) 
val_subset_size = int(subset_size * val_ratio)      
test_subset_size = int(subset_size * test_ratio)  
  
X_train_subset = X_train[:train_subset_size, :max_seq_length, :]  # Truncate time dimension
X_val_subset = X_val[:val_subset_size, :max_seq_length, :]
X_test_subset = X_test[:test_subset_size, :max_seq_length, :]
y_train_subset = y_train[:train_subset_size]
y_val_subset = y_val[:val_subset_size]
y_test_subset = y_test[:test_subset_size]

print(f"New shapes: Train {X_train_subset.shape}, Val {X_val_subset.shape}, Test {X_test_subset.shape}")

transformer_accuracy = transformer_classifier(
    X_train_subset,
    X_val_subset, 
    X_test_subset,
    y_train_subset,
    y_val_subset,
    y_test_subset,
    d_model=128,
    nhead=4,
    num_layers=2,
    batch_size=64,  # Can use larger batches now
    epochs=20,
    use_conv1d=False,
    verbose=True
)

New shapes: Train (6400, 1024, 1), Val (1600, 1024, 1), Test (2000, 1024, 1)
🔄 Using device: cuda (1 GPUs available)
DEBUG: Optimizer initialized? True
✅ Running on CUDA!
Epoch [1/20], Loss: 1.4164, Train Acc: 0.6664
Validation Acc: 0.8144
Epoch [2/20], Loss: 0.5114, Train Acc: 0.8155
Validation Acc: 0.8256
Epoch [3/20], Loss: 0.4663, Train Acc: 0.8413
Validation Acc: 0.8369
Epoch [4/20], Loss: 0.4430, Train Acc: 0.8483
Validation Acc: 0.8419
Epoch [5/20], Loss: 0.4385, Train Acc: 0.8536
Validation Acc: 0.8438
Epoch [6/20], Loss: 0.4279, Train Acc: 0.8553
Validation Acc: 0.8419
No improvement (1/10).
Epoch [7/20], Loss: 0.4276, Train Acc: 0.8553
Validation Acc: 0.8369
No improvement (2/10).
Epoch [8/20], Loss: 0.4201, Train Acc: 0.8611
Validation Acc: 0.8419
No improvement (3/10).
Epoch [9/20], Loss: 0.4216, Train Acc: 0.8589
Validation Acc: 0.8456
Epoch [10/20], Loss: 0.4151, Train Acc: 0.8638
Validation Acc: 0.8406
No improvement (1/10).
Epoch [11/20], Loss: 0.4186, Train Acc: 0.8581

Truncation: More data but slightly lower performance - suggests truncation may have removed important patterns

In [18]:
# Test different sequence lengths
sequence_lengths = [256, 512, 1024, 1536, 2048]
results = {}

for seq_len in sequence_lengths:
    print(f"\n🔬 Testing sequence length: {seq_len}")
    
    X_train_seq = X_train[:5000, :seq_len, :]
    X_val_seq = X_val[:1250, :seq_len, :]
    X_test_seq = X_test[:1250, :seq_len, :]
    
    accuracy = transformer_classifier(
        X_train_seq, X_val_seq, X_test_seq,
        y_train[:5000], y_val[:1250], y_test[:1250],
        d_model=128, nhead=4, num_layers=2,
        batch_size=64, epochs=15, use_conv1d=False, verbose=False
    )
    
    results[seq_len] = accuracy
    print(f"✅ Sequence length {seq_len}: {accuracy:.3f}")

# Find optimal length
best_seq_len = max(results, key=results.get)
print(f"\n🎯 Best sequence length: {best_seq_len} (accuracy: {results[best_seq_len]:.3f})")


🔬 Testing sequence length: 256


=== Vanilla Transformer Accuracy: 0.80 ===
✅ Sequence length 256: 0.802

🔬 Testing sequence length: 512
=== Vanilla Transformer Accuracy: 0.83 ===
✅ Sequence length 512: 0.830

🔬 Testing sequence length: 1024
=== Vanilla Transformer Accuracy: 0.85 ===
✅ Sequence length 1024: 0.851

🔬 Testing sequence length: 1536
=== Vanilla Transformer Accuracy: 0.87 ===
✅ Sequence length 1536: 0.870

🔬 Testing sequence length: 2048
=== Vanilla Transformer Accuracy: 0.90 ===
✅ Sequence length 2048: 0.895

🎯 Best sequence length: 2048 (accuracy: 0.895)


In [8]:
# Test different model sizes
configs = [
    {"d_model": 64, "nhead": 4, "num_layers": 2, "name": "Small"},
    {"d_model": 128, "nhead": 8, "num_layers": 2, "name": "Medium"},
    {"d_model": 256, "nhead": 8, "num_layers": 4, "name": "Large"},
    {"d_model": 128, "nhead": 4, "num_layers": 6, "name": "Deep"},
]

arch_results = {}
for config in configs:
    print(f"\n🏗️ Testing {config['name']} model")
    
    accuracy = transformer_classifier(
        X_train_subset, X_val_subset, X_test_subset,
        y_train_subset, y_val_subset, y_test_subset,
        d_model=config["d_model"],
        nhead=config["nhead"], 
        num_layers=config["num_layers"],
        batch_size=32, epochs=15, use_conv1d=False, verbose=False
    )
    
    arch_results[config["name"]] = accuracy
    print(f"✅ {config['name']}: {accuracy:.3f}")


🏗️ Testing Small model
=== Vanilla Transformer Accuracy: 0.85 ===
✅ Small: 0.854

🏗️ Testing Medium model
=== Vanilla Transformer Accuracy: 0.85 ===
✅ Medium: 0.855

🏗️ Testing Large model
=== Vanilla Transformer Accuracy: 0.86 ===
✅ Large: 0.858

🏗️ Testing Deep model
=== Vanilla Transformer Accuracy: 0.86 ===
✅ Deep: 0.857


TODO: How would model trained on synthetic telegraph model data perform on real experimental data?

TODO: How would the model perform on exp data if we train it on 
- a mix of different synthetic models: OU process, telegraph model, bursty model, etc.?
- a mix of synthetic and real data?

you can't load all 1024 files into memory at once, so just do them in mini batches

In [9]:
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
# try just with the first trajectory file
X = np.load(TRAJ_PATH[0].with_suffix('.npz'), allow_pickle=True)['trajectories']
y = np.array([0])  # dummy label


In [None]:
import torch, numpy as np
from torch.utils.data import Dataset, DataLoader

# 1) Your data (replace with your loader)
# X: list/array of shape [N, T, F] (or list of [T,F] tensors), y: [N]
# Example: X = np.load("trajectories.npy")  # (N,T,F); y = np.load("labels.npy")
X, y = ... , ...

# 2) Windowed dataset (keeps a padding mask)
class WindowedDS(Dataset):
    def __init__(self, X, y, win=1024, stride=512):
        self.samples = []
        for i, xi in enumerate(X):
            T = xi.shape[0]
            for s in range(0, max(1, T - win + 1), stride):
                e = min(T, s + win)
                self.samples.append((i, s, e))
        self.X, self.y, self.win = X, y, win

    def __len__(self): return len(self.samples)

    def __getitem__(self, k):
        i, s, e = self.samples[k]
        x  = torch.as_tensor(self.X[i][s:e])        # [t,F]
        t  = x.shape[0]
        pad = self.win - t
        if pad: x = torch.nn.functional.pad(x, (0,0,0,pad))   # -> [win,F]
        mask = torch.zeros(self.win, dtype=torch.bool); mask[:t] = True  # True=real token
        return x, mask, self.y[i], i                  # keep traj id if needed

# 3) Minimal collate (already equal length, so just stack)
def collate(batch):
    xs, masks, ys, ids = zip(*batch)
    return torch.stack(xs), torch.stack(masks), torch.tensor(ys), torch.tensor(ids)

ds = WindowedDS(X, y, win=1024, stride=512)
loader = DataLoader(ds, batch_size=8, shuffle=True, num_workers=4, pin_memory=True, collate_fn=collate)

# 4) Example training step (nn.Transformer vs HF-style masks)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = ... .to(device)               # your Transformer encoder/classifier
optim = torch.optim.AdamW(model.parameters(), lr=3e-4)
loss_fn = torch.nn.CrossEntropyLoss()

for xb,;;, mask, yb, _ in loader:
    xb, mask, yb = xb.to(device), mask.to(device), yb.to(device)
    # If your model expects src_key_padding_mask (nn.Transformer uses True=PAD):
    key_pad = ~mask                        # flip: True where padding
    # If your model expects attention_mask (HF uses 1=keep, 0=pad):
    attn_mask = mask.long()

    optim.zero_grad(set_to_none=True)
    # Choose the arg your model supports (uncomment the right one):
    # logits = model(xb, src_key_padding_mask=key_pad)
    # logits = model(xb, attention_mask=attn_mask)
    logits = model(xb)                     # if you handle masks inside the model
    loss = loss_fn(logits, yb)
    loss.backward()
    optim.step()


In [None]:
# load a mini batch of data


In [None]:

# Save to temporary file for load_and_split_data function
temp_path = BASE_DIR / "temp_t_ac_target.csv"
labelled_data.to_csv(temp_path, index=False)

# Use load_and_split_data for consistent splitting
X_train, X_val, X_test, y_train, y_val, y_test = load_and_split_data(
    temp_path, split_val_size=0.2
)

print(f"Data preparation:")
print(f"  Train shape: {X_train.shape}, Val shape: {X_val.shape}, Test shape: {X_test.shape}")
print(f"  Train labels: {y_train.shape}, Val labels: {y_val.shape}, Test labels: {y_test.shape}")

# =========================================================
# SVM Classification
# =========================================================
print("\n🤖 SVM Classification (Original Data)")
start_time = time.time()
svm_accuracy = svm_classifier(
    X_train, X_test, y_train, y_test,
    svm_C=1.0,
    svm_gamma='scale',
    svm_kernel='rbf',
    print_classification_report=True,
    print_confusion_matrix=True,
)
training_time = time.time() - start_time
print(f"⏱️  SVM training time: {training_time:.2f} seconds")

# =========================================================
# LSTM Classification
# =========================================================
print("\n🧠 LSTM Classification (Original Data)")
start_time = time.time()
# IY001A LSTM
lstm_accuracy = lstm_classifier(
    X_train,
    X_val,
    X_test,
    y_train,
    y_val,
    y_test,
    epochs=100,
    hidden_size=128,
    num_layers=4,
    dropout_rate=0.2,
    learning_rate=0.001,
    batch_size=64,
    use_conv1d=True,
    use_attention=True,
    num_attention_heads=4,
    use_auxiliary=True,
)
training_time = time.time() - start_time
print(f"⏱️  LSTM training time: {training_time:.2f} seconds")

# =========================================================
# Transformer Classification
# =========================================================
print("\n🤖 Transformer Classification (Original Data)")
start_time = time.time()
transformer_accuracy = transformer_classifier(
    X_train,
    X_val,
    X_test,
    y_train,
    y_val,
    y_test,
    d_model=128,
    nhead=8,
    num_layers=4,
    epochs=50,
    use_conv1d=True,
    use_auxiliary=True,
)
training_time = time.time() - start_time
print(f"⏱️  Transformer training time: {training_time:.2f} seconds")

# =========================================================
# Shuffled Data Analysis
# =========================================================
print("\n🔀 Experiment: Shuffling Time Series Data to Test Temporal Order Impact")
df = labelled_data.copy()
df_shuffled = shuffle_time_series(
    df, 
    preserve_columns=['label'], 
    random_state=42
 ,
        strategy='global'
)# Save shuffled data to temporary file

temp_shuffled_path = BASE_DIR / "temp_t_ac_target_shuffled.csv"
df_shuffled.to_csv(temp_shuffled_path, index=False)

# Load and split shuffled data
X_train_shuffled, X_val_shuffled, X_test_shuffled, y_train_shuffled, y_val_shuffled, y_test_shuffled = load_and_split_data(
    temp_shuffled_path, split_val_size=0.2
)

# SVM on shuffled data
print("\n🤖 SVM Classification (Shuffled Data)")
start_time = time.time()
svm_accuracy_shuffled = svm_classifier(
    X_train_shuffled, X_test_shuffled, y_train_shuffled, y_test_shuffled,
    svm_C=1.0,
    svm_gamma='scale',
    svm_kernel='rbf',
    print_classification_report=True,
    print_confusion_matrix=True,
)
training_time = time.time() - start_time
print(f"⏱️  SVM training time: {training_time:.2f} seconds")

# LSTM on shuffled data
print("\n🧠 LSTM Classification (Shuffled Data)")
start_time = time.time()
lstm_accuracy_shuffled = lstm_classifier(
    X_train_shuffled,
    X_val_shuffled,
    X_test_shuffled,
    y_train_shuffled,
    y_val_shuffled,
    y_test_shuffled,
    epochs=100,
    hidden_size=128,
    num_layers=4,
    dropout_rate=0.2,
    learning_rate=0.001,
    batch_size=64,
    use_conv1d=True,
    use_attention=True,
    num_attention_heads=4,
    use_auxiliary=True,
)
training_time = time.time() - start_time
print(f"⏱️  LSTM training time: {training_time:.2f} seconds")

# Transformer on shuffled data
print("\n🤖 Transformer Classification (Shuffled Data)")
start_time = time.time()
transformer_accuracy_shuffled = transformer_classifier(
    X_train_shuffled,
    X_val_shuffled,
    X_test_shuffled,
    y_train_shuffled,
    y_val_shuffled,
    y_test_shuffled,
    d_model=128,
    nhead=8,
    num_layers=4,
    epochs=50,
    use_conv1d=True,
    use_auxiliary=True,
)
training_time = time.time() - start_time
print(f"⏱️  Transformer training time: {training_time:.2f} seconds")

# Store results
results_dict['parameter'].append('Autocorrelation Time')
results_dict['svm_original_accuracy'].append(svm_accuracy)
results_dict['svm_shuffled_accuracy'].append(svm_accuracy_shuffled)
results_dict['lstm_original_accuracy'].append(lstm_accuracy)
results_dict['lstm_shuffled_accuracy'].append(lstm_accuracy_shuffled)
results_dict['transformer_original_accuracy'].append(transformer_accuracy)
results_dict['transformer_shuffled_accuracy'].append(transformer_accuracy_shuffled)

# Clean up temporary files
os.remove(temp_path)
os.remove(temp_shuffled_path)

In [1]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import sys
import os
from models.TF_transformer import TFTransformer, ModelCfg
from models.TF_pretraining import TFPretrainTransformer

In [2]:
#!/usr/bin/env python3
"""
Example usage of the enhanced TF_transformer.py with new training capabilities.

This script demonstrates:
1. Basic model creation and configuration
2. Self-supervised pretraining
3. Transfer learning to classification task
4. Model training with validation
"""

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import sys
import os

from models.TF_transformer import TFTransformer, ModelCfg
from models.TF_pretraining import TFPretrainTransformer

def create_synthetic_data(n_samples: int = 1000, seq_len: int = 50, n_classes: int = 2):
    """Create synthetic time series data for demonstration."""
    torch.manual_seed(42)
    
    # Generate synthetic time series with different patterns for different classes
    data = []
    labels = []
    
    for i in range(n_samples):
        t = torch.linspace(0, 4*torch.pi, seq_len)
        
        if i % n_classes == 0:  # Class 0: sine wave with noise
            signal = torch.sin(t) + 0.3 * torch.randn(seq_len)
            label = 0
        else:  # Class 1: cosine wave with different frequency
            signal = torch.cos(2*t) + 0.3 * torch.randn(seq_len)
            label = 1
            
        data.append(signal.unsqueeze(-1))  # Add feature dimension
        labels.append(label)
    
    return torch.stack(data), torch.tensor(labels)

def main():
    print("🚀 TF Transformer Enhanced Usage Example")
    print("=" * 50)
    
    # Configuration
    cfg = ModelCfg(
        n_classes=2,
        d_model=64,
        n_heads=4,
        n_layers=2,
        dropout=0.1,
        learning_rate=0.001,
        optimizer='AdamW',
        verbose=True
    )
    
    # Create synthetic data
    print("📊 Creating synthetic time series data...")
    X, y = create_synthetic_data(n_samples=800, seq_len=50, n_classes=2)
    
    # Split data
    train_size = int(0.7 * len(X))
    val_size = int(0.15 * len(X))
    
    X_train, y_train = X[:train_size], y[:train_size]
    X_val, y_val = X[train_size:train_size+val_size], y[train_size:train_size+val_size]
    X_test, y_test = X[train_size+val_size:], y[train_size+val_size:]
    
    # Create DataLoaders
    train_dataset = TensorDataset(X_train, y_train)
    val_dataset = TensorDataset(X_val, y_val)
    test_dataset = TensorDataset(X_test, y_test)
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    
    print(f"📈 Data split: {len(X_train)} train, {len(X_val)} val, {len(X_test)} test")
    
    # Step 1: Self-supervised pretraining
    print("\n🎭 Step 1: Self-supervised pretraining")
    print("-" * 30)
    
    base_model = TFTransformer(cfg)
    pretrain_model = TFPretrainTransformer(base_model, input_features=1)
    
    # Pretrain for a few epochs (in practice, you'd use more epochs)
    pretrain_history = pretrain_model.train_pretraining(
        train_loader, 
        epochs=5, 
        learning_rate=1e-3,
        save_path="pretrained_tf_transformer.pth"
    )
    
    # Step 2: Transfer learning
    print("\n🔄 Step 2: Transfer learning to classification")
    print("-" * 40)
    
    # Create new model for classification
    classification_model = TFTransformer(cfg)
    classification_model.load_pretrained_encoder(pretrain_model)
    
    # Freeze encoder for fine-tuning
    classification_model.freeze_encoder(freeze=True)
    classification_model.reset_classifier()
    
    # Step 3: Fine-tuning
    print("\n🏋️ Step 3: Fine-tuning on classification task")
    print("-" * 40)
    
    # Train the classification model
    history = classification_model.train_model(
        train_loader=train_loader,
        val_loader=val_loader,
        epochs=20,
        patience=5,
        save_path="best_tf_classifier.pth"
    )
    
    # Step 4: Evaluation
    print("\n📊 Step 4: Model evaluation")
    print("-" * 25)
    
    # Evaluate on test set
    test_acc = classification_model.evaluate(test_loader)
    print(f"🎯 Test Accuracy: {test_acc:.4f}")
    
    # Example predictions
    print("\n🔮 Example predictions:")
    sample_batch = X_test[:5]  # First 5 test samples
    predictions = classification_model.predict(sample_batch)
    probabilities = classification_model.predict_proba(sample_batch)
    true_labels = y_test[:5]
    
    for i in range(5):
        pred_class = predictions[i].item()
        true_class = true_labels[i].item()
        prob = probabilities[i].max().item()
        print(f"  Sample {i+1}: Predicted={pred_class}, True={true_class}, Confidence={prob:.3f}")
    
    print("\n🎉 Example complete!")
    print("💡 Key features demonstrated:")
    print("   ✅ Self-supervised pretraining with masked reconstruction")
    print("   ✅ Transfer learning with encoder freezing")
    print("   ✅ Fine-tuning with early stopping")
    print("   ✅ Model evaluation and prediction")

if __name__ == "__main__":
    main()


🚀 TF Transformer Enhanced Usage Example
📊 Creating synthetic time series data...
📈 Data split: 560 train, 120 val, 120 test

🎭 Step 1: Self-supervised pretraining
------------------------------
✅ CUDA detected, using GPU: NVIDIA GeForce RTX 2080 SUPER
🔧 Initializing TFTransformer with config: ModelCfg(n_classes=2, d_model=64, n_heads=4, n_layers=2, d_ff=128, dropout=0.1, max_len=2048, verbose=True, learning_rate=0.001, optimizer='AdamW', gradient_clip=1.0, label_smoothing=0.1)
🖥️  Using device: cuda
🚀 CUDA available with 1 GPU(s)
   GPU 0: NVIDIA GeForce RTX 2080 SUPER
ℹ️  Single GPU detected, using single GPU training
🔧 Training setup complete with AdamW optimizer
📊 Model initialized with 67,266 total parameters (67,266 trainable)
🎭 TFPretrainTransformer initialized
   Input features: 1
   Reconstruction head parameters: 65
🎭 Starting self-supervised pretraining
   Epochs: 5
   Learning rate: 0.001
   Trainable parameters: 67,137
Epoch [1/5], Train Loss: 0.5536, Avg Masked: 7.4
Epoch 

In [None]:
from torch.utils.tensorboard import SummaryWriter
from models.TF_transformer import TFTransformer, ModelCfg



In [None]:
from torch.utils.tensorboard import SummaryWriter
import torch, torch.nn as nn, torch.optim as optim

model = nn.Sequential(nn.Linear(784, 256), nn.ReLU(), nn.Linear(256, 10))
opt = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

writer = SummaryWriter("runs/exp1")        # logs go here

global_step = 0
for epoch in range(5):
    for x, y in train_loader:               # your DataLoader
        opt.zero_grad()
        logits = model(x.view(x.size(0), -1))
        loss = loss_fn(logits, y)
        loss.backward()
        opt.step()

        writer.add_scalar("train/loss", loss.item(), global_step)
        global_step += 1

# Optional: log graph once
dummy = torch.randn(1, 784)
writer.add_graph(model, dummy)
writer.close()


2025-10-17 14:59:31.897231: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-17 14:59:31.897282: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-17 14:59:31.947441: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-17 14:59:32.060628: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

A module that was compiled using NumPy 1.x cannot be

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/ianyang/micromamba/envs/stochastic_sim/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/ianyang/micromamba/envs/stochastic_sim/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/ianyang/micromamba/envs/stochastic_sim/lib/python3.11/site-packages/ipykernel/kernelapp.py

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/home/ianyang/micromamba/envs/stochastic_sim/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/ianyang/micromamba/envs/stochastic_sim/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/ianyang/micromamba/envs/stochastic_sim/lib/python3.11/site-packages/ipykernel/kernelapp.py

AttributeError: _ARRAY_API not found

ImportError: numpy.core._multiarray_umath failed to import

NameError: name 'train_loader' is not defined

: 