# Train transformer model on transformed experimental time series data

Using the preprocessed time series data from the `transformed_exp_time_series_data/` folder:
- 19316_2020_10_26_steadystate_glucose_144m_2w2_00 (label: 0)
- 20213_2021_09_07_steady_0p01glc_1344_1346_1347_00 (label: 1)

Each file contains individual cell time series data ready for classification.

In [3]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
import seaborn as sns

# Load time series data from transformed_exp_time_series_data folder
data_path = "transformed_exp_time_series_data/"
csv_files = glob.glob(data_path + "*.csv")

print(f"Found {len(csv_files)} CSV files")

# Load and combine all time series data
all_sequences = []
all_labels = []

for file in csv_files:
    df = pd.read_csv(file)
    
    # Extract experiment type from filename for labeling
    if "19316_2020_10_26" in file:
        label = 0  # glucose condition
    elif "20213_2021_09_07" in file:
        label = 1  # low glucose condition
    else:
        continue  # skip other experiments for now
    
    # Convert to numpy array (excluding any non-numeric columns if present)
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    sequence = df[numeric_cols].values
    
    all_sequences.append(sequence)
    all_labels.append(label)

X = np.array(all_sequences, dtype=object)
y = np.array(all_labels)

print(f"Number of sequences: {len(X)}")
print(f"Labels distribution: {np.bincount(y)}")
print(f"Sample sequence shape: {X[0].shape}")
print(f"Number of features per timepoint: {X[0].shape[1]}")

Found 12 CSV files
Number of sequences: 6
Labels distribution: [3 3]
Sample sequence shape: (241, 451)
Number of features per timepoint: 451
Number of sequences: 6
Labels distribution: [3 3]
Sample sequence shape: (241, 451)
Number of features per timepoint: 451


In [4]:
# Prepare time series sequences for transformer training
from sklearn.model_selection import train_test_split

# Use the data as-is since it's already properly formatted
sequence_lengths = [seq.shape[0] for seq in X]
print(f"Sequence lengths - Min: {min(sequence_lengths)}, Max: {max(sequence_lengths)}, Mean: {np.mean(sequence_lengths):.2f}")

Sequence lengths - Min: 71, Max: 275, Mean: 163.33


In [5]:
# Prepare fixed-length sequences for transformer
def pad_or_truncate_sequences(sequences, target_length=450):
    padded_sequences = []
    for seq in sequences:
        if len(seq) > target_length:
            padded_seq = seq[:target_length]
        else:
            padding = np.zeros((target_length - len(seq), seq.shape[1]))
            padded_seq = np.vstack([seq, padding])
        padded_sequences.append(padded_seq)
    return np.array(padded_sequences)

target_length = 450
X_padded = pad_or_truncate_sequences(X, target_length)

# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(X_padded, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Training: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (6, 450) + inhomogeneous part.

In [None]:
# Train transformer classifier
from classifiers.transformer_classifier import transformer_classifier

hyperparams = {
    'input_size': X_train.shape[2],
    'd_model': 128,
    'nhead': 4,
    'num_layers': 2,
    'output_size': 2,
    'dropout_rate': 0.1,
    'learning_rate': 0.01,
    'batch_size': 64,
    'epochs': 50,
    'patience': 5,
    'optimizer': 'Adam',
    'use_conv1d': False,
    'use_auxiliary': False,
    'pooling_strategy': 'last',
    'use_mask': False,
    'gradient_clip': 1.0
}

test_accuracy = transformer_classifier(X_train, X_val, X_test, y_train, y_val, y_test, **hyperparams)
print(f"Transformer Test Accuracy: {test_accuracy:.4f}")

Target sequence length: 450
Padded sequences shape: (941, 450, 57)
Training set: (564, 450, 57), labels: [140 424]
Validation set: (188, 450, 57), labels: [ 46 142]
Test set: (189, 450, 57), labels: [ 47 142]

Feature ranges (first few features):
  Feature 0: min=0.00, max=164.00, mean=126.20
  Feature 1: min=0.00, max=1493.00, mean=718.41
  Feature 2: min=0.00, max=291.72, mean=88.37
  Feature 3: min=0.00, max=1144.25, mean=624.63
  Feature 4: min=0.00, max=1164.44, mean=600.34


In [None]:
# Benchmark all classifiers
from classifiers.svm_classifier import svm_classifier
from classifiers.random_forest_classifier import random_forest_classifier  
from classifiers.logistic_regression_classifier import logistic_regression_classifier
from classifiers.mlp_classifier import mlp_classifier
from classifiers.random_classifier import random_classifier

# Extract statistical features for traditional ML models
def extract_statistical_features(X):
    features = []
    for seq in X:
        seq_features = []
        for feature_idx in range(seq.shape[1]):
            feature_series = seq[:, feature_idx]
            non_zero_mask = feature_series != 0
            if np.any(non_zero_mask):
                clean_series = feature_series[non_zero_mask]
                seq_features.extend([np.mean(clean_series), np.std(clean_series), 
                                   np.min(clean_series), np.max(clean_series), np.median(clean_series)])
            else:
                seq_features.extend([0, 0, 0, 0, 0])
        features.append(seq_features)
    return np.array(features)

X_train_features = extract_statistical_features(X_train)
X_test_features = extract_statistical_features(X_test)
X_val_features = extract_statistical_features(X_val)

# Train and evaluate all models
results = {'Transformer': test_accuracy}
results['SVM (RBF)'] = svm_classifier(X_train_features, X_test_features, y_train, y_test, svm_kernel='rbf')
results['SVM (Linear)'] = svm_classifier(X_train_features, X_test_features, y_train, y_test, svm_kernel='linear')
results['Random Forest'] = random_forest_classifier(X_train_features, X_test_features, y_train, y_test)
results['Logistic Regression'] = logistic_regression_classifier(X_train_features, X_test_features, y_train, y_test)
results['MLP'] = mlp_classifier(X_train_features, X_val_features, X_test_features, y_train, y_val, y_test)
results['Random'] = random_classifier(y_test)

# Display results
for model, acc in sorted(results.items(), key=lambda x: x[1], reverse=True):
    print(f"{model}: {acc:.4f}")

# Simple visualization
plt.figure(figsize=(10, 6))
models = list(results.keys())
accuracies = list(results.values())
plt.bar(models, accuracies)
plt.title('Model Performance Comparison')
plt.ylabel('Test Accuracy')
plt.xticks(rotation=45)
plt.show()

Hyperparameters:
  input_size: 57
  d_model: 128
  nhead: 4
  num_layers: 2
  output_size: 2
  dropout_rate: 0.1
  learning_rate: 0.01
  batch_size: 64
  epochs: 50
  patience: 5
  optimizer: Adam
  use_conv1d: False
  use_auxiliary: False
  pooling_strategy: last
  use_mask: False
  gradient_clip: 1.0
  save_path: /home/ianyang/stochastic_simulations/experiments/EXP-25-IY010/IY010A_transformer.pt

Starting transformer training...
✅ Model saved at /home/ianyang/stochastic_simulations/experiments/EXP-25-IY010/IY010A_transformer.pt (Best Val Acc: 0.4628)
✅ Model saved at /home/ianyang/stochastic_simulations/experiments/EXP-25-IY010/IY010A_transformer.pt (Best Val Acc: 0.8617)
✅ Model saved at /home/ianyang/stochastic_simulations/experiments/EXP-25-IY010/IY010A_transformer.pt (Best Val Acc: 0.9787)
✅ Model saved at /home/ianyang/stochastic_simulations/experiments/EXP-25-IY010/IY010A_transformer.pt (Best Val Acc: 0.9840)
✅ Model saved at /home/ianyang/stochastic_simulations/experiments/EXP