# Finetuning the LSTM model architecture, then hyperparameters

In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import glob
import os
from utils.set_seed import set_seed
from utils.load_data import load_and_split_data
from utils.steady_state import find_steady_state, save_steady_state 
from visualisation.plots import *
from stats.report import statistical_report
from stats.autocorrelation import autocrosscorr
from models.lstm import LSTMClassifier
import ast 
%load_ext autoreload
%autoreload 2

## Data Prep
We're using some mRNA trajectories that have the same mean, variance and autocorrelations

In [3]:
# Read in accuracy data, checking which obe we want
df_acc_results = pd.read_csv("data/accuracy_results_12_04_2025.csv")
df_acc_results[df_acc_results['Variance Ratio'] == 0.9999999999999996]

Unnamed: 0,Parameter Sets,Stats,Variance Ratio,SVM (rbf) Accuracy,SVM (linear) Accuracy,Random Forest Accuracy,Logistic Regression Accuracy,MLP Accuracy,Random Classifier Accuracy,LSTM Conv1D 4-Head Attention Accuracy,LSTM Conv1D Accuracy
900,"[{'sigma_u': 18.0, 'sigma_b': 0.05910239842216...","({'Stressed Mean': 9.956060606060607, 'Stresse...",1.0,0.475,0.475,0.5125,0.525,0.525,0.45,0.5,0.4125
901,"[{'sigma_u': 18.0, 'sigma_b': 0.05910239842216...","({'Stressed Mean': 9.716704545454546, 'Stresse...",1.0,0.4125,0.45,0.475,0.4625,0.475,0.45,0.4875,0.4125
902,"[{'sigma_u': 18.0, 'sigma_b': 0.05910239842216...","({'Stressed Mean': 9.716704545454546, 'Stresse...",1.0,0.4125,0.45,0.475,0.4625,0.475,0.45,0.4875,0.4125
903,"[{'sigma_u': 18.0, 'sigma_b': 0.05910239842216...","({'Stressed Mean': 9.716704545454546, 'Stresse...",1.0,0.4125,0.45,0.475,0.4625,0.475,0.45,0.4875,0.4125
904,"[{'sigma_u': 18.0, 'sigma_b': 0.05910239842216...","({'Stressed Mean': 9.716704545454546, 'Stresse...",1.0,0.4125,0.45,0.475,0.4625,0.475,0.45,0.4875,0.4125
905,"[{'sigma_u': 18.0, 'sigma_b': 0.05910239842216...","({'Stressed Mean': 9.716704545454546, 'Stresse...",1.0,0.4125,0.45,0.475,0.4625,0.475,0.45,0.4875,0.4125
906,"[{'sigma_u': 18.0, 'sigma_b': 0.05910239842216...","({'Stressed Mean': 9.716704545454546, 'Stresse...",1.0,0.4125,0.45,0.475,0.4625,0.475,0.45,0.4875,0.4125
907,"[{'sigma_u': 18.0, 'sigma_b': 0.05910239842216...","({'Stressed Mean': 9.716704545454546, 'Stresse...",1.0,0.4125,0.45,0.475,0.4625,0.475,0.45,0.4875,0.4125
908,"[{'sigma_u': 18.0, 'sigma_b': 0.05910239842216...","({'Stressed Mean': 9.716704545454546, 'Stresse...",1.0,0.4125,0.45,0.475,0.4625,0.475,0.45,0.4875,0.4125
909,"[{'sigma_u': 18.0, 'sigma_b': 0.05910239842216...","({'Stressed Mean': 9.716704545454546, 'Stresse...",1.0,0.4125,0.45,0.475,0.4625,0.475,0.45,0.4875,0.4125


In [6]:
##### get the data and combine them
# Path to all CSV files
file_paths = sorted(glob.glob('data/mRNA_trajectories_variance_1199_1200/steady_state_trajectories/m_traj_*.csv'))

# Read and combine
dfs = [pd.read_csv(f) for f in file_paths]
combined_df = pd.concat(dfs, ignore_index=True)

# Optional: shuffle the rows
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save combined dataset
combined_df.to_csv('data/mRNA_trajectories_variance_1199_1200/combined_traj_1199_1200_SS.csv', index=False)
print(f"✅ Combined {len(file_paths)} files into {combined_df.shape[0]} rows.")


✅ Combined 10 files into 4000 rows.


In [7]:
##### get the data and combine them
# Path to all CSV files
file_paths = sorted(glob.glob('data/mRNA_trajectories_variance_1211_1200/steady_state_trajectories/m_traj_*.csv'))

# Read and combine
dfs = [pd.read_csv(f) for f in file_paths]
combined_df = pd.concat(dfs, ignore_index=True)

# Optional: shuffle the rows
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save combined dataset
combined_df.to_csv('data/mRNA_trajectories_variance_1211_1200/combined_traj_1211_1200_SS.csv', index=False)
print(f"✅ Combined {len(file_paths)} files into {combined_df.shape[0]} rows.")


✅ Combined 10 files into 4000 rows.


In [None]:
######## Get only the steady state series and save them (if your data is not already in steady state)
output_file = '/home/ianyang/stochastic_simulations/notebooks/data/mRNA_trajectories_variance_1199_1200/combined_traj_1199_1200.csv'
df_results = pd.read_csv(output_file)
time_points = np.arange(0, 144.0, 1.0)  # Time range: start, stop, step; 
# extract the parameter sets
parameter_sets = df_acc_results[df_acc_results['Variance Ratio'] == 0.9999999999999996]['Parameter Sets'].unique()
parameter_sets = ast.literal_eval(parameter_sets[0])
save_path = 'data/steady_state_series/'
remaining_time_points, steady_state_series = save_steady_state(output_file, parameter_sets, time_points, 
                        save_path='data/steady_state_series/', # if you provide a directory name it will be saved in that directory as "{filename}_SS.csv"
                        # save_path='data/steady_state_series/example_steady_state_series.csv', # if you provide a file name, it will save it as that file
                        )


Steady state series saved to data/steady_state_series/combined_traj_1199_1200_SS.csv


In [None]:
# read in the steady state data
steady_state_file = os.path.join(save_path, f"{os.path.splitext(os.path.basename(output_file))[0]}_SS.csv") # this is the file name we saved it as

# Train LSTM model using SSA data
X_train, X_val, X_test, y_train, y_val, y_test = load_and_split_data(steady_state_file, split_val_size=0.2) # we must define split_val_size here to get a validation set
# Standardize the data 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
# Reshape input for LSTM, LSTM expects input in the shape (batch_size, seq_len, num_features)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

# Create datasets and loaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

## Understand the Data (EDA)
Before architecture refinement or hyperparameter finetuning, let's understand the data

See ``LSTM_EDA.ipynb``

## LSTM Architecture Refinement
We will be making the following refinement to the LSTM architecture: 
- Basic LSTM
- Convolutional Layer
- Attention Mechanism
- Multi-Head Attention (default: 4)
- Multi-Tasking: 
    - 🎯 Primary task: class label (via CrossEntropyLoss)
    - 🧮 Auxiliary task: number of peaks (via MSELoss)

In [6]:
################# LSTM Model #################
input_size = X_train.shape[2]  # each time step is a single value
hidden_size = 128 # number of LSTM units
num_layers = 3 # number of LSTM layers
output_size = len(torch.unique(y_train_tensor))  # number of classes
dropout_rate = 0.2
learning_rate = 0.01

model = LSTMClassifier(input_size=input_size, 
                       hidden_size=hidden_size, 
                       num_layers=num_layers, output_size=output_size,
                       dropout_rate=dropout_rate, 
                       learning_rate=learning_rate,
                       optimizer='Adam',bidirectional=True, 
                       use_attention=False, 
                       num_attention_heads=4, 
                       use_auxiliary=False)

# Train the model
history = model.train_model(train_loader, 
                            val_loader=val_loader,
                            epochs=50, 
                            patience=10,
                            # save_path='best_lstm_model.pt'
                            )
# Prepare test data
X_test_tensor = torch.tensor(X_test.reshape(X_test.shape[0], X_test.shape[1], 1), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=64)

# Evaluate
test_acc = model.evaluate(test_loader)
print(f"✅ Test accuracy: {test_acc:.4f}")

🔄 Using device: cuda (1 GPUs available)
DEBUG: Optimizer initialized? True
✅ Running on CUDA!


  return self._call_impl(*args, **kwargs)


Epoch [1/50], Loss: 0.6947, Train Acc: 0.4906
Validation Acc: 0.5000
Epoch [2/50], Loss: 0.6937, Train Acc: 0.5000
Validation Acc: 0.5000
No improvement (1/10).
Epoch [3/50], Loss: 0.6934, Train Acc: 0.5008
Validation Acc: 0.5000
No improvement (2/10).
Epoch [4/50], Loss: 0.6933, Train Acc: 0.5102
Validation Acc: 0.5000
No improvement (3/10).
Epoch [5/50], Loss: 0.6938, Train Acc: 0.4988
Validation Acc: 0.5000
No improvement (4/10).
Epoch [6/50], Loss: 0.6945, Train Acc: 0.4805
Validation Acc: 0.5000
No improvement (5/10).
Epoch [7/50], Loss: 0.6935, Train Acc: 0.4863
Validation Acc: 0.5000
No improvement (6/10).
Epoch [8/50], Loss: 0.6936, Train Acc: 0.5074
Validation Acc: 0.5000
No improvement (7/10).
Epoch [9/50], Loss: 0.6935, Train Acc: 0.4984
Validation Acc: 0.5000
No improvement (8/10).
Epoch [10/50], Loss: 0.6935, Train Acc: 0.4914
Validation Acc: 0.5000
No improvement (9/10).
Epoch [11/50], Loss: 0.6936, Train Acc: 0.4949
Validation Acc: 0.5000
No improvement (10/10).
Stopping

We'll run multiple models with combinations of features to measure their individual and combined impact:

Model #	Conv1D	Attention	Multi-Head	Multi-Task	Description
1	❌	❌	❌	❌	Vanilla LSTM
2	✅	❌	❌	❌	Conv1D only
3	✅	✅	❌	❌	Conv1D + basic attention
4	✅	✅	✅	❌	Conv1D + multi-head attention
5	✅	✅	✅	✅	Full model (Conv1D + MHA + Multi)

<span style='color:red'>Don't run the cell in the notebook, it will take a long time.</span>

In [None]:
'''
import torch
import numpy as np
import time
from models.lstm import LSTMClassifier
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
from utils.load_data import load_and_split_data 

output_file = '~/stochastic_simulations/notebooks/data/steady_state_series/combined_traj_1199_1200_SS.csv'
X_train, X_val, X_test, y_train, y_val, y_test = load_and_split_data(output_file, split_val_size=0.2)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

def to_tensor(data, labels):
    return TensorDataset(torch.tensor(data, dtype=torch.float32),
                         torch.tensor(labels, dtype=torch.long))

train_loader = DataLoader(to_tensor(X_train, y_train), batch_size=64, shuffle=True)
val_loader = DataLoader(to_tensor(X_val, y_val), batch_size=64)
test_loader = DataLoader(to_tensor(X_test, y_test), batch_size=64)

configs = [
    {"name": "Vanilla LSTM", "conv1d": False, "attention": False, "multihead": False, "aux": False},
    {"name": "Conv1D Only", "conv1d": True,  "attention": False, "multihead": False, "aux": False},
    {"name": "Conv1D + Attention", "conv1d": True,  "attention": True, "multihead": False, "aux": False},
    {"name": "Conv1D + MultiHead", "conv1d": True,  "attention": True, "multihead": True,  "aux": False},
    {"name": "Full Model", "conv1d": True,  "attention": True, "multihead": True,  "aux": True},
]

results = []
num_runs = 10

for cfg in configs:
    print(f"\n=== Repeated Training for {cfg['name']} ===")
    test_accs, val_accs, train_accs, train_losses, times = [], [], [], [], []

    for run in range(num_runs):
        print(f"Run {run + 1}/{num_runs}")
        start = time.time()

        model = LSTMClassifier(
            input_size=1,
            hidden_size=128,
            num_layers=3,
            output_size=len(np.unique(y_train)),
            dropout_rate=0.2,
            use_attention=cfg['attention'],
            num_attention_heads=4 if cfg['multihead'] else 1,
            use_auxiliary=cfg['aux'],
            use_conv1d=cfg['conv1d']
        )

        history = model.train_model(train_loader, val_loader, epochs=50, patience=10)
        test_acc = model.evaluate(test_loader)
        duration = time.time() - start

        test_accs.append(test_acc)
        val_accs.append(history['val_acc'][-1] if 'val_acc' in history and history['val_acc'] else None)
        train_accs.append(history['train_acc'][-1])
        train_losses.append(history['train_loss'][-1])
        times.append(duration)

    results.append({
        "model": cfg['name'],
        "test_acc_mean": np.mean(test_accs),
        "test_acc_std": np.std(test_accs),
        "val_acc_mean": np.mean(val_accs),
        "train_acc_mean": np.mean(train_accs),
        "train_loss_mean": np.mean(train_losses),
        "time_mean": np.mean(times)
    })

print("\n=== Averaged Experiment Summary ===")
for r in results:
    print(f"{r['model']}: Test Acc = {r['test_acc_mean']:.4f} ± {r['test_acc_std']:.4f}, Val Acc = {r['val_acc_mean']:.4f}, Train Acc = {r['train_acc_mean']:.4f}, Time = {r['time_mean']:.2f}s")
'''


=== Repeated Training for Vanilla LSTM ===
Run 1/10
🔄 Using device: cuda (1 GPUs available)
DEBUG: Optimizer initialized? True
✅ Running on CUDA!


  return self._call_impl(*args, **kwargs)


Epoch [1/50], Loss: 0.6936, Train Acc: 0.4891
Validation Acc: 0.5000
Epoch [2/50], Loss: 0.6933, Train Acc: 0.4898
Validation Acc: 0.4984
No improvement (1/10).
Epoch [3/50], Loss: 0.6934, Train Acc: 0.5035
Validation Acc: 0.5000
No improvement (2/10).
Epoch [4/50], Loss: 0.6931, Train Acc: 0.5008
Validation Acc: 0.5000
No improvement (3/10).
Epoch [5/50], Loss: 0.6932, Train Acc: 0.5078
Validation Acc: 0.5219
Epoch [6/50], Loss: 0.6935, Train Acc: 0.4984
Validation Acc: 0.5125
No improvement (1/10).
Epoch [7/50], Loss: 0.6923, Train Acc: 0.5234
Validation Acc: 0.5375
Epoch [8/50], Loss: 0.6898, Train Acc: 0.5336
Validation Acc: 0.5312
No improvement (1/10).
Epoch [9/50], Loss: 0.6882, Train Acc: 0.5391
Validation Acc: 0.5391
Epoch [10/50], Loss: 0.6828, Train Acc: 0.5570
Validation Acc: 0.5312
No improvement (1/10).
Epoch [11/50], Loss: 0.6841, Train Acc: 0.5590
Validation Acc: 0.5266
No improvement (2/10).
Epoch [12/50], Loss: 0.6870, Train Acc: 0.5453
Validation Acc: 0.5391
No impro

: 

=== Averaged Experiment Summary ===

Vanilla LSTM: Test Acc = 0.5844 ± 0.0900, Val Acc = 0.5792, Train Acc = 0.5673, Time = 29.10s
Conv1D Only: Test Acc = 0.8371 ± 0.1027, Val Acc = 0.8313, Train Acc = 0.8427, Time = 56.13s
Conv1D + Attention: Test Acc = 0.7528 ± 0.2077, Val Acc = 0.7548, Train Acc = 0.7298, Time = 61.83s
Conv1D + MultiHead: Test Acc = 0.7089 ± 0.2105, Val Acc = 0.7033, Train Acc = 0.6877, Time = 60.08s
Full Model: Test Acc = 0.6219 ± 0.1593, Val Acc = 0.6209, Train Acc = 0.6209, Time = 54.76s

## Hyperparameter Finetuning
Now that a 'best-performing' model architecture is clear, we can proceed to hyperparameter finetuning

<span style='color:red'>Don't run the cell in the notebook it will take a long time</span>

In [None]:
'''
import itertools
import torch
import torch.nn as nn
import numpy as np
import time
import csv
from models.lstm import LSTMClassifier
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
from utils.load_data import load_and_split_data
from tqdm import tqdm

# Load and preprocess data
output_file = 'data/combined_traj_1199_1200_SS.csv'
X_train, X_val, X_test, y_train, y_val, y_test = load_and_split_data(output_file, split_val_size=0.2)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

def to_tensor(data, labels):
    return TensorDataset(torch.tensor(data, dtype=torch.float32),
                         torch.tensor(labels, dtype=torch.long))

# Grid search space
hidden_sizes = [64, 128, 256]
num_layers_list = [2, 3, 4]
dropout_rates = [0.01, 0.05, 0.1, 0.2, 0.3]
learning_rates = [0.01, 1e-3, 1e-4, 5e-4]
batch_sizes = [32, 64, 128]

param_grid = list(itertools.product(hidden_sizes, 
                                    num_layers_list, 
                                    dropout_rates, 
                                    learning_rates, 
                                    batch_sizes))

csv_columns = ["hidden_size", "num_layers", "dropout_rate", "learning_rate", "batch_size", "train_acc", "val_acc", "test_acc", "time"]
results_file = "lstm_conv1d_gridsearch_results_10_04_2025.csv"

# Initialize CSV file and write header
with open(results_file, "w", newline="") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
    writer.writeheader()

# Grid search with tqdm
results = []

for i, (hidden_size, num_layers, dropout_rate, lr, batch_size) in enumerate(tqdm(param_grid, desc="Grid Search")):
    train_loader = DataLoader(to_tensor(X_train, y_train), batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(to_tensor(X_val, y_val), batch_size=batch_size)
    test_loader = DataLoader(to_tensor(X_test, y_test), batch_size=batch_size)

    start = time.time()
    model = LSTMClassifier(
        input_size=1,
        hidden_size=hidden_size,
        num_layers=num_layers,
        output_size=len(np.unique(y_train)),
        dropout_rate=dropout_rate,
        learning_rate=lr,
        use_conv1d=True,
        use_attention=False,
        use_auxiliary=False
    )

    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    
    history = model.train_model(train_loader, val_loader, epochs=50, patience=10)
    test_acc = model.evaluate(test_loader)
    duration = time.time() - start

    result = {
        "hidden_size": hidden_size,
        "num_layers": num_layers,
        "dropout_rate": dropout_rate,
        "learning_rate": lr,
        "batch_size": batch_size,
        "train_acc": history['train_acc'][-1],
        "val_acc": history['val_acc'][-1] if 'val_acc' in history and history['val_acc'] else None,
        "test_acc": test_acc,
        "time": duration
    }

    results.append(result)

    with open(results_file, "a", newline="") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writerow(result)

# Sort results by test accuracy
results = sorted(results, key=lambda x: x['test_acc'], reverse=True)

print("\n=== Top Configs ===")
for r in results[:10]:
    print(f"Test Acc: {r['test_acc']:.4f}, Val Acc: {r['val_acc']}, Train Acc: {r['train_acc']:.4f}, Time: {r['time']:.2f}s")
    print(f"  Params: Hidden={r['hidden_size']}, Layers={r['num_layers']}, Dropout={r['dropout_rate']}, LR={r['learning_rate']}, Batch={r['batch_size']}")
'''

Let's take a look at the results

In [9]:
lstm_conv1_gridsearch_results = pd.read_csv("data/lstm_conv1d_gridsearch_results_10_04_2025.csv")
lstm_conv1_gridsearch_results = lstm_conv1_gridsearch_results.sort_values(by='test_acc', ascending=False)
lstm_conv1_gridsearch_results.head(10)

Unnamed: 0,hidden_size,num_layers,dropout_rate,learning_rate,batch_size,train_acc,val_acc,test_acc,time
3,64,2,0.01,0.001,32,0.978906,0.967187,0.975,24.412648
88,64,3,0.1,0.001,64,0.972266,0.9625,0.9675,18.737075
159,64,4,0.2,0.001,32,0.971094,0.971875,0.95875,33.564914
183,128,2,0.01,0.001,32,0.955469,0.94375,0.95375,47.279711
15,64,2,0.05,0.001,32,0.958594,0.967187,0.9525,24.447744
417,256,2,0.3,0.0005,32,0.930078,0.95,0.95,172.156268
231,128,2,0.3,0.001,32,0.892969,0.925,0.94875,50.619891
160,64,4,0.2,0.001,64,0.946094,0.928125,0.9475,25.75324
40,64,2,0.2,0.001,64,0.940234,0.951562,0.9475,13.369877
471,256,3,0.3,0.001,32,0.962109,0.95,0.945,274.888041


In [20]:
best_params = lstm_conv1_gridsearch_results.iloc[0][
    ['hidden_size', 'num_layers', 'dropout_rate', 'learning_rate', 'batch_size']
    ]
best_params

hidden_size      64.000
num_layers        2.000
dropout_rate      0.010
learning_rate     0.001
batch_size       32.000
Name: 3, dtype: float64

In [25]:
# Load and preprocess data
output_file = 'data/steady_state_series/combined_traj_1199_1200_SS.csv'
X_train, X_val, X_test, y_train, y_val, y_test = load_and_split_data(output_file, split_val_size=0.2)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

def to_tensor(data, labels):
    return TensorDataset(torch.tensor(data, dtype=torch.float32),
                         torch.tensor(labels, dtype=torch.long))

# Grid search space
hidden_size = int(best_params['hidden_size'])
num_layers = int(best_params['num_layers'])
dropout_rate = best_params['dropout_rate']
lr = best_params['learning_rate']
batch_size = int(best_params['batch_size'])
# Best hyperparameters from grid search
train_loader = DataLoader(to_tensor(X_train, y_train), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(to_tensor(X_val, y_val), batch_size=batch_size)
test_loader = DataLoader(to_tensor(X_test, y_test), batch_size=batch_size)

model = LSTMClassifier(
    input_size=1,
    hidden_size=hidden_size,
    num_layers=num_layers,
    output_size=len(np.unique(y_train)),
    dropout_rate=dropout_rate,
    learning_rate=lr,
    use_conv1d=True,
    use_attention=False,
    use_auxiliary=False
)

if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)

history = model.train_model(train_loader, val_loader, epochs=50, patience=10)
test_acc = model.evaluate(test_loader)
print(f"✅ Test accuracy: {test_acc:.4f}")

🔄 Using device: cuda (1 GPUs available)
DEBUG: Optimizer initialized? True
✅ Running on CUDA!


  return self._call_impl(*args, **kwargs)


Epoch [1/50], Loss: 0.6928, Train Acc: 0.5121
Validation Acc: 0.5281
Epoch [2/50], Loss: 0.6846, Train Acc: 0.5473
Validation Acc: 0.5563
Epoch [3/50], Loss: 0.6715, Train Acc: 0.5648
Validation Acc: 0.5266
No improvement (1/10).
Epoch [4/50], Loss: 0.6627, Train Acc: 0.5789
Validation Acc: 0.5500
No improvement (2/10).
Epoch [5/50], Loss: 0.6539, Train Acc: 0.6051
Validation Acc: 0.5781
Epoch [6/50], Loss: 0.6510, Train Acc: 0.6094
Validation Acc: 0.5766
No improvement (1/10).
Epoch [7/50], Loss: 0.6661, Train Acc: 0.6016
Validation Acc: 0.5891
Epoch [8/50], Loss: 0.6482, Train Acc: 0.6184
Validation Acc: 0.6078
Epoch [9/50], Loss: 0.6325, Train Acc: 0.6398
Validation Acc: 0.6203
Epoch [10/50], Loss: 0.6195, Train Acc: 0.6613
Validation Acc: 0.6344
Epoch [11/50], Loss: 0.6161, Train Acc: 0.6539
Validation Acc: 0.6109
No improvement (1/10).
Epoch [12/50], Loss: 0.6177, Train Acc: 0.6383
Validation Acc: 0.6344
No improvement (2/10).
Epoch [13/50], Loss: 0.5927, Train Acc: 0.6820
Validat

## Model Selection + Hyperparameter Refinement

Why don't we do model selection and hyperparameters together? 

<span style = 'color : red'>Don't Run the following code - it will take a very long time, even on an HPC! </span>

In [None]:
'''
import itertools
import torch
import torch.nn as nn
import numpy as np
import time
import csv
import os
import pandas as pd
from models.lstm import LSTMClassifier
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
from utils.load_data import load_and_split_data
from tqdm import tqdm
import multiprocessing

# Auto-detect number of CPUs allocated by Grid Engine
num_workers = max(1, int(os.environ.get("NSLOTS", multiprocessing.cpu_count()) ) // 2)
print(f"Using {num_workers} workers for DataLoader.")

# Load and preprocess data
output_file = 'data/combined_traj_1199_1200_SS.csv'
X_train, X_val, X_test, y_train, y_val, y_test = load_and_split_data(output_file, split_val_size=0.2)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

def to_tensor(data, labels):
    return TensorDataset(torch.tensor(data, dtype=torch.float32),
                         torch.tensor(labels, dtype=torch.long))

# Architecture configs to sweep
architectures = [
    {"name": "Vanilla LSTM", "conv1d": False, "attention": False, "multihead": False, "aux": False},
    {"name": "Conv1D Only", "conv1d": True,  "attention": False, "multihead": False, "aux": False},
    {"name": "Conv1D + Attention", "conv1d": True,  "attention": True, "multihead": False, "aux": False},
    {"name": "Conv1D + MultiHead", "conv1d": True,  "attention": True, "multihead": True,  "aux": False},
    {"name": "Full Model", "conv1d": True,  "attention": True, "multihead": True,  "aux": True},
]

# Grid search space
hidden_sizes = [64, 128, 256]
num_layers_list = [2, 3, 4]
dropout_rates = [0.01, 0.05, 0.1, 0.2, 0.3]
learning_rates = [0.01, 1e-3, 1e-4, 5e-4]
batch_sizes = [32, 64, 128]
epochs_list = [50, 100, 150]

param_grid = list(itertools.product(hidden_sizes,
                                    num_layers_list,
                                    dropout_rates,
                                    learning_rates,
                                    batch_sizes,
                                    epochs_list))

csv_columns = ["architecture", "hidden_size", "num_layers", "dropout_rate", "learning_rate", "batch_size", "epochs", "train_acc", "val_acc", "test_acc", "test_acc_std", "time"]
results_file = "IY001A.csv"

# Initialize CSV file if it doesn't exist
if not os.path.exists(results_file) or os.path.getsize(results_file) == 0:
    with open(results_file, "w", newline="") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()

# Claim a config for exclusive access
def claim_config(config_key):
    """Claim a config by writing a placeholder row with 'IN_PROGRESS' if not yet taken."""
    if os.path.exists(results_file):
        df = pd.read_csv(results_file)
        keys = set((
            row["architecture"],
            row["hidden_size"],
            row["num_layers"],
            row["dropout_rate"],
            row["learning_rate"],
            row["batch_size"],
            row["epochs"]
        ) for _, row in df.iterrows())

        if config_key in keys:
            return False

    # Claim it with a placeholder
    with open(results_file, "a", newline="") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writerow({
            "architecture": config_key[0],
            "hidden_size": config_key[1],
            "num_layers": config_key[2],
            "dropout_rate": config_key[3],
            "learning_rate": config_key[4],
            "batch_size": config_key[5],
            "epochs": config_key[6],
            "train_acc": "IN_PROGRESS",
            "val_acc": None,
            "test_acc": None,
            "test_acc_std": None,
            "time": None
        })

    return True

num_runs = 10
results = []

# Grid search loop
for cfg in tqdm(architectures):
    print(f"\n=== Sweeping Architecture: {cfg['name']} ===")

    for hidden_size, num_layers, dropout_rate, lr, batch_size, epochs in tqdm(param_grid, desc="Grid Search"):
        config_key = (cfg['name'], hidden_size, num_layers, dropout_rate, lr, batch_size, epochs)

        if not claim_config(config_key):
            continue  # Already completed or in progress

        print(f"Testing: Hidden={hidden_size}, Layers={num_layers}, Dropout={dropout_rate}, LR={lr}, Batch={batch_size}, Epochs={epochs}")
        acc_runs = []
        run_stats = []

        for run in range(num_runs):
            print(f"Run {run + 1}/{num_runs}")
            train_loader = DataLoader(to_tensor(X_train, y_train), batch_size=batch_size, shuffle=True, num_workers=num_workers) # multi-threading data loading
            val_loader = DataLoader(to_tensor(X_val, y_val), batch_size=batch_size,num_workers=num_workers)
            test_loader = DataLoader(to_tensor(X_test, y_test), batch_size=batch_size,num_workers=num_workers)

            model = LSTMClassifier(
                input_size=1,
                hidden_size=hidden_size,
                num_layers=num_layers,
                output_size=len(np.unique(y_train)),
                dropout_rate=dropout_rate,
                learning_rate=lr,
                use_conv1d=cfg['conv1d'],
                use_attention=cfg['attention'],
                num_attention_heads=4 if cfg['multihead'] else 1,
                use_auxiliary=cfg['aux']
            )

            try:
                start = time.time()
                history = model.train_model(train_loader, val_loader, epochs=epochs, patience=10)
                test_acc = model.evaluate(test_loader)
                duration = time.time() - start

                acc_runs.append(test_acc)
                run_stats.append({
                    "architecture": cfg['name'],
                    "hidden_size": hidden_size,
                    "num_layers": num_layers,
                    "dropout_rate": dropout_rate,
                    "learning_rate": lr,
                    "batch_size": batch_size,
                    "epochs": epochs,
                    "train_acc": history['train_acc'][-1],
                    "val_acc": history['val_acc'][-1] if 'val_acc' in history and history['val_acc'] else None,
                    "test_acc": test_acc,
                    "test_acc_std": None,
                    "time": duration
                })

            except RuntimeError as e:
                if "out of memory" in str(e):
                    print("CUDA OOM error caught. Skipping this config.")
                    torch.cuda.empty_cache()
                    break
                else:
                    raise

        if acc_runs:
            mean_acc = np.mean(acc_runs)
            std_acc = np.std(acc_runs)

            avg_result = run_stats[0].copy()
            avg_result["test_acc"] = mean_acc
            avg_result["test_acc_std"] = std_acc
            results.append(avg_result)

            # Update the placeholder entry
            df = pd.read_csv(results_file)
            mask = (
                (df["architecture"] == config_key[0]) &
                (df["hidden_size"] == config_key[1]) &
                (df["num_layers"] == config_key[2]) &
                (df["dropout_rate"] == config_key[3]) &
                (df["learning_rate"] == config_key[4]) &
                (df["batch_size"] == config_key[5]) &
                (df["epochs"] == config_key[6])
            )
            df.loc[mask, avg_result.keys()] = list(avg_result.values())
            df.to_csv(results_file, index=False)

# Report top results
results = sorted(results, key=lambda x: x['test_acc'], reverse=True)

print("\n=== Top Configs ===")
for r in results[:10]:
    print(f"Test Acc: {r['test_acc']:.4f} ± {r['test_acc_std']:.4f}, Val Acc: {r['val_acc']}, Train Acc: {r['train_acc']:.4f}, Time: {r['time']:.2f}s")
    print(f"  Params: Hidden={r['hidden_size']}, Layers={r['num_layers']}, Dropout={r['dropout_rate']}, LR={r['learning_rate']}, Batch={r['batch_size']}, Epochs={r['epochs']}")
'''

In [7]:
# read in the finetuning results
df_finetune_results = pd.read_csv("data/IY001A.csv")
df_finetune_results = df_finetune_results.sort_values(by='test_acc', ascending=False)

# get the best parameters
best_params = df_finetune_results.iloc[0][
    ['architecture', 'hidden_size', 'num_layers', 'dropout_rate', 'learning_rate', 'batch_size', 'epochs']
    ]
best_params

architecture     Conv1D + MultiHead
hidden_size                      64
num_layers                        2
dropout_rate                    0.3
learning_rate                 0.001
batch_size                       64
epochs                          100
Name: 5030, dtype: object

## Use Concatenated Data as Input Data

Use data simulated over a range of variance ratio, let's use data from ``data_12_04_2025``, which have steady state data within each of the folders

In [4]:
# path to all *steady state* CSV files, for simplicity we only take the first set of steady state data ending with 0_SS.csv
file_paths = sorted(glob.glob('/home/ianyang/stochastic_simulations/experiments/SSA_telegraph_model/var_v_accuracy_plot/data_12_04_2025/mRNA_trajectories_variance_*/steady_state_trajectories/m_traj_*_0_SS.csv')) 
# len(file_paths)

# Read and combine
dfs = [pd.read_csv(f) for f in file_paths]
combined_df = pd.concat(dfs, ignore_index=True)

# Optional: shuffle the rows
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save combined dataset
input_file_path = 'data/IY002_input_A.csv'
combined_df.to_csv(input_file_path, index=False)
print(f"✅ Combined {len(file_paths)} files into {combined_df.shape[0]} rows.")

✅ Combined 290 files into 116000 rows.


In [None]:
from classifiers.lstm_classifier import lstm_classifier

# read in the finetuning results
df_finetune_results = pd.read_csv("data/IY001A.csv")
df_finetune_results = df_finetune_results.sort_values(by='test_acc', ascending=False)

# get the best parameters
best_params = df_finetune_results.iloc[0][
    ['architecture', 'hidden_size', 'num_layers', 'dropout_rate', 'learning_rate', 'batch_size', 'epochs']
    ]

architectures = [
    {"name": "Vanilla LSTM", "conv1d": False, "attention": False, "multihead": False, "aux": False},
    {"name": "Conv1D Only", "conv1d": True,  "attention": False, "multihead": False, "aux": False},
    {"name": "Conv1D + Attention", "conv1d": True,  "attention": True, "multihead": False, "aux": False},
    {"name": "Conv1D + MultiHead", "conv1d": True,  "attention": True, "multihead": True,  "aux": False},
    {"name": "Full Model", "conv1d": True,  "attention": True, "multihead": True,  "aux": True},
]

# map the best architecture name to its config
best_arch_name = best_params['architecture']
best_arch_config = next((arch for arch in architectures if arch['name'] == best_arch_name), None)

if best_arch_config is None:
    raise ValueError(f"Architecture '{best_arch_name}' not found in predefined architectures.")

# Train LSTM model using SSA data
X_train, X_val, X_test, y_train, y_val, y_test = load_and_split_data(input_file_path, split_val_size=0.2) # we must define split_val_size here to get a validation set

# Train and evaluate model using best hyperparams and architecture
lstm_accuracy = lstm_classifier(
    X_train, X_val, X_test, y_train, y_val, y_test,
    epochs=int(best_params['epochs']),
    hidden_size=int(best_params['hidden_size']),
    num_layers=int(best_params['num_layers']),
    dropout_rate=float(best_params['dropout_rate']),
    learning_rate=float(best_params['learning_rate']),
    batch_size=int(best_params['batch_size']),
    use_conv1d=best_arch_config['conv1d'],
    use_attention=best_arch_config['attention'],
    num_attention_heads=4 if best_arch_config['attention'] else 0,
    use_auxiliary=best_arch_config['aux'],
    save_path='IY002A.pth'
)

🔄 Using device: cuda (1 GPUs available)
DEBUG: Optimizer initialized? True
✅ Running on CUDA!


  return self._call_impl(*args, **kwargs)


Epoch [1/100], Loss: 0.5434, Train Acc: 0.7926
Validation Acc: 0.8572
✅ Model saved at IY002A.pth (Best Validation Acc: 0.8572)
Epoch [2/100], Loss: 0.4032, Train Acc: 0.9021
Validation Acc: 0.9441
✅ Model saved at IY002A.pth (Best Validation Acc: 0.9441)
Epoch [3/100], Loss: 0.3251, Train Acc: 0.9462
Validation Acc: 0.9511
✅ Model saved at IY002A.pth (Best Validation Acc: 0.9511)
Epoch [4/100], Loss: 0.2876, Train Acc: 0.9640
Validation Acc: 0.9737
✅ Model saved at IY002A.pth (Best Validation Acc: 0.9737)
Epoch [5/100], Loss: 0.2659, Train Acc: 0.9733
Validation Acc: 0.9742
✅ Model saved at IY002A.pth (Best Validation Acc: 0.9742)
Epoch [6/100], Loss: 0.2570, Train Acc: 0.9765
Validation Acc: 0.9501
No improvement (1/10).
Epoch [7/100], Loss: 0.2500, Train Acc: 0.9796
Validation Acc: 0.9583
No improvement (2/10).
Epoch [8/100], Loss: 0.2466, Train Acc: 0.9801
Validation Acc: 0.9809
✅ Model saved at IY002A.pth (Best Validation Acc: 0.9809)
Epoch [9/100], Loss: 0.2443, Train Acc: 0.9808

In [None]:
input_file_path = 'data/IY002_input_A.csv'
