In [2]:
import os 
import re
import random 
import torch 
from torch import nn 
import numpy as np 
import matplotlib.pyplot as plt 
from torch.utils.data import Dataset, DataLoader , random_split
import pandas as pd 
import scipy 
from scipy.signal import savgol_filter 
        
random.seed(0)

In [3]:
# Define Savitzky Golay filter parameters 
order = 1
frame_length = 21
eps = 1e-8

In [4]:
class DTOFDataset(Dataset):
    """
    Dataset for DTOF signals stored in a single CSV.

    Each column (from column 1 onward) is a DTOF trace; column 0 is time_ns.
    Preprocessing applied:
    - Savitzky-Golay smoothing
    - Negative value clipping to eps
    - Standardisation (mean 0, std 1) per trace
    - 3-channel temporal masking (early/mid/late)
    - Labels parsed from column headers -> (mua, mus)
    """

    def __init__(
            self,
            csv_path,
            labels = None,
            window_length = frame_length,
            polyorder = order,
            eps = eps,
    ):
            super().__init__()
            self.csv_path = csv_path
            self.window_length = window_length
            self.polyorder = polyorder
            self.eps = eps

            # Load the CSV
            df = pd.read_csv(csv_path)

            # Column 0 is time, rest are DTOFs
            self.time_ns = df.iloc[:, 0].values
            dtof_matrix = df.iloc[:, 1:].values

            # Transpose so rows = signals, cols = time -> (N_signals, T)
            dtof_matrix = dtof_matrix.T
            self.n_signals, self.T = dtof_matrix.shape

            dtof_smooth = savgol_filter(
                dtof_matrix,
                window_length=window_length,
                polyorder=polyorder,
                axis=1
            )

            # Clip negative values to eps
            dtof_smooth[dtof_smooth < 0] = eps

            # Standardisation (mean = 0, std = 1)
            means = dtof_smooth.mean(axis=1, keepdims=True)
            stds = dtof_smooth.std(axis=1, keepdims=True)
            dtof_standardised = (dtof_smooth - means) / (stds + eps)

            # Store as float32 numpy array
            self.signals = dtof_standardised.astype(np.float32)

            # Build 3 temporal masks: early, mid, late
            t = self.time_ns
            early_mask = ((t >= 0.0) & (t < 0.5)).astype(np.float32)
            mid_mask = ((t >= 0.5) & (t < 4.0)).astype(np.float32)
            late_mask = ((t >= 4.0) & (t < 6.0)).astype(np.float32)

            # Stack -> (3, T) so we can broadcast with each DTOF
            self.masks = np.stack([early_mask, mid_mask, late_mask], axis=0).astype(np.float32)

            # Labels parsed from column headers (mua, mus). Allows overriding via labels arg.
            header_labels = self._parse_header_labels(df.columns[1:])
            if labels is None:
                self.labels = header_labels
            else:
                labels_arr = np.asarray(labels, dtype=np.float32)
                if len(labels_arr) != self.n_signals:
                    raise ValueError(f"labels length {len(labels_arr)} does not match number of signals {self.n_signals}")
                self.labels = labels_arr

    def _parse_header_labels(self, columns):
            parsed = []
            for col in columns:
                col_clean = col.strip()
                match = re.search(r"mua:\s*([0-9.]+)\s+mus:\s*([0-9.]+)", col_clean)
                if not match:
                    raise ValueError(f"Could not parse mua/mus from column name '{col}'")
                mua_val = float(match.group(1))
                mus_val = float(match.group(2))
                parsed.append((mua_val, mus_val))
            return np.asarray(parsed, dtype=np.float32)

    def __len__(self):
            return self.n_signals

    def __getitem__(self, index):
        # Get a single DTOF
        dtof = self.signals[index]

        # Apply the 3 masks -> (3, T)
        channels = self.masks * dtof
        signal = torch.from_numpy(channels)
        target = torch.tensor(self.labels[index])
        
        return signal, target


In [5]:
class Net(nn.Module):
    def __init__(self, in_channels = 3, input_length = 3000, output_dim = 2):
        """
        CNN for 1D DTOF signals with 3 input channels (early / mid / late masks)
        Blocks: [Conv1d -> BN -> ReLU -> MaxPool1d] x 3 -> Flatten -> FCs.
        """

        super().__init__()

        # Convolution blocks
        self.conv1 = nn.Conv1d(in_channels=in_channels, out_channels=32, kernel_size=7, padding=3)
        self.bn1 = nn.BatchNorm1d(32)
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)

        self.conv2 = nn.Conv1d(in_channels=32, out_channels=32, kernel_size=5, padding=2)
        self.bn2 = nn.BatchNorm1d(32)
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)

        self.conv3 = nn.Conv1d(in_channels=32, out_channels=16, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm1d(16)
        self.pool3 = nn.MaxPool1d(kernel_size=2, stride=2)

        self.act = nn.ReLU()

        # Compute the flattened feature size dynamically
        with torch.no_grad():
            dummy = torch.zeros(1, in_channels, input_length)
            feat = self._forward_features(dummy)
            self.flatten_dim = feat.shape[1]

        # Fully connected layers
        self.fc1 = nn.Linear(self.flatten_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def _forward_features(self, x):
        """Convolutional feature extractor followed by flatten."""
        # Block 1
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.act(x)
        x = self.pool1(x)

        # Block 2
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.act(x)
        x = self.pool2(x)

        # Block 3
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.act(x)
        x = self.pool3(x)

        # Flatten to (batch, features)
        x = x.view(x.size(0), -1)
        return x

    def forward(self, x):
        """
        x: (batch_size, in_channels = 3, time_points = 3000)
        """
        x = self._forward_features(x) # (batch, flatten_dim)
        x = self.act(self.fc1(x))
        x = self.act(self.fc2(x))
        x = self.fc3(x)
        return x


In [6]:
def train_model(model, train_loader, val_loader, loss_fn, optimizer, num_epochs, device, save_path = None): 
    """ 
    Train the CNN with a training + validation loop 

    Inputs: 
        model: instance of Net 
        train_loader: DataLoader for training set (yields signals, labels)
        val_loader: DataLoader for validation set 
        loss_fun : loss function, e.g. nn.MSELoss()
        optimiser: optimiser, e.g. torch.optim.Adam(...)
        num_epochs: number of epochs to train 
        device: u
        save_path: optional path to save best model, to use later when we develop more models (str or None)
    """
    # Move model to device 
    model.to(device)
    best_val_loss = float("inf")

    for epoch in range(num_epochs): 
        print(f"\nEpoch {epoch + 1}/ {num_epochs}")

        # TRAINING PHASE 
        model.train()
        running_loss = 0.0
        for signals, labels in train_loader: 
            # Move the batch to device 
            signals = signals.to(device) # (batch, 3, T = 3000)
            labels = labels.to(device).float() # (batch,) or (batch, 1)

            # Zero gradients 
            optimizer.zero_grad()

            # Forward pass 
            preds = model(signals) # (batch, 1) or (batch, )
            preds = preds.view_as(labels) # reshape the predictions to have the same shape as labels 

            # Loss
            loss = loss_fn(preds, labels)
            
            # Backward + update 
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

    train_loss = running_loss / len(train_loader)
    print(f"Train Loss: {train_loss:.4f}")

    # VALIDATION PHASE
    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for signals, labels in val_loader: 
            signals = signals.to(device)
            labels = labels.to(device).float()
            
            preds = model(signals)
            preds = preds.view_as(labels)

            loss = loss_fn(preds, labels)
            val_loss += loss.item()
            
    val_loss /= len(val_loader)
    print(f"Validation Loss: {val_loss:.4f}")

    if val_loss < best_val_loss: 
        print(" -> Best validation loss so far, saving the model.")
        best_val_loss = val_loss
        if save_path is not None: 
            torch.save(model.state_dict(), save_path)




In [7]:
def train_test_split(data, target, test_size = 0.2, shuffle = True, random_state = None):
    """
    Splits the data and target lists into training and validation subsets. 
    """

    if len(data) != len(target): 
        raise ValueError("Data and target must have the same length.")
    
    if shuffle: 
        if random_state is not None: 
            random.seed(random_state)
        pairs = list(zip(data, target))
        random.shuffle(pairs)
        data, target = zip(*pairs)
    split_idx = int(len(data) * (1 - test_size))

    return (
        data[:split_idx], # X_train
        data[split_idx:], # X_val 
        target[:split_idx], # y_train
        target[split_idx:] # y_val
    )

In [8]:
def extract_labels_from_dtof_csv(csv_path, label_csv_path):
    """
    Extract (mua, mus) labels from DTOF CSV column headers and save to a new CSV file.

    Input: 
        csv_path : str
            Path to the large DTOF CSV file (first column = time_ns, others = DTOFs).
        label_csv_path : str
            Where to save the generated labels CSV.
    """

    df = pd.read_csv(csv_path)

    # DTOF columns start from index 1
    dtof_columns = df.columns[1:]

    labels = []

    for col in dtof_columns:
        col_clean = col.strip()

        # Regex matches text like "mua: 0.015 mus: 0.75"
        match = re.search(r"mua:\s*([0-9.]+)\s+mus:\s*([0-9.]+)", col_clean)
        if not match:
            raise ValueError(f"Could not parse mua/mus from column '{col}'")

        mua_val = float(match.group(1))
        mus_val = float(match.group(2))
        labels.append((mua_val, mus_val))

    labels = np.asarray(labels, dtype=np.float32)

    # Save to CSV
    label_df = pd.DataFrame(labels, columns=["mua", "mus"])
    label_df.to_csv(label_csv_path, index=False)

    print(f"Labels extracted and saved to: {label_csv_path}")
    print(f"Total signals: {len(labels)}")
    print(f"Example:\n{label_df.head()}")


In [9]:
class ModelEvaluator: 
    """
    Evaluates a trained model on a dataset and computes inversion accuracy metrics.  
    Assumes that targets are 2D: (mua, mus)
    """

    def __init__(self, model, device):
        self.model = model 
        self.device = device
        self.model.to(device)
        self.model.eval() # evaluation mode
    
    def evaluate(self, data_loader): 
        all_preds = []
        all_labels = []

        with torch.no_grad(): 
            for signals, labels in data_loader: 
                signals = signals.to(self.device)
                labels = labels.to(self.device).float()
                
                preds = self.model(signals)
                preds = preds.view_as(labels)

                all_preds.append(preds.cpu())
                all_labels.append(labels.cpu())

        all_preds = torch.cat(all_preds, dim = 0)
        all_labels = torch.cat(all_labels, dim = 0)

        # Compute errors 
        abs_err = torch.abs(all_preds - all_labels) # (N, 2)
        sq_err = (all_preds - all_labels) ** 2

        mae = abs_err.mean(dim = 0)
        rmse = torch.sqrt(sq_err.mean(dim = 0))

        metrics = {
            "MAE": mae.numpy(), 
            "RMSE": rmse.numpy(),  
            "preds": all_preds.numpy(), 
            "lables": all_labels.numpy()
        }

        return metrics

In [10]:
# Extract the labels from csv_path to label_csv_path

csv_path = r"/Users/lydialichen/Library/CloudStorage/OneDrive-UniversityCollegeLondon/Year 3/Research Project in Biomedical Engineering/Code/Pre-obtained data/DTOFs_Homo_raw.csv"
label_csv_path = r"/Users/lydialichen/Library/CloudStorage/OneDrive-UniversityCollegeLondon/Year 3/Research Project in Biomedical Engineering/Code/Pre-obtained data/DTOFs_Homo_labels.csv"
extract_labels_from_dtof_csv(csv_path, label_csv_path)

frame_length =21 
order = 1

# Load labels as a numpy array 
label_df = pd.read_csv(label_csv_path) # columns: ["mua", "mus"]
labels_arr = label_df.values.astype(np.float32)

# Create the DTOF dataset with labels

dataset = DTOFDataset(
    csv_path= csv_path,
    labels = labels_arr, 
    window_length= frame_length, 
    polyorder= order, 
    eps = 1e-8, 
)

loader = DataLoader(dataset, batch_size = 32, shuffle = True)
signals, labels = next(iter(loader))
print("signals:", signals.shape)
print("labels:", labels.shape)

Labels extracted and saved to: /Users/lydialichen/Library/CloudStorage/OneDrive-UniversityCollegeLondon/Year 3/Research Project in Biomedical Engineering/Code/Pre-obtained data/DTOFs_Homo_labels.csv
Total signals: 400
Example:
        mua  mus
0  0.005000  2.0
1  0.005644  2.0
2  0.006371  2.0
3  0.007192  2.0
4  0.008119  2.0
signals: torch.Size([32, 3, 3000])
labels: torch.Size([32, 2])


In [11]:
# Build dataset splits 
train_frac = 0.8 
n_total = len(dataset)
n_train = int(train_frac * n_total)
n_val = n_total - n_train

generator = torch.Generator().manual_seed(42) #Â for reproducibility 

train_dataset, val_dataset = random_split(
    dataset, 
    [n_train, n_val], 
    generator = generator
)

print("Total samples:", n_total)
print("Train samples:", len(train_dataset))
print("Val samples:  ", len(val_dataset))

# Data Loaders 
batch_size = 32

train_loader = DataLoader(
    train_dataset, 
    batch_size = batch_size, 
    shuffle = True
)

val_loader = DataLoader(
    val_dataset, 
    batch_size = batch_size, 
    shuffle = False
)

Total samples: 400
Train samples: 320
Val samples:   80


In [12]:
# 1. Device 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 2. Model: instantiating the CNN and moving the model to the device before training
model = Net(
    in_channels=3, 
    input_length = dataset.T,
    output_dim = 2 # predicting both (mua, mus)
).to(device)

# 3. Loss + optimizer 
loss_fn = torch.nn.MSELoss() # MSE error 
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3) # learning rate of 0.001, decrease to 1e-4 if the training is unstable and increase to 3e-3 if training is too slow 

# 4. Train 
num_epochs = 20 

train_model(
    model = model, 
    train_loader = train_loader, 
    val_loader= val_loader, 
    loss_fn= loss_fn, 
    optimizer= optimizer, 
    num_epochs= num_epochs, 
    device=device, 
    save_path= "best_dtof_cnn.pth"

)

# Forward pass with real DTOF batch, to verify the model pipeline from input -> output runs without shape errors
outputs = model(signals)

# Instantiate evaluator 
evaluator = ModelEvaluator(model, device)

# Run evaluation on validation loader 
metrics = evaluator.evaluate(val_loader)

print("MAE:", metrics["MAE"])
print("RMSE:", metrics["RMSE"])

Using device: cpu

Epoch 1/ 20

Epoch 2/ 20

Epoch 3/ 20

Epoch 4/ 20

Epoch 5/ 20

Epoch 6/ 20

Epoch 7/ 20

Epoch 8/ 20

Epoch 9/ 20

Epoch 10/ 20

Epoch 11/ 20

Epoch 12/ 20

Epoch 13/ 20

Epoch 14/ 20

Epoch 15/ 20

Epoch 16/ 20

Epoch 17/ 20

Epoch 18/ 20

Epoch 19/ 20

Epoch 20/ 20
Train Loss: 0.1040
Validation Loss: 0.1360
 -> Best validation loss so far, saving the model.
MAE: [0.02461546 0.4895238 ]
RMSE: [0.03028578 0.5288253 ]
