# Baseline (Nucleotide Frequency → Linear) for 4-Component Prediction

This notebook trains a very simple baseline model that converts each DNA sequence into a 5-base frequency vector (A, T, G, C, N) and applies a linear classifier to predict the 4 component probabilities. It uses the existing training utilities for a fair comparison with other models.


In [1]:
import os
import sys
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

ROOT = "/Users/jaydenthai/Dev/University Work/2025sem2/DS-Research-Project-Tumor-Expression"
MODELS_DIR = f"{ROOT}/Main Work/models"
if MODELS_DIR not in sys.path:
    sys.path.append(MODELS_DIR)

from baseline_simple.model import BaselineLinear, BaselineFrequencyDataset
from utils.data import load_and_prepare_data
from utils.training import train_epoch, validate_epoch, evaluate_model

print(f"PyTorch: {torch.__version__}, CUDA: {torch.cuda.is_available()}, MPS: {getattr(torch.backends, 'mps', None) is not None and torch.backends.mps.is_available()}")


PyTorch: 2.6.0, CUDA: False, MPS: True


In [2]:
# Load data
sequences, targets = load_and_prepare_data(f"{ROOT}/Main Work/Processed-Data/ProSeq_with_4component_analysis.csv")
print(f"Samples: {len(sequences)}, targets: {targets.shape}")

# Stratified split by dominant component
labels = np.argmax(targets, axis=1)
train_seq, test_seq, train_targets, test_targets = train_test_split(
    sequences, targets, test_size=0.2, random_state=42, stratify=labels
)
train_labels = np.argmax(train_targets, axis=1)
train_seq, val_seq, train_targets, val_targets = train_test_split(
    train_seq, train_targets, test_size=0.2, random_state=42, stratify=train_labels
)

# Datasets/loaders
train_ds = BaselineFrequencyDataset(train_seq, train_targets)
val_ds = BaselineFrequencyDataset(val_seq, val_targets)
test_ds = BaselineFrequencyDataset(test_seq, test_targets)

batch_size = 128
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=0)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=0)

print(f"Splits — train: {len(train_ds)}, val: {len(val_ds)}, test: {len(test_ds)}")


Samples: 8735, targets: (8735, 4)
Splits — train: 5590, val: 1398, test: 1747


In [3]:
# Build model
hidden = 0  # set >0 for a tiny MLP
model = BaselineLinear(input_channels=5, num_classes=4, hidden=hidden, dropout=0.1)

# Device
if getattr(torch.backends, 'mps', None) is not None and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Optim
criterion = nn.KLDivLoss(reduction='batchmean')
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)

print(f"Params: {sum(p.numel() for p in model.parameters()):,}")
print(f"Device: {device}")


Params: 24
Device: mps


In [4]:
# Train
num_epochs = 60
train_losses, val_losses = [], []
best_val = float('inf')
bad = 0
patience = 12

for epoch in range(1, num_epochs + 1):
    tr = train_epoch(model, train_loader, criterion, optimizer, device)
    va = validate_epoch(model, val_loader, criterion, device)
    scheduler.step(va)
    train_losses.append(tr)
    val_losses.append(va)

    if va < best_val - 1e-6:
        best_val = va
        bad = 0
        torch.save(model.state_dict(), 'best_baseline_simple.pth')
    else:
        bad += 1
        if bad >= patience:
            print(f"Early stopping at epoch {epoch}")
            break

    lr = optimizer.param_groups[0]['lr']
    print(f"Epoch {epoch:03d}/{num_epochs} - train: {tr:.6f}  val: {va:.6f}  lr: {lr:.2e}")


Epoch 001/60 - train: 1.041880  val: 1.027721  lr: 1.00e-03
Epoch 002/60 - train: 1.006747  val: 0.996548  lr: 1.00e-03
Epoch 003/60 - train: 0.979309  val: 0.972090  lr: 1.00e-03
Epoch 004/60 - train: 0.958266  val: 0.952933  lr: 1.00e-03
Epoch 005/60 - train: 0.940947  val: 0.938099  lr: 1.00e-03
Epoch 006/60 - train: 0.928366  val: 0.926598  lr: 1.00e-03
Epoch 007/60 - train: 0.918685  val: 0.917456  lr: 1.00e-03
Epoch 008/60 - train: 0.910383  val: 0.910585  lr: 1.00e-03
Epoch 009/60 - train: 0.904159  val: 0.905194  lr: 1.00e-03
Epoch 010/60 - train: 0.899666  val: 0.901111  lr: 1.00e-03
Epoch 011/60 - train: 0.896838  val: 0.897973  lr: 1.00e-03
Epoch 012/60 - train: 0.893753  val: 0.895555  lr: 1.00e-03
Epoch 013/60 - train: 0.891415  val: 0.893722  lr: 1.00e-03
Epoch 014/60 - train: 0.890027  val: 0.892237  lr: 1.00e-03
Epoch 015/60 - train: 0.888944  val: 0.891224  lr: 1.00e-03
Epoch 016/60 - train: 0.887550  val: 0.890396  lr: 1.00e-03
Epoch 017/60 - train: 0.886739  val: 0.8

In [5]:
# Evaluate
model.load_state_dict(torch.load('best_baseline_simple.pth', map_location=device))
preds, tgts = evaluate_model(model, test_loader, device)

comp_names = ['Component_1', 'Component_2', 'Component_3', 'Component_4']
metrics = {}
for i, name in enumerate(comp_names):
    mse = mean_squared_error(tgts[:, i], preds[:, i])
    r2 = r2_score(tgts[:, i], preds[:, i])
    metrics[name] = {'MSE': float(mse), 'R2': float(r2)}
    print(f"{name}: MSE={mse:.6f}, R2={r2:.4f}")

overall_mse = mean_squared_error(tgts, preds)
overall_r2 = r2_score(tgts.flatten(), preds.flatten())
print(f"Overall: MSE={overall_mse:.6f}, R2={overall_r2:.4f}")


Component_1: MSE=0.158437, R2=0.0001
Component_2: MSE=0.068778, R2=0.0012
Component_3: MSE=0.095149, R2=0.0011
Component_4: MSE=0.155462, R2=0.0003
Overall: MSE=0.119457, R2=0.1138
