# Phase 2 - Unimodal Deep Learning Models
---

## Import all necessary libraries

In [1]:
## Import required libraries and modules
import sys
import os
import logging
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import importlib

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join("..", "src")))


from utils import load_config
from preprocess.preprocess import split_data
from models import FlexibleFCNN
from pipelines import DLModelsPipeline

from torch.optim.lr_scheduler import ReduceLROnPlateau

# Load Config
config = load_config("../config.yaml")

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s - %(levelname)s - %(message)s",
)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {device}")

2024-12-14 17:45:19,251 - INFO - Using device: cuda


In [2]:
## Load, Split and Preprocess Dataset
# Load datasets
logging.info("Loading datasets...")
tf_df = pd.read_csv(config["data_paths"]["preprocessed_tf_file"])
landmark_df = pd.read_csv(config["data_paths"]["preprocessed_landmark_file"])

# For large gene data, read in chunks
logging.info("Loading gene dataset in chunks...")
chunk_size = 1000
chunks = []
for chunk in pd.read_csv(
    config["data_paths"]["preprocessed_gene_file"], chunksize=chunk_size
):
    chunks.append(chunk)
gene_df = pd.concat(chunks, axis=0)
del chunks  # Free memory

# # Only sample 1000 rows for now
# tf_df = tf_df.sample(1000)
# landmark_df = landmark_df.sample(1000)
# gene_df = gene_df.sample(1000)

# Split Data
logging.info("Splitting datasets into train/val/test...")
X_tf_train, y_tf_train, X_tf_val, y_tf_val, X_tf_test, y_tf_test = split_data(
    tf_df, target_name="viability", config=config
)
(
    X_landmark_train,
    y_landmark_train,
    X_landmark_val,
    y_landmark_val,
    X_landmark_test,
    y_landmark_test,
) = split_data(landmark_df, target_name="viability", config=config)
X_gene_train, y_gene_train, X_gene_val, y_gene_val, X_gene_test, y_gene_test = (
    split_data(gene_df, target_name="viability", config=config)
)

2024-12-14 17:45:19,273 - INFO - Loading datasets...
2024-12-14 17:45:30,820 - INFO - Loading gene dataset in chunks...
2024-12-14 17:47:05,070 - INFO - Splitting datasets into train/val/test...


In [3]:
def create_dataloader(X, y, batch_size=32):
    # Ensuring X and y are pandas DataFrames/Series:
    # If they are arrays, adjust accordingly.
    dataset = TensorDataset(
        torch.tensor(X.values, dtype=torch.float32),
        torch.tensor(y.values, dtype=torch.float32),
    )
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)


logging.info("Creating DataLoaders...")
tf_train_loader = create_dataloader(X_tf_train, y_tf_train)
tf_val_loader = create_dataloader(X_tf_val, y_tf_val)
tf_test_loader = create_dataloader(X_tf_test, y_tf_test)

landmark_train_loader = create_dataloader(X_landmark_train, y_landmark_train)
landmark_val_loader = create_dataloader(X_landmark_val, y_landmark_val)
landmark_test_loader = create_dataloader(X_landmark_test, y_landmark_test)

gene_train_loader = create_dataloader(X_gene_train, y_gene_train)
gene_val_loader = create_dataloader(X_gene_val, y_gene_val)
gene_test_loader = create_dataloader(X_gene_test, y_gene_test)

2024-12-14 17:47:14,819 - INFO - Creating DataLoaders...


In [4]:
from models import CNNRegressor, MLPMixer, TransformerRegressor


feature_sets = {
    "TF Data": (tf_train_loader, tf_val_loader, tf_test_loader),
    "Landmark Data": (landmark_train_loader, landmark_val_loader, landmark_test_loader),
    # "Gene Data": (gene_train_loader, gene_val_loader, gene_test_loader),
}

# Define your model configurations
model_configs = {
    "FCNN_Model": {
        "model_class": FlexibleFCNN,
        "model_params": {
            "hidden_dims": [512, 256, 128, 64],
            "output_dim": 1,
            "activation_fn": "prelu",
            "dropout_prob": 0.2,
            "residual": True,
            "norm_type": "batchnorm",
            "weight_init": "xavier",
        },
        "criterion": nn.MSELoss(),
        "optimizer_class": optim.AdamW,
        "optimizer_params": {"lr": 0.001, "weight_decay": 1e-4},
        "scheduler_class": ReduceLROnPlateau,
        "scheduler_params": {"mode": "min", "patience": 5},
        "train_params": {
            "epochs": 10,
            "gradient_clipping": 1.0,
            "early_stopping_patience": 10,
        },
    },
    "Transformer_Model": {
        "model_class": TransformerRegressor,
        "model_params": {
            "d_model": 128,
            "nhead": 4,
            "num_layers": 2,
            "dim_feedforward": 256,
            "dropout": 0.1,
            "output_dim": 1,
        },
        "criterion": nn.MSELoss(),
        "optimizer_class": optim.AdamW,
        "optimizer_params": {"lr": 0.001, "weight_decay": 1e-4},
        "scheduler_class": ReduceLROnPlateau,
        "scheduler_params": {"mode": "min", "patience": 5},
        "train_params": {
            "epochs": 10,
            "gradient_clipping": 1.0,
            "early_stopping_patience": 10,
        },
    },
    # "CNN_Model": {
    #     "model_class": CNNRegressor,
    #     "model_params": {
    #         "num_filters": 64,
    #         "kernel_size": 7,
    #         "num_layers": 3,
    #         "dropout_prob": 0.2,
    #         "output_dim": 1,
    #     },
    #     "criterion": nn.MSELoss(),
    #     "optimizer_class": optim.AdamW,
    #     "optimizer_params": {"lr": 0.001, "weight_decay": 1e-4},
    #     "scheduler_class": ReduceLROnPlateau,
    #     "scheduler_params": {"mode": "min", "patience": 5},
    #     "train_params": {
    #         "epochs": 10,
    #         "gradient_clipping": 1.0,
    #         "early_stopping_patience": 10,
    #     },
    # },
}

In [5]:
# Now initialize the pipeline using model_configs instead of model_class & model_params
pipeline = DLModelsPipeline(feature_sets=feature_sets, model_configs=model_configs)

In [6]:
# Train and evaluate the models
logging.info("Starting training and evaluation...")
pipeline.train_and_evaluate()

# Retrieve results
logging.info("Collecting results...")
results_df = pipeline.get_results()

# Save the results
results_df.to_csv("combined_metrics.csv", index=False)
logging.info("Results saved to combined_metrics.csv.")

2024-12-14 17:47:17,468 - INFO - Starting training and evaluation...
2024-12-14 17:47:29,614 - INFO - Epoch 1/10 - Model, Train Loss: 0.2286, Val Loss: 0.0345
2024-12-14 17:47:35,794 - INFO - Epoch 2/10 - Model, Train Loss: 0.0407, Val Loss: 0.0303
2024-12-14 17:47:42,162 - INFO - Epoch 3/10 - Model, Train Loss: 0.0340, Val Loss: 0.0305
2024-12-14 17:47:48,468 - INFO - Epoch 4/10 - Model, Train Loss: 0.0306, Val Loss: 0.0287
2024-12-14 17:47:54,649 - INFO - Epoch 5/10 - Model, Train Loss: 0.0286, Val Loss: 0.0295
2024-12-14 17:48:00,938 - INFO - Epoch 6/10 - Model, Train Loss: 0.0266, Val Loss: 0.0279
2024-12-14 17:48:07,600 - INFO - Epoch 7/10 - Model, Train Loss: 0.0248, Val Loss: 0.0275
2024-12-14 17:48:13,814 - INFO - Epoch 8/10 - Model, Train Loss: 0.0230, Val Loss: 0.0276
2024-12-14 17:48:19,824 - INFO - Epoch 9/10 - Model, Train Loss: 0.0218, Val Loss: 0.0275
2024-12-14 17:48:26,054 - INFO - Epoch 10/10 - Model, Train Loss: 0.0204, Val Loss: 0.0286
2024-12-14 17:50:06,669 - INFO

In [10]:
styled_results = (
    results_df.style.format(precision=3)
    .set_caption("Regression Model Evaluation Metrics")
    .highlight_max(
        subset=["R²", "Pearson Correlation"], color="lightgreen"
    )
    .highlight_min(subset=["MAE", "MSE"], color="lightgreen")
)
styled_results

Unnamed: 0_level_0,Unnamed: 1_level_0,MSE,MAE,R²,Pearson Correlation
Feature Set,Model Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Landmark Data,FCNN_Model,0.032,0.109,0.438,0.664
Landmark Data,Transformer_Model,0.038,0.133,0.34,0.587
TF Data,FCNN_Model,0.028,0.097,0.518,0.725
TF Data,Transformer_Model,0.038,0.136,0.345,0.593
