# Phase 2 - Unimodal Deep Learning Models
---

## Import all necessary libraries

In [1]:
## Import required libraries and modules
# Add src to path
import sys
import os

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join("..", "src")))

import logging
import torch
import pandas as pd

from utils import load_config

# Load Config to ensure reproducibility and syncing with other scripts
config = load_config("../config.yaml")

# Set logging configurations
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s - %(message)s",
)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

logging.info(f"Device: {device}")

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from preprocess.preprocess import split_data

# from models import FlexibleFCNN

2024-12-13 20:51:21,856 - Device: cuda


In [2]:
## Load, Split and Preprocess Dataset
# Load the dataset
tf_df = pd.read_csv(config["data_paths"]["preprocessed_tf_file"])
landmark_df = pd.read_csv(config["data_paths"]["preprocessed_landmark_file"])

# Define chunk size
chunk_size = 1000

# Initialize an empty list to store processed chunks
chunks = []

# Read the CSV file in chunks
for chunk in pd.read_csv(
    config["data_paths"]["preprocessed_gene_file"], chunksize=chunk_size):
    # Optionally process the chunk (e.g., drop columns, filter rows)
    chunks.append(chunk)

# Combine chunks into a single DataFrame (if needed)
gene_df = pd.concat(chunks, axis=0)

# Only sample a subset of the data for faster training
# tf_df = tf_df.sample(n=10000, random_state=42)
# landmark_df = landmark_df.sample(n=10000, random_state=42)

# Split data into train/validation/test sets
X_tf_train, y_tf_train, X_tf_val, y_tf_val, X_tf_test, y_tf_test = split_data(
    tf_df, target_name="viability", config=config
)
(
    X_landmark_train,
    y_landmark_train,
    X_landmark_val,
    y_landmark_val,
    X_landmark_test,
    y_landmark_test,
) = split_data(landmark_df, target_name="viability", config=config)
X_gene_train, y_gene_train, X_gene_val, y_gene_val, X_gene_test, y_gene_test = (
    split_data(gene_df, target_name="viability", config=config)
)


# Convert data to PyTorch tensors
def create_dataloader(X, y, batch_size=32):
    dataset = TensorDataset(
        torch.tensor(X.values, dtype=torch.float32),
        torch.tensor(y.values, dtype=torch.float32),
    )
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)


tf_train_loader = create_dataloader(X_tf_train, y_tf_train)
tf_val_loader = create_dataloader(X_tf_val, y_tf_val)
tf_test_loader = create_dataloader(X_tf_test, y_tf_test)

landmark_train_loader = create_dataloader(X_landmark_train, y_landmark_train)
landmark_val_loader = create_dataloader(X_landmark_val, y_landmark_val)
landmark_test_loader = create_dataloader(X_landmark_test, y_landmark_test)

gene_train_loader = create_dataloader(X_gene_train, y_gene_train)
gene_val_loader = create_dataloader(X_gene_val, y_gene_val)
gene_test_loader = create_dataloader(X_gene_test, y_gene_test)

In [3]:
feature_sets = {
    "TF Data": (tf_train_loader, tf_val_loader, tf_test_loader),
    "Landmark Data": (
        landmark_train_loader,
        landmark_val_loader,
        landmark_test_loader,
    ),
    "Gene Data": (
        gene_train_loader,
        gene_val_loader,
        gene_test_loader,
    ),
}

# Initialize a DataFrame to store results
combined_metrics = []

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import logging

ACTIVATION_MAP = {
    "relu": nn.ReLU,
    "leakyrelu": nn.LeakyReLU,
    "tanh": nn.Tanh,
    "sigmoid": nn.Sigmoid,
    "gelu": nn.GELU,
    "identity": nn.Identity,
    "prelu": nn.PReLU,
    "elu": nn.ELU,
}


class FlexibleFCNN(nn.Module):
    def __init__(
        self,
        input_dim,
        hidden_dims,
        output_dim,
        activation_fn="relu",
        dropout_prob=0.0,
        residual=False,
        use_batchnorm=True,
    ):
        """
        Flexible Fully-Connected Neural Network

        Args:
            input_dim (int): Dimensionality of input features.
            hidden_dims (list of int): List with the size of each hidden layer.
            output_dim (int): Dimension of the output.
            activation_fn (str or callable): Activation function to use.
                If str, must be in ACTIVATION_MAP. If callable, it should be a nn.Module or lambda returning Tensor.
            dropout_prob (float): Dropout probability. 0.0 means no dropout.
            residual (bool): Whether to use residual connections when possible.
            use_batchnorm (bool): Whether to use BatchNorm after each linear layer.
        """
        super(FlexibleFCNN, self).__init__()
        self.input_dim = input_dim
        self.hidden_dims = hidden_dims
        self.output_dim = output_dim
        self.residual = residual
        self.use_batchnorm = use_batchnorm
        self.dropout_prob = dropout_prob

        # Determine the activation function
        if isinstance(activation_fn, str):
            if activation_fn.lower() not in ACTIVATION_MAP:
                raise ValueError(f"Unknown activation function {activation_fn}")
            self.activation_fn = ACTIVATION_MAP[activation_fn.lower()]()
        else:
            # Assume callable
            self.activation_fn = activation_fn

        # Construct layers
        layer_dims = [input_dim] + hidden_dims
        self.layers = nn.ModuleList()
        self.bns = nn.ModuleList() if use_batchnorm else None
        self.dropouts = nn.ModuleList()

        for i in range(len(layer_dims) - 1):
            in_dim = layer_dims[i]
            out_dim = layer_dims[i + 1]
            self.layers.append(nn.Linear(in_dim, out_dim))

            if use_batchnorm:
                self.bns.append(nn.BatchNorm1d(out_dim))

            if dropout_prob > 0.0:
                self.dropouts.append(nn.Dropout(dropout_prob))
            else:
                # Placeholder for consistency
                self.dropouts.append(nn.Identity())

        self.output_layer = nn.Linear(layer_dims[-1], output_dim)

        # Initialize weights
        self._initialize_weights()

    def _initialize_weights(self):
        # Xavier initialization
        for layer in self.layers:
            nn.init.xavier_uniform_(layer.weight)
            if layer.bias is not None:
                nn.init.zeros_(layer.bias)

        nn.init.xavier_uniform_(self.output_layer.weight)
        if self.output_layer.bias is not None:
            nn.init.zeros_(self.output_layer.bias)

    def forward(self, x):
        # Keep track of input for possible residual connections
        for i, layer in enumerate(self.layers):
            residual_input = x
            x = layer(x)

            # BatchNorm
            if self.use_batchnorm:
                x = self.bns[i](x)

            # Dropout before activation
            x = self.dropouts[i](x)

            # Activation
            x = self.activation_fn(x)

            # Residual connection (only if dimensions match)
            if self.residual and residual_input.shape == x.shape:
                x = x + residual_input

        x = self.output_layer(x)
        return x

    def get_regularization_loss(self, l1_lambda=0.0, l2_lambda=0.0):
        """
        Compute L1 and L2 regularization losses for all linear layers.
        l1_lambda and l2_lambda are coefficients for L1 and L2 penalties respectively.

        Args:
            l1_lambda (float): Weight for L1 regularization.
            l2_lambda (float): Weight for L2 regularization.

        Returns:
            reg_loss (Tensor): The regularization loss (scalar).
        """
        reg_loss = torch.tensor(0.0, device=next(self.parameters()).device)
        if l1_lambda == 0.0 and l2_lambda == 0.0:
            return reg_loss

        for layer in self.layers:
            if l1_lambda > 0.0:
                reg_loss += l1_lambda * torch.sum(torch.abs(layer.weight))
            if l2_lambda > 0.0:
                reg_loss += l2_lambda * torch.sum(layer.weight**2)

        # Also consider output layer
        if l1_lambda > 0.0:
            reg_loss += l1_lambda * torch.sum(torch.abs(self.output_layer.weight))
        if l2_lambda > 0.0:
            reg_loss += l2_lambda * torch.sum(self.output_layer.weight**2)

        return reg_loss

In [5]:
# Define the feature sets
from pipelines import DLModelsPipeline


# Define the feature sets
feature_sets = {
    "TF Data": (tf_train_loader, tf_val_loader, tf_test_loader),
    "Landmark Data": (landmark_train_loader, landmark_val_loader, landmark_test_loader),
    "Gene Data": (gene_train_loader, gene_val_loader, gene_test_loader),
}

# Define the model parameters
model_params = {
    "hidden_dims": [512, 256, 128, 64],
    "output_dim": 1,
    "activation_fn": "prelu",
    "dropout_prob": 0.2,
    "residual": True,
}

# Initialize the pipeline
pipeline = DLModelsPipeline(
    model_class=FlexibleFCNN,
    feature_sets=feature_sets,
    model_params=model_params,
    epochs=20,
)

# Train and evaluate the models
pipeline.train_and_evaluate()

# Get the results
results_df = pipeline.get_results()
print(results_df)

# Save the results to a CSV file
results_df.to_csv("combined_metrics.csv", index=False)

2024-12-13 20:53:23,020 - matplotlib data path: C:\Users\20191678\AppData\Roaming\Python\Python310\site-packages\matplotlib\mpl-data
2024-12-13 20:53:23,086 - CONFIGDIR=C:\Users\20191678\.matplotlib
2024-12-13 20:53:23,100 - interactive is False
2024-12-13 20:53:23,100 - platform is win32
2024-12-13 20:53:23,515 - CACHEDIR=C:\Users\20191678\.matplotlib
2024-12-13 20:53:23,588 - Using fontManager instance from C:\Users\20191678\.matplotlib\fontlist-v330.json
2024-12-13 20:53:25,045 - Loaded backend module://matplotlib_inline.backend_inline version unknown.
2024-12-13 20:53:25,045 - Loaded backend module://matplotlib_inline.backend_inline version unknown.
2024-12-13 20:53:26,077 - Training model on TF Data...


Epoch 1/20 - Model, Train Loss: 0.1744, Val Loss: 0.0333
Epoch 2/20 - Model, Train Loss: 0.0398, Val Loss: 0.0324
Epoch 3/20 - Model, Train Loss: 0.0332, Val Loss: 0.0304
Epoch 4/20 - Model, Train Loss: 0.0301, Val Loss: 0.0297
Epoch 5/20 - Model, Train Loss: 0.0275, Val Loss: 0.0293
Epoch 6/20 - Model, Train Loss: 0.0256, Val Loss: 0.0285
Epoch 7/20 - Model, Train Loss: 0.0245, Val Loss: 0.0287
Epoch 8/20 - Model, Train Loss: 0.0228, Val Loss: 0.0281
Epoch 9/20 - Model, Train Loss: 0.0206, Val Loss: 0.0284
Epoch 10/20 - Model, Train Loss: 0.0191, Val Loss: 0.0276
Epoch 11/20 - Model, Train Loss: 0.0175, Val Loss: 0.0295
Epoch 12/20 - Model, Train Loss: 0.0166, Val Loss: 0.0297
Epoch 13/20 - Model, Train Loss: 0.0154, Val Loss: 0.0271
Epoch 14/20 - Model, Train Loss: 0.0145, Val Loss: 0.0280
Epoch 15/20 - Model, Train Loss: 0.0136, Val Loss: 0.0270
Epoch 16/20 - Model, Train Loss: 0.0134, Val Loss: 0.0278
Epoch 17/20 - Model, Train Loss: 0.0124, Val Loss: 0.0276
Epoch 18/20 - Model, Tr

2024-12-13 20:55:05,087 - Training model on Landmark Data...


Epoch 1/20 - Model, Train Loss: 0.2181, Val Loss: 0.0367
Epoch 2/20 - Model, Train Loss: 0.0449, Val Loss: 0.0338
Epoch 3/20 - Model, Train Loss: 0.0368, Val Loss: 0.0333
Epoch 4/20 - Model, Train Loss: 0.0342, Val Loss: 0.0330
Epoch 5/20 - Model, Train Loss: 0.0321, Val Loss: 0.0324
Epoch 6/20 - Model, Train Loss: 0.0308, Val Loss: 0.0320
Epoch 7/20 - Model, Train Loss: 0.0289, Val Loss: 0.0331
Epoch 8/20 - Model, Train Loss: 0.0266, Val Loss: 0.0317
Epoch 9/20 - Model, Train Loss: 0.0253, Val Loss: 0.0325
Epoch 10/20 - Model, Train Loss: 0.0235, Val Loss: 0.0335
Epoch 11/20 - Model, Train Loss: 0.0216, Val Loss: 0.0317
Epoch 12/20 - Model, Train Loss: 0.0192, Val Loss: 0.0316
Epoch 13/20 - Model, Train Loss: 0.0185, Val Loss: 0.0327
Epoch 14/20 - Model, Train Loss: 0.0170, Val Loss: 0.0329
Epoch 15/20 - Model, Train Loss: 0.0159, Val Loss: 0.0321
Epoch 16/20 - Model, Train Loss: 0.0151, Val Loss: 0.0329
Epoch 17/20 - Model, Train Loss: 0.0145, Val Loss: 0.0341
Epoch 18/20 - Model, Tr

2024-12-13 20:56:52,708 - Training model on Gene Data...


Epoch 1/20 - Model, Train Loss: 0.1875, Val Loss: 0.0343
Epoch 2/20 - Model, Train Loss: 0.0413, Val Loss: 0.0309
Epoch 3/20 - Model, Train Loss: 0.0336, Val Loss: 0.0299
Epoch 4/20 - Model, Train Loss: 0.0290, Val Loss: 0.0309
Epoch 5/20 - Model, Train Loss: 0.0257, Val Loss: 0.0295
Epoch 6/20 - Model, Train Loss: 0.0229, Val Loss: 0.0298
Epoch 7/20 - Model, Train Loss: 0.0206, Val Loss: 0.0291
Epoch 8/20 - Model, Train Loss: 0.0186, Val Loss: 0.0286
Epoch 9/20 - Model, Train Loss: 0.0167, Val Loss: 0.0311
Epoch 10/20 - Model, Train Loss: 0.0153, Val Loss: 0.0297
Epoch 11/20 - Model, Train Loss: 0.0144, Val Loss: 0.0323
Epoch 12/20 - Model, Train Loss: 0.0133, Val Loss: 0.0303
Epoch 13/20 - Model, Train Loss: 0.0123, Val Loss: 0.0292
Epoch 14/20 - Model, Train Loss: 0.0114, Val Loss: 0.0326
Epoch 15/20 - Model, Train Loss: 0.0096, Val Loss: 0.0296
Epoch 16/20 - Model, Train Loss: 0.0084, Val Loss: 0.0288
Epoch 17/20 - Model, Train Loss: 0.0082, Val Loss: 0.0283
Epoch 18/20 - Model, Tr

In [7]:
styled_results = (
    results_df.style.format(precision=3)
    .set_caption("Regression Model Evaluation Metrics")
    .highlight_max(
        subset=["R²", "Pearson Correlation"], color="lightgreen"
    )
    .highlight_min(subset=["MAE", "MSE"], color="lightgreen")
)
styled_results

Unnamed: 0_level_0,MSE,MAE,R²,Pearson Correlation
Feature Set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gene Data,0.027,0.092,0.526,0.736
Landmark Data,0.033,0.1,0.435,0.675
TF Data,0.028,0.093,0.521,0.73
