# Phase 2 - Unimodal Deep Learning Models
---

## Import all necessary libraries

In [1]:
## Import required libraries and modules
import sys
import os
import logging
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import importlib

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join("..", "src")))


from utils import load_config
from preprocess.preprocess import split_data
from models import FlexibleFCNN
from pipelines import DLModelsPipeline

from torch.optim.lr_scheduler import ReduceLROnPlateau

# Load Config
config = load_config("../config.yaml")

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s - %(levelname)s - %(message)s",
)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {device}")

2024-12-19 14:31:23,140 - INFO - Using device: cuda


In [2]:
## Load, Split and Preprocess Dataset
# Load datasets
logging.info("Loading datasets...")
tf_df = pd.read_csv(config["data_paths"]["preprocessed_tf_file"])
landmark_df = pd.read_csv(config["data_paths"]["preprocessed_landmark_file"])

# For large gene data, read in chunks
logging.info("Loading gene dataset in chunks...")
chunk_size = 1000
chunks = []
for chunk in pd.read_csv(
    config["data_paths"]["preprocessed_gene_file"], chunksize=chunk_size
):
    chunks.append(chunk)
gene_df = pd.concat(chunks, axis=0)
del chunks  # Free memory

# # Only sample 1000 rows for now
# tf_df = tf_df.sample(1000)
# landmark_df = landmark_df.sample(1000)
# gene_df = gene_df.sample(1000)

# Split Data
logging.info("Splitting datasets into train/val/test...")
X_tf_train, y_tf_train, X_tf_val, y_tf_val, X_tf_test, y_tf_test = split_data(
    tf_df, target_name="viability", config=config, stratify_by="cell_mfc_name"
)
(
    X_landmark_train,
    y_landmark_train,
    X_landmark_val,
    y_landmark_val,
    X_landmark_test,
    y_landmark_test,
) = split_data(landmark_df, target_name="viability", config=config, stratify_by="cell_mfc_name")
X_gene_train, y_gene_train, X_gene_val, y_gene_val, X_gene_test, y_gene_test = (
    split_data(gene_df, target_name="viability", config=config, stratify_by="cell_mfc_name")
)

2024-12-19 14:31:23,163 - INFO - Loading datasets...
2024-12-19 14:31:37,900 - INFO - Loading gene dataset in chunks...
2024-12-19 14:33:40,072 - INFO - Splitting datasets into train/val/test...


Train Shape: (19416, 683), Validation Shape: (5379, 683), Test Shape: (6364, 683)
Train Shape: (19416, 979), Validation Shape: (5379, 979), Test Shape: (6364, 979)
Train Shape: (19416, 12329), Validation Shape: (5379, 12329), Test Shape: (6364, 12329)


In [3]:
def create_dataloader(X, y, batch_size=32):
    # Ensuring X and y are pandas DataFrames/Series:
    # If they are arrays, adjust accordingly.
    dataset = TensorDataset(
        torch.tensor(X.values, dtype=torch.float32),
        torch.tensor(y.values, dtype=torch.float32),
    )
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)


logging.info("Creating DataLoaders...")
tf_train_loader = create_dataloader(X_tf_train, y_tf_train)
tf_val_loader = create_dataloader(X_tf_val, y_tf_val)
tf_test_loader = create_dataloader(X_tf_test, y_tf_test)

landmark_train_loader = create_dataloader(X_landmark_train, y_landmark_train)
landmark_val_loader = create_dataloader(X_landmark_val, y_landmark_val)
landmark_test_loader = create_dataloader(X_landmark_test, y_landmark_test)

gene_train_loader = create_dataloader(X_gene_train, y_gene_train)
gene_val_loader = create_dataloader(X_gene_val, y_gene_val)
gene_test_loader = create_dataloader(X_gene_test, y_gene_test)

2024-12-19 14:33:58,306 - INFO - Creating DataLoaders...


In [4]:
from models import CNNRegressor, MLPMixer, TransformerRegressor


feature_sets = {
    # "TF Data": (tf_train_loader, tf_val_loader, tf_test_loader),
    # "Landmark Data": (landmark_train_loader, landmark_val_loader, landmark_test_loader),
    "Gene Data": (gene_train_loader, gene_val_loader, gene_test_loader),
}

# Define your model configurations
model_configs = {
    "FCNN_Model": {
        "model_class": FlexibleFCNN,
        "model_params": {
            "hidden_dims": [512, 256, 128, 64],
            "output_dim": 1,
            "activation_fn": "prelu",
            "dropout_prob": 0.2,
            "residual": True,
            "norm_type": "batchnorm",
            "weight_init": "xavier",
        },
        "criterion": nn.MSELoss(),
        "optimizer_class": optim.AdamW,
        "optimizer_params": {"lr": 0.001, "weight_decay": 1e-4},
        "scheduler_class": ReduceLROnPlateau,
        "scheduler_params": {"mode": "min", "patience": 5},
        "train_params": {
            "epochs": 20,
            "gradient_clipping": 1.0,
            "early_stopping_patience": 10,
        },
    },
    # "Transformer_Model": {
    #     "model_class": TransformerRegressor,
    #     "model_params": {
    #         "d_model": 128,
    #         "nhead": 4,
    #         "num_layers": 2,
    #         "dim_feedforward": 256,
    #         "dropout": 0.1,
    #         "output_dim": 1,
    #     },
    #     "criterion": nn.MSELoss(),
    #     "optimizer_class": optim.AdamW,
    #     "optimizer_params": {"lr": 0.001, "weight_decay": 1e-4},
    #     "scheduler_class": ReduceLROnPlateau,
    #     "scheduler_params": {"mode": "min", "patience": 5},
    #     "train_params": {
    #         "epochs": 10,
    #         "gradient_clipping": 1.0,
    #         "early_stopping_patience": 10,
    #     },
    # },
    # "CNN_Model": {
    #     "model_class": CNNRegressor,
    #     "model_params": {
    #         "num_filters": 64,
    #         "kernel_size": 7,
    #         "num_layers": 3,
    #         "dropout_prob": 0.2,
    #         "output_dim": 1,
    #     },
    #     "criterion": nn.MSELoss(),
    #     "optimizer_class": optim.AdamW,
    #     "optimizer_params": {"lr": 0.001, "weight_decay": 1e-4},
    #     "scheduler_class": ReduceLROnPlateau,
    #     "scheduler_params": {"mode": "min", "patience": 5},
    #     "train_params": {
    #         "epochs": 10,
    #         "gradient_clipping": 1.0,
    #         "early_stopping_patience": 10,
    #     },
    # },
}

In [5]:
# Now initialize the pipeline using model_configs instead of model_class & model_params
pipeline = DLModelsPipeline(feature_sets=feature_sets, model_configs=model_configs)

In [6]:
# Train and evaluate the models
logging.info("Starting training and evaluation...")
pipeline.train_and_evaluate()

# Retrieve results
logging.info("Collecting results...")
results_df = pipeline.get_results()

# Save the results
results_df.to_csv("combined_metrics.csv", index=False)
logging.info("Results saved to combined_metrics.csv.")

2024-12-19 14:34:05,166 - INFO - Starting training and evaluation...
2024-12-19 14:34:31,593 - INFO - Epoch 1/20 - Model, Train Loss: 0.1780, Val Loss: 0.0369
2024-12-19 14:34:49,123 - INFO - Epoch 2/20 - Model, Train Loss: 0.0432, Val Loss: 0.0363
2024-12-19 14:35:06,758 - INFO - Epoch 3/20 - Model, Train Loss: 0.0347, Val Loss: 0.0355
2024-12-19 14:35:23,858 - INFO - Epoch 4/20 - Model, Train Loss: 0.0297, Val Loss: 0.0363
2024-12-19 14:35:41,558 - INFO - Epoch 5/20 - Model, Train Loss: 0.0254, Val Loss: 0.0373
2024-12-19 14:36:00,831 - INFO - Epoch 6/20 - Model, Train Loss: 0.0226, Val Loss: 0.0397
2024-12-19 14:36:19,724 - INFO - Epoch 7/20 - Model, Train Loss: 0.0208, Val Loss: 0.0390
2024-12-19 14:36:38,655 - INFO - Epoch 8/20 - Model, Train Loss: 0.0186, Val Loss: 0.0374
2024-12-19 14:36:57,118 - INFO - Epoch 9/20 - Model, Train Loss: 0.0169, Val Loss: 0.0400
2024-12-19 14:37:15,256 - INFO - Epoch 10/20 - Model, Train Loss: 0.0134, Val Loss: 0.0376
2024-12-19 14:37:36,170 - INFO

In [7]:
styled_results = (
    results_df.style.format(precision=3)
    .set_caption("Regression Model Evaluation Metrics")
    .highlight_max(
        subset=["R²", "Pearson Correlation"], color="lightgreen"
    )
    .highlight_min(subset=["MAE", "MSE"], color="lightgreen")
)
styled_results

Unnamed: 0_level_0,Unnamed: 1_level_0,MSE,MAE,R²,Pearson Correlation
Feature Set,Model Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Gene Data,FCNN_Model,0.027,0.103,0.155,0.602


In [8]:
class SparseKnowledgeNetwork(nn.Module):
    def __init__(
        self,
        gene_tf_matrix: torch.Tensor,
        hidden_dims: list,
        output_dim: int = 1,
        first_activation: str = "tanh",  # Activation for gene-to-TF layer: Tanh or Sigmoid
        downstream_activation: str = "relu",  # Activation for downstream layers
        dropout_prob: float = 0.2,
        weight_init: str = "xavier",
        use_batchnorm: bool = True,
    ):
        """
        Knowledge-Informed Sparse Network with trainable gene-TF interactions.

        Args:
            gene_tf_matrix (torch.Tensor): Binary (-1, 0, 1) gene-TF connection matrix.
            hidden_dims (list of int): Sizes of additional hidden layers after TF activations.
            output_dim (int): Number of output features (1 for regression).
            first_activation (str): Activation function after the gene-to-TF layer (Tanh or Sigmoid).
            downstream_activation (str): Activation function for downstream layers (e.g., ReLU).
            dropout_prob (float): Dropout probability.
            weight_init (str): Weight initialization method ("xavier" or "kaiming").
            use_batchnorm (bool): Whether to use batch normalization in downstream layers.
        """
        super(SparseKnowledgeNetwork, self).__init__()
        self.hidden_dims = hidden_dims
        self.output_dim = output_dim
        self.dropout_prob = dropout_prob
        self.use_batchnorm = use_batchnorm

        # First activation function for gene-to-TF layer
        if first_activation.lower() not in {"tanh", "sigmoid"}:
            raise ValueError("First activation must be 'tanh' or 'sigmoid'.")
        self.first_activation = getattr(F, first_activation.lower())

        # Downstream activation function
        self.downstream_activation = getattr(F, downstream_activation.lower())

        # Trainable gene-to-TF interaction weights (initialized with prior knowledge)
        self.gene_to_tf_weights = nn.Parameter(gene_tf_matrix.clone().float())

        # Define the hidden layers after the TF layer
        tf_dim = gene_tf_matrix.shape[1]  # Number of TFs
        hidden_dims = [tf_dim] + hidden_dims
        self.hidden_layers = nn.ModuleList()
        self.batch_norms = nn.ModuleList() if use_batchnorm else None

        for i in range(len(hidden_dims) - 1):
            self.hidden_layers.append(nn.Linear(hidden_dims[i], hidden_dims[i + 1]))
            if use_batchnorm:
                self.batch_norms.append(nn.BatchNorm1d(hidden_dims[i + 1]))

        # Output layer
        self.output_layer = nn.Linear(hidden_dims[-1], output_dim)
        self.dropout = nn.Dropout(p=dropout_prob)

        # Initialize weights
        self._initialize_weights(weight_init)

    def _initialize_weights(self, method="xavier"):
        """
        Initialize model weights.
        """
        for layer in list(self.hidden_layers) + [self.output_layer]:
            if method == "xavier":
                nn.init.xavier_uniform_(layer.weight)
            elif method == "kaiming":
                nn.init.kaiming_uniform_(layer.weight, nonlinearity="relu")
            else:
                raise ValueError(f"Unknown weight initialization method: {method}")
            if layer.bias is not None:
                nn.init.zeros_(layer.bias)

    def forward(self, x):
        """
        Forward pass for the sparse knowledge-informed network.

        Args:
            x (torch.Tensor): Input tensor (gene expression data).

        Returns:
            torch.Tensor: Model output.
        """
        # Trainable gene-to-TF interaction layer
        tf_activations = torch.matmul(x, self.gene_to_tf_weights)

        # Apply the first activation function (Tanh or Sigmoid)
        tf_activations = self.first_activation(tf_activations)

        # Apply additional hidden layers
        hidden_activations = tf_activations
        for i, layer in enumerate(self.hidden_layers):
            hidden_activations = layer(hidden_activations)
            if self.use_batchnorm:
                hidden_activations = self.batch_norms[i](hidden_activations)
            hidden_activations = self.downstream_activation(hidden_activations)
            hidden_activations = self.dropout(hidden_activations)

        # Output layer
        output = self.output_layer(hidden_activations)
        return output

In [9]:
import decoupler as dc


def create_gene_tf_matrix(
    net: pd.DataFrame, genes: list = None, tfs: list = None
) -> torch.Tensor:
    """
    Creates a PyTorch tensor representing the gene-TF regulatory matrix from a network dataframe.

    Args:
        net (pd.DataFrame): DataFrame containing the regulatory network with the following columns:
            - "source": Transcription factors (TFs).
            - "target": Genes regulated by the TFs.
            - "weight": Interaction weight (1 for activation, -1 for inhibition).
        genes (list, optional): List of genes to include in the matrix. If None, all unique genes in `net` are used.
        tfs (list, optional): List of TFs to include in the matrix. If None, all unique TFs in `net` are used.

    Returns:
        torch.Tensor: A tensor of shape (num_genes, num_tfs) where:
            - `1` indicates an activating interaction.
            - `-1` indicates an inhibiting interaction.
            - `0` indicates no interaction.
    """
    # Validate input
    required_columns = {"source", "target", "weight"}
    if not required_columns.issubset(net.columns):
        raise ValueError(
            f"The `net` dataframe must contain the columns: {required_columns}"
        )

    # Use all unique genes and TFs if not provided
    available_genes = sorted(net["target"].unique())
    available_tfs = sorted(net["source"].unique())

    if genes is None:
        genes = available_genes
    else:
        # Filter out genes not in the network
        genes = [gene for gene in genes if gene in available_genes]

    if tfs is None:
        tfs = available_tfs
    else:
        # Filter out TFs not in the network
        tfs = [tf for tf in tfs if tf in available_tfs]

    # Initialize a DataFrame with zeros (default for no interaction)
    gene_to_tf_df = pd.DataFrame(0, index=genes, columns=tfs, dtype=float)

    # Populate the DataFrame with interaction weights
    for _, row in net.iterrows():
        gene = row["target"]
        tf = row["source"]
        weight = row["weight"]
        if gene in genes and tf in tfs:
            gene_to_tf_df.at[gene, tf] = weight

    # Convert the DataFrame to a PyTorch tensor
    gene_to_tf_matrix = torch.tensor(gene_to_tf_df.values, dtype=torch.float32)
    return gene_to_tf_matrix


def filter_genes_to_collectri(dataset: pd.DataFrame, net: pd.DataFrame) -> pd.DataFrame:
    """
    Filters the dataset to include only genes present in the Collectri network.

    Args:
        dataset (pd.DataFrame): Gene expression dataset (rows = samples, columns = genes).
        net (pd.DataFrame): Regulatory network dataframe with a "target" column containing gene names.

    Returns:
        pd.DataFrame: Filtered dataset containing only Collectri genes.
    """
    # Extract unique genes from the "target" column of the net dataframe
    collectri_genes = net["target"].unique()

    # Find the intersection of dataset columns and Collectri genes
    intersecting_genes = set(dataset.columns).intersection(collectri_genes)

    # Filter the dataset to include only the intersecting genes
    filtered_dataset = dataset[list(intersecting_genes)]

    logging.info(f"Filtered dataset shape: {filtered_dataset.shape}")
    return filtered_dataset




net = dc.get_collectri(organism='human', split_complexes=False)

# Filter gene datasets to include only Collectri genes
X_gene_train = filter_genes_to_collectri(X_gene_train, net)
X_gene_val = filter_genes_to_collectri(X_gene_val, net)
X_gene_test = filter_genes_to_collectri(X_gene_test, net)

# Create new dataloaders
gene_train_loader = create_dataloader(X_gene_train, y_gene_train)
gene_val_loader = create_dataloader(X_gene_val, y_gene_val)
gene_test_loader = create_dataloader(X_gene_test, y_gene_test)

# Create gene-TF matrix
gene_tf_matrix = create_gene_tf_matrix(net, X_gene_train.columns)
gene_tf_matrix.shape

2024-12-19 14:38:28,424 - INFO - Downloading data from `https://omnipathdb.org/queries/enzsub?format=json`
  File "c:\Users\20191678\AppData\Local\miniconda3\envs\5ARG45\lib\site-packages\urllib3\connectionpool.py", line 466, in _make_request
    self._validate_conn(conn)
  File "c:\Users\20191678\AppData\Local\miniconda3\envs\5ARG45\lib\site-packages\urllib3\connectionpool.py", line 1095, in _validate_conn
    conn.connect()
  File "c:\Users\20191678\AppData\Local\miniconda3\envs\5ARG45\lib\site-packages\urllib3\connection.py", line 730, in connect
    sock_and_verified = _ssl_wrap_socket_and_match_hostname(
  File "c:\Users\20191678\AppData\Local\miniconda3\envs\5ARG45\lib\site-packages\urllib3\connection.py", line 909, in _ssl_wrap_socket_and_match_hostname
    ssl_sock = ssl_wrap_socket(
  File "c:\Users\20191678\AppData\Local\miniconda3\envs\5ARG45\lib\site-packages\urllib3\util\ssl_.py", line 469, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(sock, context, tls_in_tls, 

torch.Size([5576, 1186])

In [10]:
import torch.nn.functional as F
sparse_knowledge_model = SparseKnowledgeNetwork(gene_tf_matrix=gene_tf_matrix, hidden_dims=[512, 256, 128, 64, 32], output_dim=1)

In [13]:
from evaluation import evaluate_model
from training import train_model

# Define the optimizer and loss function
optimizer = optim.AdamW(sparse_knowledge_model.parameters(), lr=0.001, weight_decay=1e-4)
criterion = nn.MSELoss()

# Train the model
train_model(
    model=sparse_knowledge_model,
    criterion=criterion,
    optimizer=optimizer,
    train_loader=gene_train_loader,
    val_loader=gene_val_loader,
    device=device,
    epochs=20,
    gradient_clipping=1.0,
    early_stopping_patience=10,
)

# Evaluate the model
sparse_knowledge_results = evaluate_model(
    model=sparse_knowledge_model,
    criterion=criterion,
    test_loader=gene_test_loader,
    device=device,
)

2024-12-19 15:01:45,412 - INFO - Epoch 1/20 - Model, Train Loss: 0.0356, Val Loss: 0.0396
2024-12-19 15:02:07,570 - INFO - Epoch 2/20 - Model, Train Loss: 0.0318, Val Loss: 0.0403
2024-12-19 15:02:29,897 - INFO - Epoch 3/20 - Model, Train Loss: 0.0298, Val Loss: 0.0390
2024-12-19 15:02:54,356 - INFO - Epoch 4/20 - Model, Train Loss: 0.0276, Val Loss: 0.0404
2024-12-19 15:03:16,173 - INFO - Epoch 5/20 - Model, Train Loss: 0.0247, Val Loss: 0.0400
2024-12-19 15:03:39,394 - INFO - Epoch 6/20 - Model, Train Loss: 0.0245, Val Loss: 0.0456
2024-12-19 15:04:00,600 - INFO - Epoch 7/20 - Model, Train Loss: 0.0227, Val Loss: 0.0405
2024-12-19 15:04:23,478 - INFO - Epoch 8/20 - Model, Train Loss: 0.0217, Val Loss: 0.0413
2024-12-19 15:04:41,375 - INFO - Epoch 9/20 - Model, Train Loss: 0.0206, Val Loss: 0.0432
2024-12-19 15:04:58,485 - INFO - Epoch 10/20 - Model, Train Loss: 0.0195, Val Loss: 0.0427
2024-12-19 15:05:15,186 - INFO - Epoch 11/20 - Model, Train Loss: 0.0188, Val Loss: 0.0407
2024-12-

In [12]:
sparse_knowledge_results

{'MSE': 0.027327276876349633,
 'MAE': 0.11145621538162231,
 'R²': 0.1423114538192749,
 'Pearson Correlation': 0.5572795307373255}