# Phase 2 - Unimodal Deep Learning Models
---

## Import all necessary libraries

In [1]:
## Import required libraries and modules
import sys
import os
import logging
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import importlib

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join("..", "src")))


from utils import load_config
from preprocess.preprocess import split_data
from models import FlexibleFCNN
from pipelines import DLModelsPipeline

from torch.optim.lr_scheduler import ReduceLROnPlateau

# Load Config
config = load_config("../config.yaml")

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s - %(levelname)s - %(message)s",
)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {device}")

2025-01-13 14:38:30,217 - INFO - Using device: cuda


In [8]:
## Load, Split, and Preprocess Datase
# Configurable parameters
sample_size = 1000  # Number of rows to sample from each dataset
chunk_size = 1000  # Chunk size for loading large datasets

# Load datasets
logging.info("Loading datasets with sampling...")


def load_sampled_data(file_path, sample_size, use_chunks=False, chunk_size=None):
    """
    Load and sample a dataset, with optional chunked loading for large files.

    Args:
        file_path (str): Path to the dataset file.
        sample_size (int): Number of rows to sample.
        use_chunks (bool): Whether to load the dataset in chunks.
        chunk_size (int, optional): Size of chunks if `use_chunks` is True.

    Returns:
        pd.DataFrame: Sampled DataFrame.
    """
    if use_chunks:
        logging.info(f"Loading {file_path} in chunks...")
        chunks = []
        total_loaded = 0
        for chunk in pd.read_csv(file_path, chunksize=chunk_size):
            if total_loaded >= sample_size:
                break

            # Determine how many rows to sample from this chunk
            sample_rows = min(sample_size - total_loaded, len(chunk))
            chunks.append(chunk.sample(sample_rows))
            total_loaded += sample_rows

        sampled_df = pd.concat(chunks, axis=0)
        del chunks  # Free memory
    else:
        logging.info(f"Sampling {sample_size} rows from {file_path}...")
        sampled_df = pd.read_csv(file_path, nrows=sample_size)

    return sampled_df


# Load data with sampling

tf_df = load_sampled_data(config["data_paths"]["preprocessed_tf_file"], sample_size)
landmark_df = load_sampled_data(
    config["data_paths"]["preprocessed_landmark_file"], sample_size
)
best_inferred_df = load_sampled_data(
    config["data_paths"]["preprocessed_best_inferred_file"], sample_size
)

gene_df = load_sampled_data(
    config["data_paths"]["preprocessed_gene_file"],
    sample_size,
    use_chunks=True,
    chunk_size=chunk_size,
)

# Split Data
logging.info("Splitting datasets into train/val/test...")

X_tf_train, y_tf_train, X_tf_val, y_tf_val, X_tf_test, y_tf_test = split_data(
    tf_df, target_name="viability", config=config, stratify_by="cell_mfc_name"
)
(
    X_landmark_train,
    y_landmark_train,
    X_landmark_val,
    y_landmark_val,
    X_landmark_test,
    y_landmark_test,
) = split_data(
    landmark_df, target_name="viability", config=config, stratify_by="cell_mfc_name"
)
(
    X_best_inferred_train,
    y_best_inferred_train,
    X_best_inferred_val,
    y_best_inferred_val,
    X_best_inferred_test,
    y_best_inferred_test,
) = split_data(
    best_inferred_df,
    target_name="viability",
    config=config,
    stratify_by="cell_mfc_name",
)
X_gene_train, y_gene_train, X_gene_val, y_gene_val, X_gene_test, y_gene_test = (
    split_data(
        gene_df, target_name="viability", config=config, stratify_by="cell_mfc_name"
    )
)

2025-01-13 14:56:57,869 - INFO - Loading datasets with sampling...
2025-01-13 14:56:57,872 - INFO - Sampling 1000 rows from ..\data/processed/preprocessed_tf.csv...
2025-01-13 14:56:58,115 - INFO - Sampling 1000 rows from ..\data/processed/preprocessed_landmark.csv...
2025-01-13 14:56:58,284 - INFO - Sampling 1000 rows from ..\data/processed/preprocessed_best_inferred.csv...
2025-01-13 14:57:00,907 - INFO - Loading ..\data/processed/preprocessed_gene.csv in chunks...
2025-01-13 14:57:07,397 - INFO - Splitting datasets into train/val/test...


Train Shape: (520, 683), Validation Shape: (222, 683), Test Shape: (258, 683)
Train Shape: (520, 979), Validation Shape: (222, 979), Test Shape: (258, 979)
Train Shape: (520, 10175), Validation Shape: (222, 10175), Test Shape: (258, 10175)
Train Shape: (520, 12329), Validation Shape: (222, 12329), Test Shape: (258, 12329)


In [9]:
def create_dataloader(X, y, batch_size=32):
    # Ensuring X and y are pandas DataFrames/Series:
    # If they are arrays, adjust accordingly.
    dataset = TensorDataset(
        torch.tensor(X.values, dtype=torch.float32),
        torch.tensor(y.values, dtype=torch.float32),
    )
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)


logging.info("Creating DataLoaders...")
tf_train_loader = create_dataloader(X_tf_train, y_tf_train)
tf_val_loader = create_dataloader(X_tf_val, y_tf_val)
tf_test_loader = create_dataloader(X_tf_test, y_tf_test)

landmark_train_loader = create_dataloader(X_landmark_train, y_landmark_train)
landmark_val_loader = create_dataloader(X_landmark_val, y_landmark_val)
landmark_test_loader = create_dataloader(X_landmark_test, y_landmark_test)

best_inferred_train_loader = create_dataloader(X_best_inferred_train, y_best_inferred_train)
best_inferred_val_loader = create_dataloader(X_best_inferred_val, y_best_inferred_val)
best_inferred_test_loader = create_dataloader(X_best_inferred_test, y_best_inferred_test)

gene_train_loader = create_dataloader(X_gene_train, y_gene_train)
gene_val_loader = create_dataloader(X_gene_val, y_gene_val)
gene_test_loader = create_dataloader(X_gene_test, y_gene_test)

2025-01-13 14:57:13,483 - INFO - Creating DataLoaders...


In [10]:
feature_sets = {
    "TF Data": (tf_train_loader, tf_val_loader, tf_test_loader),
    "Landmark Data": (landmark_train_loader, landmark_val_loader, landmark_test_loader),
    "Best Inferred Data": (best_inferred_train_loader, best_inferred_val_loader, best_inferred_test_loader),
    "Gene Data": (gene_train_loader, gene_val_loader, gene_test_loader),
}

# Define your model configurations
model_configs = {
    "FCNN_Model": {
        "model_class": FlexibleFCNN,
        "model_params": {
            "hidden_dims": [512, 256, 128, 64],
            "output_dim": 1,
            "activation_fn": "prelu",
            "dropout_prob": 0.2,
            "residual": True,
            "norm_type": "batchnorm",
            "weight_init": "xavier",
        },
        "criterion": nn.MSELoss(),
        "optimizer_class": optim.AdamW,
        "optimizer_params": {"lr": 0.001, "weight_decay": 1e-4},
        "scheduler_class": ReduceLROnPlateau,
        "scheduler_params": {"mode": "min", "patience": 5},
        "train_params": {
            "epochs": 20,
            "gradient_clipping": 1.0,
            "early_stopping_patience": 10,
        },
    },
}

In [11]:
# Now initialize the pipeline using model_configs instead of model_class & model_params
pipeline = DLModelsPipeline(feature_sets=feature_sets, model_configs=model_configs)

In [12]:
# Train and evaluate the models
logging.info("Starting training and evaluation...")
pipeline.train_and_evaluate()

# Retrieve results
logging.info("Collecting results...")
results_df = pipeline.get_results()

# Save the results
results_df.to_csv("combined_metrics.csv", index=False)
logging.info("Results saved to combined_metrics.csv.")

2025-01-13 14:57:32,904 - INFO - Starting training and evaluation...
  scaler = GradScaler(enabled=use_mixed_precision)
  with autocast(enabled=use_mixed_precision):
  with torch.no_grad(), autocast(enabled=use_mixed_precision):
2025-01-13 14:57:33,202 - INFO - Epoch 1/20 - Model, Train Loss: 1.8064, Val Loss: 1.0470
  5%|▌         | 1/20 [00:00<00:05,  3.55it/s]2025-01-13 14:57:33,367 - INFO - Epoch 2/20 - Model, Train Loss: 0.8240, Val Loss: 0.3460
 10%|█         | 2/20 [00:00<00:03,  4.71it/s]2025-01-13 14:57:33,538 - INFO - Epoch 3/20 - Model, Train Loss: 0.4714, Val Loss: 0.1874
 15%|█▌        | 3/20 [00:00<00:03,  5.17it/s]2025-01-13 14:57:33,716 - INFO - Epoch 4/20 - Model, Train Loss: 0.3498, Val Loss: 0.1111
 20%|██        | 4/20 [00:00<00:02,  5.36it/s]2025-01-13 14:57:33,883 - INFO - Epoch 5/20 - Model, Train Loss: 0.3078, Val Loss: 0.1128
 25%|██▌       | 5/20 [00:00<00:02,  5.57it/s]2025-01-13 14:57:34,041 - INFO - Epoch 6/20 - Model, Train Loss: 0.2981, Val Loss: 0.1021
 

In [13]:
styled_results = (
    results_df.style.format(precision=3)
    .set_caption("Regression Model Evaluation Metrics")
    .highlight_max(
        subset=["R²", "Pearson Correlation"], color="lightgreen"
    )
    .highlight_min(subset=["MAE", "MSE"], color="lightgreen")
)
styled_results

Unnamed: 0_level_0,Unnamed: 1_level_0,MSE,MAE,R²,Pearson Correlation
Feature Set,Model Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Best Inferred Data,FCNN_Model,0.044,0.154,0.314,0.573
Gene Data,FCNN_Model,0.036,0.147,0.443,0.677
Landmark Data,FCNN_Model,0.039,0.148,0.401,0.636
TF Data,FCNN_Model,0.05,0.174,0.233,0.495


In [9]:
import decoupler as dc


def create_gene_tf_matrix(
    net: pd.DataFrame, genes: list = None, tfs: list = None
) -> torch.Tensor:
    """
    Creates a PyTorch tensor representing the gene-TF regulatory matrix from a network dataframe.

    Args:
        net (pd.DataFrame): DataFrame containing the regulatory network with the following columns:
            - "source": Transcription factors (TFs).
            - "target": Genes regulated by the TFs.
            - "weight": Interaction weight (1 for activation, -1 for inhibition).
        genes (list, optional): List of genes to include in the matrix. If None, all unique genes in `net` are used.
        tfs (list, optional): List of TFs to include in the matrix. If None, all unique TFs in `net` are used.

    Returns:
        torch.Tensor: A tensor of shape (num_genes, num_tfs) where:
            - `1` indicates an activating interaction.
            - `-1` indicates an inhibiting interaction.
            - `0` indicates no interaction.
    """
    # Validate input
    required_columns = {"source", "target", "weight"}
    if not required_columns.issubset(net.columns):
        raise ValueError(
            f"The `net` dataframe must contain the columns: {required_columns}"
        )

    # Use all unique genes and TFs if not provided
    available_genes = sorted(net["target"].unique())
    available_tfs = sorted(net["source"].unique())

    if genes is None:
        genes = available_genes
    else:
        # Filter out genes not in the network
        genes = [gene for gene in genes if gene in available_genes]

    if tfs is None:
        tfs = available_tfs
    else:
        # Filter out TFs not in the network
        tfs = [tf for tf in tfs if tf in available_tfs]

    # Initialize a DataFrame with zeros (default for no interaction)
    gene_to_tf_df = pd.DataFrame(0, index=genes, columns=tfs, dtype=float)

    # Populate the DataFrame with interaction weights
    for _, row in net.iterrows():
        gene = row["target"]
        tf = row["source"]
        weight = row["weight"]
        if gene in genes and tf in tfs:
            gene_to_tf_df.at[gene, tf] = weight

    # Convert the DataFrame to a PyTorch tensor
    gene_to_tf_matrix = torch.tensor(gene_to_tf_df.values, dtype=torch.float32)
    return gene_to_tf_matrix


def filter_genes_to_collectri(dataset: pd.DataFrame, net: pd.DataFrame) -> pd.DataFrame:
    """
    Filters the dataset to include only genes present in the Collectri network.

    Args:
        dataset (pd.DataFrame): Gene expression dataset (rows = samples, columns = genes).
        net (pd.DataFrame): Regulatory network dataframe with a "target" column containing gene names.

    Returns:
        pd.DataFrame: Filtered dataset containing only Collectri genes.
    """
    # Extract unique genes from the "target" column of the net dataframe
    collectri_genes = net["target"].unique()

    # Find the intersection of dataset columns and Collectri genes
    intersecting_genes = set(dataset.columns).intersection(collectri_genes)

    # Filter the dataset to include only the intersecting genes
    filtered_dataset = dataset[list(intersecting_genes)]

    logging.info(f"Filtered dataset shape: {filtered_dataset.shape}")
    return filtered_dataset




net = dc.get_collectri(organism='human', split_complexes=False)

# Filter gene datasets to include only Collectri genes
X_gene_train = filter_genes_to_collectri(X_gene_train, net)
X_gene_val = filter_genes_to_collectri(X_gene_val, net)
X_gene_test = filter_genes_to_collectri(X_gene_test, net)

# Create new dataloaders
gene_train_loader = create_dataloader(X_gene_train, y_gene_train)
gene_val_loader = create_dataloader(X_gene_val, y_gene_val)
gene_test_loader = create_dataloader(X_gene_test, y_gene_test)

# Create gene-TF matrix
gene_tf_matrix = create_gene_tf_matrix(net, X_gene_train.columns)
gene_tf_matrix.shape

2024-12-19 14:38:28,424 - INFO - Downloading data from `https://omnipathdb.org/queries/enzsub?format=json`
  File "c:\Users\20191678\AppData\Local\miniconda3\envs\5ARG45\lib\site-packages\urllib3\connectionpool.py", line 466, in _make_request
    self._validate_conn(conn)
  File "c:\Users\20191678\AppData\Local\miniconda3\envs\5ARG45\lib\site-packages\urllib3\connectionpool.py", line 1095, in _validate_conn
    conn.connect()
  File "c:\Users\20191678\AppData\Local\miniconda3\envs\5ARG45\lib\site-packages\urllib3\connection.py", line 730, in connect
    sock_and_verified = _ssl_wrap_socket_and_match_hostname(
  File "c:\Users\20191678\AppData\Local\miniconda3\envs\5ARG45\lib\site-packages\urllib3\connection.py", line 909, in _ssl_wrap_socket_and_match_hostname
    ssl_sock = ssl_wrap_socket(
  File "c:\Users\20191678\AppData\Local\miniconda3\envs\5ARG45\lib\site-packages\urllib3\util\ssl_.py", line 469, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(sock, context, tls_in_tls, 

torch.Size([5576, 1186])

In [17]:
# Define your model configurations
from models import SparseKnowledgeNetwork


model_configs = {
    "SparseKnowledgeNetwork_Model": {
        "model_class": SparseKnowledgeNetwork,
        "model_params": {
            "gene_tf_matrix": None,  # To be dynamically set
            "hidden_dims": [512, 256, 128, 64],
            "output_dim": 1,
            "first_activation": "tanh",  # Matches biological interpretation of TF interactions
            "downstream_activation": "prelu",
            "dropout_prob": 0.2,
            "residual": True,
            "norm_type": "batchnorm",
            "weight_init": "xavier",
        },
        "criterion": nn.MSELoss(),
        "optimizer_class": optim.AdamW,
        "optimizer_params": {"lr": 0.001, "weight_decay": 1e-4},
        "scheduler_class": ReduceLROnPlateau,
        "scheduler_params": {"mode": "min", "patience": 5},
        "train_params": {
            "epochs": 20,
            "gradient_clipping": 1.0,
            "early_stopping_patience": 10,
        },
    },
}