# Phase 2 - Unimodal Deep Learning Models
---

## Import all necessary libraries

In [1]:
## Import required libraries and modules
import sys
import os
import logging
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import importlib

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join("..", "src")))


from utils import load_config
from preprocess.preprocess import split_data
from models import FlexibleFCNN
from pipelines import DLModelsPipeline

from torch.optim.lr_scheduler import ReduceLROnPlateau

# Load Config
config = load_config("../config.yaml")

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s - %(levelname)s - %(message)s",
)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {device}")

2025-01-14 08:30:34,067 - INFO - Using device: cuda


In [2]:
## Load, Split, and Preprocess Datase
# Configurable parameters
sample_size = 1000  # Number of rows to sample from each dataset
chunk_size = 1000  # Chunk size for loading large datasets

# Load datasets
logging.info("Loading datasets with sampling...")


def load_sampled_data(file_path, sample_size, use_chunks=False, chunk_size=None):
    """
    Load and sample a dataset, with optional chunked loading for large files.

    Args:
        file_path (str): Path to the dataset file.
        sample_size (int): Number of rows to sample.
        use_chunks (bool): Whether to load the dataset in chunks.
        chunk_size (int, optional): Size of chunks if `use_chunks` is True.

    Returns:
        pd.DataFrame: Sampled DataFrame.
    """
    if use_chunks:
        logging.info(f"Loading {file_path} in chunks...")
        chunks = []
        total_loaded = 0
        for chunk in pd.read_csv(file_path, chunksize=chunk_size):
            if total_loaded >= sample_size:
                break

            # Determine how many rows to sample from this chunk
            sample_rows = min(sample_size - total_loaded, len(chunk))
            chunks.append(chunk.sample(sample_rows))
            total_loaded += sample_rows

        sampled_df = pd.concat(chunks, axis=0)
        del chunks  # Free memory
    else:
        logging.info(f"Sampling {sample_size} rows from {file_path}...")
        sampled_df = pd.read_csv(file_path, nrows=sample_size)

    return sampled_df


# Load data with sampling

tf_df = load_sampled_data(config["data_paths"]["preprocessed_tf_file"], sample_size)
landmark_df = load_sampled_data(
    config["data_paths"]["preprocessed_landmark_file"], sample_size
)
best_inferred_df = load_sampled_data(
    config["data_paths"]["preprocessed_best_inferred_file"], sample_size
)

gene_df = load_sampled_data(
    config["data_paths"]["preprocessed_gene_file"],
    sample_size,
    use_chunks=True,
    chunk_size=chunk_size,
)

# Split Data
logging.info("Splitting datasets into train/val/test...")

X_tf_train, y_tf_train, X_tf_val, y_tf_val, X_tf_test, y_tf_test = split_data(
    tf_df, target_name="viability", config=config, stratify_by="cell_mfc_name"
)
(
    X_landmark_train,
    y_landmark_train,
    X_landmark_val,
    y_landmark_val,
    X_landmark_test,
    y_landmark_test,
) = split_data(
    landmark_df, target_name="viability", config=config, stratify_by="cell_mfc_name"
)
(
    X_best_inferred_train,
    y_best_inferred_train,
    X_best_inferred_val,
    y_best_inferred_val,
    X_best_inferred_test,
    y_best_inferred_test,
) = split_data(
    best_inferred_df,
    target_name="viability",
    config=config,
    stratify_by="cell_mfc_name",
)
X_gene_train, y_gene_train, X_gene_val, y_gene_val, X_gene_test, y_gene_test = (
    split_data(
        gene_df, target_name="viability", config=config, stratify_by="cell_mfc_name"
    )
)

2025-01-14 08:31:06,157 - INFO - Loading datasets with sampling...
2025-01-14 08:31:06,157 - INFO - Sampling 1000 rows from ..\data/processed/preprocessed_tf.csv...
2025-01-14 08:31:06,514 - INFO - Sampling 1000 rows from ..\data/processed/preprocessed_landmark.csv...
2025-01-14 08:31:06,686 - INFO - Sampling 1000 rows from ..\data/processed/preprocessed_best_inferred.csv...
2025-01-14 08:31:09,361 - INFO - Loading ..\data/processed/preprocessed_gene.csv in chunks...
2025-01-14 08:31:15,974 - INFO - Splitting datasets into train/val/test...
2025-01-14 08:31:15,990 - INFO - Train Groups: 22 unique values.
2025-01-14 08:31:15,993 - INFO - Validation Groups: 6 unique values.
2025-01-14 08:31:15,994 - INFO - Test Groups: 4 unique values.
2025-01-14 08:31:16,010 - INFO - Train Groups: 22 unique values.
2025-01-14 08:31:16,011 - INFO - Validation Groups: 6 unique values.
2025-01-14 08:31:16,011 - INFO - Test Groups: 4 unique values.
2025-01-14 08:31:16,107 - INFO - Train Groups: 22 unique va

In [3]:
def create_dataloader(X, y, batch_size=32):
    # Ensuring X and y are pandas DataFrames/Series:
    # If they are arrays, adjust accordingly.
    dataset = TensorDataset(
        torch.tensor(X.values, dtype=torch.float32),
        torch.tensor(y.values, dtype=torch.float32),
    )
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)


logging.info("Creating DataLoaders...")
tf_train_loader = create_dataloader(X_tf_train, y_tf_train)
tf_val_loader = create_dataloader(X_tf_val, y_tf_val)
tf_test_loader = create_dataloader(X_tf_test, y_tf_test)

landmark_train_loader = create_dataloader(X_landmark_train, y_landmark_train)
landmark_val_loader = create_dataloader(X_landmark_val, y_landmark_val)
landmark_test_loader = create_dataloader(X_landmark_test, y_landmark_test)

best_inferred_train_loader = create_dataloader(X_best_inferred_train, y_best_inferred_train)
best_inferred_val_loader = create_dataloader(X_best_inferred_val, y_best_inferred_val)
best_inferred_test_loader = create_dataloader(X_best_inferred_test, y_best_inferred_test)

gene_train_loader = create_dataloader(X_gene_train, y_gene_train)
gene_val_loader = create_dataloader(X_gene_val, y_gene_val)
gene_test_loader = create_dataloader(X_gene_test, y_gene_test)

2025-01-14 08:31:40,024 - INFO - Creating DataLoaders...


In [7]:
feature_sets = {
    "TF Data": (tf_train_loader, tf_val_loader, tf_test_loader),
    "Landmark Data": (landmark_train_loader, landmark_val_loader, landmark_test_loader),
    "Best Inferred Data": (best_inferred_train_loader, best_inferred_val_loader, best_inferred_test_loader),
    "Gene Data": (gene_train_loader, gene_val_loader, gene_test_loader),
}

# Define your model configurations
model_configs = {
    "FCNN_Model": {
        "model_class": FlexibleFCNN,
        "model_params": {
            "input_dim": None,
            "hidden_dims": [512, 256, 128, 64],
            "output_dim": 1,
            "activation_fn": "prelu",
            "dropout_prob": 0.2,
            "residual": True,
            "norm_type": "batchnorm",
            "weight_init": "xavier",
        },
        "criterion": nn.MSELoss(),
        "optimizer_class": optim.AdamW,
        "optimizer_params": {"lr": 0.001, "weight_decay": 1e-4},
        "scheduler_class": ReduceLROnPlateau,
        "scheduler_params": {"mode": "min", "patience": 5},
        "train_params": {
            "epochs": 20,
            "gradient_clipping": 1.0,
            "early_stopping_patience": 10,
        },
    },
}

In [11]:
# Now initialize the pipeline using model_configs instead of model_class & model_params
pipeline = DLModelsPipeline(feature_sets=feature_sets, model_configs=model_configs)

In [12]:
# Train and evaluate the models
logging.info("Starting training and evaluation...")
pipeline.train_and_evaluate(device=device)

# Retrieve results
logging.info("Collecting results...")
results_df = pipeline.get_results()

# Save the results
results_df.to_csv("combined_metrics.csv", index=False)
logging.info("Results saved to combined_metrics.csv.")

2025-01-14 08:37:01,512 - INFO - Starting training and evaluation...


TypeError: DLModelsPipeline.train_and_evaluate() got an unexpected keyword argument 'device'

In [10]:
styled_results = (
    results_df.style.format(precision=3)
    .set_caption("Regression Model Evaluation Metrics")
    .highlight_max(
        subset=["R²", "Pearson Correlation"], color="lightgreen"
    )
    .highlight_min(subset=["MAE", "MSE"], color="lightgreen")
)
styled_results

Unnamed: 0_level_0,Unnamed: 1_level_0,MSE,MAE,R²,Pearson Correlation
Feature Set,Model Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Best Inferred Data,FCNN_Model,0.039,0.141,0.397,0.645
Gene Data,FCNN_Model,0.04,0.155,0.384,0.621
Landmark Data,FCNN_Model,0.044,0.151,0.328,0.588
TF Data,FCNN_Model,0.048,0.172,0.26,0.52


In [None]:
import decoupler as dc


def create_gene_tf_matrix(
    net: pd.DataFrame, genes: list = None, tfs: list = None
) -> torch.Tensor:
    """
    Creates a PyTorch tensor representing the gene-TF regulatory matrix from a network dataframe.

    Args:
        net (pd.DataFrame): DataFrame containing the regulatory network with the following columns:
            - "source": Transcription factors (TFs).
            - "target": Genes regulated by the TFs.
            - "weight": Interaction weight (1 for activation, -1 for inhibition).
        genes (list, optional): List of genes to include in the matrix. If None, all unique genes in `net` are used.
        tfs (list, optional): List of TFs to include in the matrix. If None, all unique TFs in `net` are used.

    Returns:
        torch.Tensor: A tensor of shape (num_genes, num_tfs) where:
            - `1` indicates an activating interaction.
            - `-1` indicates an inhibiting interaction.
            - `0` indicates no interaction.
    """
    # Validate input
    required_columns = {"source", "target", "weight"}
    if not required_columns.issubset(net.columns):
        raise ValueError(
            f"The `net` dataframe must contain the columns: {required_columns}"
        )

    # Use all unique genes and TFs if not provided
    available_genes = sorted(net["target"].unique())
    available_tfs = sorted(net["source"].unique())

    if genes is None:
        genes = available_genes
    else:
        # Filter out genes not in the network
        genes = [gene for gene in genes if gene in available_genes]

    if tfs is None:
        tfs = available_tfs
    else:
        # Filter out TFs not in the network
        tfs = [tf for tf in tfs if tf in available_tfs]

    # Initialize a DataFrame with zeros (default for no interaction)
    gene_to_tf_df = pd.DataFrame(0, index=genes, columns=tfs, dtype=float)

    # Populate the DataFrame with interaction weights
    for _, row in net.iterrows():
        gene = row["target"]
        tf = row["source"]
        weight = row["weight"]
        if gene in genes and tf in tfs:
            gene_to_tf_df.at[gene, tf] = weight

    # Convert the DataFrame to a PyTorch tensor
    gene_to_tf_matrix = torch.tensor(gene_to_tf_df.values, dtype=torch.float32)
    return gene_to_tf_matrix


# Define gene-TF matrix generator for SparseKnowledgeNetwork
def gene_tf_matrix_generator(feature_name, train_loader):
    if feature_name in {"Gene Data", "Best Inferred Data", "Landmark Data"}:
        # Extract gene names from training dataset
        gene_names = train_loader.dataset.dataset.columns
        return create_gene_tf_matrix(net, genes=list(gene_names))
    return None  # No gene-TF matrix needed for other feature sets


def filter_genes_to_collectri(dataset: pd.DataFrame, net: pd.DataFrame) -> pd.DataFrame:
    """
    Filters the dataset to include only genes present in the Collectri network.

    Args:
        dataset (pd.DataFrame): Gene expression dataset (rows = samples, columns = genes).
        net (pd.DataFrame): Regulatory network dataframe with a "target" column containing gene names.

    Returns:
        pd.DataFrame: Filtered dataset containing only Collectri genes.
    """
    # Extract unique genes from the "target" column of the net dataframe
    collectri_genes = net["target"].unique()

    # Find the intersection of dataset columns and Collectri genes
    intersecting_genes = set(dataset.columns).intersection(collectri_genes)

    # Check if any genes are found; warn if not
    if not intersecting_genes:
        logging.warning("No overlapping genes between dataset and Collectri network.")

    # Filter the dataset to include only the intersecting genes
    filtered_dataset = dataset[list(intersecting_genes)]

    logging.info(
        f"Filtered dataset shape: {filtered_dataset.shape} (Retained {len(intersecting_genes)} genes)"
    )
    return filtered_dataset


net = dc.get_collectri(organism='human', split_complexes=False)

# Filter gene datasets to include only Collectri genes
X_gene_train = filter_genes_to_collectri(X_gene_train, net)
X_gene_val = filter_genes_to_collectri(X_gene_val, net)
X_gene_test = filter_genes_to_collectri(X_gene_test, net)

X_best_inferred_train = filter_genes_to_collectri(X_best_inferred_train, net)
X_best_inferred_val = filter_genes_to_collectri(X_best_inferred_val, net)
X_best_inferred_test = filter_genes_to_collectri(X_best_inferred_test, net)

X_landmark_train = filter_genes_to_collectri(X_landmark_train, net)
X_landmark_val = filter_genes_to_collectri(X_landmark_val, net)
X_landmark_test = filter_genes_to_collectri(X_landmark_test, net)

# Create new dataloaders
gene_train_loader = create_dataloader(X_gene_train, y_gene_train)
gene_val_loader = create_dataloader(X_gene_val, y_gene_val)
gene_test_loader = create_dataloader(X_gene_test, y_gene_test)

best_inferred_train_loader = create_dataloader(X_best_inferred_train, y_best_inferred_train)
best_inferred_val_loader = create_dataloader(X_best_inferred_val, y_best_inferred_val)
best_inferred_test_loader = create_dataloader(X_best_inferred_test, y_best_inferred_test)

landmark_train_loader = create_dataloader(X_landmark_train, y_landmark_train)
landmark_val_loader = create_dataloader(X_landmark_val, y_landmark_val)
landmark_test_loader = create_dataloader(X_landmark_test, y_landmark_test)

In [17]:
import decoupler as dc
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

from models import SparseKnowledgeNetwork

# Define feature sets with dataloader tuples
feature_sets = {
    "Landmark Data": (landmark_train_loader, landmark_val_loader, landmark_test_loader),
    "Best Inferred Data": (
        best_inferred_train_loader,
        best_inferred_val_loader,
        best_inferred_test_loader,
    ),
    "Gene Data": (gene_train_loader, gene_val_loader, gene_test_loader),
}

# Define model configurations
model_configs = {
    "FlexibleFCNN": {
        "model_class": FlexibleFCNN,
        "model_params": {
            "hidden_dims": [512, 256, 128, 64],
            "output_dim": 1,
            "activation_fn": "relu",
            "dropout_prob": 0.2,
            "residual": True,
            "norm_type": "batchnorm",
            "weight_init": "xavier",
        },
        "criterion": nn.MSELoss(),
        "optimizer_class": optim.AdamW,
        "optimizer_params": {"lr": 0.001, "weight_decay": 1e-4},
        "scheduler_class": ReduceLROnPlateau,
        "scheduler_params": {"mode": "min", "patience": 5},
        "train_params": {
            "epochs": 20,
            "gradient_clipping": 1.0,
            "early_stopping_patience": 10,
        },
    },
    "SparseKnowledgeNetwork": {
        "model_class": SparseKnowledgeNetwork,
        "model_params": {
            "hidden_dims": [512, 256, 128, 64],
            "output_dim": 1,
            "first_activation": "tanh",
            "downstream_activation": "relu",
            "dropout_prob": 0.2,
            "weight_init": "xavier",
            "use_batchnorm": True,
        },
        "criterion": nn.MSELoss(),
        "optimizer_class": optim.AdamW,
        "optimizer_params": {"lr": 0.001, "weight_decay": 1e-4},
        "scheduler_class": ReduceLROnPlateau,
        "scheduler_params": {"mode": "min", "patience": 5},
        "train_params": {
            "epochs": 20,
            "gradient_clipping": 1.0,
            "early_stopping_patience": 10,
        },
        "gene_tf_matrix_generator": gene_tf_matrix_generator,
    },
}

# Initialize and run the pipeline
pipeline = DLModelsPipeline(feature_sets, model_configs)
pipeline.train_and_evaluate()

# Retrieve results
results_df = pipeline.get_results()
print(results_df)