In [1]:
## Import required libraries and modules
import sys
import os
import logging
import pandas as pd
import importlib

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join("..", "src")))


from utils import load_config
from preprocess import split_data

# Load Config
config = load_config("../config.yaml")

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s - %(levelname)s - %(message)s",
)

In [2]:
## Load, Split, and Preprocess Datase
# Configurable parameters
sample_size = 1000  # Number of rows to sample from each dataset
chunk_size = 1000  # Chunk size for loading large datasets

# Load datasets
logging.info("Loading datasets with sampling...")


def load_sampled_data(file_path, sample_size, use_chunks=False, chunk_size=None):
    """
    Load and sample a dataset, with optional chunked loading for large files.

    Args:
        file_path (str): Path to the dataset file.
        sample_size (int): Number of rows to sample.
        use_chunks (bool): Whether to load the dataset in chunks.
        chunk_size (int, optional): Size of chunks if `use_chunks` is True.

    Returns:
        pd.DataFrame: Sampled DataFrame.
    """
    if use_chunks:
        logging.info(f"Loading {file_path} in chunks...")
        chunks = []
        total_loaded = 0
        for chunk in pd.read_csv(file_path, chunksize=chunk_size):
            if total_loaded >= sample_size:
                break

            # Determine how many rows to sample from this chunk
            sample_rows = min(sample_size - total_loaded, len(chunk))
            chunks.append(chunk.sample(sample_rows))
            total_loaded += sample_rows

        sampled_df = pd.concat(chunks, axis=0)
        del chunks  # Free memory
    else:
        logging.info(f"Sampling {sample_size} rows from {file_path}...")
        sampled_df = pd.read_csv(file_path, nrows=sample_size)

    return sampled_df


# Load data with sampling

tf_df = load_sampled_data(config["data_paths"]["preprocessed_tf_file"], sample_size)
landmark_df = load_sampled_data(
    config["data_paths"]["preprocessed_landmark_file"], sample_size
)
best_inferred_df = load_sampled_data(
    config["data_paths"]["preprocessed_best_inferred_file"], sample_size
)

gene_df = load_sampled_data(
    config["data_paths"]["preprocessed_gene_file"],
    sample_size,
    use_chunks=True,
    chunk_size=chunk_size,
)

# Split Data
logging.info("Splitting datasets into train/val/test...")

X_tf_train, y_tf_train, X_tf_val, y_tf_val, X_tf_test, y_tf_test = split_data(
    tf_df, target_name="viability", config=config, stratify_by="cell_mfc_name"
)
(
    X_landmark_train,
    y_landmark_train,
    X_landmark_val,
    y_landmark_val,
    X_landmark_test,
    y_landmark_test,
) = split_data(
    landmark_df, target_name="viability", config=config, stratify_by="cell_mfc_name"
)
(
    X_best_inferred_train,
    y_best_inferred_train,
    X_best_inferred_val,
    y_best_inferred_val,
    X_best_inferred_test,
    y_best_inferred_test,
) = split_data(
    best_inferred_df,
    target_name="viability",
    config=config,
    stratify_by="cell_mfc_name",
)
X_gene_train, y_gene_train, X_gene_val, y_gene_val, X_gene_test, y_gene_test = (
    split_data(
        gene_df, target_name="viability", config=config, stratify_by="cell_mfc_name"
    )
)

2025-01-21 14:15:11,314 - INFO - Loading datasets with sampling...
2025-01-21 14:15:11,316 - INFO - Sampling 1000 rows from ..\data/processed/preprocessed_tf.csv...
2025-01-21 14:15:11,550 - INFO - Sampling 1000 rows from ..\data/processed/preprocessed_landmark.csv...
2025-01-21 14:15:11,754 - INFO - Sampling 1000 rows from ..\data/processed/preprocessed_best_inferred.csv...
2025-01-21 14:15:14,471 - INFO - Loading ..\data/processed/preprocessed_gene.csv in chunks...
2025-01-21 14:15:21,553 - INFO - Splitting datasets into train/val/test...
2025-01-21 14:15:21,558 - INFO - Train Groups: 22 unique values.
2025-01-21 14:15:21,558 - INFO - Validation Groups: 6 unique values.
2025-01-21 14:15:21,558 - INFO - Test Groups: 4 unique values.
2025-01-21 14:15:21,603 - INFO - Train Groups: 22 unique values.
2025-01-21 14:15:21,604 - INFO - Validation Groups: 6 unique values.
2025-01-21 14:15:21,606 - INFO - Test Groups: 4 unique values.
2025-01-21 14:15:21,718 - INFO - Train Groups: 22 unique va

In [3]:
import networkx as nx


def load_ontology(file_path):
    dG = nx.DiGraph()
    with open(file_path, "r") as f:
        for line in f:
            parent, child, rel_type = line.strip().split("\t")
            dG.add_edge(parent, child, relationship=rel_type)
    return dG


# Load the ontology
ontology_file = "../data/raw/drugcell_ont.txt"
dG = load_ontology(ontology_file)

# Check the structure
print(f"Number of nodes: {len(dG.nodes())}")
print(f"Number of edges: {len(dG.edges())}")

Number of nodes: 5094
Number of edges: 62920


In [9]:
genes_in_ontology = {
    node for node, data in dG.nodes(data=True) if dG.out_degree(node) == 0
}
missing_genes = set(gene_df.columns) - genes_in_ontology
if missing_genes:
    print(f"Warning: {len(missing_genes)} genes are missing from the ontology.")
    
print(len(gene_df.columns) - len(missing_genes))

2683
