In [1]:
import os
import sys

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join("..", "src")))

import pandas as pd

# Load primary data files
viability_data = pd.read_csv('../data/raw/CTRP/v20.data.per_cpd_post_qc.txt', sep='\t')
auc_data = pd.read_csv("../data/raw/CTRP/v20.data.curves_post_qc.txt", sep="\t")

# Load metadata files
compound_meta = pd.read_csv("../data/raw/CTRP/v20.meta.per_compound.txt", sep="\t")
cell_line_meta = pd.read_csv("../data/raw/CTRP/v20.meta.per_cell_line.txt", sep="\t")
experiment_meta = pd.read_csv("../data/raw/CTRP/v20.meta.per_experiment.txt", sep="\t")

# Load perturbation metadata
pert_meta = pd.read_csv("../data/raw/compound_perturbation_metadata.txt", sep="\t")

In [2]:
# Merge viability data with compound metadata on master_cpd_id
pv_comp = viability_data.merge(compound_meta, on="master_cpd_id", how="left")

# Merge experiment metadata with cell line meta data on master_ccl_id
exp_cell = experiment_meta.merge(cell_line_meta, on="master_ccl_id", how="left")

In [3]:
# Deduplicate the experiment metadata by experiment_id
exp_cell_deduplicated = exp_cell.drop_duplicates(subset=["experiment_id"])

In [4]:
# Check shape of the dataframes before merging
print(pv_comp.shape)
print(exp_cell_deduplicated.shape)

(6171005, 16)
(907, 14)


In [5]:
# Merge the dataframes on experiment_id
final_data = pv_comp.merge(exp_cell_deduplicated, on="experiment_id", how="left")

# Display the resulting dataframe
final_data.shape

(6171005, 29)

In [6]:
final_data

Unnamed: 0,experiment_id,cpd_pv_errorbar,cpd_pred_pv,cpd_avg_pv,cpd_conc_umol,master_cpd_id,cpd_name,broad_cpd_id,top_test_conc_umol,cpd_status,...,baseline_signal,cells_per_well,growth_mode,snp_fp_status,master_ccl_id,ccl_name,ccl_availability,ccle_primary_site,ccle_primary_hist,ccle_hist_subtype_1
0,1,0.000058,1.0000,0.9303,0.00030,1788,CIL55,BRD-K46556387,10.0,probe,...,0.2225,500,adherent,SNP-matched-reference,130,CAS1,ccle;public,central_nervous_system,glioma,astrocytoma_Grade_IV
1,1,0.000058,1.0000,0.8337,0.00061,1788,CIL55,BRD-K46556387,10.0,probe,...,0.2225,500,adherent,SNP-matched-reference,130,CAS1,ccle;public,central_nervous_system,glioma,astrocytoma_Grade_IV
2,1,0.000058,1.0000,1.0460,0.00120,1788,CIL55,BRD-K46556387,10.0,probe,...,0.2225,500,adherent,SNP-matched-reference,130,CAS1,ccle;public,central_nervous_system,glioma,astrocytoma_Grade_IV
3,1,0.000058,1.0000,1.0910,0.00240,1788,CIL55,BRD-K46556387,10.0,probe,...,0.2225,500,adherent,SNP-matched-reference,130,CAS1,ccle;public,central_nervous_system,glioma,astrocytoma_Grade_IV
4,1,0.000058,1.0000,1.0190,0.00490,1788,CIL55,BRD-K46556387,10.0,probe,...,0.2225,500,adherent,SNP-matched-reference,130,CAS1,ccle;public,central_nervous_system,glioma,astrocytoma_Grade_IV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6171000,907,0.000000,0.9187,1.0330,4.20000,710154,AT-406,BRD-K02834582,66.0,probe,...,0.0548,500,adherent,SNP-not-tested,155826,YAMATO,collaborator,,,
6171001,907,0.000000,0.9187,0.8442,8.30000,710154,AT-406,BRD-K02834582,66.0,probe,...,0.0548,500,adherent,SNP-not-tested,155826,YAMATO,collaborator,,,
6171002,907,0.000000,0.9187,0.9270,17.00000,710154,AT-406,BRD-K02834582,66.0,probe,...,0.0548,500,adherent,SNP-not-tested,155826,YAMATO,collaborator,,,
6171003,907,0.000000,0.9187,0.9251,33.00000,710154,AT-406,BRD-K02834582,66.0,probe,...,0.0548,500,adherent,SNP-not-tested,155826,YAMATO,collaborator,,,


In [7]:
# Rename columns in final_data for clarity
final_data = final_data.rename(
    columns={
        "cpd_conc_umol": "pert_dose",  # Dose column for merging
        "ccl_name": "cell_mfc_name",  # Match naming convention in pert_meta
        "broad_cpd_id": "pert_mfc_id",  # Match compound identifier
        "cpd_pred_pv": "viability_score",  # Viability score for model training
    }
)

In [9]:
# Filter out irrelevant drugs and cell lines
relevant_drugs = pert_meta["pert_mfc_id"].unique()
relevant_cell_lines = pert_meta["cell_mfc_name"].unique()

In [10]:
# Filter the final data
filtered_final_data = final_data[
    (final_data["pert_mfc_id"].isin(relevant_drugs))
    & (final_data["cell_mfc_name"].isin(relevant_cell_lines))
]

print(f"Original final_data shape: {final_data.shape}")
print(f"Filtered final_data shape: {filtered_final_data.shape}")

Original final_data shape: (6171005, 29)
Filtered final_data shape: (660133, 29)


In [12]:
# Perform the direct match
direct_matches = pert_meta.merge(
    filtered_final_data,
    left_on=["pert_dose", "cell_mfc_name", "pert_mfc_id"],
    right_on=["pert_dose", "cell_mfc_name", "pert_mfc_id"],
    how="inner",
)

In [13]:
print(f"Total rows in pert_meta: {len(pert_meta)}")
print(f"Direct matches: {len(direct_matches)}")

Total rows in pert_meta: 1311972
Direct matches: 5997


In [14]:
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit


# Sigmoid function
def sigmoid(x, x0, k, ymin, ymax):
    return ymin + (ymax - ymin) / (1 + np.exp(-k * (x - x0)))


def fit_sigmoid(dose, viability):
    """
    Fits a sigmoid curve to the dose-response data.

    Args:
        dose (array-like): Dose values.
        viability (array-like): Viability scores corresponding to doses.

    Returns:
        popt (array): Optimized parameters [x0, k, ymin, ymax].
        success (bool): Whether the fit was successful.
    """
    try:
        # Initial guesses: IC50 (midpoint), slope, min viability, max viability
        initial_guess = [np.median(dose), 1, np.min(viability), np.max(viability)]
        popt, _ = curve_fit(sigmoid, dose, viability, p0=initial_guess, maxfev=10000)
        return popt, True
    except RuntimeError:
        # Curve fitting failed
        return None, False

In [16]:
# Group data by cell line and compound
grouped = filtered_final_data.groupby(["cell_mfc_name", "pert_mfc_id"])

# Store fitted parameters
sigmoid_params = {}

for (cell, drug), group in grouped:
    dose = group["pert_dose"].values
    viability = group["viability_score"].values

    # Fit sigmoid curve
    popt, success = fit_sigmoid(dose, viability)
    if success:
        sigmoid_params[(cell, drug)] = popt

  return ymin + (ymax - ymin) / (1 + np.exp(-k * (x - x0)))
  popt, _ = curve_fit(sigmoid, dose, viability, p0=initial_guess, maxfev=10000)


KeyboardInterrupt: 

In [None]:
def predict_viability(row):
    """
    Predicts viability using fitted sigmoid parameters.

    Args:
        row (Series): A row from pert_meta.

    Returns:
        float: Predicted viability score or NaN if no fit is available.
    """
    params = sigmoid_params.get((row["cell_mfc_name"], row["pert_mfc_id"]))
    if params:
        x0, k, ymin, ymax = params
        return sigmoid(row["pert_dose"], x0, k, ymin, ymax)
    return np.nan


# Apply predictions to unmatched rows
pert_meta["viability_score"] = pert_meta.apply(
    predict_viability, axis=1
)

In [None]:
# Combine direct matches and interpolated rows
final_pert_meta = pd.concat([direct_matches, unmatched_pert_meta], ignore_index=True)