In [None]:
import os
import sys

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join("..", "src")))

import pandas as pd

# Load primary data files
viability_data = pd.read_csv('../data/raw/CTRP/v20.data.per_cpd_post_qc.txt', sep='\t')
auc_data = pd.read_csv("../data/raw/CTRP/v20.data.curves_post_qc.txt", sep="\t")

# Load metadata files
compound_meta = pd.read_csv("../data/raw/CTRP/v20.meta.per_compound.txt", sep="\t")
cell_line_meta = pd.read_csv("../data/raw/CTRP/v20.meta.per_cell_line.txt", sep="\t")
experiment_meta = pd.read_csv("../data/raw/CTRP/v20.meta.per_experiment.txt", sep="\t")

# Load perturbation metadata
pert_meta = pd.read_csv("../data/raw/compound_perturbation_metadata.txt", sep="\t")

In [2]:
# Merge viability data with compound metadata on master_cpd_id
pv_comp = viability_data.merge(compound_meta, on="master_cpd_id", how="left")

# Merge experiment metadata with cell line meta data on master_ccl_id
exp_cell = experiment_meta.merge(cell_line_meta, on="master_ccl_id", how="left")

In [3]:
# Deduplicate the experiment metadata by experiment_id
exp_cell_deduplicated = exp_cell.drop_duplicates(subset=["experiment_id"])

In [4]:
# Check shape of the dataframes before merging
print(pv_comp.shape)
print(exp_cell_deduplicated.shape)

(6171005, 16)
(907, 14)


In [5]:
# Merge the dataframes on experiment_id
final_data = pv_comp.merge(exp_cell_deduplicated, on="experiment_id", how="left")

# Display the resulting dataframe
final_data.shape

(6171005, 29)

In [6]:
# Rename columns in final_data for clarity
final_data = final_data.rename(
    columns={
        "cpd_conc_umol": "pert_dose",  # Dose column for merging
        "ccl_name": "cell_mfc_name",  # Match naming convention in pert_meta
        "broad_cpd_id": "pert_mfc_id",  # Match compound identifier
        "cpd_pred_pv": "viability_score",  # Viability score for model training
    }
)

In [7]:
# Filter out irrelevant drugs and cell lines
relevant_drugs = pert_meta["pert_mfc_id"].unique()
relevant_cell_lines = pert_meta["cell_mfc_name"].unique()

In [8]:
# Filter the final data
filtered_final_data = final_data[
    (final_data["pert_mfc_id"].isin(relevant_drugs))
    & (final_data["cell_mfc_name"].isin(relevant_cell_lines))
]

print(f"Original final_data shape: {final_data.shape}")
print(f"Filtered final_data shape: {filtered_final_data.shape}")

Original final_data shape: (6171005, 29)
Filtered final_data shape: (660133, 29)


In [9]:
# Perform the direct match
direct_matches = pert_meta.merge(
    filtered_final_data,
    left_on=["pert_dose", "cell_mfc_name", "pert_mfc_id"],
    right_on=["pert_dose", "cell_mfc_name", "pert_mfc_id"],
    how="inner",
)

In [10]:
print(f"Total rows in pert_meta: {len(pert_meta)}")
print(f"Direct matches: {len(direct_matches)}")

Total rows in pert_meta: 1311972
Direct matches: 5997


In [11]:
# Select the columns of interest
unique_triplets = pert_meta[["cell_mfc_name", "pert_mfc_id", "pert_dose"]]

# Drop duplicates
unique_triplets = unique_triplets.drop_duplicates()

# Count unique triplets
num_unique_triplets = len(unique_triplets)

print(
    f"Number of unique (cell_mfc_name, pert_mfc_id, pert_dose) triplets: {num_unique_triplets}"
)

Number of unique (cell_mfc_name, pert_mfc_id, pert_dose) triplets: 491368


In [14]:
Y = pd.read_csv("../data/raw/Y.tsv", sep="\t")

In [15]:
Y.shape

(31567, 7)

In [3]:
# Add a column to pert_meta for the viability score and set it to 1 for all rows
pert_meta["viability"] = 1

# Save the pert_meta dataframe to the original file location and overwrite the file
pert_meta.to_csv("../data/raw/compound_perturbation_metadata.txt", sep="\t", index=False)