# Create Drug–Target Pair Dataset
Build a dataset of samples where each row is:
- (drug_graph, target_embedding) → binding_label


In [1]:
#Import Libraries
import pandas as pd
import os

In [3]:
# Function to prepare the drug-target pair dataset
# This function reads the activity data and protein embeddings, filters them based on certain criteria,
def prepare_pair_dataset(
    activity_file="../data/step1_kinase_inhibitors_raw.csv",
    protein_embeddings_file="../data/step4_onehot_embeddings.csv",
    output_file="../data/step6_training_pairs.csv",
    ic50_cutoff=1000  # nM (1 µM)
):
    print("Loading data...")

    # Load files
    df = pd.read_csv(activity_file)
    protein_df = pd.read_csv(protein_embeddings_file)

    # Drop missing values
    df = df.dropna(subset=["activity_value", "target_chembl_id", "molecule_chembl_id"])
    df = df[df["activity_type"].isin(["IC50", "Ki", "Kd"])]  # Only keep these

    # Create binary label
    df["label"] = df["activity_value"].apply(lambda x: 1 if float(x) < ic50_cutoff else 0)

    # Keep only valid protein IDs that we have embeddings for
    # Extract ChEMBL IDs from the protein embeddings (format: P00519|CHEMBL1862|Tyrosine-protein)
    protein_df["chembl_id"] = protein_df["id"].str.split("|").str[1]
    valid_proteins = set(protein_df["chembl_id"])
    df = df[df["target_chembl_id"].isin(valid_proteins)]

    # Keep only drugs for which we have graphs
    available_drug_ids = set(f.replace(".pt", "") for f in os.listdir("../data/graphs/") if f.endswith(".pt"))
    df = df[df["molecule_chembl_id"].isin(available_drug_ids)]

    # Final cleaned pair dataset
    pair_df = df[["molecule_chembl_id", "target_chembl_id", "label"]].drop_duplicates()
    pair_df.columns = ["drug_id", "target_id", "label"]

    os.makedirs("data", exist_ok=True)
    pair_df.to_csv(output_file, index=False)
    print(f"✓ Saved {len(pair_df)} drug-target pairs to {output_file}")

    return pair_df

# Run the function
if __name__ == "__main__":
    prepare_pair_dataset()


Loading data...
✓ Saved 10584 drug-target pairs to ../data/step6_training_pairs.csv
