In [8]:
import pandas as pd

path = "../data/chembl/chembl_raw_pralsetinib_targets.csv"
df = pd.read_csv(path, sep=";")   # <-- key fix
df
#df.isna().mean().sort_values(ascending=False).head(15)

Unnamed: 0,ChEMBL ID,Name,Accessions,Type,Organism,Compounds,Activities,Tax ID,Species Group Flag
0,CHEMBL612558,ADMET,,ADMET,,142422,496635,,False
1,CHEMBL612545,Unchecked,,UNCHECKED,,782405,2291439,,False
2,CHEMBL4523582,Replicase polyprotein 1ab,P0DTD1,SINGLE PROTEIN,Severe acute respiratory syndrome coronavirus 2,11766,17929,2697049.0,False
3,CHEMBL4303835,SARS-CoV-2,,ORGANISM,Severe acute respiratory syndrome coronavirus 2,11952,40586,2697049.0,False
4,CHEMBL395,HepG2,,CELL-LINE,Homo sapiens,182794,201750,9606.0,False
5,CHEMBL3430904,Coiled-coil domain-containing protein 6/Tyrosi...,P07949|Q16204,CHIMERIC PROTEIN,Homo sapiens,65,91,9606.0,False
6,CHEMBL1974,Receptor-type tyrosine-protein kinase FLT3,P36888,SINGLE PROTEIN,Homo sapiens,8546,19473,9606.0,False
7,CHEMBL3430888,Kinesin-1 heavy chain/ Tyrosine-protein kinase...,P07949|P33176,CHIMERIC PROTEIN,Homo sapiens,146,173,9606.0,False
8,CHEMBL2971,Tyrosine-protein kinase JAK2,O60674,SINGLE PROTEIN,Homo sapiens,17559,56207,9606.0,False
9,CHEMBL614524,BaF3,,CELL-LINE,Mus musculus,2557,6945,10090.0,False


In [9]:
df_protein = df[df["Type"].str.contains("SINGLE PROTEIN", case=False, na=False)]
df_human = df_protein[df_protein["Organism"].str.contains("Homo sapiens", na=False)]
df_human.shape

df_human[["ChEMBL ID", "Name", "Organism", "Type"]]

Unnamed: 0,ChEMBL ID,Name,Organism,Type
6,CHEMBL1974,Receptor-type tyrosine-protein kinase FLT3,Homo sapiens,SINGLE PROTEIN
8,CHEMBL2971,Tyrosine-protein kinase JAK2,Homo sapiens,SINGLE PROTEIN
10,CHEMBL2041,Proto-oncogene tyrosine-protein kinase recepto...,Homo sapiens,SINGLE PROTEIN


After filtering, left with three human protein targets that have measured bioactivity data for pralsetinib in ChEMBL. These are real, protein-level interactions, not cell lines or ADMET panels.
- Target type = SINGLE PROTEIN
- Organism = Homo sapiens


**Receptor-type tyrosine-protein kinase FLT3**
- A receptor tyrosine kinase (like RET)
- Known oncogenic kinase
- Common off-target for kinase inhibitors
- This is a plausible off-target

**Tyrosine-protein kinase JAK2**
- Central to cytokine and inflammatory signaling
- Frequently linked to: cardiovascular effects, blood pressure regulation, immune-mediated toxicity
- Interesting off-target especially for hypertension

**Proto-oncogene tyrosine-protein kinase receptor RET**
- The intended on-target
- Confirms your pipeline is correct; validates  approach

In [13]:
df_human[["ChEMBL ID", "Name", "Accessions"]]

import re

def parse_uniprot(accessions):
    if pd.isna(accessions):
        return None
    # split on common delimiters
    parts = re.split(r"[,\|;\s]+", str(accessions).strip())
    parts = [p for p in parts if p]
    return parts[0] if parts else None

df_human["uniprot_id"] = df_human["Accessions"].apply(parse_uniprot)
df_human[["ChEMBL ID", "Name", "uniprot_id"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_human["uniprot_id"] = df_human["Accessions"].apply(parse_uniprot)


Unnamed: 0,ChEMBL ID,Name,uniprot_id
6,CHEMBL1974,Receptor-type tyrosine-protein kinase FLT3,P36888
8,CHEMBL2971,Tyrosine-protein kinase JAK2,O60674
10,CHEMBL2041,Proto-oncogene tyrosine-protein kinase recepto...,P07949


In [15]:
targets = df_human[["ChEMBL ID", "Name", "Organism", "Type", "uniprot_id"]].copy()
targets = targets.drop_duplicates().reset_index(drop=True)

targets_path = "../data/chembl/pralsetinib_targets_clean.csv"
targets.to_csv(targets_path, index=False)
targets_path

'../data/chembl/pralsetinib_targets_clean.csv'

In [20]:
import gzip

goa_path = "../data/ontologies/goa_human.gaf.gz"
targets_to_check = {"P36888", "P07949", "O60674"}

found = {t: 0 for t in targets_to_check}

with gzip.open(goa_path, "rt") as f:
    for line in f:
        if line.startswith("!"):
            continue
        parts = line.rstrip("\n").split("\t")
        acc = parts[1].strip()  # DB_Object_ID
        if acc in found:
            found[acc] += 1

found

{'O60674': 435, 'P36888': 141, 'P07949': 132}

In [17]:
import gzip

goa_path = "../data/ontologies/goa_human.gaf.gz"
uniprots = set(targets["uniprot_id"].dropna().tolist())

rows = []
with gzip.open(goa_path, "rt") as f:
    for line in f:
        if line.startswith("!"):
            continue
        parts = line.strip().split("\t")
        db_object_id = parts[1]      # UniProt accession
        go_id = parts[4]             # GO:xxxxxxx
        aspect = parts[8]            # P/F/C
        if db_object_id in uniprots:
            rows.append({
                "uniprot_id": db_object_id,
                "go_id": go_id,
                "aspect": aspect
            })

go_df = pd.DataFrame(rows).drop_duplicates()
go_df.head(), go_df.shape

(  uniprot_id       go_id aspect
 0     O60674  GO:0032760      P
 2     O60674  GO:1901731      P
 3     O60674  GO:0007167      P
 4     O60674  GO:0007259      P
 5     O60674  GO:0008285      P,
 (240, 3))

In [22]:
go_out = "../data/ontologies/processed/pralsetinib_targets_goa.csv"
go_df.to_csv(go_out, index=False)
go_out

'../data/ontologies/processed/pralsetinib_targets_goa.csv'

In [24]:
phenos = pd.DataFrame([
    {"mesh_term": "Hypertension", "category": "cardiovascular"},
    {"mesh_term": "Drug-Induced Liver Injury", "category": "hepatic"},
])
phenos_path = "../data/ontologies/processed/phenotypes_of_interest.csv"
phenos.to_csv(phenos_path, index=False)
phenos_path

'../data/ontologies/processed/phenotypes_of_interest.csv'

**Nodes:**
- pralsetinib (CHEMBL4582651)
- RET, FLT3, JAK2 (proteins)
- GO terms 
- phenotypes

**Edges:**
- drug -> protein (from ChEMBL target list)
- protein -> GO (from GOA / UniProt)
- (later) GO -> phenotype

In [26]:
drug_id = "CHEMBL4582651"
drug_name = "PRALSETINIB"

edges = []
for _, r in targets.iterrows():
    edges.append({
        "source_id": drug_id,
        "source_type": "drug",
        "edge_type": "binds_or_inhibits",
        "target_id": r["uniprot_id"],
        "target_type": "protein",
        "target_name": r["Name"],
        "evidence": "ChEMBL targets export"
    })

edges_df = pd.DataFrame(edges)
edges_path = "../data/processed/kg_edges_drug_protein.csv"
edges_df.to_csv(edges_path, index=False)
edges_path

'../data/processed/kg_edges_drug_protein.csv'

In [27]:
df_human[["Name","Accessions","uniprot_id"]]

Unnamed: 0,Name,Accessions,uniprot_id
6,Receptor-type tyrosine-protein kinase FLT3,P36888,P36888
8,Tyrosine-protein kinase JAK2,O60674,O60674
10,Proto-oncogene tyrosine-protein kinase recepto...,P07949,P07949
