In [1]:
from pathlib import Path
import xml.etree.ElementTree as ET
import pandas as pd

XML_PATH = Path("../data/raw/drugbank_full_database.xml")
TARGET_ID = "DB15822"

def strip_ns(tag: str) -> str:
    return tag.split("}", 1)[-1] if "}" in tag else tag

def find_drug_by_id(xml_path: Path, target_id: str):
    context = ET.iterparse(str(xml_path), events=("start", "end"))
    _, root = next(context)  # root element

    for event, elem in context:
        if event == "end" and strip_ns(elem.tag) == "drug":
            # collect drugbank ids
            ids = []
            for child in elem:
                if strip_ns(child.tag) == "drugbank-id" and child.text:
                    ids.append(child.text.strip())

            if target_id in ids:
                return elem  # found!

            root.clear()  # free memory

    return None

drug_elem = find_drug_by_id(XML_PATH, TARGET_ID)
if drug_elem is None:
    raise FileNotFoundError(f"Could not find {TARGET_ID} in {XML_PATH}")
print("Found drug element.")

Found drug element.


In [2]:
def child_text(elem, tag_name, default=None):
    for c in list(elem):
        if strip_ns(c.tag) == tag_name:
            return c.text.strip() if c.text else default
    return default

def find_child(elem, tag_name):
    for c in list(elem):
        if strip_ns(c.tag) == tag_name:
            return c
    return None

def iter_children(elem, tag_name):
    for c in list(elem):
        if strip_ns(c.tag) == tag_name:
            yield c

### Core “drug” table

In [3]:
pd.set_option('display.max_colwidth', 500)

def parse_drug_core(drug):
    primary_id = None
    all_ids = []
    for c in list(drug):
        if strip_ns(c.tag) == "drugbank-id" and c.text:
            did = c.text.strip()
            all_ids.append(did)
            if c.attrib.get("primary") == "true":
                primary_id = did
    if primary_id is None and all_ids:
        primary_id = all_ids[0]

    groups = []
    groups_elem = find_child(drug, "groups")
    if groups_elem is not None:
        for g in list(groups_elem):
            if strip_ns(g.tag) == "group" and g.text:
                groups.append(g.text.strip())

    return {
        "drugbank_id": primary_id,
        "name": child_text(drug, "name"),
        "type": drug.attrib.get("type"),
        "description": child_text(drug, "description"),
        "indication": child_text(drug, "indication"),
        "pharmacodynamics": child_text(drug, "pharmacodynamics"),
        "mechanism_of_action": child_text(drug, "mechanism-of-action"),
        "toxicity": child_text(drug, "toxicity"),
        "groups": groups,
    }

df_drug = pd.DataFrame([parse_drug_core(drug_elem)])
df_drug

Unnamed: 0,drugbank_id,name,type,description,indication,pharmacodynamics,mechanism_of_action,toxicity,groups
0,DB15822,Pralsetinib,small molecule,"Pralsetinib, similar to the previously approved [selpercatinib], is a kinase inhibitor with enhanced specificity for RET tyrosine kinase receptors (RTKs) over other RTK classes.[A202055, A219751, L15986] Enhanced RET (Rearranged during transfection) oncogene expression is a hallmark of many cancers, including non-small cell lung cancer. Although multikinase inhibitors, including [cabozantinib], [ponatinib], [sorafenib], [sunitinib], and [vandetanib], have shown efficacy in RET-driven cancers...","Pralsetinib is indicated for the treatment of metastatic non-small cell lung cancer (NSCLC) in adult patients who are confirmed to possess a rearranged during transfection (RET) gene fusion, as determined by an FDA approved test.[L15986] It is also indicated in adult and pediatric patients 12 years of age and older for the treatment of advanced or metastatic _RET_ fusion-positive thyroid cancer who require systemic therapy and for whom radioactive iodine is not appropriate.[L47905] The indic...","Pralsetinib exerts an anti-tumour effect through specific inhibition of the rearranged during transfection (RET) tyrosine kinase, including multiple distinct oncogenic RET fusions, mutated RET kinase domains harbouring gatekeeper mutations, and in RET kinases with a variety of activating single point mutations.[A202049, A202046, A202055, A219746, L15986] Due to pralsetinib's high selectivity for RET over other kinases, both _in vitro_ and _in vivo_,[A219751] pralsetinib has been described as...","Rearranged during transfection (RET) is a transmembrane receptor tyrosine kinase containing extracellular, transmembrane, and intracellular domains whose activity is required for normal kidney and nervous system development.[A202061, A202055] Constitutive RET activation is achieved through chromosomal rearrangements producing 5' fusions of dimerizable domains to the 3' _RET_ tyrosine kinase domain leading to constitutive dimerization and subsequent autophosphorylation; the most common fusion...","Pralsetinib administered to rats at 20 mg/kg (roughly 2.5-3.6 times the recommended human exposure) resulted in resorption of litters in pregnant female mice in 92% of pregnancies (82% complete resorption); resorption occurred at doses as low as 5 mg/kg (0.3 times the recommended human exposure). Both male and female rats given 10 mg/kg pralsetinib or more had observable degeneration within the testis/ovaries. In 28-day rat and monkey studies, once-daily pralsetinib resulted in histological ...","[approved, investigational]"


### Targets / enzymes / transporters / carriers (as entities + edges)

In [4]:
def parse_polypeptide(polypep):
    # polypeptide can contain gene-name, protein-name, organism, external-identifiers, etc.
    gene = None
    prot_name = None
    organism = None
    uniprot = None

    for c in list(polypep):
        t = strip_ns(c.tag)
        if t == "gene-name" and c.text:
            gene = c.text.strip()
        elif t == "protein-name" and c.text:
            prot_name = c.text.strip()
        elif t == "organism" and c.text:
            organism = c.text.strip()
        elif t == "external-identifiers":
            for ei in list(c):
                if strip_ns(ei.tag) == "external-identifier":
                    res = child_text(ei, "resource")
                    ident = child_text(ei, "identifier")
                    if res and ident and res.lower() == "uniprotkb":
                        uniprot = ident

    return gene, prot_name, organism, uniprot

def parse_partner_section(drug, section_tag):
    sec = find_child(drug, section_tag)
    rows = []
    if sec is None:
        return pd.DataFrame(rows)

    for item in list(sec):
        item_tag = strip_ns(item.tag)  # target/enzyme/transporter/carrier
        if item_tag not in ("target", "enzyme", "transporter", "carrier"):
            continue

        partner_id = child_text(item, "id")
        partner_name = child_text(item, "name")

        # actions
        actions = []
        actions_elem = find_child(item, "actions")
        if actions_elem is not None:
            for a in list(actions_elem):
                if strip_ns(a.tag) == "action" and a.text:
                    actions.append(a.text.strip())

        # polypeptide
        gene = prot_name = organism = uniprot = None
        polypep = find_child(item, "polypeptide")
        if polypep is not None:
            gene, prot_name, organism, uniprot = parse_polypeptide(polypep)

        rows.append({
            "drugbank_id": df_drug.loc[0, "drugbank_id"],
            "section": section_tag,
            "partner_type": item_tag,
            "partner_id": partner_id,
            "partner_name": partner_name,
            "gene": gene,
            "protein_name": prot_name,
            "organism": organism,
            "uniprot": uniprot,
            "actions": actions,
        })

    return pd.DataFrame(rows)

df_targets = parse_partner_section(drug_elem, "targets")
df_enzymes = parse_partner_section(drug_elem, "enzymes")
df_transporters = parse_partner_section(drug_elem, "transporters")
df_carriers = parse_partner_section(drug_elem, "carriers")

df_targets

Unnamed: 0,drugbank_id,section,partner_type,partner_id,partner_name,gene,protein_name,organism,uniprot,actions
0,DB15822,targets,target,BE0002411,Proto-oncogene tyrosine-protein kinase receptor Ret,RET,,Humans,P07949,[inhibitor]
1,DB15822,targets,target,BE0001124,Epithelial discoidin domain-containing receptor 1,DDR1,,Humans,Q08345,[inhibitor]
2,DB15822,targets,target,BE0009501,NT-3 growth factor receptor,NTRK3,,Humans,Q16288,[inhibitor]
3,DB15822,targets,target,BE0000147,Receptor-type tyrosine-protein kinase FLT3,FLT3,,Humans,P36888,[inhibitor]
4,DB15822,targets,target,BE0004145,Tyrosine-protein kinase JAK1,JAK1,,Humans,P23458,[inhibitor]
5,DB15822,targets,target,BE0002408,Tyrosine-protein kinase JAK2,JAK2,,Humans,O60674,[inhibitor]
6,DB15822,targets,target,BE0001039,High affinity nerve growth factor receptor,NTRK1,,Humans,P04629,[inhibitor]
7,DB15822,targets,target,BE0000369,Vascular endothelial growth factor receptor 2,KDR,,Humans,P35968,[inhibitor]
8,DB15822,targets,target,BE0000205,Platelet-derived growth factor receptor beta,PDGFRB,,Humans,P09619,[inhibitor]
9,DB15822,targets,target,BE0002131,Fibroblast growth factor receptor 1,FGFR1,,Humans,P11362,[inhibitor]


### Drug–drug interactions (DDIs)

In [5]:
def parse_ddi(drug):
    sec = find_child(drug, "drug-interactions")
    rows = []
    if sec is None:
        return pd.DataFrame(rows)

    for di in list(sec):
        if strip_ns(di.tag) != "drug-interaction":
            continue
        rows.append({
            "drugbank_id": df_drug.loc[0, "drugbank_id"],
            "interact_drugbank_id": child_text(di, "drugbank-id"),
            "interact_name": child_text(di, "name"),
            "description": child_text(di, "description"),
        })
    return pd.DataFrame(rows)

df_ddi = parse_ddi(drug_elem)
df_ddi.head()

Unnamed: 0,drugbank_id,interact_drugbank_id,interact_name,description
0,DB15822,DB06616,Bosutinib,The serum concentration of Bosutinib can be increased when it is combined with Pralsetinib.
1,DB15822,DB08870,Brentuximab vedotin,The serum concentration of Brentuximab vedotin can be increased when it is combined with Pralsetinib.
2,DB15822,DB00175,Pravastatin,Pralsetinib may decrease the excretion rate of Pravastatin which could result in a higher serum level.
3,DB15822,DB00176,Fluvoxamine,The serum concentration of Pralsetinib can be increased when it is combined with Fluvoxamine.
4,DB15822,DB00222,Glimepiride,Pralsetinib may decrease the excretion rate of Glimepiride which could result in a higher serum level.


## KNOWLEDGE GRAPH

### NODES

In [6]:
df_partners = pd.concat([df_targets, df_enzymes, df_transporters, df_carriers], ignore_index=True)
df_partners

Unnamed: 0,drugbank_id,section,partner_type,partner_id,partner_name,gene,protein_name,organism,uniprot,actions
0,DB15822,targets,target,BE0002411,Proto-oncogene tyrosine-protein kinase receptor Ret,RET,,Humans,P07949,[inhibitor]
1,DB15822,targets,target,BE0001124,Epithelial discoidin domain-containing receptor 1,DDR1,,Humans,Q08345,[inhibitor]
2,DB15822,targets,target,BE0009501,NT-3 growth factor receptor,NTRK3,,Humans,Q16288,[inhibitor]
3,DB15822,targets,target,BE0000147,Receptor-type tyrosine-protein kinase FLT3,FLT3,,Humans,P36888,[inhibitor]
4,DB15822,targets,target,BE0004145,Tyrosine-protein kinase JAK1,JAK1,,Humans,P23458,[inhibitor]
5,DB15822,targets,target,BE0002408,Tyrosine-protein kinase JAK2,JAK2,,Humans,O60674,[inhibitor]
6,DB15822,targets,target,BE0001039,High affinity nerve growth factor receptor,NTRK1,,Humans,P04629,[inhibitor]
7,DB15822,targets,target,BE0000369,Vascular endothelial growth factor receptor 2,KDR,,Humans,P35968,[inhibitor]
8,DB15822,targets,target,BE0000205,Platelet-derived growth factor receptor beta,PDGFRB,,Humans,P09619,[inhibitor]
9,DB15822,targets,target,BE0002131,Fibroblast growth factor receptor 1,FGFR1,,Humans,P11362,[inhibitor]


In [7]:
def make_nodes(df_drug, df_partners, df_ddi):
    nodes = []

    # drug node
    drug_id = df_drug.loc[0, "drugbank_id"]
    nodes.append({"node_id": drug_id, "node_type": "Drug", "label": df_drug.loc[0, "name"]})

    # partner nodes
    for _, r in df_partners.iterrows():
        # prefer uniprot if present; else gene; else partner_id/name fallback
        pid = r["uniprot"] or r["gene"] or r["partner_id"] or r["partner_name"]
        if pd.isna(pid) or pid is None:
            continue
        nodes.append({
            "node_id": str(pid),
            "node_type": r["partner_type"].capitalize(),  # Target/Enzyme/Transporter/Carrier
            "label": r["gene"] or r["partner_name"] or str(pid),
        })

    # interacting drug nodes
    for _, r in df_ddi.iterrows():
        if r["interact_drugbank_id"]:
            nodes.append({
                "node_id": r["interact_drugbank_id"],
                "node_type": "Drug",
                "label": r["interact_name"] or r["interact_drugbank_id"],
            })

    df_nodes = pd.DataFrame(nodes).drop_duplicates(subset=["node_id"]).reset_index(drop=True)
    return df_nodes

df_nodes = make_nodes(df_drug, df_partners, df_ddi)
df_nodes.head()

Unnamed: 0,node_id,node_type,label
0,DB15822,Drug,Pralsetinib
1,P07949,Target,RET
2,Q08345,Target,DDR1
3,Q16288,Target,NTRK3
4,P36888,Target,FLT3


### EDGES

In [8]:
def make_edges(df_drug, df_partners, df_ddi):
    edges = []
    drug_id = df_drug.loc[0, "drugbank_id"]

    # drug -> partners
    for _, r in df_partners.iterrows():
        pid = r["uniprot"] or r["gene"] or r["partner_id"] or r["partner_name"]
        if pd.isna(pid) or pid is None:
            continue
        rel = {
            "targets": "TARGETS",
            "enzymes": "AFFECTS_ENZYME",
            "transporters": "AFFECTS_TRANSPORTER",
            "carriers": "AFFECTS_CARRIER",
        }.get(r["section"], "RELATED_TO")

        # if actions exist, attach as edge attribute
        edges.append({
            "source": drug_id,
            "target": str(pid),
            "relation": rel,
            "actions": ";".join(r["actions"]) if isinstance(r["actions"], list) else None,
        })

    # drug -> interacting drugs
    for _, r in df_ddi.iterrows():
        if r["interact_drugbank_id"]:
            edges.append({
                "source": drug_id,
                "target": r["interact_drugbank_id"],
                "relation": "DRUG_INTERACTION",
                "actions": None,
                "description": r["description"],
            })

    return pd.DataFrame(edges)

df_edges = make_edges(df_drug, df_partners, df_ddi)
df_edges

Unnamed: 0,source,target,relation,actions,description
0,DB15822,P07949,TARGETS,inhibitor,
1,DB15822,Q08345,TARGETS,inhibitor,
2,DB15822,Q16288,TARGETS,inhibitor,
3,DB15822,P36888,TARGETS,inhibitor,
4,DB15822,P23458,TARGETS,inhibitor,
...,...,...,...,...,...
907,DB15822,DB16852,DRUG_INTERACTION,,The serum concentration of Pralsetinib can be increased when it is combined with Remibrutinib.
908,DB15822,DB16277,DRUG_INTERACTION,,The metabolism of Paltusotine can be decreased when combined with Pralsetinib.
909,DB15822,DB21667,DRUG_INTERACTION,,The serum concentration of Pralsetinib can be increased when it is combined with Sevabertinib.
910,DB15822,DB12817,DRUG_INTERACTION,,The metabolism of Zoliflodacin can be decreased when combined with Pralsetinib.


Exploring toxicity

In [9]:
pd.set_option('display.max_colwidth', None)

tox_fields = [
    "toxicity",
    "black-box-warning",
    "contraindications",
    "adverse-reactions",
    "pharmacodynamics",
    "mechanism-of-action"
]

tox_data = {}

for field in tox_fields:
    tox_data[field] = child_text(drug_elem, field)

tox_data

df_toxicity = pd.DataFrame([
    {"section": k, "text": v}
    for k, v in tox_data.items()
    if v is not None
])

df_toxicity

Unnamed: 0,section,text
0,toxicity,"Pralsetinib administered to rats at 20 mg/kg (roughly 2.5-3.6 times the recommended human exposure) resulted in resorption of litters in pregnant female mice in 92% of pregnancies (82% complete resorption); resorption occurred at doses as low as 5 mg/kg (0.3 times the recommended human exposure). Both male and female rats given 10 mg/kg pralsetinib or more had observable degeneration within the testis/ovaries. In 28-day rat and monkey studies, once-daily pralsetinib resulted in histological necrosis at doses 1.1 or more times the recommended human dose and myocardial hemorrhage at doses 2.6 or more times the recommended human dose. Also, pralsetinib induced hyperphosphatemia (rats only, dose 2.4-3.5 times the recommended human dose) and multi-organ mineralization (dose 0.11 or more times the recommended human dose).[L15986]"
1,pharmacodynamics,"Pralsetinib exerts an anti-tumour effect through specific inhibition of the rearranged during transfection (RET) tyrosine kinase, including multiple distinct oncogenic RET fusions, mutated RET kinase domains harbouring gatekeeper mutations, and in RET kinases with a variety of activating single point mutations.[A202049, A202046, A202055, A219746, L15986] Due to pralsetinib's high selectivity for RET over other kinases, both _in vitro_ and _in vivo_,[A219751] pralsetinib has been described as having a better safety profile compared to previously used multi-kinase inhibitors.[A202049, A202046, A202055, A219746] Despite this, pralsetinib use may increase the risk of hypertension, hemorrhagic events, impaired wound healing, hepatotoxicity, interstitial lung disease/pneumonitis, and embryo-fetal toxicity.[L15986]"
2,mechanism-of-action,"Rearranged during transfection (RET) is a transmembrane receptor tyrosine kinase containing extracellular, transmembrane, and intracellular domains whose activity is required for normal kidney and nervous system development.[A202061, A202055] Constitutive RET activation is achieved through chromosomal rearrangements producing 5' fusions of dimerizable domains to the 3' _RET_ tyrosine kinase domain leading to constitutive dimerization and subsequent autophosphorylation; the most common fusions are _KIF5B-RET_ and _CCDC6-RET_, although more than 35 genes have been reported to fuse with _RET_.[A202055, A202049, A202073] Constitutive activation leads to increased downstream signalling and is associated with tumour invasion, migration, and proliferation.[A202046]\r\n\r\nPralsetinib (formerly referred to as BLU-667) was developed through screening more than 10,000 agnostically designed kinase inhibitors followed by extensive chemical modification to improve its properties. Pralsetinib displays _in vitro_ IC<sub>50</sub> values for both WT RET as well as several mutant forms, including CCDC6-RET, in the range of 0.3-0.4 nmol/L, and is 100-fold more selective for RET kinase over 96% of 371 kinases tested.[A219751] It is this specific inhibition of RET kinase that is associated with anti-tumour activity and clinical benefit in patients.[A219751, A219756, L15986]\r\n\r\nDespite increased selectivity for RET over other kinases, pralsetinib has been reported to inhibit DDR1, TRKC, FLT3, JAK1-2, TRKA, VEGFR2, PDGFRb, and FGFR1-2 at clinically relevant concentrations. The significance of these findings remains uncertain.[L15986]"


In [9]:
from pathlib import Path
import copy
import xml.etree.ElementTree as ET

XML_PATH = Path("../data/raw/drugbank_full_database.xml")
TARGET_ID = "DB15822"

def strip_ns(tag: str) -> str:
    return tag.split("}", 1)[-1] if "}" in tag else tag

def find_drug_by_id(xml_path: Path, target_id: str):
    context = ET.iterparse(str(xml_path), events=("start", "end"))
    _, root = next(context)  # root element (drugbank)
    root_attrib = dict(root.attrib)

    # detect namespace (DrugBank uses a default ns)
    ns_uri = None
    if "}" in root.tag:
        ns_uri = root.tag.split("}", 1)[0].strip("{")

    for event, elem in context:
        if event == "end" and strip_ns(elem.tag) == "drug":
            ids = []
            for child in elem:
                if strip_ns(child.tag) == "drugbank-id" and child.text:
                    ids.append(child.text.strip())

            if target_id in ids:
                # IMPORTANT: copy before we clear to free memory
                drug_copy = copy.deepcopy(elem)
                return drug_copy, ns_uri, root_attrib

            root.clear()

    return None, ns_uri, root_attrib

drug_elem, ns_uri, root_attrib = find_drug_by_id(XML_PATH, TARGET_ID)
if drug_elem is None:
    raise ValueError(f"Could not find {TARGET_ID} in {XML_PATH}")

# Write a minimal XML containing ONLY that drug
out_dir = Path("../data/interim/drugbank/drugbank_pralstinib_only")
out_dir.mkdir(parents=True, exist_ok=True)

out_xml = out_dir / f"{TARGET_ID}_pralsetinib.xml"

if ns_uri:
    # wrap in a drugbank root so the file is still a valid-ish DrugBank-shaped XML
    root_out = ET.Element(f"{{{ns_uri}}}drugbank", root_attrib)
else:
    root_out = ET.Element("drugbank", root_attrib)

root_out.append(drug_elem)
ET.ElementTree(root_out).write(out_xml, encoding="utf-8", xml_declaration=True)

print("Wrote:", out_xml)

Wrote: ../data/interim/drugbank/drugbank_pralstinib_only/DB15822_pralsetinib.xml


In [None]:
import pandas as pd

faers_path = "../data/raw/faers_pralsetinib_reports.xlsx"
faers = pd.read_excel(faers_path)

faers.shape, faers.columns



((1011, 24),
 Index(['Case ID', 'Suspect Product Names',
        'Suspect Product Active Ingredients', 'Reason for Use', 'Reactions',
        'Serious', 'Outcomes', 'Sex', 'Event Date', 'Latest FDA Received Date',
        'Case Priority', 'Patient Age', 'Patient Weight', 'Sender',
        'Reporter Type', 'Report Source', 'Concomitant Product Names',
        'Latest Manufacturer Received Date', 'Initial FDA Received Date',
        'Country where Event occurred', 'Reported to Manufacturer?',
        'Manufacturer Control Number', 'Literature Reference',
        'Compounded Flag'],
       dtype='object'))

In [11]:
df_partners.to_csv("../data/interim/drugbank_pralsetinib_partners.csv", index=False)

In [13]:
partners = pd.read_csv("../data/processed/drugbank_pralsetinib_partners.csv")

# keep only human proteins with a UniProt ID
partners = partners[(partners["organism"].str.contains("Human", na=False)) & (partners["uniprot"].notna())].copy()

partners[["section","gene","uniprot","partner_name","actions"]].head(), partners["section"].value_counts()


(   section   gene uniprot  \
 0  targets    RET  P07949   
 1  targets   DDR1  Q08345   
 2  targets  NTRK3  Q16288   
 3  targets   FLT3  P36888   
 4  targets   JAK1  P23458   
 
                                           partner_name        actions  
 0  Proto-oncogene tyrosine-protein kinase receptor Ret  ['inhibitor']  
 1    Epithelial discoidin domain-containing receptor 1  ['inhibitor']  
 2                          NT-3 growth factor receptor  ['inhibitor']  
 3           Receptor-type tyrosine-protein kinase FLT3  ['inhibitor']  
 4                         Tyrosine-protein kinase JAK1  ['inhibitor']  ,
 section
 targets         11
 transporters     8
 enzymes          6
 Name: count, dtype: int64)

In [15]:
def pick_reaction_col(df):
    candidates = ["Reactions", "reactions", "reaction", "pt", "PT", "Preferred Term"]
    for c in candidates:
        if c in df.columns:
            return c
    return None

rx_col = pick_reaction_col(faers)
rx_col

import re

faers_rx = faers.copy()

if rx_col is None:
    raise ValueError("Couldn't find a reactions column. Print faers.columns and tell me what you see.")

# If reactions are comma/semicolon separated in one cell:
faers_rx[rx_col] = faers_rx[rx_col].astype(str)

# split on ; or , (common)
faers_rx["reaction_term"] = faers_rx[rx_col].apply(lambda s: [t.strip() for t in re.split(r"[;,]", s) if t.strip()])
faers_rx = faers_rx.explode("reaction_term").dropna(subset=["reaction_term"])

top_rx = faers_rx["reaction_term"].value_counts().head(25)
top_rx

reaction_term
Product Prescribing Issue           131
Death                                81
Fatigue                              67
Off Label Use                        65
Diarrhoea                            52
Anaemia                              49
Asthenia                             47
Constipation                         43
Blood Pressure Increased             43
Hypertension                         42
Disease Progression                  39
White Blood Cell Count Decreased     35
Pneumonia                            33
No Adverse Event                     33
Pyrexia                              31
Hospitalisation                      30
Cough                                28
Dizziness                            26
Nausea                               26
Dyspnoea                             25
Pneumonitis                          24
Dry Mouth                            24
Platelet Count Decreased             23
Arthralgia                           21
Pain                      

In [18]:
TOX_BUCKETS = {
    "cardiovascular": ["hypertension", "blood pressure", "tachycard", "bradycard", "arrhythm", "qt", "cardiac"],
    "hepatic": ["liver", "hepatic", "hepat", "bilirubin", "alt", "ast", "jaundice", "cholest"],
    "bleeding": ["hemorrhage", "bleed", "thrombocyt", "coagul", "hematoma"],
    "infection_immune": ["infection", "sepsis", "pneumonia", "neutrop", "immun", "fever"],
    "wound_healing": ["wound", "healing", "dehisc", "surgery", "incision"],
    "pulmonary": ["pneumonitis", "interstitial", "dyspnea", "pulmonary", "hypoxia"],
    "gi": ["diarrhea", "nausea", "vomit", "abdominal", "constipation"],
    "renal": ["renal", "kidney", "creatinine", "neph"],
}

def bucket_reaction(term: str):
    t = str(term).lower()
    for bucket, kws in TOX_BUCKETS.items():
        if any(kw in t for kw in kws):
            return bucket
    return "other"

faers_rx["tox_bucket"] = faers_rx["reaction_term"].apply(bucket_reaction)

bucket_counts = faers_rx["tox_bucket"].value_counts()
bucket_counts

faers_rx.groupby("tox_bucket")["reaction_term"].value_counts().groupby(level=0).head(8)


tox_bucket      reaction_term           
bleeding        Thrombocytopenia             6
                Coagulopathy                 1
                Gingival Bleeding            1
cardiovascular  Blood Pressure Increased    43
                Hypertension                42
                                            ..
renal           Nephrolithiasis              1
                Nephrostomy                  1
wound_healing   Surgery                      6
                Impaired Healing             2
                Spinal Fusion Surgery        2
Name: count, Length: 62, dtype: int64

In [24]:
ot_path = "../data/raw/open_targets_target_disease_long.csv"  # adjust if needed
ot = pd.read_csv(ot_path)

ot.shape, ot.columns

ot

Unnamed: 0,target_symbol,disease_id,disease_name,score
0,RET,MONDO_0008234,multiple endocrine neoplasia type 2A,0.859057
1,RET,MONDO_0015277,medullary thyroid gland carcinoma,0.856603
2,RET,MONDO_0008082,multiple endocrine neoplasia type 2B,0.819577
3,RET,MONDO_0018309,Hirschsprung disease,0.791767
4,RET,MONDO_0008233,pheochromocytoma,0.778019
...,...,...,...,...
70,JAK2,MONDO_0013730,graft versus host disease,0.488734
71,JAK2,HP_0001873,Thrombocytopenia,0.483634
72,JAK2,EFO_0003767,inflammatory bowel disease,0.482697
73,JAK2,EFO_0000676,psoriasis,0.477655


In [25]:
def pick_col(cols, options):
    for c in options:
        if c in cols:
            return c
    return None

gene_col = pick_col(ot.columns, ["target_symbol","geneSymbol","symbol","gene"])
disease_col = pick_col(ot.columns, ["disease_id","disease","disease_name","name"])
score_col = pick_col(ot.columns, ["score","associationScore","overall_score"])

gene_col, disease_col, score_col


('target_symbol', 'disease_id', 'score')

In [26]:
genes = set(partners["gene"].dropna().astype(str).unique())
ot_sub = ot[ot[gene_col].astype(str).isin(genes)].copy()

ot_sub.shape, sorted(list(genes))[:10]


((75, 4),
 ['ABCB1',
  'ABCB11',
  'ABCG2',
  'CYP1A2',
  'CYP2C8',
  'CYP2C9',
  'CYP2D6',
  'CYP3A4',
  'CYP3A5',
  'DDR1'])

In [27]:
DISEASE_BUCKETS = {
    "cardiovascular": ["hypertension", "cardiac", "heart", "arrhythm", "blood pressure", "vascular"],
    "hepatic": ["liver", "hepatic", "hepatitis", "cholest", "cirrhos", "jaundice"],
    "bleeding": ["hemorrhage", "bleeding", "thrombocyt", "coagul"],
    "infection_immune": ["infection", "sepsis", "pneumonia", "immune", "neutrop", "inflamm"],
    "wound_healing": ["wound", "healing", "dehisc", "ulcer"],
    "pulmonary": ["pneumonitis", "pulmonary", "interstitial", "lung"],
    "gi": ["diarrhea", "nausea", "vomit", "colitis", "abdominal"],
    "renal": ["renal", "kidney", "neph"],
}

def bucket_disease(name: str):
    t = str(name).lower()
    for bucket, kws in DISEASE_BUCKETS.items():
        if any(kw in t for kw in kws):
            return bucket
    return "other"

ot_sub["tox_bucket"] = ot_sub[disease_col].apply(bucket_disease)

# If you have a score column, keep strongest associations
if score_col is not None:
    ot_sub = ot_sub.sort_values(score_col, ascending=False)

protein_bucket = (
    ot_sub.groupby([gene_col, "tox_bucket"])
    .size()
    .reset_index(name="n_diseases")
    .sort_values(["n_diseases"], ascending=False)
)

protein_bucket.head(20)


Unnamed: 0,target_symbol,tox_bucket,n_diseases
0,FLT3,other,25
1,JAK2,other,25
2,RET,other,25


In [29]:
faers_bucket_set = set(bucket_counts.index)

pred_bucket_counts = protein_bucket["tox_bucket"].value_counts()
overlap = sorted(set(pred_bucket_counts.index) & set(bucket_counts.index))

overlap, pred_bucket_counts, bucket_counts

summary = pd.DataFrame({
    "faers_reports_in_bucket": bucket_counts,
}).join(pd.DataFrame({
    "proteins_linked_to_bucket": protein_bucket.groupby("tox_bucket")[gene_col].nunique()
}), how="outer").fillna(0).astype(int).sort_values("faers_reports_in_bucket", ascending=False)

summary

Unnamed: 0_level_0,faers_reports_in_bucket,proteins_linked_to_bucket
tox_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1
other,2155,3
hepatic,190,0
infection_immune,141,0
gi,112,0
cardiovascular,109,0
pulmonary,76,0
renal,38,0
wound_healing,10,0
bleeding,8,0


In [30]:
summary = pd.DataFrame({
    "faers_reports_in_bucket": bucket_counts,
}).join(pd.DataFrame({
    "proteins_linked_to_bucket": protein_bucket.groupby("tox_bucket")[gene_col].nunique()
}), how="outer").fillna(0).astype(int).sort_values("faers_reports_in_bucket", ascending=False)

summary

Unnamed: 0_level_0,faers_reports_in_bucket,proteins_linked_to_bucket
tox_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1
other,2155,3
hepatic,190,0
infection_immune,141,0
gi,112,0
cardiovascular,109,0
pulmonary,76,0
renal,38,0
wound_healing,10,0
bleeding,8,0


In [31]:
prot_edges = protein_bucket.copy()
prot_edges = prot_edges[prot_edges["tox_bucket"] != "other"].copy()

prot_edges = prot_edges.assign(
    source_id=prot_edges[gene_col].astype(str),
    source_type="gene",
    edge_type="associated_with_toxicity_domain",
    target_id=prot_edges["tox_bucket"],
    target_type="toxicity_domain",
    evidence="Open Targets disease associations (keyword bucketed)",
)[["source_id","source_type","edge_type","target_id","target_type","n_diseases","evidence"]]

prot_edges.to_csv("../data/processed/kg_edges_gene_toxicitydomain_opentargets.csv", index=False)