In [1]:
import pandas as pd
import json
import requests
import time
import tqdm

In [2]:
input_file = "data/drugsto_faers_male.csv"

In [3]:
df = pd.read_csv(input_file, sep="\t", index_col=2)
df.head()

Unnamed: 0_level_0,name,smiles,struct_id,meddra_name,meddra_code,level,llr_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2808625,paroxetine,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,2068,Atrial septal defect,10003664,PT,76.772102
3475252,paroxetine,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,2068,Ventricular septal defect,10047298,PT,53.619698
3289318,paroxetine,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,2068,Patent ductus arteriosus,10034130,PT,43.883318
2808640,folic acid,NC1=NC(=O)C2=NC(CNC3=CC=C(C=C3)C(=O)N[C@@H](CC...,1231,Atrial septal defect,10003664,PT,23.922752
3340913,paroxetine,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,2068,Pulmonary valve stenosis,10037450,PT,21.589727


In [4]:
def get_hpo(term):
    res = requests.get("https://www.ebi.ac.uk/ols/api/select?q=%s&ontology=hp"%term)
    if res.ok:
        result = res.json()
        if result["response"]["numFound"] > 0:
            return result["response"]["docs"][0]["obo_id"]

In [5]:
disease_names = {}
for i in tqdm.tqdm(df.index):
    disease = df.at[i, 'meddra_name']
    if disease not in disease_names:
        obo_id = get_hpo(disease)
        time.sleep(0.2)
        if obo_id:
            disease_names[disease] = obo_id
        else:
            disease_id = df.at[i, 'meddra_code']
            disease_names[disease] = "meddra_%s"%disease_id

100%|██████████| 179/179 [00:31<00:00,  5.71it/s]


In [6]:
def get_cid(term):
    res = requests.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/%s/JSON?list_return=flat"%term)
    if res.ok:
        result = res.json()
        if len(result["PC_Compounds"]):
            return result["PC_Compounds"][0]["id"]["id"]["cid"]

In [7]:
drug_names = {}
for i in tqdm.tqdm(df.index):
    drug = df.at[i, 'name']
    if drug not in drug_names:
        cid = get_cid(drug)
        time.sleep(0.2)
        if cid:
            drug_names[drug] = cid
        else:
            drug_id = df.at[i, 'struct_id']
            drug_names[drug] = "DC_%s"%drug_id

100%|██████████| 179/179 [00:46<00:00,  3.88it/s]


In [8]:
disease_names

{'Atrial septal defect': 'HP:0001631',
 'Ventricular septal defect': 'HP:0001629',
 'Patent ductus arteriosus': 'HP:0001643',
 'Pulmonary valve stenosis': 'meddra_10037450',
 'Coarctation of the aorta': 'HP:0011647',
 'Bicuspid aortic valve': 'HP:0001647',
 'Hypoplastic left heart syndrome': 'meddra_10021076',
 'Cerebellar atrophy': 'HP:0001272',
 'Aortic dissection': 'HP:0002647',
 'Trisomy 21': 'meddra_10044688',
 'Low set ears': 'HP:0000369',
 'Double outlet right ventricle': 'HP:0001719',
 'Cleft palate': 'HP:0000175',
 'Microtia': 'HP:0008551',
 'Spinal cord compression': 'HP:0002176',
 'Cleft lip': 'HP:0410030',
 'Aortic aneurysm': 'HP:0004942',
 'Anencephaly': 'HP:0002323',
 'Diaphragmatic hernia': 'HP:0025194',
 'Trisomy 18': 'meddra_10053884',
 'Spina bifida': 'HP:0002414',
 'Hydrocephalus': 'HP:0000238',
 'Cerebral atrophy': 'HP:0002059',
 'Right aortic arch': 'HP:0012020',
 'Aortic valve prolapse': 'HP:0025578',
 'Facial asymmetry': 'HP:0000324',
 'Aortic rupture': 'HP:00316

In [8]:
nodes = {}
edges = []
sex = "male"
for i in df.index:
    drug_name = df.at[i, "name"]
    drug_id = str(drug_names[drug_name])
    drug_smiles = df.at[i, "smiles"]
    drugcentral_id = str(df.at[i, "struct_id"])
    
    if drug_id not in nodes:
        nodes[drug_id] = {
            "type": "Drug",
            "properties": {
                "id": drug_id,
                "label": drug_name,
                "drugcentral_id": drugcentral_id,
                "smiles": drug_smiles,
                "uri": "https://pubchem.ncbi.nlm.nih.gov/compound/%s"%str(drug_id)
            }
        }
    
    disease_name = df.at[i, "meddra_name"]
    disease_id = str(disease_names[disease_name])
    meddra_code = str(df.at[i, "meddra_code"])

    if disease_id not in nodes:
        nodes[disease_id] = {
            "type": "BirthDefect",
            "properties": {
                "id": disease_id,
                "label": disease_name,
                "meddra_code": meddra_code,
                "uri": "https://hpo.jax.org/app/browse/term/%s"%uri
            }
        }
    edges.append({
        "source": drug_id,
        "relation": "FDA Adverse Event Reporting System (Male)",
        "target": disease_id,
        "properties": {
            "id": i,
            "source_label": drug_name,
            "target_label": disease_name,
            "level": df.at[i, "level"],
            "llr_ratio": df.at[i, "llr_ratio"],
            "sex": sex,
            "directed": True,
        }
    })
    

In [9]:
def isNumber(value):
    try:
        v = float(value)
        return {"@type": "number", "@value": v}
    except:
        return False

def typer(value):
    numb = isNumber(str(value))
    if numb:
        return numb
    elif isinstance(value, str):
        return {
            "@type": "string",
            "@value": value
        }
    elif isinstance(value, list):
        type_list = []
        for i in value:
            type_list.append(typer(i))
        
        return {
            "@type": "array",
            "@value": type_list
        }
    elif isinstance(value, dict):
        type_dict = {}
        for k,v in value.items():
            if k == "id" or k == "target" or k == "source":
                type_dict[k] = {
                    "@type": "string",
                    "@value": v
                }
            else: type_dict[k] = typer(v)
        return {
            "@type": "object",
            "@value": type_dict
        }
    

In [10]:
serialization_v1 = {
    "version": "1",
    "nodes": nodes,
    "edges": edges
}
with open("results/drugsto_faers_male.v1.json", "w") as o:
    o.write(json.dumps(serialization_v1, indent=2))
    
nodes_v2 = {}
for k,v in nodes.items():
    nodes_v2[k] = typer(v)
    
edges_v2 = []
for i in edges:
    edges_v2.append(
        typer(i)
    )

serialization_v2 = {
    "version": "2",
    "nodes": nodes_v2,
    "edges": edges_v2
}
with open("results/drugsto_faers_male.v2.json", "w") as o:
    o.write(json.dumps(serialization_v2, indent=2))

In [11]:
input_file = "data/drugsto_faers_female.csv"
df = pd.read_csv(input_file, sep="\t", index_col=2)
df.head()

Unnamed: 0_level_0,name,smiles,struct_id,meddra_name,meddra_code,level,llr_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1913328,paroxetine,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,2068,Atrial septal defect,10003664,PT,73.28254
2687052,paroxetine,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,2068,Ventricular septal defect,10047298,PT,39.521558
2468561,paroxetine,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,2068,Patent ductus arteriosus,10034130,PT,37.840621
2526066,paroxetine,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,2068,Pulmonary valve stenosis,10037450,PT,20.47905
1913434,lamotrigine,NC1=NN=C(C(N)=N1)C1=C(Cl)C(Cl)=CC=C1,1540,Atrial septal defect,10003664,PT,17.524641


In [12]:
for i in tqdm.tqdm(df.index):
    disease = df.at[i, 'meddra_name']
    if disease not in disease_names:
        obo_id = get_hpo(disease)
        time.sleep(0.2)
        if obo_id:
            disease_names[disease] = obo_id
        else:
            disease_id = df.at[i, 'meddra_code']
            disease_names[disease] = "meddra_%s"%disease_id

100%|██████████| 193/193 [00:03<00:00, 49.82it/s] 


In [13]:
for i in tqdm.tqdm(df.index):
    drug = df.at[i, 'name']
    if drug not in drug_names:
        cid = get_cid(drug)
        time.sleep(0.2)
        if cid:
            drug_names[drug] = cid
        else:
            drug_id = df.at[i, 'struct_id']
            drug_names[drug] = "DC_%s"%drug_id

100%|██████████| 193/193 [00:34<00:00,  5.55it/s]


In [14]:
nodes = {}
edges = []
sex = "female"
for i in df.index:
    drug_name = df.at[i, "name"]
    drug_id = str(drug_names[drug_name])
    drug_smiles = df.at[i, "smiles"]
    drugcentral_id = str(df.at[i, "struct_id"])
    
    if drug_id not in nodes:
        nodes[drug_id] = {
            "type": "Drug",
            "properties": {
                "id": drug_id,
                "label": drug_name,
                "drugcentral_id": drugcentral_id,
                "smiles": drug_smiles
            }
        }
    
    disease_name = df.at[i, "meddra_name"]
    disease_id = str(disease_names[disease_name])
    meddra_code = str(df.at[i, "meddra_code"])

    if disease_id not in nodes:
        nodes[disease_id] = {
            "type": "BirthDefect",
            "properties": {
                "id": disease_id,
                "label": disease_name,
                "meddra_code": meddra_code
            }
        }
    edges.append({
        "source": drug_id,
        "relation": "FDA Adverse Event Reporting System (Female)",
        "target": disease_id,
        "properties": {
            "id": i,
            "source_label": drug_name,
            "target_label": disease_name,
            "level": df.at[i, "level"],
            "llr_ratio": df.at[i, "llr_ratio"],
            "sex": sex,
            "directed": True,
        }
    })
    

In [15]:
serialization_v1 = {
    "version": "1",
    "nodes": nodes,
    "edges": edges
}
with open("results/drugsto_faers_female.v1.json", "w") as o:
    o.write(json.dumps(serialization_v1, indent=2))
    
nodes_v2 = {}
for k,v in nodes.items():
    nodes_v2[k] = typer(v)
    
edges_v2 = []
for i in edges:
    edges_v2.append(
        typer(i)
    )

serialization_v2 = {
    "version": "2",
    "nodes": nodes_v2,
    "edges": edges_v2
}
with open("results/drugsto_faers_female.v2.json", "w") as o:
    o.write(json.dumps(serialization_v2, indent=2))