In [1]:
import pandas as pd
import json
import requests
import time
import tqdm

In [2]:
input_file = "data/drugsto_faers_male.csv"

In [3]:
with open("data/drug_ids.json") as o:
    drug_ids = json.loads(o.read())


with open("data/birth_defect_ids.json") as o:
   birth_defect_ids = json.loads(o.read())

In [4]:
df = pd.read_csv(input_file, sep="\t", index_col=2)
df.head()

Unnamed: 0_level_0,name,smiles,struct_id,meddra_name,meddra_code,level,llr_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2808625,paroxetine,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,2068,Atrial septal defect,10003664,PT,76.772102
3475252,paroxetine,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,2068,Ventricular septal defect,10047298,PT,53.619698
3289318,paroxetine,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,2068,Patent ductus arteriosus,10034130,PT,43.883318
2808640,folic acid,NC1=NC(=O)C2=NC(CNC3=CC=C(C=C3)C(=O)N[C@@H](CC...,1231,Atrial septal defect,10003664,PT,23.922752
3340913,paroxetine,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,2068,Pulmonary valve stenosis,10037450,PT,21.589727


In [5]:
def get_hpo(term):
    if term in birth_defect_ids:
        return birth_defect_ids[term]
    res = requests.get("https://www.ebi.ac.uk/ols/api/select?q=%s&ontology=hp"%term)
    if res.ok:
        result = res.json()
        if result["response"]["numFound"] > 0:
            return result["response"]["docs"][0]["obo_id"]

In [6]:
disease_names = {}
for i in tqdm.tqdm(df.index):
    disease = df.at[i, 'meddra_name']
    if disease not in disease_names:
        obo_id = get_hpo(disease)
        time.sleep(0.2)
        if obo_id:
            disease_names[disease] = obo_id
        else:
            disease_id = df.at[i, 'meddra_code']
            disease_names[disease] = "meddra_%s"%disease_id

100%|██████████| 179/179 [00:27<00:00,  6.46it/s]


In [7]:
def get_cid(term):
    if term in drug_ids:
        return drug_ids[term]
    res = requests.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/%s/JSON?list_return=flat"%term)
    if res.ok:
        result = res.json()
        if len(result["PC_Compounds"]):
            return "CID:%s"%result["PC_Compounds"][0]["id"]["id"]["cid"]

In [8]:
drug_names = {}
for i in tqdm.tqdm(df.index):
    drug = df.at[i, 'name']
    if drug not in drug_names:
        cid = get_cid(drug)
        time.sleep(0.2)
        if cid:
            drug_names[drug] = cid
        else:
            drug_id = df.at[i, 'struct_id']
            drug_names[drug] = "DC_%s"%drug_id

100%|██████████| 179/179 [00:22<00:00,  7.93it/s]


In [9]:
nodes = {}
edges = []
sex = "male"
for i in df.index:
    drug_name = df.at[i, "name"]
    drug_id = str(drug_names[drug_name])
    drug_smiles = df.at[i, "smiles"]
    drugcentral_id = str(df.at[i, "struct_id"])
    
    if drug_id not in nodes:
        nodes[drug_id] = {
            "type": "Drug",
            "properties": {
                "id": drug_id,
                "label": drug_name,
                "drugcentral_id": drugcentral_id,
                "smiles": drug_smiles,
                "uri": "https://pubchem.ncbi.nlm.nih.gov/compound/%s"%str(drug_id)
            }
        }
    
    disease_name = df.at[i, "meddra_name"]
    disease_id = str(disease_names[disease_name])
    meddra_code = str(df.at[i, "meddra_code"])

    if disease_id not in nodes:
        nodes[disease_id] = {
            "type": "BirthDefect",
            "properties": {
                "id": disease_id,
                "label": disease_name,
                "meddra_code": meddra_code,
                "uri": "https://hpo.jax.org/app/browse/term/%s"%disease_id
            }
        }
    edges.append({
        "source": drug_id,
        "relation": "FDA Adverse Event Reporting System (Male)",
        "target": disease_id,
        "properties": {
            "id": i,
            "source_label": drug_name,
            "target_label": disease_name,
            "level": df.at[i, "level"],
            "llr_ratio": df.at[i, "llr_ratio"],
            "sex": sex,
            "directed": True,
        }
    })
    

In [10]:
edges[0]

{'source': 'CID:10947895',
 'relation': 'FDA Adverse Event Reporting System (Male)',
 'target': 'HP:0001631',
 'properties': {'id': 2808625,
  'source_label': 'paroxetine',
  'target_label': 'Atrial septal defect',
  'level': 'PT',
  'llr_ratio': 76.77210179076344,
  'sex': 'male',
  'directed': True}}

In [11]:
def isNumber(value):
    try:
        v = float(value)
        return {"@type": "number", "@value": v}
    except:
        return False

def typer(value):
    numb = isNumber(str(value))
    if numb:
        return numb
    elif isinstance(value, str):
        return {
            "@type": "string",
            "@value": value
        }
    elif isinstance(value, list):
        type_list = []
        for i in value:
            type_list.append(typer(i))
        
        return {
            "@type": "array",
            "@value": type_list
        }
    elif isinstance(value, dict):
        type_dict = {}
        for k,v in value.items():
            if k == "id" or k == "target" or k == "source":
                type_dict[k] = {
                    "@type": "string",
                    "@value": v
                }
            else: type_dict[k] = typer(v)
        return {
            "@type": "object",
            "@value": type_dict
        }
    

In [12]:
serialization_v1 = {
    "version": "1",
    "nodes": nodes,
    "edges": edges
}
with open("results/drugsto_faers_male.v1.json", "w") as o:
    o.write(json.dumps(serialization_v1, indent=2))
    
nodes_v2 = {}
for k,v in nodes.items():
    nodes_v2[k] = typer(v)
    
edges_v2 = []
for i in edges:
    edges_v2.append(
        typer(i)
    )

serialization_v2 = {
    "version": "2",
    "nodes": nodes_v2,
    "edges": edges_v2
}
with open("results/drugsto_faers_male.v2.json", "w") as o:
    o.write(json.dumps(serialization_v2, indent=2))

In [13]:
input_file = "data/drugsto_faers_female.csv"
df = pd.read_csv(input_file, sep="\t", index_col=2)
df.head()

Unnamed: 0_level_0,name,smiles,struct_id,meddra_name,meddra_code,level,llr_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1913328,paroxetine,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,2068,Atrial septal defect,10003664,PT,73.28254
2687052,paroxetine,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,2068,Ventricular septal defect,10047298,PT,39.521558
2468561,paroxetine,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,2068,Patent ductus arteriosus,10034130,PT,37.840621
2526066,paroxetine,FC1=CC=C(C=C1)[C@@H]1CCNC[C@H]1COC1=CC2=C(OCO2...,2068,Pulmonary valve stenosis,10037450,PT,20.47905
1913434,lamotrigine,NC1=NN=C(C(N)=N1)C1=C(Cl)C(Cl)=CC=C1,1540,Atrial septal defect,10003664,PT,17.524641


In [14]:
for i in tqdm.tqdm(df.index):
    disease = df.at[i, 'meddra_name']
    if disease not in disease_names:
        obo_id = get_hpo(disease)
        time.sleep(0.2)
        if obo_id:
            disease_names[disease] = obo_id
        else:
            disease_id = df.at[i, 'meddra_code']
            disease_names[disease] = "meddra_%s"%disease_id

100%|██████████| 193/193 [00:03<00:00, 51.00it/s] 


In [15]:
for i in tqdm.tqdm(df.index):
    drug = df.at[i, 'name']
    if drug not in drug_names:
        cid = get_cid(drug)
        time.sleep(0.2)
        if cid:
            drug_names[drug] = cid
        else:
            drug_id = df.at[i, 'struct_id']
            drug_names[drug] = "DC_%s"%drug_id

100%|██████████| 193/193 [00:15<00:00, 12.81it/s]


In [16]:
nodes = {}
edges = []
sex = "female"
for i in df.index:
    drug_name = df.at[i, "name"]
    drug_id = str(drug_names[drug_name])
    drug_smiles = df.at[i, "smiles"]
    drugcentral_id = str(df.at[i, "struct_id"])
    
    if drug_id not in nodes:
        nodes[drug_id] = {
            "type": "Drug",
            "properties": {
                "id": drug_id,
                "label": drug_name,
                "drugcentral_id": drugcentral_id,
                "smiles": drug_smiles
            }
        }
    
    disease_name = df.at[i, "meddra_name"]
    disease_id = str(disease_names[disease_name])
    meddra_code = str(df.at[i, "meddra_code"])

    if disease_id not in nodes:
        nodes[disease_id] = {
            "type": "BirthDefect",
            "properties": {
                "id": disease_id,
                "label": disease_name,
                "meddra_code": meddra_code
            }
        }
    edges.append({
        "source": drug_id,
        "relation": "FDA Adverse Event Reporting System (Female)",
        "target": disease_id,
        "properties": {
            "id": i,
            "source_label": drug_name,
            "target_label": disease_name,
            "level": df.at[i, "level"],
            "llr_ratio": df.at[i, "llr_ratio"],
            "sex": sex,
            "directed": True,
        }
    })
    

In [17]:
edges[0]

{'source': 'CID:10947895',
 'relation': 'FDA Adverse Event Reporting System (Female)',
 'target': 'HP:0001631',
 'properties': {'id': 1913328,
  'source_label': 'paroxetine',
  'target_label': 'Atrial septal defect',
  'level': 'PT',
  'llr_ratio': 73.28254032667141,
  'sex': 'female',
  'directed': True}}

In [18]:
serialization_v1 = {
    "version": "1",
    "nodes": nodes,
    "edges": edges
}
with open("results/drugsto_faers_female.v1.json", "w") as o:
    o.write(json.dumps(serialization_v1, indent=2))
    
nodes_v2 = {}
for k,v in nodes.items():
    nodes_v2[k] = typer(v)
    
edges_v2 = []
for i in edges:
    edges_v2.append(
        typer(i)
    )

serialization_v2 = {
    "version": "2",
    "nodes": nodes_v2,
    "edges": edges_v2
}
with open("results/drugsto_faers_female.v2.json", "w") as o:
    o.write(json.dumps(serialization_v2, indent=2))

In [77]:
gene_str = '''Nsun3
Polrmt
Nlrx1
Sfxn5
Zc3h12c
Slc25a39
Arsg
Defb29
Ndufb6
Zfand1
Tmem77
5730403B10Rik
RP23-195K8.6
Tlcd1
Psmc6
Slc30a6
LOC100047292
Lrrc40
Orc5l
Mpp7
Unc119b
Prkaca
Tcn2
Psmc3ip
Pcmtd2
Acaa1a
Lrrc1
2810432D09Rik
Sephs2
Sac3d1
Tmlhe
LOC623451
Tsr2
Plekha7
Gys2
Arhgef12
Hibch
Lyrm2
Zbtb44
Entpd5
Rab11fip2
Lipt1
Intu
Anxa13
Klf12
Sat2
Gal3st2
Vamp8
Fkbpl
Aqp11
Trap1
Pmpcb
Tm7sf3
Rbm39
Bri3
Kdr
Zfp748
Nap1l1
Dhrs1
Lrrc56
Wdr20a
Stxbp2
Klf1
Ufc1
Ccdc16
9230114K14Rik
Rwdd3
2610528K11Rik
Aco1
Cables1
LOC100047214
Yars2
Lypla1
Kalrn
Gyk
Zfp787
Zfp655
Rabepk
Zfp650
4732466D17Rik
Exosc4
Wdr42a
Gphn
2610528J11Rik
1110003E01Rik
Mdh1
1200014M14Rik
AW209491
Mut
1700123L14Rik
2610036D13Rik
Cox15
Tmem30a
Nsmce4a
Tm2d2
Rhbdd3
Atxn2
Nfs1
3110001I20Rik
BC038156
LOC100047782
2410012H22Rik
Rilp
A230062G08Rik
Pttg1ip
Rab1
Afap1l1
Lyrm5
2310026E23Rik
C330002I19Rik
Zfyve20
Poli
Tomm70a
Slc7a6os
Mat2b
4932438A13Rik
Lrrc8a
Smo
Nupl2
Trpc2
Arsk
D630023B12Rik
Mtfr1
5730414N17Rik
Scp2
Zrsr1
Nol7
C330018D20Rik
Ift122
LOC100046168
D730039F16Rik
Scyl1
1700023B02Rik
1700034H14Rik
Fbxo8
Paip1
Tmem186
Atpaf1
LOC100046254
LOC100047604
Coq10a
Fn3k
Sipa1l1
Slc25a16
Slc25a40
Rps6ka5
Trim37
Lrrc61
Abhd3
Gbe1
Parp16
Hsd3b2
Esm1
Dnajc18
Dolpp1
Lass2
Wdr34
Rfesd
Cacnb4
2310042D19Rik
Srr
Bpnt1
6530415H11Rik
Clcc1
Tfb1m
4632404H12Rik
D4Bwg0951e
Med14
Adhfe1
Thtpa
Cat
Ell3
Akr7a5
Mtmr14
Timm44
Sf1
Ipp
Iah1
Trim23
Wdr89
Gstz1
Cradd
2510006D16Rik
Fbxl6
LOC100044400
Zfp106
Cd55
0610013E23Rik
Afmid
Tmem86a
Aldh6a1
Dalrd3
Smyd4
Nme7
Fars2
Tasp1
Cldn10
A930005H10Rik
Slc9a6
Adk
Rbks
2210016F16Rik
Vwce
4732435N03Rik
Zfp11
Vldlr
9630013D21Rik
4933407N01Rik
Fahd1
Mipol1
1810019D21Rik
1810049H13Rik
Tfam
Paics
1110032A03Rik
LOC100044139
Dnajc19
BC016495
A930041I02Rik
Rqcd1
Usp34
Zcchc3
H2afj
Phf7
4921508D12Rik
Kmo
Prpf18
Mcat
Txndc4
4921530L18Rik
Vps13b
Scrn3
Tor1a
AI316807
Acbd4
Fah
Apool
Col4a4
Lrrc19
Gnmt
Nr3c1
Sip1
Ascc1
Fech
Abhd14a
Arhgap18
2700046G09Rik
Yme1l1
Gk5
Glo1
Sbk1
Cisd1
2210011C24Rik
Nxt2
Notum
Ankrd42
Ube2e1
Ndufv1
Slc33a1
Cep68
Rps6kb1
Hyi
Aldh1a3
Mynn
3110048L19Rik
Rdh14
Proz
Gorasp1
LOC674449
Zfp775
5430437P03Rik
Npy
Adh5
Sybl1
4930432O21Rik
Nat9
LOC100048387
Mettl8
Eny2
2410018G20Rik
Pgm2
Fgfr4
Mobkl2b
Atad3a
4932432K03Rik
Dhtkd1
Ubox5
A530050D06Rik
Zdhhc5
Mgat1
Nudt6
Tpmt
Wbscr18
LOC100041586
Cdk5rap1
4833426J09Rik
Myo6
Cpt1a
Gadd45gip1
Tmbim4
2010309E21Rik
Asb9
2610019F03Rik
7530414M10Rik
Atp6v1b2
2310068J16Rik
Ddt
Klhdc4
Hpn
Lifr
Ovol1
Nudt12
Cdan1
Fbxo9
Fbxl3
Hoxa7
Aldh8a1
3110057O12Rik
Abhd11
Psmb1
ENSMUSG00000074286
Chpt1
Oxsm
2310009A05Rik
1700001L05Rik
Zfp148
39509
Mrpl9
Tmem80
9030420J04Rik
Naglu
Plscr2
Agbl3
Pex1
Cno
Neo1
Asf1a
Tnfsf5ip1
Pkig
AI931714
D130020L05Rik
Cntd1
Clec2h
Zkscan1
1810044D09Rik
Mettl7a
Siae
Fbxo3
Fzd5
Tmem166
Tmed4
Gpr155
Rnf167
Sptlc1
Riok2
Tgds
Pms1
Pitpnc1
Pcsk7
4933403G14Rik
Ei24
Crebl2
Tln1
Mrpl35
2700038C09Rik
Ubie
Osgepl1
2410166I05Rik
Wdr24
Ap4s1
Lrrc44
B3bp
Itfg1
Dmxl1
C1d'''

In [116]:
genes =gene_str.split("\n")

description = 'Example gene list KG'
library = 'Reactome_2016'
res = requests.post("http://localhost:3000/api/knowledge_graph/enrichment", json={"genes": genes, "description": description, "library": library, "term_limit": 10})


In [114]:
results = res.json()

In [115]:
results

[{'data': {'id': 'ATP-synt_ab',
   'kind': 'Protein Family',
   'label': 'ATP-synt ab',
   'properties': {'id': 'ATP-synt_ab', 'label': 'ATP-synt ab'},
   'color': '#B8255F',
   'node_type': 0}},
 {'data': {'source': 'ATP-synt_ab',
   'target': 'ATP6V1B2',
   'kind': 'Relation',
   'relation': 'protein family-gene (Pfam)',
   'label': 'protein family-gene (Pfam)',
   'properties': {'id': 'ATP-synt ab_protein family-gene (Pfam)_ATP6V1B2',
    'label': 'protein family-gene (Pfam)',
    'source_label': 'ATP-synt ab',
    'target_label': 'ATP6V1B2',
    'resource': 'Pfam'},
   'lineColor': '#e0e0e0',
   'directed': 'none'}},
 {'data': {'id': 'ATP6V1B2',
   'kind': 'Gene',
   'label': 'ATP6V1B2',
   'properties': {'id': 'ATP6V1B2', 'label': 'ATP6V1B2'},
   'color': '#AED581',
   'node_type': 0}},
 {'data': {'id': 'ATP-synt_ab_N',
   'kind': 'Protein Family',
   'label': 'ATP-synt ab N',
   'properties': {'id': 'ATP-synt_ab_N', 'label': 'ATP-synt ab N'},
   'color': '#B8255F',
   'node_type'

In [91]:
gene_nodes

['MAT2B', 'AFMID', 'KMO', 'MUT', 'MDH1', 'ADK', 'GNMT', 'OXSM', 'MCAT', 'TFAM']

In [87]:
[i.split(" WP")[0] for i in terms_nodes]

['Methionine metabolism leading to sulfur amino acids and related disorders',
 'Mitochondrial fatty acid synthesis pathway',
 'Mitochondrial Gene Expression',
 'NAD Biosynthesis II (from tryptophan)',
 'Methylation Pathways',
 'Vitamin B12 Disorders',
 'Amino Acid metabolism',
 'TCA Cycle and Deficiency of Pyruvate Dehydrogenase complex (PDHc)',
 'Tryptophan catabolism leading to NAD+ production',
 'Mitochondrial LC-Fatty Acid Beta-Oxidation']

In [88]:
len(gene_nodes)

20