# Data cleaning and mapping

In [8]:
import warnings
warnings.filterwarnings("ignore")

import requests
import os
import re

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

import pandas as pd

from grapharm._utils import download_and_extract

os.makedirs("../outputs", exist_ok=True)

datadir = "../data/drkg"
download_and_extract(path=datadir)

os.makedirs("../data/drkg/map", exist_ok=True)

entities = pd.read_csv("../data/drkg/embed/entities.tsv", header=None, names=["entity", "id"], sep="\t")
entities.head()

Unnamed: 0,entity,id
0,Gene::2157,0
1,Gene::5264,1
2,Gene::2158,2
3,Gene::3309,3
4,Gene::28912,4


## Compounds

In [2]:
compounds = entities[entities["entity"].str.contains("Compound")]
print("Number of compounds: {}".format(len(compounds)))

Number of compounds: 24313


### ZINC

In [3]:
from grapharm.webcrawl import crawl_zinc

savepath = "../data/drkg/map/compounds/ZINC.tsv"

zinc = crawl_zinc(entities, savepath)
zinc.head()

Total: 63
Processed: 63
Need to process: 0


Unnamed: 0,entity,entity_id,InChIKey,chemical_formula
0,Compound::zinc:ZINC000003581289,ZINC000003581289,FELUFXCUIYHAPB-INIZCTEOSA-N,C17H16Br2N2O2
1,Compound::zinc:ZINC000005551645,ZINC000005551645,ZNFFMCYSMBXZQU-NSHDSACASA-N,C16H20ClN3S
2,Compound::zinc:ZINC000052956156,ZINC000052956156,GZBLEJZADHZBBZ-HOTGVXAUSA-N,C17H22N2O
3,Compound::zinc:ZINC000002020240,ZINC000002020240,CJPLEFFCVDQQFZ-INIZCTEOSA-N,C17H16Cl2N2O2
4,Compound::zinc:ZINC000003832004,ZINC000003832004,YCFJZPGDTZVVSM-UHFFFAOYSA-N,C15H14N4O3


### DrugBank

In [4]:
from grapharm.webcrawl import crawl_drugbank

savepath = "../data/drkg/map/compounds/DrugBank.tsv"
drbk = crawl_drugbank(entities, savepath)
drbk.head()

Total: 10551
Processed: 10551
Need to process: 0


Unnamed: 0,entity,entity_id,name,InChIKey,type,phase,weight,chemical_formula
0,Compound::DB02573,DB02573,"2'-deoxycytidine-2'-deoxyadenosine-3',5'-monop...",LYWWDKIADIGKTH-IDMWBNCISA-N,Small Molecule,Experimental,Average: 540.4238 Monoisotopic: 540.14821095,C19H25N8O9P
1,Compound::DB05105,DB05105,Pleconaril,KQOXLKOJHVFTRN-UHFFFAOYSA-N,Small Molecule,Investigational,Average: 381.349 Monoisotopic: 381.130026072,C18H18F3N3O3
2,Compound::DB00244,DB00244,Mesalazine,KBOPZPXVLCULAV-UHFFFAOYSA-N,Small Molecule,Approved,Average: 153.1354 Monoisotopic: 153.042593095,C7H7NO3
3,Compound::DB00684,DB00684,Tobramycin,NLVFBUXFDBBNBW-PBSUHMDJSA-N,Small Molecule,"Approved, Investigational",Average: 467.5145 Monoisotopic: 467.259127807,C18H37N5O9
4,Compound::DB03118,DB03118,(2Z)-1-(5-Chloro-1H-indol-3-yl)-3-hydroxy-3-(1...,LKVXXMOMTRBUQI-WCIBSUBMSA-N,Small Molecule,Experimental,Average: 289.677 Monoisotopic: 289.036652232,C12H8ClN5O2


### ChEBI

In [5]:
from grapharm.webcrawl import crawl_chebi
savepath = "../data/drkg/map/compounds/ChEBI.tsv"

chebi = crawl_chebi(entities, savepath)
chebi.head()

Total: 1088
Processed: 1088
Need to process: 0


Unnamed: 0,entity,entity_id,name,InChIKey
0,Compound::chebi:141552,CHEBI:141552,forasartan,YONOBYIBNBCDSJ-UHFFFAOYSA-N
1,Compound::chebi:143117,CHEBI:143117,lorlatinib,IIXWYSCJSQVBQM-LLVKDONJSA-N
2,Compound::chebi:47414,CHEBI:47414,"(Z)-3',5'-dibromo-2',4,4',6'-tetrahydroxyaurone",BRPKBUNFOZFULQ-SGAXSIHGSA-N
3,Compound::chebi:138865,CHEBI:138865,AMD 070,WVLHHLRVNDMIAR-IBGZPJMESA-N
4,Compound::chebi:141474,CHEBI:141474,diethyl 2-[(dimethoxyphosphorothioyl)thio]succ...,JXSJBGJIGXNWCI-UHFFFAOYSA-N


### MolPort

In [6]:
from grapharm.webcrawl import crawl_molport
savepath = "../data/drkg/map/compounds/MolPort.tsv"

molport = crawl_molport(entities, savepath)
molport.head()


Total: 228
Processed: 228
Need to process: 0


Unnamed: 0,entity,entity_id,name,weight,chemical_formula,InChIKey
0,Compound::molport:MolPort-046-762-962,MolPort-046-762-962,"4-{[(6Z)-6-[(2,4,6-trimethylphenyl)imino]-1,6-...",329.407,C20H19N5,ILAYIAGXTHKHNT-UHFFFAOYSA-N
1,Compound::molport:MolPort-046-766-257,MolPort-046-766-257,"9-{[(1,3-dihydroxypropan-2-yl)oxy]methyl}-2-im...",255.234,C9H13N5O4,IRSCQMHQWWYFCW-UHFFFAOYSA-N
2,Compound::molport:MolPort-006-110-129,MolPort-006-110-129,"(9R)-10-methyl-10-azatetracyclo[7.7.1.0²,⁷.0¹³...",267.328,C17H17NO2,VMWNQDUVQKEIOC-CYBMUJFWSA-N
3,Compound::molport:MolPort-044-561-436,MolPort-044-561-436,"({[({[(2S,5R)-5-(6-amino-9H-purin-9-yl)oxolan-...",475.183,C10H16N5O11P3,OAKPWEUQDVLTCN-NKWVEPMBSA-N
4,Compound::molport:MolPort-046-763-659,MolPort-046-763-659,"3-chloro-5-({1-[(5-hydroxy-4-methyl-4H-1,2,4-t...",425.75,C17H11ClF3N5O3,ZIAOVIPSKUPPQW-UHFFFAOYSA-N


### DrugCentral

In [7]:
from grapharm.webcrawl import crawl_drugcentral
savepath="../data/drkg/map/compounds/DrugCentral.tsv"

drugcentral = crawl_drugcentral(entities, savepath)
drugcentral.head()

Total: 18
Processed: 18
Need to process: 0


Unnamed: 0,entity,entity_id,InChI
0,Compound::drugcentral:243,243,"InChI=1S/C15H21N3O2S3/c1-15(2,3)17-6-9(19)7-21..."
1,Compound::drugcentral:483,483,"InChI=1S/C15H21N3O2S3/c1-15(2,3)17-6-9(19)7-21..."
2,Compound::drugcentral:4789,4789,InChI=1S/Hg.2HI/h;2*1H/q+2;;/p-2
3,Compound::drugcentral:2584,2584,InChI=1S/C21H18F3N3O3/c1-11-9-26(5-4-25-11)19-...
4,Compound::drugcentral:4947,4947,InChI=1S/C35H35F2N8O5S/c1-22(33-42-30(18-51-33...


### BindingDB

In [8]:
from grapharm.webcrawl import crawl_bindingdb

savepath="../data/drkg/map/compounds/BindingDB.tsv"

bindingdb = crawl_bindingdb(entities, savepath)
bindingdb.head()

Total: 144
Processed: 144
Need to process: 0


Unnamed: 0,entity,entity_id,InChIKey
0,Compound::bindingdb:50225285,50225285,GJSURZIOUXUGAL-UHFFFAOYSA-N
1,Compound::bindingdb:50180655,50180655,KJHKTHWMRKYKJE-SUGCFTRWSA-N
2,Compound::bindingdb:50215393,50215393,YMARZQAQMVYCKC-OEMFJLHTSA-N
3,Compound::bindingdb:50062551,50062551,RHWKPHLQXYSBKR-BMIGLBTASA-N
4,Compound::bindingdb:50088493,50088493,ZSZFUDFOPOMEET-UHFFFAOYSA-N


### HMDB

In [9]:
from grapharm.webcrawl import crawl_hmdb

savepath="../data/drkg/map/compounds/HMDB.tsv"
hmdb = crawl_hmdb(entities, savepath)
hmdb.head()


Total: 12
Processed: 12
Need to process: 0


Unnamed: 0,entity,entity_id,name,InChIKey
0,Compound::hmdb:HMDB0015612,HMDB0015612,Error,Error
1,Compound::hmdb:HMDB0014658,HMDB0014658,Error,Error
2,Compound::hmdb:HMDB0062709,HMDB0062709,S-Adenosylmethionine,MEFKEPWMEQBLKI-AIRLBKTGSA-O
3,Compound::hmdb:HMDB0014609,HMDB0014609,Error,Error
4,Compound::hmdb:HMDB0015436,HMDB0015436,Error,Error


### Brenda

In [10]:
from grapharm.webcrawl import crawl_brenda

savepath="../data/drkg/map/compounds/Brenda.tsv"
brenda = crawl_brenda(entities, savepath)
brenda.head()

Total: 731
Processed: 730
Need to process: 1
731. 169533


Unnamed: 0,entity,entity_id,name,InChIKey,chemical_formula
0,Compound::brenda:207529,207529,actinomycin D,RJURFGZVJUQBHK-IIXSONLDSA-N,C62H86N12O16
1,Compound::brenda:6989,6989,BILN 2061,PJZPDFUUXKKDNB-KNINVFKUSA-N,C40H50N6O8S
2,Compound::brenda:102188,102188,quercetin 3-O-alpha-D-glucopyranoside,OVSQVDMCBVZWGM-CAWYGJOUSA-N,C21H20O12
3,Compound::brenda:61880,61880,(2Z)-2-(benzoylamino)-3-[4-(2-bromophenoxy)phe...,WLPJLQNKCJWAFL-RGEXLXHISA-N,C22H16BrNO4
4,Compound::brenda:223510,223510,apigenin,KZNIFHPLKGYRTM-UHFFFAOYSA-N,C15H10O5


### FDASRS

In [11]:
savepath="../data/drkg/map/compounds/FDASRS.tsv"

os.makedirs(os.path.dirname(savepath), exist_ok=True)
    
if os.path.isfile(savepath):
    df = pd.read_csv(savepath, sep="\t")
else:
    df = entities[entities.entity.str.contains("Compound::fdasrs")]
    df["entity_id"] = df.entity.str.split(":", expand=True)[3]

options = webdriver.FirefoxOptions()
options.add_argument('--headless')

id = "VT98O71A2D"
url = f"https://gsrs.ncats.nih.gov/ginas/app/beta/substances/{id}"
doc = requests.get(url, allow_redirects=True).content
soup = BeautifulSoup(doc)
soup.find_all("body")
# driver = webdriver.Firefox(options=options)
# timeout = 30

# with driver:
#     driver.get(url)
    
#     # Set timeout
#     element_present = EC.presence_of_element_located((By.CLASS_NAME,  "mat-expansion-panel-body ng-tns-c168-11"))
#     WebDriverWait(driver, timeout).until(element_present)
    
#     soup = BeautifulSoup(driver.page_source,features="html.parser")
#     driver.close()

[<body>
 <app-root></app-root>
 <script src="runtime.71e4f26d68389ddc.js" type="module"></script><script src="polyfills.13822991a2edc31a.js" type="module"></script><script src="main.f79212cd8e956815.js" type="module"></script>
 </body>]

In [34]:
soup

<!DOCTYPE html>
<html lang="en"><head><link crossorigin="" href="https://fonts.gstatic.com" rel="preconnect"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<style type="text/css">@font-face{font-family:'Material Icons';font-style:normal;font-weight:400;src:url(https://fonts.gstatic.com/s/materialicons/v139/flUhRq6tzZclQEJ-Vdg-IuiaDsNcIhQ8tQ.woff2) format('woff2');}.material-icons{font-family:'Material Icons';font-weight:normal;font-style:normal;font-size:24px;line-height:1;letter-spacing:normal;text-transform:none;display:inline-block;white-space:nowrap;word-wrap:normal;direction:ltr;-webkit-font-feature-settings:'liga';-webkit-font-smoothing:antialiased;}</style>
<style type="text/css">@font-face{font-family:'Roboto';font-style:normal;font-weight:300;src:url(https://fonts.gstatic.com/s/roboto/v30/KFOlCnqEu92Fr1MmSU5fCRc4AMP6lbBP.woff2) format('woff2');unicode-range:U+0460-052F, U+1C80-1C88, U+20B4, U+2DE0-2DFF, U+A640-A69F, U+FE2E-FE2F;}@font-face{font-family:'Roboto';font-s

In [3]:
import sys
sys.path.append("..")
from ultra import tasks, util
from ultra.models import Ultra

In [5]:
import pandas as pd
df = pd.read_csv("../assets/clean-drkg/standardized_relations.tsv", sep="\t")
df

Unnamed: 0,Relation-name,Data-source,Connected entity-types,Interaction-type,Standardized-type,Description,Reference for the description
0,DGIDB::ACTIVATOR::Gene:Compound,DGIDB,Compound:Gene,activation,upregulates,An activator interaction is when a drug activa...,http://www.dgidb.org/getting_started
1,DGIDB::AGONIST::Gene:Compound,DGIDB,Compound:Gene,agonism,upregulates,An agonist interaction occurs when a drug bind...,http://www.dgidb.org/getting_started
2,DGIDB::ALLOSTERIC MODULATOR::Gene:Compound,DGIDB,Compound:Gene,allosteric modulation,modulates,An allosteric modulator interaction occurs whe...,http://www.dgidb.org/getting_started
3,DGIDB::ANTAGONIST::Gene:Compound,DGIDB,Compound:Gene,antagonism,blocks,An antagonist interaction occurs when a drug b...,http://www.dgidb.org/getting_started
4,DGIDB::ANTIBODY::Gene:Compound,DGIDB,Compound:Gene,antibody,binds,An antibody interaction occurs when an antibod...,http://www.dgidb.org/getting_started
...,...,...,...,...,...,...,...
102,bioarx::Covid2_acc_host_gene::Disease:Gene,BIBLIOGRAPHY,Disease:Gene,interaction,associates,"Interactions between 27 viral proteins, and ...",
103,bioarx::DrugHumGen:Compound:Gene,BIBLIOGRAPHY,Compound:Gene,interaction,interacts,,
104,bioarx::DrugVirGen:Compound:Gene,BIBLIOGRAPHY,Compound:Gene,interaction,interacts,,
105,bioarx::HumGenHumGen:Gene:Gene,BIBLIOGRAPHY,Gene:Gene,interaction,interacts,Protein-protein interaction,
