In [1]:
# raw_ad_triples
#
# created by LuYF-Lemon-love <luyanfeng_nlp@qq.com>  on January 11, 2023
#
# 该脚本展示了如何提取 DRKG 中药物治疗 Alzheimer's disease 的三元组.
#
# 需要的包:
#          numpy
#          pandas
#          time
#
# 需要的文件:
#          ../data/drkg/drkg.tsv
#
# 输出的文件:
#          ./drug_treat_ad.tsv
#          ./ad_drugs.txt

### 导入需要的库

In [2]:
import numpy as np
import pandas as pd
import time

### Alzheimer's disease

In [3]:
AD_disease_list = [
'Disease::DOID:10652',
'Disease::MESH:C536599',
'Disease::MESH:D000544'
]

In [4]:
len(AD_disease_list)

3

In [5]:
AD_disease_list

['Disease::DOID:10652', 'Disease::MESH:C536599', 'Disease::MESH:D000544']

### Treatment relation

In [6]:
treatment_list = [
'DRUGBANK::treats::Compound:Disease',
'GNBR::T::Compound:Disease',
'Hetionet::CtD::Compound:Disease'
]

In [7]:
treatment_list

['DRUGBANK::treats::Compound:Disease',
 'GNBR::T::Compound:Disease',
 'Hetionet::CtD::Compound:Disease']

### 读取知识图谱

In [8]:
df = pd.read_csv('../data/drkg/drkg.tsv', sep='\t', names = ['h', 'r', 't'])

In [9]:
df

Unnamed: 0,h,r,t
0,Gene::2157,bioarx::HumGenHumGen:Gene:Gene,Gene::2157
1,Gene::2157,bioarx::HumGenHumGen:Gene:Gene,Gene::5264
2,Gene::2157,bioarx::HumGenHumGen:Gene:Gene,Gene::2158
3,Gene::2157,bioarx::HumGenHumGen:Gene:Gene,Gene::3309
4,Gene::2157,bioarx::HumGenHumGen:Gene:Gene,Gene::28912
...,...,...,...
5874256,Gene::29099,STRING::REACTION::Gene:Gene,Gene::1643
5874257,Gene::51645,STRING::REACTION::Gene:Gene,Gene::3183
5874258,Gene::865,STRING::CATALYSIS::Gene:Gene,Gene::983
5874259,Gene::1066,STRING::BINDING::Gene:Gene,Gene::7365


### 提取药物治疗 AD 的三元组

In [10]:
start = time.perf_counter()

triples = []

for index, Series in df.iterrows():
    if Series['h'].startswith('Compound::DB') and Series['r'] in treatment_list and Series['t'] in AD_disease_list:
        triples.append([Series['h'], Series['r'], Series['t']])
        
end = time.perf_counter()

print("运行时间为", round(end-start), 'seconds')

运行时间为 241 seconds


In [11]:
for triple in triples:
    print(f"{triple[0]}\t{triple[1]}\t{triple[2]}")

Compound::DB00656	DRUGBANK::treats::Compound:Disease	Disease::MESH:D000544
Compound::DB00674	DRUGBANK::treats::Compound:Disease	Disease::MESH:D000544
Compound::DB00843	DRUGBANK::treats::Compound:Disease	Disease::MESH:D000544
Compound::DB01043	DRUGBANK::treats::Compound:Disease	Disease::MESH:D000544
Compound::DB01403	DRUGBANK::treats::Compound:Disease	Disease::MESH:D000544
Compound::DB09148	DRUGBANK::treats::Compound:Disease	Disease::MESH:D000544
Compound::DB09149	DRUGBANK::treats::Compound:Disease	Disease::MESH:D000544
Compound::DB09151	DRUGBANK::treats::Compound:Disease	Disease::MESH:D000544
Compound::DB03929	GNBR::T::Compound:Disease	Disease::MESH:D000544
Compound::DB01750	GNBR::T::Compound:Disease	Disease::MESH:D000544
Compound::DB02543	GNBR::T::Compound:Disease	Disease::MESH:D000544
Compound::DB12116	GNBR::T::Compound:Disease	Disease::MESH:D000544
Compound::DB00244	GNBR::T::Compound:Disease	Disease::MESH:D000544
Compound::DB03575	GNBR::T::Compound:Disease	Disease::MESH:D000544
Comp

In [12]:
len(triples)

131

In [13]:
# 输出结果到文件

f = open('./drug_treat_ad.tsv', 'w')
for triple in triples:
    f.write('%s\t%s\t%s\n' % (triple[0], triple[1], triple[2]))
f.close()

### 提取所有治疗药物

In [14]:
# 利用集合进行药物去重
ad_drugs = set()

for triple in triples:
    ad_drugs.add(triple[0])

In [15]:
ad_drugs

{'Compound::DB00091',
 'Compound::DB00115',
 'Compound::DB00122',
 'Compound::DB00126',
 'Compound::DB00134',
 'Compound::DB00136',
 'Compound::DB00158',
 'Compound::DB00207',
 'Compound::DB00215',
 'Compound::DB00244',
 'Compound::DB00313',
 'Compound::DB00325',
 'Compound::DB00328',
 'Compound::DB00331',
 'Compound::DB00337',
 'Compound::DB00368',
 'Compound::DB00382',
 'Compound::DB00393',
 'Compound::DB00422',
 'Compound::DB00459',
 'Compound::DB00472',
 'Compound::DB00490',
 'Compound::DB00533',
 'Compound::DB00564',
 'Compound::DB00571',
 'Compound::DB00635',
 'Compound::DB00641',
 'Compound::DB00656',
 'Compound::DB00674',
 'Compound::DB00682',
 'Compound::DB00694',
 'Compound::DB00712',
 'Compound::DB00734',
 'Compound::DB00741',
 'Compound::DB00746',
 'Compound::DB00809',
 'Compound::DB00843',
 'Compound::DB00859',
 'Compound::DB00877',
 'Compound::DB00928',
 'Compound::DB00945',
 'Compound::DB00981',
 'Compound::DB00987',
 'Compound::DB00989',
 'Compound::DB00993',
 'Compound

In [16]:
len(ad_drugs)

126

In [17]:
# 输出结果到文件

f = open('./ad_drugs.txt', 'w')
for drug in ad_drugs:
    f.write('%s\n' % (drug))
f.close()