In [128]:
import numpy as np
import pandas as pd

In [129]:
# 读取 biokg
biokg = pd.read_csv('data/BioKG/kg.csv', index_col=0)

# to avoid information leakage
dti = pd.read_csv('data/BioKG/dti.csv')
biokg = pd.concat([biokg, dti])
biokg.drop_duplicates(inplace=True)
biokg.head()

Unnamed: 0,head,relation,tail
0,P61981,RELATED_GENETIC_DISORDER,MIM:617665
1,Q14738,RELATED_GENETIC_DISORDER,MIM:616355
2,P30153,RELATED_GENETIC_DISORDER,MIM:616362
3,Q00005,RELATED_GENETIC_DISORDER,MIM:604326
4,Q9H2F3,RELATED_GENETIC_DISORDER,MIM:607765


In [130]:
# 读取 primekg
primekg = pd.read_csv('data/PrimeKG/kg.csv', low_memory=False)
primekg.head()

Unnamed: 0,relation,display_relation,x_index,x_id,x_type,x_name,x_source,y_index,y_id,y_type,y_name,y_source
0,protein_protein,ppi,0,9796,gene/protein,PHYHIP,NCBI,8889,56992,gene/protein,KIF15,NCBI
1,protein_protein,ppi,1,7918,gene/protein,GPANK1,NCBI,2798,9240,gene/protein,PNMA1,NCBI
2,protein_protein,ppi,2,8233,gene/protein,ZRSR2,NCBI,5646,23548,gene/protein,TTC33,NCBI
3,protein_protein,ppi,3,4899,gene/protein,NRF1,NCBI,11592,11253,gene/protein,MAN1B1,NCBI
4,protein_protein,ppi,4,5297,gene/protein,PI4KA,NCBI,2122,8601,gene/protein,RGS20,NCBI


In [131]:
# 读取 disease 的映射文件
map_data = pd.read_csv('data/PrimeKG/mondo_references.csv')
map_data.head()

Unnamed: 0.1,Unnamed: 0,ontology_id,ontology,mondo_id
0,0,4,DOID,1
1,1,D004194,MESH,1
2,2,0000408,EFO,1
3,3,C0012634,UMLS,1
4,4,799.9,ICD9,1


In [132]:
mesh_mondo=map_data[map_data['ontology']=='MESH'][['ontology_id','mondo_id']]
mesh_mondo=mesh_mondo.rename(columns={"ontology_id":"mesh_id","mondo_id":'mondo_id'})
mesh_mondo.head()

# mondo_id 对应的是 disease 的 x_id 和 y_id

Unnamed: 0,mesh_id,mondo_id
1,D004194,1
18,D000309,4
44,D053206,22
104,D056887,82
116,D065706,87


### protein_protein：gene_gene, GGI

In [133]:
primekg_ggi = primekg[primekg['relation'] == 'protein_protein'][['x_name','y_name']]
# 重命名
primekg_ggi = primekg_ggi.rename(columns={"x_name": "head", "y_name": "tail"})
primekg_ggi.head()

Unnamed: 0,head,tail
0,PHYHIP,KIF15
1,GPANK1,PNMA1
2,ZRSR2,TTC33
3,NRF1,MAN1B1
4,PI4KA,RGS20


In [134]:
primekg_ggi['relation'] = 'GGI'
# 更改列的顺序
primekg_ggi = primekg_ggi[['head', 'relation', 'tail']]
primekg_ggi.head()

Unnamed: 0,head,relation,tail
0,PHYHIP,GGI,KIF15
1,GPANK1,GGI,PNMA1
2,ZRSR2,GGI,TTC33
3,NRF1,GGI,MAN1B1
4,PI4KA,GGI,RGS20


In [135]:
# 将 primekg 和 biokg 合并
biokg = pd.concat([biokg, primekg_ggi], axis=0)
# 重置索引
biokg.reset_index(drop=True, inplace=True)

### drug_drug

In [136]:
primekg_ddi = primekg[primekg['relation'] == 'drug_drug'][['x_id','y_id']]
# 重命名
primekg_ddi = primekg_ddi.rename(columns={"x_id": "head", "y_id": "tail"})
primekg_ddi.head()

Unnamed: 0,head,tail
389359,DB00001,DB06605
389360,DB00005,DB06605
389361,DB00006,DB06605
389362,DB00008,DB06605
389363,DB00009,DB06605


In [137]:
primekg_ddi.tail()

Unnamed: 0,head,tail
3061982,DB14443,DB14521
3061983,DB14583,DB11074
3061984,DB14845,DB13153
3061985,DB14895,DB13272
3061986,DB15305,DB08837


In [138]:
primekg_ddi['relation'] = 'DDI'
# 更改列的顺序
primekg_ddi = primekg_ddi[['head', 'relation', 'tail']]
primekg_ddi.head()

Unnamed: 0,head,relation,tail
389359,DB00001,DDI,DB06605
389360,DB00005,DDI,DB06605
389361,DB00006,DDI,DB06605
389362,DB00008,DDI,DB06605
389363,DB00009,DDI,DB06605


In [139]:
biokg = pd.concat([biokg, primekg_ddi], axis=0)
# 重置索引
biokg.reset_index(drop=True, inplace=True)

### pathway_pathway

In [140]:
primekg_papa = primekg[primekg['relation'] == 'pathway_pathway'][['x_id','y_id']]
# 重命名
primekg_papa = primekg_papa.rename(columns={"x_id": "head", "y_id": "tail"})
primekg_papa.head()

Unnamed: 0,head,tail
3789497,R-HSA-109581,R-HSA-109606
3789498,R-HSA-109581,R-HSA-169911
3789499,R-HSA-109581,R-HSA-5357769
3789500,R-HSA-109581,R-HSA-75153
3789501,R-HSA-109582,R-HSA-140877


In [141]:
primekg_papa.tail()

Unnamed: 0,head,tail
6505725,R-HSA-5690714,R-HSA-983705
6505726,R-HSA-983695,R-HSA-983705
6505727,R-HSA-2672351,R-HSA-983712
6505728,R-HSA-936837,R-HSA-983712
6505729,R-HSA-997272,R-HSA-991365


In [142]:
primekg_papa['relation'] = 'PATHWAY_PATHWAY'
# 更改列的顺序
primekg_papa = primekg_papa[['head', 'relation', 'tail']]
primekg_papa.head()

Unnamed: 0,head,relation,tail
3789497,R-HSA-109581,PATHWAY_PATHWAY,R-HSA-109606
3789498,R-HSA-109581,PATHWAY_PATHWAY,R-HSA-169911
3789499,R-HSA-109581,PATHWAY_PATHWAY,R-HSA-5357769
3789500,R-HSA-109581,PATHWAY_PATHWAY,R-HSA-75153
3789501,R-HSA-109582,PATHWAY_PATHWAY,R-HSA-140877


In [143]:
biokg = pd.concat([biokg, primekg_papa], axis=0)
# 重置索引
biokg.reset_index(drop=True, inplace=True)

### disease_disease

In [144]:
mesh_mondo.head()

Unnamed: 0,mesh_id,mondo_id
1,D004194,1
18,D000309,4
44,D053206,22
104,D056887,82
116,D065706,87


In [145]:
# 查看 mesh_mondo 的数据类型
mesh_mondo.dtypes

mesh_id     object
mondo_id     int64
dtype: object

In [146]:
primekg_didi = primekg[(primekg['relation'] == 'disease_disease') & (primekg['x_source'] == 'MONDO') & (primekg['y_source'] == 'MONDO')][['x_id','y_id']]
# 转换 x_id 和 y_id 的数据类型
primekg_didi['x_id'] = primekg_didi['x_id'].astype('int64')
primekg_didi['y_id'] = primekg_didi['y_id'].astype('int64')
primekg_didi.head()

Unnamed: 0,x_id,y_id
3315993,2816,4
3315994,21034,5
3315995,2243,9
3315996,2245,9
3315997,3847,9


In [147]:
# 利用 mesh_mondo 将 mondo_id 转换为 mesh_id
primekg_didi = pd.merge(primekg_didi, mesh_mondo, left_on='x_id', right_on='mondo_id', how='left')
# 删除 x_id 和 mondo_id 列
primekg_didi.drop(['x_id', 'mondo_id'], axis=1, inplace=True)
# 将 mesh_id 重命名为 x_id
primekg_didi = primekg_didi.rename(columns={"mesh_id": "x_id"})

primekg_didi = pd.merge(primekg_didi, mesh_mondo, left_on='y_id', right_on='mondo_id', how='left')
primekg_didi.drop(['y_id', 'mondo_id'], axis=1, inplace=True)
primekg_didi = primekg_didi.rename(columns={"mesh_id": "y_id"})

primekg_didi.head()

Unnamed: 0,x_id,y_id
0,D000303,D000309
1,,
2,D006474,
3,D001791,
4,D030342,


In [148]:
primekg_didi.shape

(50660, 2)

In [149]:
# 如果 x_id 和 y_id 有一个为空，则删除该行
primekg_didi = primekg_didi.dropna()
primekg_didi.shape

(8974, 2)

In [150]:
primekg_didi.head()

Unnamed: 0,x_id,y_id
0,D000303,D000309
12,D030342,C567144
34,D001145,D014693
46,D000073605,D000073605
53,D003384,C536166


In [151]:
# 重命名
primekg_didi = primekg_didi.rename(columns={"x_id": "head", "y_id": "tail"})
primekg_didi.head()

Unnamed: 0,head,tail
0,D000303,D000309
12,D030342,C567144
34,D001145,D014693
46,D000073605,D000073605
53,D003384,C536166


In [152]:
primekg_didi['relation'] = 'DISEASE_DISEASE'
# 更改列的顺序
primekg_didi = primekg_didi[['head', 'relation', 'tail']]
primekg_didi.head()

Unnamed: 0,head,relation,tail
0,D000303,DISEASE_DISEASE,D000309
12,D030342,DISEASE_DISEASE,C567144
34,D001145,DISEASE_DISEASE,D014693
46,D000073605,DISEASE_DISEASE,D000073605
53,D003384,DISEASE_DISEASE,C536166


In [153]:
biokg = pd.concat([biokg, primekg_didi], axis=0)
# 重置索引
biokg.reset_index(drop=True, inplace=True)

### drug_protein: drug_gene

注意，primekg 中 x 既有可能是 drug，也有可能是 protein，因为它把正边和反边都包含进来了，我们在处理的时候只要包含正边就行了。

In [154]:
primekg_dgi = primekg[(primekg['relation'] == 'drug_protein') & (primekg['x_type'] == 'drug')][['x_id','y_name']]
# 重命名
primekg_dgi = primekg_dgi.rename(columns={"x_id": "head", "y_name": "tail"})
primekg_dgi.tail()

Unnamed: 0,head,tail
346723,DB14548,SLC39A11
346724,DB14533,SLC30A6
346725,DB14548,SLC30A6
346726,DB09462,AQP7
346727,DB09462,AQP9


In [155]:
# primekg_gdi = primekg[(primekg['relation'] == 'drug_protein') & (primekg['x_type'] == 'gene/protein')][['x_name','y_id']]
# primekg_gdi.tail()
# # 交换列的顺序
# primekg_gdi = primekg_gdi[['y_id','x_name']]
# # 重命名
# primekg_gdi = primekg_gdi.rename(columns={"y_id": "head", "x_name": "tail"})
# primekg_gdi.head()

In [156]:
# # 将 primekg_dgi 和 primekg_gdi 合并
# primekg_dgi = pd.concat([primekg_dgi, primekg_gdi], axis=0)
# primekg_dgi.tail()

In [157]:
primekg_dgi['relation'] = 'DRUG_GENE'
# 更改列的顺序
primekg_dgi = primekg_dgi[['head', 'relation', 'tail']]
primekg_dgi.head()

Unnamed: 0,head,relation,tail
321075,DB09130,DRUG_GENE,F8
321076,DB09130,DRUG_GENE,F5
321077,DB09140,DRUG_GENE,HBA2
321078,DB00180,DRUG_GENE,SERPINA6
321079,DB00240,DRUG_GENE,SERPINA6


In [158]:
biokg = pd.concat([biokg, primekg_dgi], axis=0)
# 重置索引
biokg.reset_index(drop=True, inplace=True)

In [159]:
biokg.shape

(5457351, 3)

### pathway_protein: pathway_gene

In [160]:
primekg_pagi = primekg[(primekg['relation'] == 'pathway_protein') & (primekg['x_type'] == 'pathway')][['x_id','y_name']]
# 重命名
primekg_pagi = primekg_pagi.rename(columns={"x_id": "head", "y_name": "tail"})
primekg_pagi.head()

Unnamed: 0,head,tail
6505730,R-HSA-114608,A1BG
6505731,R-HSA-6798695,A1BG
6505732,R-HSA-156582,NAT2
6505733,R-HSA-74217,ADA
6505734,R-HSA-381426,CDH2


In [161]:
primekg_pagi['relation'] = 'PATHWAY_GENE'
# 更改列的顺序
primekg_pagi = primekg_pagi[['head', 'relation', 'tail']]
primekg_pagi.head()

Unnamed: 0,head,relation,tail
6505730,R-HSA-114608,PATHWAY_GENE,A1BG
6505731,R-HSA-6798695,PATHWAY_GENE,A1BG
6505732,R-HSA-156582,PATHWAY_GENE,NAT2
6505733,R-HSA-74217,PATHWAY_GENE,ADA
6505734,R-HSA-381426,PATHWAY_GENE,CDH2


In [162]:
biokg = pd.concat([biokg, primekg_pagi], axis=0)
# 重置索引
biokg.reset_index(drop=True, inplace=True)
biokg.shape

(5499997, 3)

### disease_protein: disease_gene

In [163]:
primekg_digi = primekg[(primekg['relation'] == 'disease_protein') & (primekg['x_source'] == 'MONDO') & (primekg['x_type'] == 'disease')][['x_id','y_name']]
# 转换 x_id 和 y_id 的数据类型
primekg_digi['x_id'] = primekg_digi['x_id'].astype('int64')
primekg_digi.head()

Unnamed: 0,x_id,y_name
5950600,11401,A2M
5950602,5942,A2M
5950603,8243,A2M
5950605,11401,ACHE
5950607,5942,ACHE


In [164]:
primekg_digi = pd.merge(primekg_digi, mesh_mondo, left_on='x_id', right_on='mondo_id', how='left')
# 删除 x_id 和 mondo_id 列
primekg_digi.drop(['x_id', 'mondo_id'], axis=1, inplace=True)
# 将 mesh_id 重命名为 x_id
primekg_digi = primekg_digi.rename(columns={"mesh_id": "x_id"})
primekg_digi.head()

Unnamed: 0,y_name,x_id
0,A2M,C536599
1,A2M,D012202
2,A2M,D020774
3,ACHE,C536599
4,ACHE,D012202


In [165]:
primekg_digi.shape

(64804, 2)

In [166]:
primekg_digi = primekg_digi.dropna()
primekg_digi.shape

(31238, 2)

In [167]:
# 重命名
primekg_digi = primekg_digi.rename(columns={"x_id": "head", "y_name": "tail"})
primekg_digi['relation'] = 'DISEASE_GENE'
# 更改列的顺序
primekg_digi = primekg_digi[['head', 'relation', 'tail']]
primekg_digi.head()

Unnamed: 0,head,relation,tail
0,C536599,DISEASE_GENE,A2M
1,D012202,DISEASE_GENE,A2M
2,D020774,DISEASE_GENE,A2M
3,C536599,DISEASE_GENE,ACHE
4,D012202,DISEASE_GENE,ACHE


In [168]:
biokg = pd.concat([biokg, primekg_digi], axis=0)
# 重置索引
biokg.reset_index(drop=True, inplace=True)
biokg.shape

(5531235, 3)

In [169]:
biokg.to_csv('./data/biokg_primekg.csv')