In [202]:
import pandas as pd
import csv
from tqdm import tqdm
import os

In [203]:
data_dir = '../dd_data/17July2024DD_csvs/'

## Prepare Code DF

In [3]:
codes = pd.read_csv(data_dir + "CODEs.csv", index_col=0)

  codes = pd.read_csv(data_dir + "CODEs.csv", index_col=0)


In [4]:
concept_code = pd.read_csv(data_dir + "CUI-CODEs.csv")
concept_code.columns = ["id", "CodeID:ID"]

In [5]:
concept_code = pd.merge(concept_code, codes, on="CodeID:ID", how='left')


In [6]:
concept_code.columns = ["id", "code_id", "SAB", "CODE", "value:float","lowerbound:float","upperbound:float","unit"]

In [7]:
concept_code.head()

Unnamed: 0,id,code_id,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
0,C0000039,LNC:LP15542-1,LNC,LP15542-1,,,,
1,C0000107,MSH:D015059,MSH,D015059,,,,
2,C0000119,MSH:D015062,MSH,D015062,,,,
3,C0000163,CSP:0059-6844,CSP,0059-6844,,,,
4,C0000248,LNC:MTHU027462,LNC,MTHU027462,,,,


## Prepare Label DF

In [8]:
labels = pd.read_csv(data_dir + "CUI-SUIs.csv")
labels.columns = ['id', 'label']
labels = labels.set_index('id')
labels.head()

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
C0000005,(131)I-Macroaggregated Albumin
C0000139,"16,16-Dimethylprostaglandin E2"
C0000163,17-Hydroxycorticosteroids
C0000165,17-Hydroxysteroid Dehydrogenases
C0000190,2'-CMP


In [9]:
code_labels = pd.read_csv(data_dir + "CODE-SUIs.csv")
code_labels.columns = ["label", "id", "type", "cui"]
code_labels.head()

Unnamed: 0,label,id,type,cui
0,Dipalmitoyl Phosphatidylcholine,MSH:D015060,ET,C0000039
1,Cyperquat,MSH:D015655,ET,C0000098
2,1-Sar-8-Ile Angiotensin II,MSH:D015059,ET,C0000107
3,SAR ILE ANGIOTENSIN 02 01 08,MSH:D015059,DSV,C0000107
4,"RNA, 15S",MSH:D012335,PM,C0000137


In [10]:
code_labels = code_labels.groupby('cui').first()

In [11]:
cl = set(code_labels.index) 
l=set(labels.index)
missing = cl - l

In [12]:
labels = pd.concat([labels, code_labels.loc[list(missing), ["label"]]])

In [13]:
labels.loc['C0038351']

label    Stomach
Name: C0038351, dtype: object

In [14]:
sabs = set()
counter = 0
with open(data_dir + 'CUI-CUIs.csv') as o:
	csv_reader = csv.reader(o)
	header = None
	rows = []
	for row in tqdm(csv_reader):
		if counter == 0:
			header = ['source', 'target', 'relation', 'SAB', 'evidence_class', 'dcc']
			counter += 1
		else:
			sabs.add(row[3])

173212741it [02:18, 1250687.08it/s]


In [15]:
with open("out/sabs.txt") as o:
	old_sabs = set(o.read().strip().split("\n"))

In [16]:
sabs - old_sabs

{'BIOMARKER', 'DGN', 'UBKGSOURCE'}

In [17]:
def get_sab_df(sab, dcc=''):
	counter = 0
	with open(data_dir + 'CUI-CUIs.csv') as o:
		filename = 'out/sab/%s.edges.csv'%sab
		if os.path.isfile(filename): 
			print("%s exists!"%filename)
			return pd.read_csv(filename, index_col=0)
		else:
			csv_reader = csv.reader(o)
			header = None
			rows = []
			for row in tqdm(csv_reader):
				if counter == 0:
					header = ['source', 'target', 'relation', 'SAB', 'evidence_class', 'dcc']
				else:
					if sab == row[3]:
						rows.append(row + [dcc])
				counter += 1
			df = pd.DataFrame(rows, columns=header)
			df.dcc = dcc
			df.to_csv(filename)
			return df

def get_nodes(df, sab, relations):
	# Get Node Ids
	node_ids = {}
	for relation, v in relations.items():
		tmp = df[df["relation"] == relation]
		for key, node_type in v.items():
			node_index = tmp[key]
			if node_type not in node_ids:
				node_ids[node_type] = set()
			node_ids[node_type] = node_ids[node_type].union(node_index)
	# Get DF
	nodes = {}
	for node_type, node_index in node_ids.items():
		print(node_type)
		filename = 'out/sab/%s.%s.nodes.csv'%(sab, node_type)
		if (os.path.isfile(filename)):
			print('%s found'%filename)
			node_df = pd.read_csv(filename, index_col=0)
			nodes[node_type] = node_df
		else:
			node_index = list(node_index)
			node_df = pd.DataFrame(index=node_index, columns=["label", "type"])
			node_df.index.name = "id"
			node_df["type"] = node_type
			node_df["label"] = node_df.index
			with_label = list(set(labels.index).intersection(node_index))
			node_df.loc[with_label, 'label'] = labels.loc[with_label].reset_index().groupby("index").first().loc[with_label, "label"]
			if node_type == "Gene":
				node_df.loc[with_label, 'label'] = [i.replace(" gene", "") for i in node_df.loc[with_label, 'label']]
			# CODE
			filtered = concept_code[concept_code["id"].isin(node_df.index)]
			filtered = filtered[filtered.CODE != 'NOCODE']
			# filter keys
			code_keys = filtered.SAB.value_counts()[filtered.SAB.value_counts() > node_df.shape[0]/2].index
			grouped_concept = concept_code[concept_code["id"].isin(node_df.index)].groupby("id")
			score_df = pd.DataFrame(index=filtered.id.unique(), columns=code_keys)
			score_df.index.name = 'id'
			grouped_concept = filtered.groupby("id")
			for group in score_df.index:
				g = grouped_concept.get_group(group)
				g = g[g.SAB.isin(code_keys)]
				cols = g.SAB
				values = g.CODE
				score_df.loc[group]=pd.Series(list(values), index=cols).groupby('SAB').first()
			node_df = node_df.merge(score_df, on="id")
			node_df.to_csv(filename)
			nodes[node_type] = node_df
	return nodes

# LINCS

In [18]:
sab = 'LINCS'
dcc = 'LINCS'
df = get_sab_df(sab, dcc)

out/sab/LINCS.edges.csv exists!


In [19]:
df.head()

Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
225509,PUBCHEM:10109823 CUI,PUBCHEM:11338033 CUI,in_similarity_relationship_with,LINCS,0.55667,LINCS
225510,PUBCHEM:2062 CUI,PUBCHEM:1379247 CUI,in_similarity_relationship_with,LINCS,0.254851,LINCS
225511,PUBCHEM:5517 CUI,PUBCHEM:5281912 CUI,in_similarity_relationship_with,LINCS,0.251117,LINCS
225512,PUBCHEM:4658599 CUI,PUBCHEM:5202 CUI,in_similarity_relationship_with,LINCS,0.251333,LINCS
225513,PUBCHEM:44623840 CUI,PUBCHEM:44602407 CUI,in_similarity_relationship_with,LINCS,0.307563,LINCS


In [20]:
df.relation.unique()

array(['in_similarity_relationship_with', 'positively_regulates',
       'negatively_regulates'], dtype=object)

In [21]:
relations = {
	"negatively_regulates": {
		"target": "Gene",
		"source": "Compound",
	},
	"positively_regulates": {
		"target": "Gene",
		"source": "Compound",
	},
	"in_similarity_relationship_with": {
		"source": "Compound",
		"target": "Compound",
	}
}

In [22]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [23]:
nodes = get_nodes(df, sab, relations)

Gene
out/sab/LINCS.Gene.nodes.csv found
Compound
out/sab/LINCS.Compound.nodes.csv found


In [24]:
nodes["Gene"].head()

Unnamed: 0_level_0,label,type,ENSEMBL,HGNC,ENTREZ,OMIM
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C1412629,ATP1A2,Gene,ENSG00000018625,800.0,477.0,182340.0
C1413228,CD53,Gene,ENSG00000143119,1686.0,963.0,151525.0
C1335648,RRM1,Gene,ENSG00000167325,10451.0,6240.0,180410.0
C1419846,SCD,Gene,ENSG00000099194,10571.0,6319.0,604031.0
C1414375,ELF1,Gene,ENSG00000120690,3316.0,1997.0,189973.0


## 4DN

In [25]:
sab = '4DN'
dcc = '4DN'
df = get_sab_df(sab, dcc)

173212741it [02:52, 1004212.33it/s]


In [26]:
df.head()

Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,4DND:4DNESWST3UBH CUI,EFO:0008808 CUI,has_assay_type,4DN,,4DN
1,4DND:4DNES21D8SP8 CUI,EFO:0008808 CUI,has_assay_type,4DN,,4DN
2,4DND:4DNESW1SPPTD CUI,EFO:0008808 CUI,has_assay_type,4DN,,4DN
3,4DND:4DNESN49VY8X CUI,EFO:0009974 CUI,has_assay_type,4DN,,4DN
4,4DND:4DNES2R6PUEK CUI,EFO:0009974 CUI,has_assay_type,4DN,,4DN


In [27]:
df.relation.unique()

array(['has_assay_type', 'dataset_involves_cell_type', 'dataset_has_file',
       'file_has_loop', 'loop_us_start', 'loop_us_end', 'loop_ds_start',
       'loop_ds_end', 'loop_has_qvalue_bin', 'inverse_has_assay_type',
       'inverse_dataset_involves_cell_type', 'inverse_dataset_has_file',
       'inverse_file_has_loop', 'inverse_loop_us_start',
       'inverse_loop_us_end', 'inverse_loop_ds_start',
       'inverse_loop_ds_end', 'inverse_loop_has_qvalue_bin'], dtype=object)

In [28]:
relations = {
	"has_assay_type": {
		"source": "4DN Dataset",
		"target": "Assay",
	},
	"dataset_involves_cell_type": {
		"source": "4DN Dataset",
		"target": "Anatomy", # Uberon, EFO
	},
	"dataset_has_file": {
		"source": "4DN Dataset",
		"target": "4DN File",
	},
	"file_has_loop": {
		"source": "4DN File",
		"target": "4DN Loop",
	},
	"loop_has_qvalue_bin": {
		"source": "4DN Loop",
		"target": "4DN QVal Bin",
	},
	"loop_us_start": {
		"source": "4DN Loop",
		"target": "HSCLO",
	},
	"loop_us_end": {
		"source": "4DN Loop",
		"target": "HSCLO",
	},
	"loop_ds_start": {
		"source": "4DN Loop",
		"target": "HSCLO",
	},
	"loop_ds_end": {
		"source": "4DN Loop",
		"target": "HSCLO",
	}
}

In [29]:
set(df.relation) - set(relations.keys())

{'inverse_dataset_has_file',
 'inverse_dataset_involves_cell_type',
 'inverse_file_has_loop',
 'inverse_has_assay_type',
 'inverse_loop_ds_end',
 'inverse_loop_ds_start',
 'inverse_loop_has_qvalue_bin',
 'inverse_loop_us_end',
 'inverse_loop_us_start'}

In [30]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [31]:
nodes = get_nodes(df, sab, relations)

4DN Dataset
Assay
Anatomy
4DN File
4DN Loop
4DN QVal Bin
HSCLO


In [32]:
nodes['4DN Dataset'].head()

Unnamed: 0_level_0,label,type,4DND
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4DND:4DNES1JP4KZ1 CUI,in situ Hi-C on HCT116 cells (containing AID-t...,4DN Dataset,4DNES1JP4KZ1
4DND:4DNESW1SPPTD CUI,Micro-C on H1 cells differentiated to definiti...,4DN Dataset,4DNESW1SPPTD
4DND:4DNES21D8SP8 CUI,Micro-C on H1-ESC cells.H1-ESC,4DN Dataset,4DNES21D8SP8
4DND:4DNESN49VY8X CUI,in situ Hi-C on HFFc6 cells.HFFc6,4DN Dataset,4DNESN49VY8X
4DND:4DNES3JX38V5 CUI,in situ Hi-C on GM12878 with MboI and bio-dATP...,4DN Dataset,4DNES3JX38V5


## ERCC

In [33]:
sab = "ERCCRBP"
dcc = "ERCC"
df = get_sab_df(sab, dcc)
df.head()

out/sab/ERCCRBP.edges.csv exists!


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,UNIPROTKB:P49588 CUI,ENCODE.RBS.150.NO.OVERLAP:chr5.134924513.13492...,molecularly_interacts_with,ERCCRBP,,ERCC
1,UNIPROTKB:Q9NY61 CUI,ENCODE.RBS.150.NO.OVERLAP:chr1.1341093.1341096...,molecularly_interacts_with,ERCCRBP,,ERCC
2,UNIPROTKB:Q9NY61 CUI,ENCODE.RBS.150.NO.OVERLAP:chr1.1341218.1341221...,molecularly_interacts_with,ERCCRBP,,ERCC
3,UNIPROTKB:Q9NY61 CUI,ENCODE.RBS.150.NO.OVERLAP:chr1.28508550.285085...,molecularly_interacts_with,ERCCRBP,,ERCC
4,UNIPROTKB:Q9NY61 CUI,ENCODE.RBS.150.NO.OVERLAP:chr1.30935491.309355...,molecularly_interacts_with,ERCCRBP,,ERCC


In [34]:
df.relation.unique()

array(['molecularly_interacts_with', 'predicted_in', 'not_predicted_in',
       'correlated_in', 'not_correlated_in', 'overlaps',
       'inverse_molecularly_interacts_with', 'inverse_predicted_in',
       'inverse_not_predicted_in', 'inverse_correlated_in',
       'inverse_not_correlated_in', 'inverse_overlaps'], dtype=object)

In [35]:
# rels = ['molecularly_interacts_with', 'is_subsequence_of', 'predicted_in',
#        'not_predicted_in', 'correlated_in', 'not_correlated_in',
#        'overlaps']
# df[df.relation.isin(rels)].to_csv("out/sab/%s.edges.csv"%sab)

In [36]:
set([i.split(":")[0] for i in df[df.relation == "overlaps"].source])

{'ENCODE.RBS.150.NO.OVERLAP'}

In [37]:
set([i.split(":")[0] for i in df.source if not i.startswith("C")])

{'ENCODE.RBS.150.NO.OVERLAP', 'HGNC', 'UNIPROTKB'}

In [38]:
relation = 'overlaps'
change = [i for i in df[df.relation == relation].source if 'RBS.150' in i]
ind = df[(df.source.isin(change)) & (df.relation == relation)].index
df.loc[ind, 'relation'].unique()
# df.loc[ind, 'relation'] = 'overlaps_exrna'

array(['overlaps'], dtype=object)

In [39]:
set([i.split(":")[0] for i in df[df.relation == 'correlated_in'].source])

{'ENCODE.RBS.150.NO.OVERLAP'}

In [40]:
set([i.split(":")[0] for i in df[df.relation == 'correlated_in'].target])

{'C0007806', 'C0032105', 'C0036087', 'C0042036', 'C0229671'}

In [41]:
df.relation.unique()

array(['molecularly_interacts_with', 'predicted_in', 'not_predicted_in',
       'correlated_in', 'not_correlated_in', 'overlaps',
       'inverse_molecularly_interacts_with', 'inverse_predicted_in',
       'inverse_not_predicted_in', 'inverse_correlated_in',
       'inverse_not_correlated_in', 'inverse_overlaps'], dtype=object)

In [42]:
relations = {
	"overlaps": {
		"source": "exRNA Loci",
		"target": "Gene",
	},
	"molecularly_interacts_with": {
		"source": "Protein",
		"target": "exRNA Loci",
	},
	"predicted_in": {
		"source": "Protein",
		"target": "Biofluid"
	},
	"not_predicted_in": {
		"source": "Protein",
		"target": "Biofluid"
	},
	"correlated_in": {
		"source": "exRNA Loci",
		"target": "Biofluid"
	},
	"not_correlated_in": {
		"source": "exRNA Loci",
		"target": "Biofluid"
	},	
}

In [43]:
set(relations.keys()) - set(df.relation)

set()

In [44]:
# relations = {
# 	"overlaps": {
# 		"source": "RBP Binding Loci",
# 		"target": "Gene",
# 	},
# 	"overlaps_exrna": {
# 		"source": "exRNA Loci",
# 		"target": "Gene", # Uberon, EFO
# 	},
# 	"molecularly_interacts_with": {
# 		"source": "Protein",
# 		"target": "RBP Binding Loci",
# 	},
# 	"is_subsequence": {
# 		"source": "exRNA Loci",
# 		"target": "RBP Binding Loci",
# 	},
# 	"is_subsequence_of": {
# 		"source": "exRNA Loci",
# 		"target": "RBP Binding Loci",
# 	},
# 	"correlated_in": {
# 		"source": "exRNA Loci",
# 		"target": "Biofluid"
# 	},
# 	"not_correlated_in": {
# 		"source": "exRNA Loci",
# 		"target": "Biofluid"
# 	},
# 	"predicted_in": {
# 		"source": "Protein",
# 		"target": "Biofluid"
# 	},
# 	"not_predicted_in": {
# 		"source": "Protein",
# 		"target": "Biofluid"
# 	}
# }

In [45]:
df[df.relation.isin(relations.keys())].to_csv("out/sab/%s.edges.csv"%sab)

In [46]:
nodes = get_nodes(df, sab, relations)

exRNA Loci
Gene
Protein
Biofluid


In [18]:
sab = 'ERCCREG'
dcc = "ERCC"
df = get_sab_df(sab, dcc)
df.head()

173212741it [05:38, 510969.26it/s] 


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,C0038351,ENCODE.CCRE.ACTIVITY:EH38E0064571.UBERON000094...,part_of,ERCCREG,,ERCC
1,ENCODE.CCRE:EH38E0064571 CUI,ENCODE.CCRE.ACTIVITY:EH38E0064571.UBERON000094...,part_of,ERCCREG,,ERCC
2,ENCODE.CCRE.ACTIVITY:EH38E0064571.UBERON000094...,C1333687,regulates,ERCCREG,,ERCC
3,C0007461,ENCODE.CCRE.ACTIVITY:EH38E0064571.UBERON000187...,part_of,ERCCREG,,ERCC
4,ENCODE.CCRE:EH38E0064571 CUI,ENCODE.CCRE.ACTIVITY:EH38E0064571.UBERON000187...,part_of,ERCCREG,,ERCC


In [19]:
df.relation.unique()

array(['part_of', 'regulates', 'isa', 'located_in',
       'negatively_regulates', 'positively_regulates', 'has_part',
       'regulated_by', 'inverse_isa', 'location_of',
       'negatively_regulated_by', 'positively_regulated_by'], dtype=object)

In [36]:
df.relation.unique()

array(['part_of', 'regulates', 'isa', 'located_in',
       'negatively_regulates', 'positively_regulates'], dtype=object)

In [20]:
rels = ['part_of', 'regulates', 'isa', 'located_in',
       'negatively_regulates', 'positively_regulates']
df[df.relation.isin(rels)].to_csv("out/sab/%s.edges.csv"%sab)

In [21]:
source_pref = set([i.split(":")[0] for i in df[df.relation == 'part_of'].source if ":" in i])
source_pref

{'CLINGEN.ALLELE.REGISTRY', 'ENCODE.CCRE', 'UBERON'}

In [22]:
target_pref = set([i.split(":")[0] for i in df[df.relation == 'part_of'].target])
target_pref

{'ENCODE.CCRE.ACTIVITY', 'GTEXEQTL'}

In [23]:
relation = "part_of"
s = 'ENCODE.CCRE'
t = 'ENCODE.CCRE.ACTIVITY'
source_ids = [i for i in df[df.relation == relation].source if i.startswith(s)]
target_ids = [i for i in df[df.relation == relation].target if i.startswith(t)]
ind = df[(df.relation == relation) & (df.source.isin(source_ids)) & (df.target.isin(target_ids))].index
df.loc[ind, 'relation'] = 'part_of_ccre'

In [24]:
relation = "part_of"
s = 'CLINGEN.ALLELE.REGISTRY'
t = 'GTEXEQTL'
source_ids = [i for i in df[df.relation == relation].source if i.startswith(s)]
target_ids = [i for i in df[df.relation == relation].target if i.startswith(t)]
ind = df[(df.relation == relation) & (df.source.isin(source_ids)) & (df.target.isin(target_ids))].index
print(len(ind))
# df.loc[ind, 'relation'] = 'part_of_clingen'

265965


In [25]:
df.loc[ind, 'relation'] = 'part_of_clingen'

In [26]:
relation = "part_of"
t = 'GTEXEQTL'
target_ids = [i for i in df[df.relation == relation].target if i.startswith(t)]
ind = df[(df.relation == relation) & (df.target.isin(target_ids))].index
print(len(ind))
# df.loc[ind, 'relation'] = 'part_of_clingen'

265965


In [27]:
df.loc[ind, 'relation'] = 'part_of_uberon'

In [28]:
df.relation.unique()

array(['part_of', 'part_of_ccre', 'regulates', 'isa', 'located_in',
       'part_of_clingen', 'part_of_uberon', 'negatively_regulates',
       'positively_regulates', 'has_part', 'regulated_by', 'inverse_isa',
       'location_of', 'negatively_regulated_by',
       'positively_regulated_by'], dtype=object)

In [29]:
df.relation.unique()

array(['part_of', 'part_of_ccre', 'regulates', 'isa', 'located_in',
       'part_of_clingen', 'part_of_uberon', 'negatively_regulates',
       'positively_regulates', 'has_part', 'regulated_by', 'inverse_isa',
       'location_of', 'negatively_regulated_by',
       'positively_regulated_by'], dtype=object)

In [None]:
'located_in',
'negatively_regulates',
'positively_regulates'

In [30]:
relations = {
	"isa": {
		"source": "Regulatory Element Activity",
		"target": "ENCODE CCRE Data Matrix",
	},
	"part_of": {
		"source": "Anatomy",
		"target": "Regulatory Element Activity", # Uberon, EFO
	},
	"part_of_ccre": {
		"source": "ENCODE CCRE",
		"target": "Regulatory Element Activity",
	},
	"part_of_clingen": {
		"source": "CLINGEN ALLELE REGISTRY",
		"target": "GTEXEQTL",
	},
	"part_of_uberon": {
		"source": "Anatomy",
		"target": "GTEXEQTL"
	},
	"regulates": {
		"source": "Regulatory Element Activity",
		"target": "Gene"
	},
	"negatively_regulates": {
		"source": "GTEXEQTL",
		"target": "Gene"
	},
	"positively_regulates": {
		"source": "GTEXEQTL",
		"target": "Gene"
	},
	"located_in": {
		"source": "CLINGEN ALLELE REGISTRY",
		"target": "ENCODE CCRE"
	}
}

In [31]:
nodes = get_nodes(df, sab, relations)

Regulatory Element Activity
ENCODE CCRE Data Matrix
Anatomy
ENCODE CCRE
CLINGEN ALLELE REGISTRY
GTEXEQTL
Gene


In [32]:
nodes["Gene"].head()

Unnamed: 0_level_0,label,type,ENSEMBL,HGNC
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSEMBL:ENSG00000270020 CUI,ENSG00000270020,Gene,ENSG00000270020,
C1822930,SNORD32A,Gene,ENSG00000201675,10159.0
ENSEMBL:ENSG00000207420 CUI,Y_RNA,Gene,ENSG00000207420,
C1421369,UQCRBP1,Gene,ENSG00000237748,12583.0
C2239655,LRIT3,Gene,ENSG00000183423,24783.0


## Glygen

In [47]:
sab = 'PROTEOFORM'
dcc = "GlyGen"
df = get_sab_df(sab, dcc)
df.head()

173212741it [02:21, 1228168.15it/s]


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,UNIPROTKB:O14490 CUI,UNIPROTKB.ISOFORM:O14490-1 CUI,has_isoform,PROTEOFORM,,GlyGen
1,UNIPROTKB:E9Q7T7 CUI,UNIPROTKB.ISOFORM:E9Q7T7-1 CUI,has_isoform,PROTEOFORM,,GlyGen
2,UNIPROTKB:O14513 CUI,UNIPROTKB.ISOFORM:O14513-1 CUI,has_isoform,PROTEOFORM,,GlyGen
3,UNIPROTKB:O15355 CUI,UNIPROTKB.ISOFORM:O15355-1 CUI,has_isoform,PROTEOFORM,,GlyGen
4,UNIPROTKB:O15488 CUI,UNIPROTKB.ISOFORM:O15488-1 CUI,has_isoform,PROTEOFORM,,GlyGen


In [48]:
df.relation.unique()

array(['has_isoform', 'has_evidence', 'sequence', 'citation',
       'has_pro_entry', 'glycosylated_at', 'location', 'has_saccharide',
       'has_amino_acid', 'inverse_has_isoform', 'is_evidence_for',
       'inverse_sequence', 'inverse_citation', 'inverse_has_pro_entry',
       'inverse_glycosylated_at', 'inverse_location',
       'inverse_has_saccharide', 'inverse_has_amino_acid'], dtype=object)

In [221]:
df.relation.unique()

array(['has_isoform', 'has_evidence', 'sequence', 'citation',
       'has_pro_entry', 'glycosylated_at', 'location', 'has_saccharide',
       'has_amino_acid', 'inverse_has_isoform', 'is_evidence_for',
       'inverse_sequence', 'inverse_citation', 'inverse_has_pro_entry',
       'inverse_glycosylated_at', 'inverse_location',
       'inverse_has_saccharide', 'inverse_has_amino_acid'], dtype=object)

In [49]:
relations = {
	"has_isoform": {
		"source": "Protein",
		"target": "Isoform",
	},
	"has_evidence": {
		"source": "Glycoprotein",
		"target": "Glycoprotein Evidence", # Uberon, EFO
	},
	"sequence": {
		"source": "Glycoprotein",
		"target": "Isoform",
	},
	"citation": {
		"source": "Glycoprotein Evidence",
		"target": "Glycoprotein Citation",
	},
	"has_pro_entry": {
		"source": "Glycoprotein",
		"target": "GP ID2PRO"
	},
	"glycosylated_at": {
		"source": "Glycoprotein",
		"target": "Glycosylation Site"
	},
	"location": {
		"source": "Glycosylation Site",
		"target": "GlyGen Location"
	},
	"has_saccharide": {
		"source": "Glycosylation Site",
		"target": "Glytoucan"
	},
	"has_amino_acid": {
		"source": "GlyGen Location",
		"target": "Amino Acid"
	}
}

In [50]:
df[df.relation.isin(relations.keys())].to_csv('out/sab/PROTEOFORM.edges.csv')

In [51]:
nodes = get_nodes(df, sab, relations)

Protein
Isoform
Glycoprotein
Glycoprotein Evidence
Glycoprotein Citation
GP ID2PRO
Glycosylation Site
GlyGen Location
Glytoucan
Amino Acid


In [52]:
sab = 'GLYCANS'
dcc = "GlyGen"
df = get_sab_df(sab, dcc)
df.head()

173212741it [02:25, 1190978.68it/s]


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,GLYGEN.GLYCOSYLATION:RXN00000038 CUI,UNIPROTKB:P38649 CUI,has_enzyme_protein,GLYCANS,,GlyGen
1,GLYCOSYLTRANSFERASE.REACTION:RXN00000011 CUI,UNIPROTKB:P38649 CUI,has_enzyme_protein,GLYCANS,,GlyGen
2,GLYTOUCAN:G85555HC CUI,GLYGEN.SRC:G85555HC-SRC00000461 CUI,is_from_source,GLYCANS,,GlyGen
3,GLYTOUCAN:G89102AG CUI,GLYGEN.SRC:G89102AG-SRC00000121 CUI,is_from_source,GLYCANS,,GlyGen
4,GLYTOUCAN:G94192DA CUI,GLYGEN.GLYCOSEQUENCE:G94192DA-GLYCOSEQ00002593...,has_glycosequence,GLYCANS,,GlyGen


In [53]:
df.relation.unique()

array(['has_enzyme_protein', 'is_from_source', 'has_glycosequence',
       'attached_by', 'synthesized_by', 'has_motif',
       'has_canonical_residue', 'has_parent',
       'inverse_has_enzyme_protein', 'inverse_is_from_source',
       'inverse_has_glycosequence', 'inverse_attached_by',
       'inverse_synthesized_by', 'inverse_has_motif',
       'inverse_has_canonical_residue', 'inverse_has_parent'],
      dtype=object)

In [229]:
df.relation.unique()

array(['has_enzyme_protein', 'is_from_source', 'has_glycosequence',
       'attached_by', 'synthesized_by', 'has_motif',
       'has_canonical_residue', 'has_parent',
       'inverse_has_enzyme_protein', 'inverse_is_from_source',
       'inverse_has_glycosequence', 'inverse_attached_by',
       'inverse_synthesized_by', 'inverse_has_motif',
       'inverse_has_canonical_residue', 'inverse_has_parent'],
      dtype=object)

In [54]:
set([i.split(":")[0] for i in df[df.relation == "has_enzyme_protein"].source])

{'GLYCOSYLTRANSFERASE.REACTION', 'GLYGEN.GLYCOSYLATION'}

In [56]:
relation = "has_enzyme_protein"
s = 'GLYCOSYLTRANSFERASE.REACTION'
source_ids = [i for i in df[df.relation == relation].source if i.startswith(s)]
ind = df[(df.relation == relation) & (df.source.isin(source_ids))].index
print(len(ind))
df.loc[ind, 'relation'] = 'has_enzyme_protein_gr'

91


In [57]:
relations = {
	"has_enzyme_protein": {
		"source": "Glycosylation",
		"target": "Protein",
	},
	"has_enzyme_protein_gr": {
		"source": "Glycosyltransferase Reaction",
		"target": "Protein",
	},
	"is_from_source": {
		"source": "Glytoucan",
		"target": "GlyGen src", # Uberon, EFO
	},
	"has_glycosequence": {
		"source": "Glytoucan",
		"target": "GlyGen Glycosequence",
	},
	"attached_by": {
		"source": "GlyGen Residue",
		"target": "Glycosylation",
	},
	"synthesized_by": {
		"source": "Glytoucan",
		"target": "Glycosyltransferase Reaction"
	},
	"has_motif": {
		"source": "Glytoucan",
		"target": "Glycan Motif"
	},
	"has_canonical_residue": {
		"source": "Glytoucan",
		"target": "GlyGen Residue"
	},
	"has_parent": {
		"source": "GlyGen Residue",
		"target": "GlyGen Residue"
	}
}

In [58]:
set(df.relation) - set(relations.keys())

{'inverse_attached_by',
 'inverse_has_canonical_residue',
 'inverse_has_enzyme_protein',
 'inverse_has_glycosequence',
 'inverse_has_motif',
 'inverse_has_parent',
 'inverse_is_from_source',
 'inverse_synthesized_by'}

In [59]:
df[df.relation.isin(relations.keys())].to_csv('out/sab/GLYCANS.edges.csv')

In [60]:
nodes = get_nodes(df, sab, relations)

Glycosylation
Protein
Glycosyltransferase Reaction
Glytoucan
GlyGen src
GlyGen Glycosequence
GlyGen Residue
Glycan Motif


## GTEx

In [61]:
sab = 'GTEXEXP'
dcc = "GTEx"
df = get_sab_df(sab, dcc)
df.head()

173212741it [04:10, 691604.61it/s] 


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,GTEXEXP:ENSG00000223972-5-Testis CUI,EXPBINS:0.1.0.2 CUI,has_expression,GTEXEXP,,GTEx
1,GTEXEXP:ENSG00000227232-5-Adipose-Subcutaneous...,EXPBINS:4.0.5.0 CUI,has_expression,GTEXEXP,,GTEx
2,GTEXEXP:ENSG00000227232-5-Adrenal-Gland CUI,EXPBINS:2.0.3.0 CUI,has_expression,GTEXEXP,,GTEx
3,GTEXEXP:ENSG00000227232-5-Artery-Aorta CUI,EXPBINS:4.0.5.0 CUI,has_expression,GTEXEXP,,GTEx
4,GTEXEXP:ENSG00000227232-5-Artery-Coronary CUI,EXPBINS:3.0.4.0 CUI,has_expression,GTEXEXP,,GTEx


In [62]:
len(df[df.relation == 'has_expression'].source), len(df[df.relation == 'has_expression'].source.unique())

(1573380, 1573380)

In [63]:
relation = "expressed_in"
targets = df[df.relation == relation].target.unique()
# s = 'GLYCOSYLTRANSFERASE.REACTION'
# source_ids = [i for i in df[df.relation == relation].source if i.startswith(s)]
# ind = df[(df.relation == relation) & (df.source.isin(source_ids))].index
# print(len(ind))
# df.loc[ind, 'relation'] = 'has_enzyme_protein_gr'

In [64]:
tmp = concept_code[concept_code.SAB.isin(['HGNC', 'ENSEMBLE', 'UBERON', 'EFO'])]

In [65]:
tmp = tmp.groupby("id").first()
tmp.head()

Unnamed: 0_level_0,code_id,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C0000696,UBERON:0006135,UBERON,6135,,,,
C0000726,UBERON:0000916,UBERON,916,,,,
C0000739,UBERON:0002378,UBERON,2378,,,,
C0000741,UBERON:0001646,UBERON,1646,,,,
C0000773,EFO:0009565,EFO,9565,,,,


In [66]:
anatomy_index = set(targets).intersection(tmp[tmp.SAB.isin(['UBERON', 'EFO'])].index)
gene_index = set(targets).intersection(tmp[tmp.SAB.isin(['HGNC', 'ENSEMBLE'])].index)
len(targets), len(anatomy_index), len(gene_index)

(34623, 44, 34579)

In [67]:
s = 'expressed_in'
ind = df[(df.relation == relation) & (df.target.isin(gene_index))].index
print(len(ind))

1573380


In [68]:
df.loc[ind, 'relation'] = 'expressed_in_gene'

In [69]:
s = 'expressed_in'
ind = df[(df.relation == relation) & (df.target.isin(anatomy_index))].index
print(len(ind))

1573380


In [70]:
df.loc[ind, 'relation'] = 'expressed_in_anatomy'

In [71]:
relations = {
	"expressed_in_gene": {
		"source": "GTEXEXP",
		"target": "Gene",
	},
	"expressed_in_anatomy": {
		"source": "GTEXEXP",
		"target": "Anatomy",
	},
	"has_expression": {
		"source": "GTEXEXP",
		"target": "EXPBINS",
	},
}

In [72]:
df.relation.unique()

array(['has_expression', 'expressed_in_gene', 'expressed_in_anatomy',
       'inverse_has_expression', 'expresses'], dtype=object)

In [73]:
nodes = get_nodes(df, sab, relations)

GTEXEXP
Gene
Anatomy
EXPBINS


In [74]:
gtexexp = df[df.relation == "has_expression"].source.unique()

In [75]:
len(gtexexp)

1573380

In [76]:
bins_dict = {}
for i, row in df[df.relation == "has_expression"].iterrows():
	source = row["source"]
	target = row["target"]
	tmp = target.replace(" CUI", "").split(":")[1]
	score = ".".join(tmp.split(".")[2:])
	bins_dict[source] = float(score)

In [77]:
anatomy_dict = {}
for i, row in df[df.relation == "expressed_in_anatomy"].iterrows():
	source = row["source"]
	target = row["target"]
	anatomy_dict[source] = target

gene_dict = {}
for i, row in df[df.relation == "expressed_in_gene"].iterrows():
	source = row["source"]
	target = row["target"]
	gene_dict[source] = target
len(anatomy_dict), len(gene_dict), len(bins_dict)

(1573380, 1573380, 1573380)

In [78]:
# source	target	relation	SAB	evidence_class	dcc

ind = len(df.index)
rows = {}
for k, score in bins_dict.items():
	anatomy = anatomy_dict[k]
	gene = gene_dict[k]
	row = [gene, anatomy, "expressed_in", "GTEXEXP", score, "GTEx"]
	rows[ind] = row
	ind += 1

In [79]:
concat_df = pd.DataFrame.from_dict(rows, orient="index", columns=df.columns)
concat_df.head()

Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
9440280,C2239334,C0039597,expressed_in,GTEXEXP,0.2,GTEx
9440281,C2829144,C0222331,expressed_in,GTEXEXP,5.0,GTEx
9440282,C2829144,C0001625,expressed_in,GTEXEXP,3.0,GTEx
9440283,C2829144,C0003956,expressed_in,GTEXEXP,5.0,GTEx
9440284,C2829144,C0205042,expressed_in,GTEXEXP,4.0,GTEx


In [80]:
df = pd.concat([df, concat_df])

In [81]:
ind = df[df.relation == 'expressed_in_gene'].index
df.loc[ind, 'relation'] = 'expressed_in'
ind = df[df.relation == 'expressed_in_anatomy'].index
df.loc[ind, 'relation'] = 'expressed_in'

In [82]:
df = pd.concat([df, concat_df])

In [83]:
df.to_csv('out/sab/GTEXEXP.edges.csv')

In [84]:
df.relation.unique()

array(['has_expression', 'expressed_in', 'inverse_has_expression',
       'expresses'], dtype=object)

In [218]:
df.relation.unique()

array(['has_expression', 'expressed_in', 'inverse_has_expression',
       'expresses'], dtype=object)

In [85]:
df[df.relation.isin(['expressed_in', 'has_expression'])].to_csv('out/sab/GTEXEXP.edges.csv')

In [86]:
sab = 'GTEXEQTL'
dcc = "GTEx"
df = get_sab_df(sab, dcc)
df.head()

173212741it [04:17, 671529.63it/s] 


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,GTEXEQTL:eQTL.chr1.1434243.G.A.b38.Cells.Cultu...,C1428335,located_in,GTEXEQTL,,GTEx
1,GTEXEQTL:eQTL.chr1.1497758.C.T.b38.Cells.Cultu...,C1428335,located_in,GTEXEQTL,,GTEx
2,GTEXEQTL:eQTL.chr1.1499000.C.A.b38.Cells.Cultu...,C1428335,located_in,GTEXEQTL,,GTEx
3,GTEXEQTL:eQTL.chr1.1499128.C.T.b38.Cells.Cultu...,C1428335,located_in,GTEXEQTL,,GTEx
4,GTEXEQTL:eQTL.chr1.1499639.G.T.b38.Cells.Cultu...,C1428335,located_in,GTEXEQTL,,GTEx


In [87]:
df.relation.unique()

array(['located_in', 'p_value', 'location_of', 'inverse_p_value'],
      dtype=object)

In [88]:
df[df.relation == 'located_in'].target.unique()

array(['C1428335', 'C3146465', 'C1823788', ...,
       'HSCLO:chr12.69390001-69391000 CUI',
       'HSCLO:chr12.69391001-69392000 CUI',
       'HSCLO:chr12.69396001-69397000 CUI'], dtype=object)

In [89]:
targets = df[df.relation == 'located_in'].target.unique()

In [90]:
tmp = concept_code[concept_code.SAB.isin(['UBERON', 'EFO', 'HSCLO', 'HGNC'])]
hsclo_index = set(targets).intersection(tmp[tmp.SAB.isin(['HSCLO'])]["id"])
gene_index = set(targets).intersection(tmp[tmp.SAB.isin(['HGNC'])]["id"])
anatomy_index = set(targets).intersection(tmp[tmp.SAB.isin(['UBERON', 'EFO'])]["id"])
len(targets), len(anatomy_index), len(gene_index), len(hsclo_index)

(12814, 48, 301, 12465)

In [91]:
relation = 'located_in'
ind = df[(df.relation == relation) & (df.target.isin(hsclo_index))].index
print(len(ind))

1240810


In [92]:
df.loc[ind, 'relation'] = 'located_in_hsclo'

In [93]:
relation = 'located_in'
ind = df[(df.relation == relation) & (df.target.isin(anatomy_index))].index
print(len(ind))

1240810


In [94]:
df.loc[ind, 'relation'] = 'located_in_anatomy'

In [95]:
relation = 'located_in'
ind = df[(df.relation == relation) & (df.target.isin(gene_index))].index
print(len(ind))

2041208


In [96]:
df.loc[ind, 'relation'] = 'located_in_gene'

In [97]:
relations = {
	"located_in_gene": {
		"source": "GTEXEQTL",
		"target": "Gene",
	},
	"located_in_anatomy": {
		"source": "GTEXEQTL",
		"target": "Anatomy",
	},
	"located_in_hsclo": {
		"source": "GTEXEQTL",
		"target": "HSCLO",
	},
	"p_value": {
		"source": "GTEXEQTL",
		"target": "GTEXPVALUEBIN",
	},
}

In [98]:
nodes = get_nodes(df, sab, relations)

GTEXEQTL
Gene
Anatomy
HSCLO
GTEXPVALUEBIN


In [99]:
df.relation.unique()

array(['located_in_gene', 'located_in_anatomy', 'located_in_hsclo',
       'p_value', 'location_of', 'inverse_p_value'], dtype=object)

In [100]:
df = df[df.relation.isin(relations.keys())]
ind = df[df.relation == 'located_in_gene'].index
df.loc[ind, 'relation'] = 'located_in'
ind = df[df.relation == 'located_in_anatomy'].index
df.loc[ind, 'relation'] = 'located_in'
ind = df[df.relation == 'located_in_hsclo'].index
df.loc[ind, 'relation'] = 'located_in'
df.relation.unique()

array(['located_in', 'p_value'], dtype=object)

In [101]:
df.to_csv('out/sab/GTEXEQTL.edges.csv')

## HubMAP

In [102]:
sab = 'AZ'
dcc = "HuBMAP"
df = get_sab_df(sab, dcc)
df.head()

173212741it [02:12, 1306013.67it/s]


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,AZ:0000000 CUI,AZ:9000000 CUI,isa,AZ,,HuBMAP
1,AZ:0100000 CUI,AZ:9000000 CUI,isa,AZ,,HuBMAP
2,C0018787,AZ:0100000 CUI,isa,AZ,,HuBMAP
3,C0022646,AZ:0100000 CUI,isa,AZ,,HuBMAP
4,C0024109,AZ:0100000 CUI,isa,AZ,,HuBMAP


In [103]:
df.relation.unique()

array(['isa', 'located_in', 'inverse_isa', 'location_of'], dtype=object)

In [104]:
relations = {
	"isa": {
		"source": "Anatomy",
		"target": "Anatomy",
	},
	"located_in": {
		"source": "Anatomy",
		"target": "Anatomy",
	}
}

In [105]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [106]:
nodes = get_nodes(df, sab, relations)

Anatomy


In [107]:
sab = 'HMAZ'
dcc = "HuBMAP"
df = get_sab_df(sab, dcc)
df.head()

173212741it [02:11, 1315917.32it/s]


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,C0206131,C1539593,has_marker_gene_in_heart,HMAZ,,HuBMAP
1,C0206131,C1412105,has_marker_gene_in_heart,HMAZ,,HuBMAP
2,C0206131,C1424138,has_marker_gene_in_heart,HMAZ,,HuBMAP
3,C0206131,C3470887,has_marker_gene_in_heart,HMAZ,,HuBMAP
4,C0206131,C1367441,has_marker_gene_in_heart,HMAZ,,HuBMAP


In [108]:
df.relation.unique()

array(['has_marker_gene_in_heart', 'has_marker_gene_in_kidney',
       'has_marker_gene_in_liver', 'inverse_has_marker_gene_in_heart',
       'inverse_has_marker_gene_in_kidney',
       'inverse_has_marker_gene_in_liver'], dtype=object)

In [109]:
relations = {
	"has_marker_gene_in_heart": {
		"source": "Anatomy",
		"target": "Gene",
	},
	"has_marker_gene_in_kidney": {
		"source": "Anatomy",
		"target": "Gene",
	},
	"has_marker_gene_in_liver": {
		"source": "Anatomy",
		"target": "Gene",
	}
}

In [110]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [111]:
nodes = get_nodes(df, sab, relations)

Anatomy
Gene


## IDG

In [112]:
sab = 'IDGP'
dcc = "IDG"
df = get_sab_df(sab, dcc)
df.head()

173212741it [02:52, 1006565.06it/s]


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,PUBCHEM:286 CUI,UNIPROTKB:P32929 CUI,bioactivity,IDGP,IC50,IDG
1,PUBCHEM:185909 CUI,UNIPROTKB:P32929 CUI,bioactivity,IDGP,IC50,IDG
2,PUBCHEM:439742 CUI,UNIPROTKB:P32929 CUI,bioactivity,IDGP,IC50,IDG
3,PUBCHEM:6436272 CUI,UNIPROTKB:P32929 CUI,bioactivity,IDGP,IC50,IDG
4,PUBCHEM:1312632 CUI,UNIPROTKB:Q969H4 CUI,bioactivity,IDGP,Kd,IDG


In [113]:
df.relation.unique()

array(['bioactivity', 'inverse_bioactivity'], dtype=object)

In [114]:
relations = {
	"bioactivity": {
		"source": "Compound",
		"target": "Protein",
	},
}

In [115]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [116]:
nodes = get_nodes(df, sab, relations)

Compound
Protein


In [117]:
sab = 'IDGD'
dcc = "IDG"
df = get_sab_df(sab, dcc)
df.head()

173212741it [02:11, 1322187.94it/s]


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,PUBCHEM:54684141 CUI,SNOMEDCT_US:33339001 CUI,indication,IDGD,,IDG
1,PUBCHEM:5311101 CUI,C0155880,indication,IDGD,,IDG
2,PUBCHEM:456201 CUI,C0036508,indication,IDGD,,IDG
3,PUBCHEM:4744 CUI,C0003950,indication,IDGD,,IDG
4,PUBCHEM:6067 CUI,C0030920,indication,IDGD,,IDG


In [118]:
df.relation.unique()

array(['indication', 'inverse_indication'], dtype=object)

In [119]:
relations = {
	"indication": {
		"source": "Compound",
		"target": "Disease or Phenotype",
	},
}

In [120]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [121]:
nodes = get_nodes(df, sab, relations)

Compound
Disease or Phenotype


## Kid's First

In [122]:
sab = 'KF'
dcc = "KF"
df = get_sab_df(sab, dcc)
df.head()

173212741it [02:10, 1323484.41it/s]


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,KFPT:PT-9X741E8Z CUI,KFCOHORT:SD-0TYVY1TW CUI,belongs_to_cohort,KF,,KF
1,KFPT:PT-0AQN56EH CUI,KFCOHORT:SD-NMVV8A1Y CUI,belongs_to_cohort,KF,,KF
2,KFPT:PT-1HNTASHD CUI,KFCOHORT:SD-NMVV8A1Y CUI,belongs_to_cohort,KF,,KF
3,KFPT:PT-2KE662T4 CUI,KFCOHORT:SD-NMVV8A1Y CUI,belongs_to_cohort,KF,,KF
4,KFPT:PT-2Q0TYD81 CUI,KFCOHORT:SD-NMVV8A1Y CUI,belongs_to_cohort,KF,,KF


In [123]:
df.relation.unique()

array(['belongs_to_cohort', 'has_phenotype', 'gene_has_variants',
       'inverse_belongs_to_cohort', 'phenotype_of',
       'inverse_gene_has_variants'], dtype=object)

In [124]:
relations = {
	"has_phenotype": {
		"source": "KFPT",
		"target": "Disease or Phenotype",
	},
	"belongs_to_cohort": {
		"source": "KFGENEBIN",
		"target": "KFCOHORT",
	},
	"gene_has_variants": {
		"source": "KFGENEBIN",
		"target": "Gene",
	},
	"belongs_to_cohort": {
		"source": "KFPT",
		"target": "KFCOHORT",
	},
}

In [125]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [126]:
set([i.split(":")[0] for i in df[df.relation == 'belongs_to_cohort'].source.unique()])

{'KFGENEBIN', 'KFPT'}

In [127]:
kfpt = []
genebin = []
for i in df[df.relation == 'belongs_to_cohort'].source.unique():
	if 'KFPT:' in i:
		kfpt.append(i)
	else:
		genebin.append(i)


In [128]:
ind = df[(df.relation == 'belongs_to_cohort') & (df.source.isin(genebin))].index
len(ind)

13375

In [129]:
df.loc[ind, 'relation'] = 'belongs_to_cohort_bin'

In [130]:
relations = {
	"has_phenotype": {
		"source": "KFPT",
		"target": "Disease or Phenotype",
	},
	"belongs_to_cohort_bin": {
		"source": "KFGENEBIN",
		"target": "KFCOHORT",
	},
	"gene_has_variants": {
		"source": "KFGENEBIN",
		"target": "Gene",
	},
	"belongs_to_cohort": {
		"source": "KFPT",
		"target": "KFCOHORT",
	},
}

In [131]:
nodes = get_nodes(df, sab, relations)

KFPT
Disease or Phenotype
KFGENEBIN
KFCOHORT
Gene


## MoTrPAC

In [133]:
sab = 'MOTRPAC'
dcc = "MoTrPAC"
df = get_sab_df(sab, dcc)
df.head()

173212741it [02:15, 1274441.50it/s]


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,MOTRPAC:ENSRNOG00000000012-gastrocnemius-male CUI,ENSEMBL:ENSRNOG00000000012 CUI,associated_with,MOTRPAC,,MoTrPAC
1,MOTRPAC:ENSRNOG00000000073-gastrocnemius-male CUI,ENSEMBL:ENSRNOG00000000073 CUI,associated_with,MOTRPAC,,MoTrPAC
2,MOTRPAC:ENSRNOG00000000130-gastrocnemius-male CUI,ENSEMBL:ENSRNOG00000000130 CUI,associated_with,MOTRPAC,,MoTrPAC
3,MOTRPAC:ENSRNOG00000000165-gastrocnemius-male CUI,ENSEMBL:ENSRNOG00000000165 CUI,associated_with,MOTRPAC,,MoTrPAC
4,MOTRPAC:ENSRNOG00000000245-gastrocnemius-male CUI,ENSEMBL:ENSRNOG00000000245 CUI,associated_with,MOTRPAC,,MoTrPAC


In [134]:
df.relation.unique()

array(['associated_with', 'located_in', 'sex', 'inverse_associated_with',
       'location_of', 'inverse_sex'], dtype=object)

In [135]:
relations = {
	"associated_with": {
		"source": "MOTRPAC",
		"target": "Gene",
	},
	"located_in": {
		"source": "MOTRPAC",
		"target": "Anatomy",
	},
	"sex": {
		"source": "MOTRPAC",
		"target": "Sex",
	}
}

In [136]:
set(df.relation) - set(relations.keys())

{'inverse_associated_with', 'inverse_sex', 'location_of'}

In [137]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [138]:
nodes = get_nodes(df, sab, relations)

MOTRPAC
Gene
Anatomy
Sex


## MW

In [139]:
sab = 'MW'
dcc = "Metabolomics"
df = get_sab_df(sab, dcc)
df.head()

173212741it [02:13, 1294193.52it/s]


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,C0021853,C0030234,produces,MW,,Metabolomics
1,C0021853,C0070210,produces,MW,,Metabolomics
2,C0021853,C0023139,produces,MW,,Metabolomics
3,C0021853,PUBCHEM:3893 CUI,produces,MW,,Metabolomics
4,C0021853,C0027138,produces,MW,,Metabolomics


In [140]:
df.relation.unique()

array(['produces', 'causally_influences', 'correlated_with_condition',
       'produced_by', 'causally_influenced_by',
       'inverse_correlated_with_condition'], dtype=object)

In [141]:
target = df[df.relation == 'produces'].target.unique()

In [142]:
labels.loc[list(set(target).intersection(labels.index))]

Unnamed: 0,label
PUBCHEM:4914 CUI,Procaine
PUBCHEM:8096 CUI,"3,3'-Thiobispropanoic acid"
PUBCHEM:33255 CUI,Cefazolin
C0061298,glaucarubinone
C0061414,glucose-1-phosphate
...,...
PUBCHEM:1833 CUI,2-(5-Methoxy-1H-indol-3-yl)-ethylamine
C0002139,Alloisoleucine
C0007299,carteolol
PUBCHEM:5280882 CUI,PGC2


In [143]:
from glob import glob

In [144]:
relations = {
	"produces": {
		"source": "Anatomy",
		"target": "Metabolite",
	},
	"causally_influences": {
		"source": "Gene",
		"target": "Metabolite",
	},
	"correlated_with_condition": {
		"source": "Metabolite",
		"target": "Disease or Phenotype",
	}
}

In [145]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [146]:
df = df[df.relation.isin(list(relations.keys()))]

In [147]:
nodes = get_nodes(df, sab, relations)

Anatomy
Metabolite
Gene
Disease or Phenotype


In [148]:
nodes["Metabolite"] = nodes["Metabolite"].loc[[i for i in nodes['Metabolite'].index if i != 'C0035298']]

In [149]:
nodes["Metabolite"].loc['PUBCHEM:638015 CUI']

label      all-<i>trans</i>-retinal
type                     Metabolite
PUBCHEM                      638015
Name: PUBCHEM:638015 CUI, dtype: object

In [150]:
nodes["Metabolite"].loc['C1426339', 'label'] = 'COQ4'

In [151]:
nodes['Metabolite'].loc[['C3495801', 'C0033036', 'C0038454']]

Unnamed: 0_level_0,label,type,PUBCHEM
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C3495801,Granulomatosis with polyangiitis,Metabolite,145459098
C0033036,Atrial Premature Complexes,Metabolite,145459091
C0038454,Cerebrovascular accident,Metabolite,6131


In [152]:
new_index = []
id_mapper = {}
for k,v in nodes['Metabolite'].iterrows():
	if k in ['C3495801', 'C0033036', 'C0038454']:
		index = "PUBMED:%d CUI"%int(v["PUBCHEM"])
		new_index.append(index)
		id_mapper[k] = index
		print(index)
	else:
		new_index.append(k)
len(new_index)

PUBMED:6131 CUI
PUBMED:145459091 CUI
PUBMED:145459098 CUI


9725

In [153]:
nodes['Metabolite'].loc['PUBMED:145459098 CUI', 'label'] = "(3'-5')-Guanylyladenosine"
nodes['Metabolite'].loc['PUBMED:145459091 CUI', 'label'] = "(3'-5')-Adenylylcytidine"
nodes['Metabolite'].loc['PUBMED:6131 CUI', 'label'] = "5'-Cytidylic acid"

In [154]:
id_mapper

{'C0038454': 'PUBMED:6131 CUI',
 'C0033036': 'PUBMED:145459091 CUI',
 'C3495801': 'PUBMED:145459098 CUI'}

In [155]:
for k,v in df[df.source.isin(['C3495801', 'C0033036', 'C0038454'])].iterrows():
	source = id_mapper[v['source']]
	df.at[k, 'source'] = source

In [156]:
for k,v in df[df.target.isin(['C3495801', 'C0033036', 'C0038454'])].iterrows():
	target = id_mapper[v['target']]
	df.at[k, 'target'] = target

In [157]:
nodes["Metabolite"].loc[[i for i in nodes["Metabolite"].index if i not in ['C3495801', 'C0033036', 'C0038454']]].to_csv('out/sab/%s.%s.nodes.csv'%(sab,'Metabolite'))

In [158]:
ind = df[df.target == 'C0035298'].index

df.loc[ind, 'target'] = 'PUBCHEM:638015 CUI'

In [159]:
df.to_csv('out/sab/%s.edges.csv'%sab)

## SPARC

In [160]:
sab = 'NPO'
dcc = "SPARC"
df = get_sab_df(sab, dcc)
df.head()

173212741it [02:11, 1319807.60it/s]


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,C0228398,C2331300,isa,NPO,,SPARC
1,C0224520,C0224518,isa,NPO,,SPARC
2,C0036700,C0752060,part_of,NPO,,SPARC
3,C0038246,C0446846,isa,NPO,,SPARC
4,NIFSTD:sao925531236 CUI,NIFSTD:sao1593305396 CUI,isa,NPO,,SPARC


In [161]:
df.relation.unique()

array(['isa', 'part_of', 'contributes_to_morphology_of', 'is_part_of',
       'delineates', 'isdelineatedby', 'has_role', 'inverse_isa',
       'has_part', 'inverse_contributes_to_morphology_of',
       'inverse_is_part_of', 'inverse_delineates',
       'inverse_isdelineatedby', 'role_of'], dtype=object)

In [162]:
df.shape

(7974, 6)

In [163]:
ids = [i for i in df.source.unique()] + [i for i in df.target.unique()]
ids = list(set(ids))
len(ids)

1912

In [164]:
concept_code[concept_code.id.isin(ids)].SAB.unique()

array(['LCH_NW', 'MTH', 'NCI', 'LNC', 'CSP', 'UWDA', 'MSH', 'SNOMEDCT_US',
       'PSY', 'FMA', 'CHV', 'ICF-CY', 'ICF', 'NEU', 'OMIM', 'MMSL',
       'HL7V2.5', 'NCBI', 'MEDCIN', 'MEDLINEPLUS', 'DRUGBANK', 'CPM',
       'MTHICD9', 'GO', 'SNOMEDCT_VET', 'GS', 'RXNORM', 'VANDF',
       'ICD10AM', 'MTHSPL', 'PDQ', 'NDDF', 'SRC', 'MDR', 'AOT', 'ATC',
       'USP', 'ICPC2ICD10ENG', 'UBERON', 'CHEBI', 'PATO', 'PR', 'CL',
       'SO', 'OBI', 'CARO', 'MONDO', 'EFO', 'MA', 'ZFA', 'AZ', 'PUBCHEM',
       'ENTREZ', 'NIFSTD', 'ILX', 'ILX.TR', 'NIFSTD.NIFEXT', 'NIFSTD.NLX',
       'NIFSTD.NLX.MOL', 'NIFSTD.NLX.ORG', 'PAX.PAXSPN', 'PAX.PAXRAT'],
      dtype=object)

In [165]:
tmp = concept_code[concept_code.id.isin(ids)].groupby('id')

In [166]:
len(ids)

1912

In [167]:
def is_substring(l, substring):
	for i in l:
		if substring in i:
			return True
	else:
		return False

In [168]:
mapping = {}
for g in tmp.groups:
	group = tmp.get_group(g)
	if 'UBERON' in list(group.SAB):
		gr = "Anatomy"
	elif 'ENTREZ' in list(group.SAB):
		gr = "Gene"
	elif 'PUBCHEM' in list(group.SAB):
		gr = "Compound"
	elif is_substring(list(group.SAB), 'NIFSTD'):
		gr = "NIFSTD"
	elif is_substring(list(group.SAB), 'PAX'):
		gr = "Anatomy"
	elif 'FMA' in list(group.SAB):
		gr = "Anatomy"
	elif 'PATO' in list(group.SAB):
		gr = "PATO"
	elif 'PR' in list(group.SAB):
		gr = "Protein"
	elif 'NCBI' in list(group.SAB):
		gr = "Taxon"
	elif 'CHEBI' in list(group.SAB):
		gr = "Compound"
	elif 'LNC' in list(group.SAB):
		gr = "Anatomy"
	elif 'OBI' in list(group.SAB):
		gr = "Taxon"
	elif 'CL' in list(group.SAB):
		gr = "Anatomy"
	elif 'FMA' in list(group.SAB):
		gr = "Anatomy"
	elif 'ILX' in list(group.SAB):
		gr = "ILX"
	elif 'ILX.TR' in list(group.SAB):
		gr = "ILX"
	elif 'UWDA' in list(group.SAB):
		gr = "Anatomy"
	else:
		gr = list(group.SAB)[0]
	if gr not in mapping:
		mapping[gr] = []
	mapping[gr].append(g)

In [169]:
for k,v in mapping.items():
	print(k, len(v))

Anatomy 1225
Compound 122
Taxon 67
GO 9
Gene 122
ILX 227
NIFSTD 83
PATO 12
Protein 41
SO 4


In [170]:
nodes = {}
for node_type, node_index in mapping.items():
	print(node_type)
	filename = 'out/sab/%s.%s.nodes.csv'%(sab, node_type)
	if (os.path.isfile(filename)):
		print('%s found'%filename)
		node_df = pd.read_csv(filename, index_col=0)
		nodes[node_type] = node_df
	else:
		node_index = list(node_index)
		node_df = pd.DataFrame(index=node_index, columns=["label", "type"])
		node_df.index.name = "id"
		node_df["type"] = node_type
		node_df["label"] = node_df.index
		with_label = list(set(labels.index).intersection(node_index))
		node_df.loc[with_label, 'label'] = labels.loc[with_label, "label"]
		if node_type == "Gene":
			node_df.loc[with_label, 'label'] = [i.replace(" gene", "") for i in node_df.loc[with_label, 'label']]
		# CODE
		filtered = concept_code[concept_code["id"].isin(node_df.index)]
		filtered = filtered[filtered.CODE != 'NOCODE']
		# filter keys
		code_keys = filtered.SAB.value_counts()[filtered.SAB.value_counts() > node_df.shape[0]/2].index
		grouped_concept = concept_code[concept_code["id"].isin(node_df.index)].groupby("id")
		score_df = pd.DataFrame(index=filtered.id.unique(), columns=code_keys)
		score_df.index.name = 'id'
		grouped_concept = filtered.groupby("id")
		for group in score_df.index:
			g = grouped_concept.get_group(group)
			g = g[g.SAB.isin(code_keys)]
			cols = g.SAB
			values = g.CODE
			score_df.loc[group]=pd.Series(list(values), index=cols).groupby('SAB').first()
		node_df = node_df.merge(score_df, on="id")
		node_df.to_csv(filename)
		nodes[node_type] = node_df

Anatomy
Compound
Taxon
GO
Gene
ILX
NIFSTD
PATO
Protein
SO


In [171]:
df.relation.unique()

array(['isa', 'part_of', 'contributes_to_morphology_of', 'is_part_of',
       'delineates', 'isdelineatedby', 'has_role', 'inverse_isa',
       'has_part', 'inverse_contributes_to_morphology_of',
       'inverse_is_part_of', 'inverse_delineates',
       'inverse_isdelineatedby', 'role_of'], dtype=object)

In [172]:
relations = [
'isa',
'part_of',
'is_part_of',
'contributes_to_morphology_of',
'delineates',
'isdelineatedby',
'has_role',
]

In [173]:
df[df.relation.isin(relations)].to_csv("out/sab/%s.edges.csv"%sab)

In [None]:
# for dcc, filenames in dcc_filename.items():
# 	print(dcc)
# 	compress(filenames, dcc)

IDG
File Paths:
['IDGD.edges.csv', 'IDGP.Protein.nodes.csv', 'IDGP.edges.csv', 'IDGP.Compound.nodes.csv', 'IDGD.Compound.nodes.csv', 'IDGD.Disease.nodes.csv']
GlyGen
File Paths:
['GLYCANS.Glytoucan.nodes.csv', 'PROTEOFORM.Glycoprotein.nodes.csv', 'PROTEOFORM.Isoform.nodes.csv', 'PROTEOFORM.Glytoucan.nodes.csv', 'PROTEOFORM.GlyGen Location.nodes.csv', 'PROTEOFORM.Amino Acid.nodes.csv', 'GLYCANS.edges.csv', 'PROTEOFORM.Glycoprotein Citation.nodes.csv', 'GLYCANS.GlyGen Residue.nodes.csv', 'PROTEOFORM.Protein.nodes.csv', 'GLYCANS.Glycan Motif.nodes.csv', 'PROTEOFORM.Glycoprotein Evidence.nodes.csv', 'GLYCANS.Glycosyltransferase Reaction.nodes.csv', 'GLYCANS.Glycosylation.nodes.csv', 'PROTEOFORM.GP ID2PRO.nodes.csv', 'GLYCANS.GlyGen src.nodes.csv', 'GLYCANS.GlyGen Glycosequence.nodes.csv', 'GLYCANS.Protein.nodes.csv', 'PROTEOFORM.edges.csv', 'PROTEOFORM.Glycosylation Site.nodes.csv']
4DN
File Paths:
['4DN.4DN File.nodes.csv', '4DN.Anatomy.nodes.csv', '4DN.edges.csv', '4DN.4DN QVal Bin.nodes

## Other Files
### CLINVAR

In [204]:
sab = 'CLINVAR'
df = get_sab_df(sab)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../dd_data/17July2024DD_csvs/CUI-CUIs.csv'

In [206]:
df = pd.read_csv("out/sab/CLINVAR.edges.csv", index_col=0)
df.head()

Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,41c1d77f-da64-5851-a9a1-64b97a2c2755,ade0b23c-d4fa-52c1-b202-b5f6461e8358,gene_associated_with_disease_or_phenotype,CLINVAR,,
1,41c1d77f-da64-5851-a9a1-64b97a2c2755,362612f1-2149-5e02-91a3-854fb72da454,gene_associated_with_disease_or_phenotype,CLINVAR,,
2,de85472e-53a1-50f7-a20d-7ec256eff945,73fa4088-748c-5396-91b8-f6b92d895d10,gene_associated_with_disease_or_phenotype,CLINVAR,,
3,de85472e-53a1-50f7-a20d-7ec256eff945,ebe6f385-4bd7-5d11-ba46-171a40fdc947,gene_associated_with_disease_or_phenotype,CLINVAR,,
4,de85472e-53a1-50f7-a20d-7ec256eff945,5ba85480-3d16-512d-ad51-22132b49e374,gene_associated_with_disease_or_phenotype,CLINVAR,,


In [175]:
df.relation.unique()

array(['gene_associated_with_disease_or_phenotype',
       'inverse_gene_associated_with_disease_or_phenotype'], dtype=object)

In [176]:
set(i.split(":")[0] for i in df[df.relation == 'gene_associated_with_disease_or_phenotype'].target if ":" in i)

{'DOID', 'EFO', 'HP', 'MEDGEN', 'MONDO', 'MSH', 'OMIM'}

In [177]:
relations = ["gene_associated_with_disease_or_phenotype"]
df[df.relation.isin(relations)].to_csv("out/sab/%s.edges.csv"%sab)

In [178]:
labels.loc['C1825487']

label    A2ML1 gene
Name: C1825487, dtype: object

In [179]:
relations = {
	"gene_associated_with_disease_or_phenotype": {
		"source": "Gene",
		"target": "Disease or Phenotype",
	}
}

In [180]:
nodes = get_nodes(df, sab, relations)

Gene
Disease or Phenotype


In [181]:
ind = df[df.target == 'C1417848'].index
ind

Index([ 68277,  68278, 104175, 104176, 104177, 104178, 104179, 104180, 104181,
       104182, 104183, 104184, 104185],
      dtype='int64')

In [182]:
df.loc[[i for i in df.index if i not in ind]].to_csv("out/sab/%s.edges.csv"%sab)

In [183]:
nodes['Disease or Phenotype'].loc['C1417848']

KeyError: 'C1417848'

In [184]:
nodes['Disease or Phenotype'].loc[[i for i in nodes['Disease or Phenotype'].index if i != 'C1417848']].to_csv("out/sab/%s.%s.nodes.csv"%(sab, 'Disease or Phenotype'))

In [209]:
df[df.relation == 'gene_associated_with_disease_or_phenotype'].to_csv('out/sab/CLINVAR.edges.csv')

### HGNCHPO

In [185]:
sab = 'HGNCHPO'
df = get_sab_df(sab)
df.head()

173212741it [03:14, 891596.23it/s] 


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,C1826605,C4025901,associated_with,HGNCHPO,,
1,C1826605,C4021817,associated_with,HGNCHPO,,
2,C1826605,C0026633,associated_with,HGNCHPO,,
3,C1826605,C2183966,associated_with,HGNCHPO,,
4,C1826605,C4025887,associated_with,HGNCHPO,,


In [186]:
df.relation.unique()

array(['associated_with', 'inverse_associated_with'], dtype=object)

In [187]:
hp = df[df.relation == 'associated_with'].target.unique()

In [188]:
df[df.relation == 'associated_with'].to_csv("out/sab/%s.edges.csv"%sab)

In [189]:
relations = {
	"associated_with": {
		"source": "Gene",
		"target": "Disease or Phenotype",
	}
}

In [190]:
nodes = get_nodes(df, sab, relations)

Gene
Disease or Phenotype


### MSIGDB

In [191]:
sab = 'MSIGDB'
df = get_sab_df(sab)
df.head()

173212741it [03:04, 936987.95it/s] 


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,MSIGDB:M40835 CUI,C1424141,targets_expression_of_gene,MSIGDB,,
1,MSIGDB:M1556 CUI,C1424200,chr_band_contains_gene,MSIGDB,,
2,MSIGDB:M1884 CUI,C1424528,pathway_associated_with_gene,MSIGDB,,
3,MSIGDB:M39822 CUI,C1424528,pathway_associated_with_gene,MSIGDB,,
4,MSIGDB:M752 CUI,C1424528,pathway_associated_with_gene,MSIGDB,,


In [192]:
df.relation.unique()

array(['targets_expression_of_gene', 'chr_band_contains_gene',
       'pathway_associated_with_gene', 'has_marker_gene',
       'has_signature_gene', 'inverse_targets_expression_of_gene',
       'inverse_chr_band_contains_gene',
       'inverse_pathway_associated_with_gene', 'inverse_has_marker_gene',
       'inverse_has_signature_gene'], dtype=object)

In [193]:
d = df[df.relation == 'has_signature_gene']

In [194]:
set([i.split(":")[0] for i in d.source])

{'MSIGDB'}

In [195]:
concept_code[concept_code.id.isin(d.target.unique())].SAB.value_counts()

SAB
ENSEMBL        4384
MTH            4384
HGNC           4384
ENTREZ         4380
OMIM           4349
NCI            2376
ORDO           1576
LNC             346
CHV              46
PDQ              38
MONDO            30
MSH              22
CSP              16
EFO              13
LCH_NW            5
SNOMEDCT_US       4
CHEBI             2
AOT               1
DOID              1
Name: count, dtype: int64

In [196]:
relations = {
	'targets_expression_of_gene': {
		"source": "MSIGDB",
		"target": "Gene"
	},
	"chr_band_contains_gene": {
		"source": "MSIGDB",
		"target": "Gene"
	},
	"pathway_associated_with_gene": {
		"source": "MSIGDB",
		"target": "Gene"
	},
	"has_marker_gene": {
		"source": "MSIGDB",
		"target": "Gene"
	},
	"has_signature_gene": {
		"source": "MSIGDB",
		"target": "Gene"
	},
	"has_signature_gene": {
		"source": "MSIGDB",
		"target": "Gene"
	}
}

In [197]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [198]:
nodes = get_nodes(df, sab, relations)

MSIGDB
Gene


## Biomarker Partnersip

In [199]:
sab = 'BIOMARKER'
df = get_sab_df(sab)
df.head()

173212741it [02:18, 1246522.43it/s]


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,BIOMARKER:AN3902-1 CUI,DBSNP:rs7785013 CUI,indicated_by_presence_of,BIOMARKER,,
1,BIOMARKER:AN3902-1 CUI,C0005767,determined_using_sample_from,BIOMARKER,,
2,BIOMARKER:AN3902-1 CUI,OBCI:0000008 CUI,has_best_classification,BIOMARKER,,
3,BIOMARKER:AN3902-1 CUI,C0684249,_indicates_risk_of_developing,BIOMARKER,,
4,BIOMARKER:AN3903-1 CUI,DBSNP:rs73159014 CUI,indicated_by_presence_of,BIOMARKER,,


In [201]:
df.relation.unique()

array(['indicated_by_presence_of', 'determined_using_sample_from',
       'has_best_classification', '_indicates_risk_of_developing',
       'indicated_by_above_normal_level_of', 'diagnostic_for',
       'indicated_by_below_normal_level_of', 'prognostic_for',
       'monitors_status_of', 'inverse_indicated_by_presence_of',
       'inverse_determined_using_sample_from',
       'inverse_has_best_classification',
       'inverse__indicates_risk_of_developing',
       'inverse_indicated_by_above_normal_level_of',
       'inverse_diagnostic_for',
       'inverse_indicated_by_below_normal_level_of',
       'inverse_prognostic_for', 'inverse_monitors_status_of'],
      dtype=object)

In [3]:
from glob import glob

In [30]:
for filename in tqdm(glob('out/sab/*nodes.csv')):
	df = pd.read_csv(filename, index_col=0)
	df.astype(str).to_csv(filename)


  0%|          | 0/85 [00:00<?, ?it/s]

100%|██████████| 85/85 [00:34<00:00,  2.45it/s]


In [48]:
for filename in tqdm(glob('out/sab/*nodes.csv')):
	df = pd.read_csv(filename, index_col=0)
	
	columns = ['label', 'type']
	for col in df.columns:
		if col not in columns or col != 'NCI':
			ind = df[~df[col].isna()].index
			df.loc[ind, col] = df.loc[ind, col].apply(lambda x: '%s:%d'%(col, int(x)) if (type(x) == float or type(x) == int) else x)
			columns.append(col)
	df.to_csv(filename)

  0%|          | 0/85 [00:00<?, ?it/s]

100%|██████████| 85/85 [00:40<00:00,  2.09it/s]


In [18]:
sab_dict = {
	"LINCS": ["LINCS"],
	"4DN": ["4DN"],
	"ERCC": ["ERCCRBP", "ERCCREG"],
	"GlyGen": ["PROTEOFORM", "GLYCANS"],
	"GTEx": ["GTEXEXP", "GTEXEQTL"],
	"HuBMAP": ["AZ", "HMAZ"],
	"IDG": ["IDGP", "IDGD"],
	"KF": ["KF"],
	"MoTrPAC": ["MOTRPAC"],
	"MW": ["MW"],
	"SPARC": ["NPO"],
	"CLINVAR": ["CLINVAR"],
	"HGNCHPO": ["HGNCHPO"],
	"MSIGDB": ["MSIGDB"],
	"HGNCUNIPROT": ["HGNCUNIPROT"],
	"HGNCENZ": ["HGNCENZ"]
	
}


dcc_sabs = set()
for i in sab_dict.values():
	dcc_sabs = dcc_sabs.union(i)
len(dcc_sabs)

21

In [34]:
df = pd.read_csv('out/sab/hgnc_enzyme.Enzyme.nodes.csv', index_col=0)
df.head()

Unnamed: 0,label,type,HGNC,ENSEMBL,OMIM,ORDO,ENTREZ,NCI
5c17d548-700e-5f58-af5a-e1d3fe19e691,ADSS1,Gene,20093,ENSG00000185100,612498.0,469928.0,122622.0,
824f348c-029c-5a95-a0a2-7aac6ed3339a,MAPK8,Gene,6881,ENSG00000107643,601158.0,,5599.0,
04b1db5c-26ef-52b0-8330-d35da65847a1,QTRT1,Gene,23797,ENSG00000213339,609615.0,,81890.0,
346a68f6-3a95-5f06-9eb5-3c485ebc1973,PTGS1,Gene,9604,ENSG00000095303,176805.0,,5742.0,
6969bb6a-32b9-5b4e-ac1c-56c7f64776fe,HM13,Gene,16435,ENSG00000101294,607106.0,,81502.0,


In [46]:
df[~df.ORDO.isna()].ORDO.apply(lambda x: 'blah' if (type(x) == float or type(x) == int) else 'ORDO:%d'%int(x))

5c17d548-700e-5f58-af5a-e1d3fe19e691    blah
d4513ef4-e71a-5350-b138-b45e9735fa1d    blah
a728b6a9-7ee0-56b2-b87d-c6d99f69cb07    blah
c55f6180-5f5c-5395-98d4-4d731e20a07f    blah
5dfa2a20-0fe6-54c2-87c4-6103155f3ac4    blah
                                        ... 
2052cb2d-1b4f-5570-878a-c4f8662d9780    blah
382fa520-924a-5d05-ae90-b467d1ae7985    blah
772b77ea-de31-53cb-92b0-96536be9ed47    blah
32ab3dca-3319-5551-b028-61bc2cfdcded    blah
77e9a191-cb6f-5fea-868d-a8a3dc394446    blah
Name: ORDO, Length: 761, dtype: object

In [19]:
file_sabs = set()
for i in glob('out/sab/*.csv'):
	s = i.split(".")[0].replace("out/sab/", "")
	file_sabs.add(s)

In [20]:
len(file_sabs), len(dcc_sabs), len(file_sabs.intersection(dcc_sabs))

(21, 21, 21)

In [21]:
import zlib
import zipfile

def compress(file_names, zip_name):
    print("File Paths:")
    print(file_names)

    path = "out/sab/"

    # Select the compression mode ZIP_DEFLATED for compression
    # or zipfile.ZIP_STORED to just store the file
    compression = zipfile.ZIP_DEFLATED

    # create the zip file first parameter path/name, second mode
    zf = zipfile.ZipFile("out/compressed/%s.zip"%zip_name, mode="w")
    try:
        for file_name in file_names:
            # Add file to the zip file
            # first parameter file to zip, second filename in zip
            zf.write(path + file_name, file_name, compress_type=compression)

    except FileNotFoundError:
        print("An error occurred")
    finally:
        # Don't forget to close the file!
        zf.close()

In [22]:
dict_sab = {}
for k,v in sab_dict.items():
	for i in v:
		dict_sab[i] = k

In [23]:
dcc_filename = {}
for i in glob('out/sab/*.csv'):
	filename = i.replace("out/sab/", "")
	sab = filename.split(".")[0]
	dcc = dict_sab[sab]
	if dcc not in dcc_filename:
		dcc_filename[dcc] = []
	dcc_filename[dcc].append(filename)

In [24]:
for dcc, filenames in dcc_filename.items():
	compress(filenames, dcc)

File Paths:
['HGNCUNIPROT.Protein.edges.csv', 'HGNCUNIPROT.Gene.edges.csv', 'HGNCUNIPROT.edges.csv']
File Paths:
['IDGD.edges.csv', 'IDGP.Protein.nodes.csv', 'IDGP.edges.csv', 'IDGD.Disease or Phenotype.nodes.csv', 'IDGP.Compound.nodes.csv', 'IDGD.Compound.nodes.csv']
File Paths:
['GLYCANS.Glytoucan.nodes.csv', 'PROTEOFORM.Glycoprotein.nodes.csv', 'PROTEOFORM.Isoform.nodes.csv', 'PROTEOFORM.Glytoucan.nodes.csv', 'PROTEOFORM.GlyGen Location.nodes.csv', 'PROTEOFORM.Amino Acid.nodes.csv', 'GLYCANS.edges.csv', 'PROTEOFORM.Glycoprotein Citation.nodes.csv', 'GLYCANS.GlyGen Residue.nodes.csv', 'PROTEOFORM.Protein.nodes.csv', 'GLYCANS.Glycan Motif.nodes.csv', 'PROTEOFORM.Glycoprotein Evidence.nodes.csv', 'GLYCANS.Glycosyltransferase Reaction.nodes.csv', 'GLYCANS.Glycosylation.nodes.csv', 'PROTEOFORM.GP ID2PRO.nodes.csv', 'GLYCANS.GlyGen src.nodes.csv', 'GLYCANS.GlyGen Glycosequence.nodes.csv', 'GLYCANS.Protein.nodes.csv', 'PROTEOFORM.edges.csv', 'PROTEOFORM.Glycosylation Site.nodes.csv']
File 

In [516]:
concept_code[concept_code.id == 'C1417848']

Unnamed: 0,id,code_id,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
1503118,C1417848,MTH:NOCODE,MTH,NOCODE,,,,
3184031,C1417848,OMIM:162080,OMIM,162080,,,,
4303000,C1417848,HGNC:8002,HGNC,8002,,,,
4965231,C1417848,ORDO:123930,ORDO,123930,,,,
5032941,C1417848,MONDO:0005283,MONDO,0005283,,,,
5088526,C1417848,EFO:0003839,EFO,0003839,,,,
5139873,C1417848,ENSEMBL:ENSG00000129535,ENSEMBL,ENSG00000129535,13.0,24078662.0,24115010.0,
5593888,C1417848,ENTREZ:4901,ENTREZ,4901,,24078662.0,24114949.0,


In [8]:
for filename in glob('out/sab/*.nodes.csv'):
	df = pd.read_csv(filename, index_col=0)
	new_col = [i for i in df.columns if "SNOMED" not in i.upper()]
	if len(new_col) < len(df.columns):
		print(filename)
		df[new_col].to_csv(filename)

out/sab/GTEXEXP.Anatomy.nodes.csv
out/sab/GTEXEQTL.Anatomy.nodes.csv
out/sab/ERCCRBP.Biofluid.nodes.csv
out/sab/MOTRPAC.Anatomy.nodes.csv
out/sab/NPO.Taxon.nodes.csv
out/sab/ERCCREG.Anatomy.nodes.csv
out/sab/IDGD.Disease or Phenotype.nodes.csv
out/sab/MW.Disease or Phenotype.nodes.csv
out/sab/MW.Anatomy.nodes.csv
out/sab/KF.Disease or Phenotype.nodes.csv
