In [1]:
import pandas as pd
import csv
from tqdm import tqdm
import os

In [2]:
data_dir = '../dd_data/DD28Mar2024CSV/'

## Prepare Code DF

In [3]:
codes = pd.read_csv(data_dir + "CODEs.csv", index_col=0)

  codes = pd.read_csv(data_dir + "CODEs.csv", index_col=0)


In [4]:
concept_code = pd.read_csv(data_dir + "CUI-CODEs.csv")
concept_code.columns = ["id", "CodeID:ID"]

In [5]:
concept_code = pd.merge(concept_code, codes, on="CodeID:ID", how='left')


In [6]:
concept_code.columns = ["id", "code_id", "SAB", "CODE", "value:float","lowerbound:float","upperbound:float","unit"]

In [7]:
concept_code.head()

Unnamed: 0,id,code_id,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
0,C0000039,LNC:LP15542-1,LNC,LP15542-1,,,,
1,C0000107,MSH:D015059,MSH,D015059,,,,
2,C0000119,MSH:D015062,MSH,D015062,,,,
3,C0000163,CSP:0059-6844,CSP,0059-6844,,,,
4,C0000248,LNC:MTHU027462,LNC,MTHU027462,,,,


## Prepare Label DF

In [8]:
labels = pd.read_csv(data_dir + "CUI-SUIs.csv")
labels.columns = ['id', 'label']
labels = labels.set_index('id')
labels.head()

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
C0000005,(131)I-Macroaggregated Albumin
C0000139,"16,16-Dimethylprostaglandin E2"
C0000163,17-Hydroxycorticosteroids
C0000165,17-Hydroxysteroid Dehydrogenases
C0000190,2'-CMP


In [9]:
code_labels = pd.read_csv(data_dir + "CODE-SUIs.csv")
code_labels.columns = ["label", "id", "type", "cui"]
code_labels.head()

Unnamed: 0,label,id,type,cui
0,Dipalmitoyl Phosphatidylcholine,MSH:D015060,ET,C0000039
1,Cyperquat,MSH:D015655,ET,C0000098
2,1-naththylamine (substance),SNOMEDCT_US:13579002,FN,C0000102
3,1-Sar-8-Ile Angiotensin II,MSH:D015059,ET,C0000107
4,SAR ILE ANGIOTENSIN 02 01 08,MSH:D015059,DSV,C0000107


In [10]:
code_labels = code_labels.groupby('cui').first()

In [11]:
cl = set(code_labels.index) 
l=set(labels.index)
missing = cl - l

In [12]:
labels = pd.concat([labels, code_labels.loc[list(missing), ["label"]]])

In [13]:
labels.loc['PUBCHEM:5496659 CUI']

label    otamixaban
Name: PUBCHEM:5496659 CUI, dtype: object

In [348]:
def get_sab_df(sab, dcc=''):
	counter = 0
	with open(data_dir + 'CUI-CUIs.csv') as o:
		filename = 'out/sab/%s.edges.csv'%sab
		if os.path.isfile(filename): 
			print("%s exists!"%filename)
			return pd.read_csv(filename, index_col=0)
		else:
			csv_reader = csv.reader(o)
			header = None
			rows = []
			for row in tqdm(csv_reader):
				if counter == 0:
					header = ['source', 'target', 'relation', 'SAB', 'evidence_class', 'dcc']
				else:
					if sab == row[3]:
						rows.append(row + [dcc])
				counter += 1
			df = pd.DataFrame(rows, columns=header)
			df.dcc = dcc
			df.to_csv(filename)
			return df

def get_nodes(df, sab, relations):
	# Get Node Ids
	node_ids = {}
	for relation, v in relations.items():
		tmp = df[df["relation"] == relation]
		for key, node_type in v.items():
			node_index = tmp[key]
			if node_type not in node_ids:
				node_ids[node_type] = set()
			node_ids[node_type] = node_ids[node_type].union(node_index)
	# Get DF
	nodes = {}
	for node_type, node_index in node_ids.items():
		print(node_type)
		filename = 'out/sab/%s.%s.nodes.csv'%(sab, node_type)
		if (os.path.isfile(filename)):
			print('%s found'%filename)
			node_df = pd.read_csv(filename, index_col=0)
			nodes[node_type] = node_df
		else:
			node_index = list(node_index)
			node_df = pd.DataFrame(index=node_index, columns=["label", "type"])
			node_df.index.name = "id"
			node_df["type"] = node_type
			node_df["label"] = node_df.index
			with_label = list(set(labels.index).intersection(node_index))
			node_df.loc[with_label, 'label'] = labels.loc[with_label].reset_index().groupby("index").first().loc[with_label, "label"]
			if node_type == "Gene":
				node_df.loc[with_label, 'label'] = [i.replace(" gene", "") for i in node_df.loc[with_label, 'label']]
			# CODE
			filtered = concept_code[concept_code["id"].isin(node_df.index)]
			filtered = filtered[filtered.CODE != 'NOCODE']
			# filter keys
			code_keys = filtered.SAB.value_counts()[filtered.SAB.value_counts() > node_df.shape[0]/2].index
			grouped_concept = concept_code[concept_code["id"].isin(node_df.index)].groupby("id")
			score_df = pd.DataFrame(index=filtered.id.unique(), columns=code_keys)
			score_df.index.name = 'id'
			grouped_concept = filtered.groupby("id")
			for group in score_df.index:
				g = grouped_concept.get_group(group)
				g = g[g.SAB.isin(code_keys)]
				cols = g.SAB
				values = g.CODE
				score_df.loc[group]=pd.Series(list(values), index=cols).groupby('SAB').first()
			node_df = node_df.merge(score_df, on="id")
			node_df.to_csv(filename)
			nodes[node_type] = node_df
	return nodes

# LINCS

In [249]:
sab = 'LINCS'
dcc = 'LINCS'
df = get_sab_df(sab, dcc)

out/sab/LINCS.edges.csv exists!


In [250]:
df.head()

Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,C1412480,PUBCHEM:9830191 CUI,positively_regulated_by,LINCS,0.003757,LINCS
1,C1412234,PUBCHEM:334007 CUI,negatively_regulated_by,LINCS,-0.010076,LINCS
2,C1416933,PUBCHEM:6481236 CUI,negatively_regulated_by,LINCS,-0.006697,LINCS
3,C1416717,PUBCHEM:771910 CUI,negatively_regulated_by,LINCS,-0.020809,LINCS
4,C1423844,PUBCHEM:122718 CUI,positively_regulated_by,LINCS,0.015338,LINCS


In [251]:
relations = {
	"negatively_regulates": {
		"target": "Gene",
		"source": "Compound",
	},
	"positively_regulates": {
		"target": "Gene",
		"source": "Compound",
	},
	"in_similarity_relationship_with": {
		"source": "Compound",
		"target": "Compound",
	}
}

In [253]:
df.relation.unique()

array(['positively_regulated_by', 'negatively_regulated_by',
       'in_similarity_relationship_with', 'positively_regulates',
       'negatively_regulates', 'inverse_in_similarity_relationship_with'],
      dtype=object)

In [254]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [18]:
nodes = get_nodes(df, sab, relations)

Gene
out/sab/LINCS.Gene.nodes.csv found
Compound
out/sab/LINCS.Compound.nodes.csv found


In [19]:
nodes["Gene"].head()

Unnamed: 0_level_0,label,type,ENSEMBL,HGNC,ENTREZ,OMIM
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C1422180,FSD1,Gene,ENSG00000105255,13745.0,79187.0,609828.0
C1539265,DEGS1,Gene,ENSG00000143753,13709.0,8560.0,615843.0
C1413066,CACNA2D2,Gene,ENSG00000007402,1400.0,9254.0,607082.0
C1412062,ABCA3,Gene,ENSG00000167972,33.0,21.0,601615.0
C1421623,ZNF131,Gene,ENSG00000172262,12915.0,7690.0,604073.0


## 4DN

In [243]:
sab = '4DN'
dcc = '4DN'
df = get_sab_df(sab, dcc)

out/sab/4DN.edges.csv exists!


In [244]:
df.head()

Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,4DND:4DNESWST3UBH CUI,EFO:0008808 CUI,has_assay_type,4DN,,4DN
1,4DND:4DNES21D8SP8 CUI,EFO:0008808 CUI,has_assay_type,4DN,,4DN
2,4DND:4DNESW1SPPTD CUI,EFO:0008808 CUI,has_assay_type,4DN,,4DN
3,4DND:4DNESN49VY8X CUI,EFO:0009974 CUI,has_assay_type,4DN,,4DN
4,4DND:4DNES2R6PUEK CUI,EFO:0009974 CUI,has_assay_type,4DN,,4DN


In [245]:
relations = {
	"has_assay_type": {
		"source": "4DN Dataset",
		"target": "Assay",
	},
	"dataset_involves_cell_type": {
		"source": "4DN Dataset",
		"target": "Anatomy", # Uberon, EFO
	},
	"dataset_has_file": {
		"source": "4DN Dataset",
		"target": "4DN File",
	},
	"file_has_loop": {
		"source": "4DN File",
		"target": "4DN Loop",
	},
	"loop_has_qvalue_bin": {
		"source": "4DN Loop",
		"target": "4DN QVal Bin",
	},
	"loop_us_start": {
		"source": "4DN Loop",
		"target": "HSCLO",
	},
	"loop_us_end": {
		"source": "4DN Loop",
		"target": "HSCLO",
	},
	"loop_ds_start": {
		"source": "4DN Loop",
		"target": "HSCLO",
	},
	"loop_ds_end": {
		"source": "4DN Loop",
		"target": "HSCLO",
	}
}

In [246]:
set(df.relation) - set(relations.keys())

{'inverse_dataset_has_file',
 'inverse_dataset_involves_cell_type',
 'inverse_file_has_loop',
 'inverse_has_assay_type',
 'inverse_loop_ds_end',
 'inverse_loop_ds_start',
 'inverse_loop_has_qvalue_bin',
 'inverse_loop_us_end',
 'inverse_loop_us_start'}

In [248]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [23]:
nodes = get_nodes(df, sab, relations)

4DN Dataset
out/sab/4DN.4DN Dataset.nodes.csv found
Assay
out/sab/4DN.Assay.nodes.csv found
Anatomy
out/sab/4DN.Anatomy.nodes.csv found
4DN File
out/sab/4DN.4DN File.nodes.csv found
4DN Loop
out/sab/4DN.4DN Loop.nodes.csv found
4DN QVal Bin
out/sab/4DN.4DN QVal Bin.nodes.csv found
HSCLO
out/sab/4DN.HSCLO.nodes.csv found


In [24]:
nodes['4DN Dataset'].head()

Unnamed: 0_level_0,label,type,4DND
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4DND:4DNESNSTBMBY CUI,in situ Hi-C on HCT116 cells (containing AID-t...,4DN Dataset,4DNESNSTBMBY
4DND:4DNESGTHHJAC CUI,in situ Dnase Hi-C on RUES2 cells differentiat...,4DN Dataset,4DNESGTHHJAC
4DND:4DNES21D8SP8 CUI,Micro-C on H1-ESC cells.H1-ESC,4DN Dataset,4DNES21D8SP8
4DND:4DNESWST3UBH CUI,Micro-C on HFFc6 cells.HFFc6,4DN Dataset,4DNESWST3UBH
4DND:4DNESN49VY8X CUI,in situ Hi-C on HFFc6 cells.HFFc6,4DN Dataset,4DNESN49VY8X


## ERCC

In [21]:
sab = "ERCCRBP"
dcc = "ERCC"
df = get_sab_df(sab, dcc)
df.head()

out/sab/ERCCRBP.edges.csv exists!


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,UNIPROTKB:Q92667 CUI,ENCODE.RBS.HEPG2:chr1.1055031.1055131.plus.b38...,molecularly_interacts_with,ERCCRBP,,ERCC
1,UNIPROTKB:Q92667 CUI,ENCODE.RBS.HEPG2:chr1.1055457.1055531.plus.b38...,molecularly_interacts_with,ERCCRBP,,ERCC
2,UNIPROTKB:Q92667 CUI,ENCODE.RBS.HEPG2:chr1.1055748.1055814.plus.b38...,molecularly_interacts_with,ERCCRBP,,ERCC
3,UNIPROTKB:Q92667 CUI,ENCODE.RBS.HEPG2:chr1.1055843.1055885.plus.b38...,molecularly_interacts_with,ERCCRBP,,ERCC
4,UNIPROTKB:Q92667 CUI,ENCODE.RBS.HEPG2:chr1.1216991.1217090.minus.b3...,molecularly_interacts_with,ERCCRBP,,ERCC


In [22]:
df.relation.unique()

array(['molecularly_interacts_with', 'is_subsequence_of', 'predicted_in',
       'not_predicted_in', 'correlated_in', 'not_correlated_in',
       'overlaps'], dtype=object)

In [23]:
rels = ['molecularly_interacts_with', 'is_subsequence_of', 'predicted_in',
       'not_predicted_in', 'correlated_in', 'not_correlated_in',
       'overlaps']
df[df.relation.isin(rels)].to_csv("out/sab/%s.edges.csv"%sab)

In [24]:
relation = 'overlaps'
change = [i for i in df[df.relation == relation].source if 'RBS.150' in i]
ind = df[(df.source.isin(change)) & (df.relation == relation)].index
df.loc[ind, 'relation'] = 'overlaps_exrna'

In [25]:
set([i.split(":")[0] for i in df[df.relation == 'molecularly_interacts_with'].target])

{'ENCODE.RBS.HEPG2', 'ENCODE.RBS.HEPG2.K562', 'ENCODE.RBS.K562'}

In [26]:
df.relation.unique()

array(['molecularly_interacts_with', 'is_subsequence_of', 'predicted_in',
       'not_predicted_in', 'correlated_in', 'not_correlated_in',
       'overlaps', 'overlaps_exrna'], dtype=object)

In [28]:
relations = {
	"overlaps": {
		"source": "RBP Binding Loci",
		"target": "Gene",
	},
	"overlaps_exrna": {
		"source": "exRNA Loci",
		"target": "Gene", # Uberon, EFO
	},
	"molecularly_interacts_with": {
		"source": "Protein",
		"target": "RBP Binding Loci",
	},
	"is_subsequence": {
		"source": "exRNA Loci",
		"target": "RBP Binding Loci",
	},
	"is_subsequence_of": {
		"source": "exRNA Loci",
		"target": "RBP Binding Loci",
	},
	"correlated_in": {
		"source": "exRNA Loci",
		"target": "Biofluid"
	},
	"not_correlated_in": {
		"source": "exRNA Loci",
		"target": "Biofluid"
	},
	"predicted_in": {
		"source": "Protein",
		"target": "Biofluid"
	},
	"not_predicted_in": {
		"source": "Protein",
		"target": "Biofluid"
	}
}

In [29]:
nodes = get_nodes(df, sab, relations)

RBP Binding Loci
Gene
exRNA Loci
Protein
Biofluid


In [30]:
sab = 'ERCCREG'
dcc = "ERCC"
df = get_sab_df(sab, dcc)
df.head()

out/sab/ERCCREG.edges.csv exists!


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,C0038351,ENCODE.CCRE.ACTIVITY:EH38E0064571.UBERON000094...,part_of,ERCCREG,,ERCC
1,ENCODE.CCRE:EH38E0064571 CUI,ENCODE.CCRE.ACTIVITY:EH38E0064571.UBERON000094...,part_of,ERCCREG,,ERCC
2,ENCODE.CCRE.ACTIVITY:EH38E0064571.UBERON000094...,C1333687,regulates,ERCCREG,,ERCC
3,C0007461,ENCODE.CCRE.ACTIVITY:EH38E0064571.UBERON000187...,part_of,ERCCREG,,ERCC
4,ENCODE.CCRE:EH38E0064571 CUI,ENCODE.CCRE.ACTIVITY:EH38E0064571.UBERON000187...,part_of,ERCCREG,,ERCC


In [31]:
df.relation.unique()

array(['part_of', 'regulates', 'isa', 'located_in',
       'negatively_regulates', 'positively_regulates'], dtype=object)

In [32]:
rels = ['part_of', 'regulates', 'isa', 'located_in',
       'negatively_regulates', 'positively_regulates']
df[df.relation.isin(rels)].to_csv("out/sab/%s.edges.csv"%sab)

In [55]:
source_pref = set([i.split(":")[0] for i in df[df.relation == 'part_of'].source if ":" in i])
source_pref

{'CLINGEN.ALLELE.REGISTRY', 'ENCODE.CCRE', 'UBERON'}

In [56]:
target_pref = set([i.split(":")[0] for i in df[df.relation == 'part_of'].target])
target_pref

{'ENCODE.CCRE.ACTIVITY', 'GTEXEQTL'}

In [61]:
relation = "part_of"
s = 'ENCODE.CCRE'
t = 'ENCODE.CCRE.ACTIVITY'
source_ids = [i for i in df[df.relation == relation].source if i.startswith(s)]
target_ids = [i for i in df[df.relation == relation].target if i.startswith(t)]
ind = df[(df.relation == relation) & (df.source.isin(source_ids)) & (df.target.isin(target_ids))].index
df.loc[ind, 'relation'] = 'part_of_ccre'

In [62]:
relation = "part_of"
s = 'CLINGEN.ALLELE.REGISTRY'
t = 'GTEXEQTL'
source_ids = [i for i in df[df.relation == relation].source if i.startswith(s)]
target_ids = [i for i in df[df.relation == relation].target if i.startswith(t)]
ind = df[(df.relation == relation) & (df.source.isin(source_ids)) & (df.target.isin(target_ids))].index
print(len(ind))
# df.loc[ind, 'relation'] = 'part_of_clingen'

265965


In [63]:
df.loc[ind, 'relation'] = 'part_of_clingen'

In [64]:
relation = "part_of"
t = 'GTEXEQTL'
target_ids = [i for i in df[df.relation == relation].target if i.startswith(t)]
ind = df[(df.relation == relation) & (df.target.isin(target_ids))].index
print(len(ind))
# df.loc[ind, 'relation'] = 'part_of_clingen'

265965


In [68]:
df.loc[ind, 'relation'] = 'part_of_uberon'

In [33]:
rels

['part_of',
 'regulates',
 'isa',
 'located_in',
 'negatively_regulates',
 'positively_regulates']

In [69]:
relations = {
	"isa": {
		"source": "Regulatory Element Activity",
		"target": "ENCODE CCRE Data Matrix",
	},
	"part_of": {
		"source": "Anatomy",
		"target": "Regulatory Element Activity", # Uberon, EFO
	},
	"part_of_ccre": {
		"source": "ENCODE CCRE",
		"target": "Regulatory Element Activity",
	},
	"part_of_clingen": {
		"source": "CLINGEN ALLELE REGISTRY",
		"target": "GTEXEQTL",
	},
	"part_of_uberon": {
		"source": "Anatomy",
		"target": "GTEXEQTL"
	},
	"regulates": {
		"source": "Regulatory Element Activity",
		"target": "Gene"
	},
	"negatively_regulates": {
		"source": "GTEXEQTL",
		"target": "Gene"
	},
	"positively_regulates": {
		"source": "GTEXEQTL",
		"target": "Gene"
	},
	"located_in": {
		"source": "CLINGEN ALLELE REGISTRY",
		"target": "ENCODE CCRE"
	}
}

In [71]:
nodes = get_nodes(df, sab, relations)

Regulatory Element Activity
ENCODE CCRE Data Matrix
Anatomy
ENCODE CCRE
CLINGEN ALLELE REGISTRY
GTEXEQTL
Gene


In [78]:
nodes["Gene"].head()

Unnamed: 0_level_0,label,type,ENSEMBL,HGNC
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C1823499,TRMT5,Gene,ENSG00000126814,23141.0
C5576639,LINC03000,Gene,ENSG00000241956,56116.0
C3890090,SAP18P2,Gene,ENSG00000223873,51568.0
C1826005,MIR181A2,Gene,ENSG00000207595,31549.0
ENSEMBL:ENSG00000271781 CUI,ENSG00000271781,Gene,ENSG00000271781,


## Glygen

In [220]:
sab = 'PROTEOFORM'
dcc = "GlyGen"
df = get_sab_df(sab, dcc)
df.head()

out/sab/PROTEOFORM.edges.csv exists!


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,UNIPROTKB:O14490 CUI,UNIPROTKB.ISOFORM:O14490-1 CUI,has_isoform,PROTEOFORM,,GlyGen
1,UNIPROTKB:E9Q7T7 CUI,UNIPROTKB.ISOFORM:E9Q7T7-1 CUI,has_isoform,PROTEOFORM,,GlyGen
2,UNIPROTKB:O14513 CUI,UNIPROTKB.ISOFORM:O14513-1 CUI,has_isoform,PROTEOFORM,,GlyGen
3,UNIPROTKB:O15355 CUI,UNIPROTKB.ISOFORM:O15355-1 CUI,has_isoform,PROTEOFORM,,GlyGen
4,UNIPROTKB:O15488 CUI,UNIPROTKB.ISOFORM:O15488-1 CUI,has_isoform,PROTEOFORM,,GlyGen


In [221]:
df.relation.unique()

array(['has_isoform', 'has_evidence', 'sequence', 'citation',
       'has_pro_entry', 'glycosylated_at', 'location', 'has_saccharide',
       'has_amino_acid', 'inverse_has_isoform', 'is_evidence_for',
       'inverse_sequence', 'inverse_citation', 'inverse_has_pro_entry',
       'inverse_glycosylated_at', 'inverse_location',
       'inverse_has_saccharide', 'inverse_has_amino_acid'], dtype=object)

In [222]:
relations = {
	"has_isoform": {
		"source": "Protein",
		"target": "Isoform",
	},
	"has_evidence": {
		"source": "Glycoprotein",
		"target": "Glycoprotein Evidence", # Uberon, EFO
	},
	"sequence": {
		"source": "Glycoprotein",
		"target": "Isoform",
	},
	"citation": {
		"source": "Glycoprotein Evidence",
		"target": "Glycoprotein Citation",
	},
	"has_pro_entry": {
		"source": "Glycoprotein",
		"target": "GP ID2PRO"
	},
	"glycosylated_at": {
		"source": "Glycoprotein",
		"target": "Glycosylation Site"
	},
	"location": {
		"source": "Glycosylation Site",
		"target": "GlyGen Location"
	},
	"has_saccharide": {
		"source": "Glycosylation Site",
		"target": "Glytoucan"
	},
	"has_amino_acid": {
		"source": "GlyGen Location",
		"target": "Amino Acid"
	}
}

In [227]:
df[df.relation.isin(relations.keys())].to_csv('out/sab/PROTEOFORM.edges.csv')

In [79]:
nodes = get_nodes(df, sab, relations)

Protein
Isoform
Glycoprotein
Glycoprotein Evidence
Glycoprotein Citation
GP ID2PRO
Glycosylation Site
GlyGen Location
Glytoucan
Amino Acid


In [228]:
sab = 'GLYCANS'
dcc = "GlyGen"
df = get_sab_df(sab, dcc)
df.head()

out/sab/GLYCANS.edges.csv exists!


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,GLYGEN.GLYCOSYLATION:RXN00000038 CUI,UNIPROTKB:P38649 CUI,has_enzyme_protein,GLYCANS,,GlyGen
1,GLYCOSYLTRANSFERASE.REACTION:RXN00000011 CUI,UNIPROTKB:P38649 CUI,has_enzyme_protein,GLYCANS,,GlyGen
2,GLYTOUCAN:G85555HC CUI,GLYGEN.SRC:G85555HC-SRC00000461 CUI,is_from_source,GLYCANS,,GlyGen
3,GLYTOUCAN:G89102AG CUI,GLYGEN.SRC:G89102AG-SRC00000121 CUI,is_from_source,GLYCANS,,GlyGen
4,GLYTOUCAN:G94192DA CUI,GLYGEN.GLYCOSEQUENCE:G94192DA-GLYCOSEQ00002593...,has_glycosequence,GLYCANS,,GlyGen


In [229]:
df.relation.unique()

array(['has_enzyme_protein', 'is_from_source', 'has_glycosequence',
       'attached_by', 'synthesized_by', 'has_motif',
       'has_canonical_residue', 'has_parent',
       'inverse_has_enzyme_protein', 'inverse_is_from_source',
       'inverse_has_glycosequence', 'inverse_attached_by',
       'inverse_synthesized_by', 'inverse_has_motif',
       'inverse_has_canonical_residue', 'inverse_has_parent'],
      dtype=object)

In [82]:
set([i.split(":")[0] for i in df[df.relation == "has_enzyme_protein"].source])

{'GLYCOSYLTRANSFERASE.REACTION', 'GLYGEN.GLYCOSYLATION'}

In [84]:
relation = "has_enzyme_protein"
s = 'GLYCOSYLTRANSFERASE.REACTION'
source_ids = [i for i in df[df.relation == relation].source if i.startswith(s)]
ind = df[(df.relation == relation) & (df.source.isin(source_ids))].index
print(len(ind))
df.loc[ind, 'relation'] = 'has_enzyme_protein_gr'

91


In [230]:
relations = {
	"has_enzyme_protein": {
		"source": "Glycosylation",
		"target": "Protein",
	},
	"has_enzyme_protein_gr": {
		"source": "Glycosyltransferase Reaction",
		"target": "Protein",
	},
	"is_from_source": {
		"source": "Glytoucan",
		"target": "GlyGen src", # Uberon, EFO
	},
	"has_glycosequence": {
		"source": "Glytoucan",
		"target": "GlyGen Glycosequence",
	},
	"attached_by": {
		"source": "GlyGen Residue",
		"target": "Glycosylation",
	},
	"synthesized_by": {
		"source": "Glytoucan",
		"target": "Glycosyltransferase Reaction"
	},
	"has_motif": {
		"source": "Glytoucan",
		"target": "Glycan Motif"
	},
	"has_canonical_residue": {
		"source": "Glytoucan",
		"target": "GlyGen Residue"
	},
	"has_parent": {
		"source": "GlyGen Residue",
		"target": "GlyGen Residue"
	}
}

In [231]:
set(df.relation) - set(relations.keys())

{'inverse_attached_by',
 'inverse_has_canonical_residue',
 'inverse_has_enzyme_protein',
 'inverse_has_glycosequence',
 'inverse_has_motif',
 'inverse_has_parent',
 'inverse_is_from_source',
 'inverse_synthesized_by'}

In [232]:
df[df.relation.isin(relations.keys())].to_csv('out/sab/GLYCANS.edges.csv')

In [87]:
nodes = get_nodes(df, sab, relations)

Glycosylation
Protein
Glycosyltransferase Reaction
Glytoucan
GlyGen src
GlyGen Glycosequence
GlyGen Residue
Glycan Motif


## GTEx

In [217]:
sab = 'GTEXEXP'
dcc = "GTEx"
df = get_sab_df(sab, dcc)
df.head()

out/sab/GTEXEXP.edges.csv exists!


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,GTEXEXP:ENSG00000223972-5-Testis CUI,EXPBINS:0.1.0.2 CUI,has_expression,GTEXEXP,,GTEx
1,GTEXEXP:ENSG00000227232-5-Adipose-Subcutaneous...,EXPBINS:4.0.5.0 CUI,has_expression,GTEXEXP,,GTEx
2,GTEXEXP:ENSG00000227232-5-Adrenal-Gland CUI,EXPBINS:2.0.3.0 CUI,has_expression,GTEXEXP,,GTEx
3,GTEXEXP:ENSG00000227232-5-Artery-Aorta CUI,EXPBINS:4.0.5.0 CUI,has_expression,GTEXEXP,,GTEx
4,GTEXEXP:ENSG00000227232-5-Artery-Coronary CUI,EXPBINS:3.0.4.0 CUI,has_expression,GTEXEXP,,GTEx


In [127]:
len(df[df.relation == 'has_expression'].source), len(df[df.relation == 'has_expression'].source.unique())

(1573380, 1573380)

In [128]:
relation = "expressed_in"
targets = df[df.relation == relation].target.unique()
# s = 'GLYCOSYLTRANSFERASE.REACTION'
# source_ids = [i for i in df[df.relation == relation].source if i.startswith(s)]
# ind = df[(df.relation == relation) & (df.source.isin(source_ids))].index
# print(len(ind))
# df.loc[ind, 'relation'] = 'has_enzyme_protein_gr'

In [129]:
tmp = concept_code[concept_code.SAB.isin(['HGNC', 'ENSEMBLE', 'UBERON', 'EFO'])]

In [130]:
tmp = tmp.groupby("id").first()
tmp.head()

Unnamed: 0_level_0,code_id,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C0000696,UBERON:0006135,UBERON,6135,,,,
C0000726,UBERON:0000916,UBERON,916,,,,
C0000739,UBERON:0002378,UBERON,2378,,,,
C0000741,UBERON:0001646,UBERON,1646,,,,
C0000773,EFO:0009565,EFO,9565,,,,


In [131]:
anatomy_index = set(targets).intersection(tmp[tmp.SAB.isin(['UBERON', 'EFO'])].index)
gene_index = set(targets).intersection(tmp[tmp.SAB.isin(['HGNC', 'ENSEMBLE'])].index)
len(targets), len(anatomy_index), len(gene_index)

(34623, 44, 34579)

In [132]:
s = 'expressed_in'
ind = df[(df.relation == relation) & (df.target.isin(gene_index))].index
print(len(ind))

1573380


In [133]:
df.loc[ind, 'relation'] = 'expressed_in_gene'

In [134]:
s = 'expressed_in'
ind = df[(df.relation == relation) & (df.target.isin(anatomy_index))].index
print(len(ind))

1573380


In [135]:
df.loc[ind, 'relation'] = 'expressed_in_anatomy'

In [136]:
relations = {
	"expressed_in_gene": {
		"source": "GTEXEXP",
		"target": "Gene",
	},
	"expressed_in_anatomy": {
		"source": "GTEXEXP",
		"target": "Anatomy",
	},
	"has_expression": {
		"source": "GTEXEXP",
		"target": "EXPBINS",
	},
}

In [137]:
df.relation.unique()

array(['has_expression', 'expressed_in_gene', 'expressed_in_anatomy',
       'inverse_has_expression', 'expresses'], dtype=object)

In [138]:
nodes = get_nodes(df, sab, relations)

GTEXEXP
out/sab/GTEXEXP.GTEXEXP.nodes.csv found
Gene
out/sab/GTEXEXP.Gene.nodes.csv found
Anatomy
out/sab/GTEXEXP.Anatomy.nodes.csv found
EXPBINS
out/sab/GTEXEXP.EXPBINS.nodes.csv found


In [139]:
gtexexp = df[df.relation == "has_expression"].source.unique()

In [140]:
len(gtexexp)

1573380

In [141]:
bins_dict = {}
for i, row in df[df.relation == "has_expression"].iterrows():
	source = row["source"]
	target = row["target"]
	tmp = target.replace(" CUI", "").split(":")[1]
	score = ".".join(tmp.split(".")[2:])
	bins_dict[source] = float(score)

In [142]:
anatomy_dict = {}
for i, row in df[df.relation == "expressed_in_anatomy"].iterrows():
	source = row["source"]
	target = row["target"]
	anatomy_dict[source] = target

gene_dict = {}
for i, row in df[df.relation == "expressed_in_gene"].iterrows():
	source = row["source"]
	target = row["target"]
	gene_dict[source] = target
len(anatomy_dict), len(gene_dict), len(bins_dict)

(1573380, 1573380, 1573380)

In [143]:
# source	target	relation	SAB	evidence_class	dcc

ind = len(df.index)
rows = {}
for k, score in bins_dict.items():
	anatomy = anatomy_dict[k]
	gene = gene_dict[k]
	row = [gene, anatomy, "expressed_in", "GTEXEXP", score, "GTEx"]
	rows[ind] = row
	ind += 1

In [144]:
concat_df = pd.DataFrame.from_dict(rows, orient="index", columns=df.columns)
concat_df.head()

Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
9440280,C2239334,C0039597,expressed_in,GTEXEXP,0.2,GTEx
9440281,C2829144,C0222331,expressed_in,GTEXEXP,5.0,GTEx
9440282,C2829144,C0001625,expressed_in,GTEXEXP,3.0,GTEx
9440283,C2829144,C0003956,expressed_in,GTEXEXP,5.0,GTEx
9440284,C2829144,C0205042,expressed_in,GTEXEXP,4.0,GTEx


In [145]:
df = pd.concat([df, concat_df])

In [146]:
ind = df[df.relation == 'expressed_in_gene'].index
df.loc[ind, 'relation'] = 'expressed_in'
ind = df[df.relation == 'expressed_in_anatomy'].index
df.loc[ind, 'relation'] = 'expressed_in'

In [147]:
df = pd.concat([df, concat_df])

In [148]:
df.to_csv('out/sab/GTEXEXP.edges.csv')

In [218]:
df.relation.unique()

array(['has_expression', 'expressed_in', 'inverse_has_expression',
       'expresses'], dtype=object)

In [219]:
df[df.relation.isin(['expressed_in', 'has_expression'])].to_csv('out/sab/GTEXEXP.edges.csv')

In [199]:
sab = 'GTEXEQTL'
dcc = "GTEx"
df = get_sab_df(sab, dcc)
df.head()

out/sab/GTEXEQTL.edges.csv exists!


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,GTEXEQTL:eQTL.chr1.1434243.G.A.b38.Cells.Cultu...,C1428335,located_in,GTEXEQTL,,GTEx
1,GTEXEQTL:eQTL.chr1.1497758.C.T.b38.Cells.Cultu...,C1428335,located_in,GTEXEQTL,,GTEx
2,GTEXEQTL:eQTL.chr1.1499000.C.A.b38.Cells.Cultu...,C1428335,located_in,GTEXEQTL,,GTEx
3,GTEXEQTL:eQTL.chr1.1499128.C.T.b38.Cells.Cultu...,C1428335,located_in,GTEXEQTL,,GTEx
4,GTEXEQTL:eQTL.chr1.1499639.G.T.b38.Cells.Cultu...,C1428335,located_in,GTEXEQTL,,GTEx


In [200]:
df.relation.unique()

array(['located_in', 'p_value', 'location_of', 'inverse_p_value'],
      dtype=object)

In [201]:
df[df.relation == 'located_in'].target.unique()

array(['C1428335', 'C3146465', 'C1823788', ...,
       'HSCLO:chr12.69390001-69391000 CUI',
       'HSCLO:chr12.69391001-69392000 CUI',
       'HSCLO:chr12.69396001-69397000 CUI'], dtype=object)

In [202]:
targets = df[df.relation == 'located_in'].target.unique()

In [203]:
tmp = concept_code[concept_code.SAB.isin(['UBERON', 'EFO', 'HSCLO', 'HGNC'])]
hsclo_index = set(targets).intersection(tmp[tmp.SAB.isin(['HSCLO'])]["id"])
gene_index = set(targets).intersection(tmp[tmp.SAB.isin(['HGNC'])]["id"])
anatomy_index = set(targets).intersection(tmp[tmp.SAB.isin(['UBERON', 'EFO'])]["id"])
len(targets), len(anatomy_index), len(gene_index), len(hsclo_index)

(12814, 48, 301, 12465)

In [204]:
relation = 'located_in'
ind = df[(df.relation == relation) & (df.target.isin(hsclo_index))].index
print(len(ind))

1240810


In [205]:
df.loc[ind, 'relation'] = 'located_in_hsclo'

In [206]:
relation = 'located_in'
ind = df[(df.relation == relation) & (df.target.isin(anatomy_index))].index
print(len(ind))

1240810


In [207]:
df.loc[ind, 'relation'] = 'located_in_anatomy'

In [208]:
relation = 'located_in'
ind = df[(df.relation == relation) & (df.target.isin(gene_index))].index
print(len(ind))

2041208


In [209]:
df.loc[ind, 'relation'] = 'located_in_gene'

In [212]:
relations = {
	"located_in_gene": {
		"source": "GTEXEQTL",
		"target": "Gene",
	},
	"located_in_anatomy": {
		"source": "GTEXEQTL",
		"target": "Anatomy",
	},
	"located_in_hsclo": {
		"source": "GTEXEQTL",
		"target": "HSCLO",
	},
	"p_value": {
		"source": "GTEXEQTL",
		"target": "GTEXPVALUEBIN",
	},
}

In [213]:
nodes = get_nodes(df, sab, relations)

GTEXEQTL
out/sab/GTEXEQTL.GTEXEQTL.nodes.csv found
Gene
out/sab/GTEXEQTL.Gene.nodes.csv found
Anatomy
out/sab/GTEXEQTL.Anatomy.nodes.csv found
HSCLO
out/sab/GTEXEQTL.HSCLO.nodes.csv found
GTEXPVALUEBIN


In [214]:
df.relation.unique()

array(['located_in_gene', 'located_in_anatomy', 'located_in_hsclo',
       'p_value', 'location_of', 'inverse_p_value'], dtype=object)

In [215]:
df = df[df.relation.isin(relations.keys())]
ind = df[df.relation == 'located_in_gene'].index
df.loc[ind, 'relation'] = 'located_in'
ind = df[df.relation == 'located_in_anatomy'].index
df.loc[ind, 'relation'] = 'located_in'
ind = df[df.relation == 'located_in_hsclo'].index
df.loc[ind, 'relation'] = 'located_in'
df.relation.unique()

array(['located_in', 'p_value'], dtype=object)

In [216]:
df.to_csv('out/sab/GTEXEQTL.edges.csv')

## HubMAP

In [255]:
sab = 'AZ'
dcc = "HuBMAP"
df = get_sab_df(sab, dcc)
df.head()

159010399it [02:04, 1277341.64it/s]


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,AZ:0000000 CUI,AZ:9000000 CUI,isa,AZ,,HuBMAP
1,AZ:0100000 CUI,AZ:9000000 CUI,isa,AZ,,HuBMAP
2,C0018787,AZ:0100000 CUI,isa,AZ,,HuBMAP
3,UBERON:0002113 CUI,AZ:0100000 CUI,isa,AZ,,HuBMAP
4,UBERON:0002048 CUI,AZ:0100000 CUI,isa,AZ,,HuBMAP


In [256]:
df.relation.unique()

array(['isa', 'located_in', 'inverse_isa', 'location_of'], dtype=object)

In [278]:
relations = {
	"isa": {
		"source": "Anatomy",
		"target": "Anatomy",
	},
	"located_in": {
		"source": "Anatomy",
		"target": "Anatomy",
	}
}

In [279]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [280]:
nodes = get_nodes(df, sab, relations)

Anatomy


In [291]:
sab = 'HMAZ'
dcc = "HuBMAP"
df = get_sab_df(sab, dcc)
df.head()

159010399it [02:03, 1289854.75it/s]


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,C0206131,C1539593,has_marker_gene_in_heart,HMAZ,,HuBMAP
1,C0206131,C1412105,has_marker_gene_in_heart,HMAZ,,HuBMAP
2,C0206131,C1424138,has_marker_gene_in_heart,HMAZ,,HuBMAP
3,C0206131,C3470887,has_marker_gene_in_heart,HMAZ,,HuBMAP
4,C0206131,C1367441,has_marker_gene_in_heart,HMAZ,,HuBMAP


In [292]:
df.relation.unique()

array(['has_marker_gene_in_heart', 'has_marker_gene_in_kidney',
       'has_marker_gene_in_liver', 'inverse_has_marker_gene_in_heart',
       'inverse_has_marker_gene_in_kidney',
       'inverse_has_marker_gene_in_liver'], dtype=object)

In [293]:
relations = {
	"has_marker_gene_in_heart": {
		"source": "Anatomy",
		"target": "Gene",
	},
	"has_marker_gene_in_kidney": {
		"source": "Anatomy",
		"target": "Gene",
	},
	"has_marker_gene_in_liver": {
		"source": "Anatomy",
		"target": "Gene",
	}
}

In [296]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [297]:
nodes = get_nodes(df, sab, relations)

Anatomy
Gene


## IDG

In [298]:
sab = 'IDGP'
dcc = "IDG"
df = get_sab_df(sab, dcc)
df.head()

159010399it [02:07, 1243819.84it/s]


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,PUBCHEM:286 CUI,UNIPROTKB:P32929 CUI,bioactivity,IDGP,IC50,IDG
1,PUBCHEM:185909 CUI,UNIPROTKB:P32929 CUI,bioactivity,IDGP,IC50,IDG
2,PUBCHEM:439742 CUI,UNIPROTKB:P32929 CUI,bioactivity,IDGP,IC50,IDG
3,PUBCHEM:6436272 CUI,UNIPROTKB:P32929 CUI,bioactivity,IDGP,IC50,IDG
4,PUBCHEM:1312632 CUI,UNIPROTKB:Q969H4 CUI,bioactivity,IDGP,Kd,IDG


In [315]:
df.relation.unique()

array(['bioactivity', 'inverse_bioactivity'], dtype=object)

In [316]:
relations = {
	"bioactivity": {
		"source": "Compound",
		"target": "Protein",
	},
}

In [317]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [318]:
nodes = get_nodes(df, sab, relations)

Compound
Protein


In [437]:
sab = 'IDGD'
dcc = "IDG"
df = get_sab_df(sab, dcc)
df.head()

out/sab/IDGD.edges.csv exists!


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,PUBCHEM:54684141 CUI,SNOMEDCT_US:33339001 CUI,indication,IDGD,,IDG
1,PUBCHEM:5311101 CUI,C0155880,indication,IDGD,,IDG
2,PUBCHEM:456201 CUI,C0036508,indication,IDGD,,IDG
3,PUBCHEM:4744 CUI,C0003950,indication,IDGD,,IDG
4,PUBCHEM:6067 CUI,C0030920,indication,IDGD,,IDG


In [438]:
df.relation.unique()

array(['indication'], dtype=object)

In [439]:
relations = {
	"indication": {
		"source": "Compound",
		"target": "Disease or Phenotype",
	},
}

In [440]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [441]:
nodes = get_nodes(df, sab, relations)

Compound
out/sab/IDGD.Compound.nodes.csv found
Disease or Phenotype


## Kid's First

In [463]:
sab = 'KF'
dcc = "KF"
df = get_sab_df(sab, dcc)
df.head()

213950it [00:00, 896843.09it/s]

159010399it [02:04, 1280409.21it/s]


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,KFPT:PT-9X741E8Z CUI,KFCOHORT:SD-0TYVY1TW CUI,belongs_to_cohort,KF,,KF
1,KFPT:PT-0AQN56EH CUI,KFCOHORT:SD-NMVV8A1Y CUI,belongs_to_cohort,KF,,KF
2,KFPT:PT-1HNTASHD CUI,KFCOHORT:SD-NMVV8A1Y CUI,belongs_to_cohort,KF,,KF
3,KFPT:PT-2KE662T4 CUI,KFCOHORT:SD-NMVV8A1Y CUI,belongs_to_cohort,KF,,KF
4,KFPT:PT-2Q0TYD81 CUI,KFCOHORT:SD-NMVV8A1Y CUI,belongs_to_cohort,KF,,KF


In [464]:
df.relation.unique()

array(['belongs_to_cohort', 'has_phenotype', 'gene_has_variants',
       'inverse_belongs_to_cohort', 'phenotype_of',
       'inverse_gene_has_variants'], dtype=object)

In [465]:
relations = {
	"has_phenotype": {
		"source": "KFPT",
		"target": "Disease or Phenotype",
	},
	"belongs_to_cohort": {
		"source": "KFGENEBIN",
		"target": "KFCOHORT",
	},
	"gene_has_variants": {
		"source": "KFGENEBIN",
		"target": "Gene",
	},
	"belongs_to_cohort": {
		"source": "KFPT",
		"target": "KFCOHORT",
	},
}

In [466]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [467]:
set([i.split(":")[0] for i in df[df.relation == 'belongs_to_cohort'].source.unique()])

{'KFGENEBIN', 'KFPT'}

In [468]:
kfpt = []
genebin = []
for i in df[df.relation == 'belongs_to_cohort'].source.unique():
	if 'KFPT:' in i:
		kfpt.append(i)
	else:
		genebin.append(i)


In [469]:
ind = df[(df.relation == 'belongs_to_cohort') & (df.source.isin(genebin))].index
len(ind)

13375

In [470]:
df.loc[ind, 'relation'] = 'belongs_to_cohort_bin'

In [471]:
relations = {
	"has_phenotype": {
		"source": "KFPT",
		"target": "Disease or Phenotype",
	},
	"belongs_to_cohort_bin": {
		"source": "KFGENEBIN",
		"target": "KFCOHORT",
	},
	"gene_has_variants": {
		"source": "KFGENEBIN",
		"target": "Gene",
	},
	"belongs_to_cohort": {
		"source": "KFPT",
		"target": "KFCOHORT",
	},
}

In [472]:
nodes = get_nodes(df, sab, relations)

KFPT
Disease or Phenotype
KFGENEBIN
KFCOHORT
Gene


## MoTrPAC

In [343]:
sab = 'MOTRPAC'
dcc = "MoTrPAC"
df = get_sab_df(sab, dcc)
df.head()

159010399it [02:04, 1277317.99it/s]


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,MOTRPAC:ENSRNOG00000000012-gastrocnemius-male CUI,ENSEMBL:ENSRNOG00000000012 CUI,associated_with,MOTRPAC,,MoTrPAC
1,MOTRPAC:ENSRNOG00000000073-gastrocnemius-male CUI,ENSEMBL:ENSRNOG00000000073 CUI,associated_with,MOTRPAC,,MoTrPAC
2,MOTRPAC:ENSRNOG00000000130-gastrocnemius-male CUI,ENSEMBL:ENSRNOG00000000130 CUI,associated_with,MOTRPAC,,MoTrPAC
3,MOTRPAC:ENSRNOG00000000165-gastrocnemius-male CUI,ENSEMBL:ENSRNOG00000000165 CUI,associated_with,MOTRPAC,,MoTrPAC
4,MOTRPAC:ENSRNOG00000000245-gastrocnemius-male CUI,ENSEMBL:ENSRNOG00000000245 CUI,associated_with,MOTRPAC,,MoTrPAC


In [344]:
df.relation.unique()

array(['associated_with', 'located_in', 'sex', 'inverse_associated_with',
       'location_of', 'inverse_sex'], dtype=object)

In [345]:
relations = {
	"associated_with": {
		"source": "MOTRPAC",
		"target": "Gene",
	},
	"located_in": {
		"source": "MOTRPAC",
		"target": "Anatomy",
	},
	"sex": {
		"source": "MOTRPAC",
		"target": "Sex",
	}
}

In [346]:
set(df.relation) - set(relations.keys())

{'inverse_associated_with', 'inverse_sex', 'location_of'}

In [347]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [348]:
nodes = get_nodes(df, sab, relations)

MOTRPAC
Gene
Anatomy
Sex


## MW

In [580]:
sab = 'MW'
dcc = "Metabolomics"
df = get_sab_df(sab, dcc)
df.head()

out/sab/MW.edges.csv exists!


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,C0021853,C0030234,produces,MW,,Metabolomics
1,C0021853,C0070210,produces,MW,,Metabolomics
2,C0021853,C0023139,produces,MW,,Metabolomics
3,C0021853,PUBCHEM:3893 CUI,produces,MW,,Metabolomics
4,C0021853,C0027138,produces,MW,,Metabolomics


In [581]:
df.relation.unique()

array(['produces', 'causally_influences', 'correlated_with_condition',
       'produced_by', 'causally_influenced_by',
       'inverse_correlated_with_condition'], dtype=object)

In [582]:
target = df[df.relation == 'produces'].target.unique()

In [583]:
labels.loc[list(set(target).intersection(labels.index))]

Unnamed: 0,label
PUBCHEM:449093 CUI,trans-Zeatin
C0041942,urea
PUBCHEM:145453498 CUI,Phe-Thr
C0008174,clorazepate
PUBCHEM:7408124 CUI,Pro-Trp
...,...
C0033405,promethazine
PUBCHEM:5312890 CUI,12-Oxo-10Z-dodecenoic acid
PUBCHEM:25217593 CUI,Ala-Val-Arg
C0073082,resistomycin


In [584]:
from glob import glob

In [585]:
relations = {
	"produces": {
		"source": "Anatomy",
		"target": "Metabolite",
	},
	"causally_influences": {
		"source": "Gene",
		"target": "Metabolite",
	},
	"correlated_with_condition": {
		"source": "Metabolite",
		"target": "Disease or Phenotype",
	}
}

In [586]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [609]:
df = df[df.relation.isin(list(relations.keys()))]

In [598]:
nodes = get_nodes(df, sab, relations)

Anatomy
out/sab/MW.Anatomy.nodes.csv found
Metabolite
out/sab/MW.Metabolite.nodes.csv found
Gene
out/sab/MW.Gene.nodes.csv found
Disease or Phenotype
out/sab/MW.Disease or Phenotype.nodes.csv found


In [599]:
nodes["Metabolite"] = nodes["Metabolite"].loc[[i for i in nodes['Metabolite'].index if i != 'C0035298']]

In [600]:
nodes["Metabolite"].loc['PUBCHEM:638015 CUI']

label      all-<i>trans</i>-retinal
type                     Metabolite
PUBCHEM                    638015.0
Name: PUBCHEM:638015 CUI, dtype: object

In [601]:
nodes["Metabolite"].loc['C1426339', 'label'] = 'COQ4'

In [602]:
nodes['Metabolite'].loc[['C3495801', 'C0033036', 'C0038454']]

Unnamed: 0_level_0,label,type,PUBCHEM
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C3495801,Granulomatosis with polyangiitis,Metabolite,145459098.0
C0033036,Atrial Premature Complexes,Metabolite,145459091.0
C0038454,Cerebrovascular accident,Metabolite,6131.0


In [603]:
new_index = []
id_mapper = {}
for k,v in nodes['Metabolite'].iterrows():
	if k in ['C3495801', 'C0033036', 'C0038454']:
		index = "PUBMED:%d CUI"%int(v["PUBCHEM"])
		new_index.append(index)
		id_mapper[k] = index
		print(index)
	else:
		new_index.append(k)
len(new_index)

PUBMED:145459091 CUI
PUBMED:145459098 CUI
PUBMED:6131 CUI


9627

In [604]:
nodes['Metabolite'].loc['PUBMED:145459098 CUI', 'label'] = "(3'-5')-Guanylyladenosine"
nodes['Metabolite'].loc['PUBMED:145459091 CUI', 'label'] = "(3'-5')-Adenylylcytidine"
nodes['Metabolite'].loc['PUBMED:6131 CUI', 'label'] = "5'-Cytidylic acid"

In [616]:
id_mapper

{'C0033036': 'PUBMED:145459091 CUI',
 'C3495801': 'PUBMED:145459098 CUI',
 'C0038454': 'PUBMED:6131 CUI'}

In [618]:
for k,v in df[df.source.isin(['C3495801', 'C0033036', 'C0038454'])].iterrows():
	source = id_mapper[v['source']]
	df.at[k, 'source'] = source

In [619]:
for k,v in df[df.target.isin(['C3495801', 'C0033036', 'C0038454'])].iterrows():
	target = id_mapper[v['target']]
	df.at[k, 'target'] = target

In [626]:
nodes["Metabolite"].loc[[i for i in nodes["Metabolite"].index if i not in ['C3495801', 'C0033036', 'C0038454']]].to_csv('out/sab/%s.%s.nodes.csv'%(sab,'Metabolite'))

In [621]:
ind = df[df.target == 'C0035298'].index

df.loc[ind, 'target'] = 'PUBCHEM:638015 CUI'

In [622]:
df.to_csv('out/sab/%s.edges.csv'%sab)

## SPARC

In [63]:
sab = 'NPO'
dcc = "SPARC"
df = get_sab_df(sab, dcc)
df.head()

159010399it [02:02, 1294940.66it/s]


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,NIFSTD.NIFEXT:5252 CUI,NIFSTD.NIFEXT:5239 CUI,isa,NPO,,SPARC
1,C2337482,C0037303,part_of,NPO,,SPARC
2,C1265533,C0039005,isa,NPO,,SPARC
3,UBERON:0001807 CUI,UBERON:0007134 CUI,isa,NPO,,SPARC
4,C0228973,UBERON:0007134 CUI,isa,NPO,,SPARC


In [64]:
df.relation.unique()

array(['isa', 'part_of', 'is_part_of', 'contributes_to_morphology_of',
       'delineates', 'isdelineatedby', 'has_role', 'inverse_isa',
       'has_part', 'inverse_is_part_of',
       'inverse_contributes_to_morphology_of', 'inverse_delineates',
       'inverse_isdelineatedby', 'role_of'], dtype=object)

In [65]:
df.shape

(7956, 6)

In [165]:
ids = [i for i in df.source.unique()] + [i for i in df.target.unique()]
ids = list(set(ids))
len(ids)

1913

In [166]:
concept_code[concept_code.id.isin(ids)].SAB.unique()

array(['LCH_NW', 'MTH', 'NCI', 'LNC', 'CSP', 'UWDA', 'MSH', 'SNOMEDCT_US',
       'PSY', 'FMA', 'CHV', 'ICF-CY', 'ICF', 'NEU', 'OMIM', 'MMSL',
       'HL7V2.5', 'NCBI', 'MEDCIN', 'MEDLINEPLUS', 'DRUGBANK', 'CPM',
       'MTHICD9', 'GO', 'SNOMEDCT_VET', 'GS', 'RXNORM', 'VANDF',
       'ICD10AM', 'MTHSPL', 'PDQ', 'NDDF', 'SRC', 'MDR', 'AOT', 'ATC',
       'USP', 'ICPC2ICD10ENG', 'UBERON', 'CHEBI', 'CL', 'PATO', 'OBI',
       'PR', 'SO', 'CARO', 'MONDO', 'MA', 'EFO', 'ZFA', 'AZ', 'PUBCHEM',
       'ENTREZ', 'ILX.TR', 'NIFSTD.NIFEXT', 'ILX', 'NIFSTD',
       'NIFSTD.NLX.MOL', 'PAX.PAXSPN', 'NIFSTD.NLX', 'NIFSTD.NLX.ORG',
       'PAX.PAXRAT'], dtype=object)

In [167]:
tmp = concept_code[concept_code.id.isin(ids)].groupby('id')

In [168]:
len(ids)

1913

In [242]:
def is_substring(l, substring):
	for i in l:
		if substring in i:
			return True
	else:
		return False

In [267]:
mapping = {}
for g in tmp.groups:
	group = tmp.get_group(g)
	if 'UBERON' in list(group.SAB):
		gr = "Anatomy"
	elif 'ENTREZ' in list(group.SAB):
		gr = "Gene"
	elif 'PUBCHEM' in list(group.SAB):
		gr = "Compound"
	elif is_substring(list(group.SAB), 'NIFSTD'):
		gr = "NIFSTD"
	elif is_substring(list(group.SAB), 'PAX'):
		gr = "Anatomy"
	elif 'FMA' in list(group.SAB):
		gr = "Anatomy"
	elif 'PATO' in list(group.SAB):
		gr = "PATO"
	elif 'PR' in list(group.SAB):
		gr = "Protein"
	elif 'NCBI' in list(group.SAB):
		gr = "Taxon"
	elif 'CHEBI' in list(group.SAB):
		gr = "Compound"
	elif 'LNC' in list(group.SAB):
		gr = "Anatomy"
	elif 'OBI' in list(group.SAB):
		gr = "Taxon"
	elif 'CL' in list(group.SAB):
		gr = "Anatomy"
	elif 'FMA' in list(group.SAB):
		gr = "Anatomy"
	elif 'ILX' in list(group.SAB):
		gr = "ILX"
	elif 'ILX.TR' in list(group.SAB):
		gr = "ILX"
	elif 'UWDA' in list(group.SAB):
		gr = "Anatomy"
	else:
		gr = list(group.SAB)[0]
	if gr not in mapping:
		mapping[gr] = []
	mapping[gr].append(g)

In [268]:
for k,v in mapping.items():
	print(k, len(v))

Anatomy 1226
Compound 122
Taxon 67
GO 9
Gene 122
ILX 227
NIFSTD 83
PATO 12
Protein 41
SO 4


In [272]:
nodes = {}
for node_type, node_index in mapping.items():
	print(node_type)
	filename = 'out/sab/%s.%s.nodes.csv'%(sab, node_type)
	if (os.path.isfile(filename)):
		print('%s found'%filename)
		node_df = pd.read_csv(filename, index_col=0)
		nodes[node_type] = node_df
	else:
		node_index = list(node_index)
		node_df = pd.DataFrame(index=node_index, columns=["label", "type"])
		node_df.index.name = "id"
		node_df["type"] = node_type
		node_df["label"] = node_df.index
		with_label = list(set(labels.index).intersection(node_index))
		node_df.loc[with_label, 'label'] = labels.loc[with_label, "label"]
		if node_type == "Gene":
			node_df.loc[with_label, 'label'] = [i.replace(" gene", "") for i in node_df.loc[with_label, 'label']]
		# CODE
		filtered = concept_code[concept_code["id"].isin(node_df.index)]
		filtered = filtered[filtered.CODE != 'NOCODE']
		# filter keys
		code_keys = filtered.SAB.value_counts()[filtered.SAB.value_counts() > node_df.shape[0]/2].index
		grouped_concept = concept_code[concept_code["id"].isin(node_df.index)].groupby("id")
		score_df = pd.DataFrame(index=filtered.id.unique(), columns=code_keys)
		score_df.index.name = 'id'
		grouped_concept = filtered.groupby("id")
		for group in score_df.index:
			g = grouped_concept.get_group(group)
			g = g[g.SAB.isin(code_keys)]
			cols = g.SAB
			values = g.CODE
			score_df.loc[group]=pd.Series(list(values), index=cols).groupby('SAB').first()
		node_df = node_df.merge(score_df, on="id")
		node_df.to_csv(filename)
		nodes[node_type] = node_df

Anatomy
Compound
Taxon
GO
Gene
ILX
NIFSTD
PATO
Protein
SO


In [275]:
df.relation.unique()

array(['isa', 'part_of', 'is_part_of', 'contributes_to_morphology_of',
       'delineates', 'isdelineatedby', 'has_role', 'inverse_isa',
       'has_part', 'inverse_is_part_of',
       'inverse_contributes_to_morphology_of', 'inverse_delineates',
       'inverse_isdelineatedby', 'role_of'], dtype=object)

In [276]:
relations = [
'isa',
'part_of',
'is_part_of',
'contributes_to_morphology_of',
'delineates',
'isdelineatedby',
'has_role',
]

In [279]:
df[df.relation.isin(relations)].to_csv("out/sab/%s.edges.csv"%sab)

In [301]:
for dcc, filenames in dcc_filename.items():
	print(dcc)
	compress(filenames, dcc)

IDG
File Paths:
['IDGD.edges.csv', 'IDGP.Protein.nodes.csv', 'IDGP.edges.csv', 'IDGP.Compound.nodes.csv', 'IDGD.Compound.nodes.csv', 'IDGD.Disease.nodes.csv']
GlyGen
File Paths:
['GLYCANS.Glytoucan.nodes.csv', 'PROTEOFORM.Glycoprotein.nodes.csv', 'PROTEOFORM.Isoform.nodes.csv', 'PROTEOFORM.Glytoucan.nodes.csv', 'PROTEOFORM.GlyGen Location.nodes.csv', 'PROTEOFORM.Amino Acid.nodes.csv', 'GLYCANS.edges.csv', 'PROTEOFORM.Glycoprotein Citation.nodes.csv', 'GLYCANS.GlyGen Residue.nodes.csv', 'PROTEOFORM.Protein.nodes.csv', 'GLYCANS.Glycan Motif.nodes.csv', 'PROTEOFORM.Glycoprotein Evidence.nodes.csv', 'GLYCANS.Glycosyltransferase Reaction.nodes.csv', 'GLYCANS.Glycosylation.nodes.csv', 'PROTEOFORM.GP ID2PRO.nodes.csv', 'GLYCANS.GlyGen src.nodes.csv', 'GLYCANS.GlyGen Glycosequence.nodes.csv', 'GLYCANS.Protein.nodes.csv', 'PROTEOFORM.edges.csv', 'PROTEOFORM.Glycosylation Site.nodes.csv']
4DN
File Paths:
['4DN.4DN File.nodes.csv', '4DN.Anatomy.nodes.csv', '4DN.edges.csv', '4DN.4DN QVal Bin.nodes

## Other Files
### CLINVAR

In [537]:
sab = 'CLINVAR'
df = get_sab_df(sab)
df.head()

out/sab/CLINVAR.edges.csv exists!


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,C1825487,C0747085,gene_associated_with_disease_or_phenotype,CLINVAR,,
1,C1825487,C0029882,gene_associated_with_disease_or_phenotype,CLINVAR,,
2,C1422135,C0271742,gene_associated_with_disease_or_phenotype,CLINVAR,,
3,C1422135,C1535926,gene_associated_with_disease_or_phenotype,CLINVAR,,
4,C1422135,C0497552,gene_associated_with_disease_or_phenotype,CLINVAR,,


In [518]:
df.relation.unique()

array(['gene_associated_with_disease_or_phenotype'], dtype=object)

In [519]:
set(i.split(":")[0] for i in df[df.relation == 'gene_associated_with_disease_or_phenotype'].target if ":" in i)

{'DOID', 'EFO', 'HP', 'MEDGEN', 'MONDO', 'MSH', 'OMIM'}

In [520]:
relations = ["gene_associated_with_disease_or_phenotype"]
df[df.relation.isin(relations)].to_csv("out/sab/%s.edges.csv"%sab)

In [521]:
labels.loc['C1825487']

label    A2ML1 gene
Name: C1825487, dtype: object

In [522]:
relations = {
	"gene_associated_with_disease_or_phenotype": {
		"source": "Gene",
		"target": "Disease or Phenotype",
	}
}

In [523]:
nodes = get_nodes(df, sab, relations)

Gene
out/sab/CLINVAR.Gene.nodes.csv found


Disease or Phenotype
out/sab/CLINVAR.Disease or Phenotype.nodes.csv found


In [538]:
ind = df[df.target == 'C1417848'].index
ind

Index([], dtype='int64')

In [527]:
df.loc[[i for i in df.index if i not in ind]].to_csv("out/sab/%s.edges.csv"%sab)

In [529]:
nodes['Disease or Phenotype'].loc['C1417848']

label                NRL gene
type     Disease or Phenotype
OMIM                   162080
ORDO                 123930.0
MONDO                  5283.0
Name: C1417848, dtype: object

In [536]:
nodes['Disease or Phenotype'].loc[[i for i in nodes['Disease or Phenotype'].index if i != 'C1417848']].to_csv("out/sab/%s.%s.nodes.csv"%(sab, 'Disease or Phenotype'))

### HGNCHPO

In [414]:
sab = 'HGNCHPO'
df = get_sab_df(sab)
df.head()

out/sab/HGNCHPO.edges.csv exists!


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,C1826605,C4025901,associated_with,HGNCHPO,,
1,C1826605,C4021817,associated_with,HGNCHPO,,
2,C1826605,C0026633,associated_with,HGNCHPO,,
3,C1826605,C2183966,associated_with,HGNCHPO,,
4,C1826605,C4025887,associated_with,HGNCHPO,,


In [415]:
df.relation.unique()

array(['associated_with'], dtype=object)

In [416]:
hp = df[df.relation == 'associated_with'].target.unique()

In [417]:
df[df.relation == 'associated_with'].to_csv("out/sab/%s.edges.csv"%sab)

In [418]:
relations = {
	"associated_with": {
		"source": "Gene",
		"target": "Disease or Phenotype",
	}
}

In [419]:
nodes = get_nodes(df, sab, relations)

Gene
out/sab/HGNCHPO.Gene.nodes.csv found
Disease or Phenotype


### MSIGDB

In [356]:
sab = 'MSIGDB'
df = get_sab_df(sab)
df.head()

159010399it [02:26, 1083406.58it/s]


Unnamed: 0,source,target,relation,SAB,evidence_class,dcc
0,MSIGDB:M40835 CUI,C1424141,targets_expression_of_gene,MSIGDB,,
1,MSIGDB:M1556 CUI,C1424200,chr_band_contains_gene,MSIGDB,,
2,MSIGDB:M1884 CUI,C1424528,pathway_associated_with_gene,MSIGDB,,
3,MSIGDB:M39822 CUI,C1424528,pathway_associated_with_gene,MSIGDB,,
4,MSIGDB:M752 CUI,C1424528,pathway_associated_with_gene,MSIGDB,,


In [357]:
df.relation.unique()

array(['targets_expression_of_gene', 'chr_band_contains_gene',
       'pathway_associated_with_gene', 'has_marker_gene',
       'has_signature_gene', 'inverse_targets_expression_of_gene',
       'inverse_chr_band_contains_gene',
       'inverse_pathway_associated_with_gene', 'inverse_has_marker_gene',
       'inverse_has_signature_gene'], dtype=object)

In [385]:
d = df[df.relation == 'has_signature_gene']

In [386]:
set([i.split(":")[0] for i in d.source])

{'MSIGDB'}

In [387]:
concept_code[concept_code.id.isin(d.target.unique())].SAB.value_counts()

SAB
ENSEMBL        4384
MTH            4384
HGNC           4384
ENTREZ         4380
OMIM           4347
NCI            2348
ORDO           1576
LNC             344
CHV              46
PDQ              38
MONDO            30
MSH              22
CSP              16
EFO              13
LCH_NW            5
SNOMEDCT_US       4
CHEBI             2
AOT               1
DOID              1
Name: count, dtype: int64

In [388]:
relations = {
	'targets_expression_of_gene': {
		"source": "MSIGDB",
		"target": "Gene"
	},
	"chr_band_contains_gene": {
		"source": "MSIGDB",
		"target": "Gene"
	},
	"pathway_associated_with_gene": {
		"source": "MSIGDB",
		"target": "Gene"
	},
	"has_marker_gene": {
		"source": "MSIGDB",
		"target": "Gene"
	},
	"has_signature_gene": {
		"source": "MSIGDB",
		"target": "Gene"
	},
	"has_signature_gene": {
		"source": "MSIGDB",
		"target": "Gene"
	}
}

In [389]:
df[df.relation.isin(list(relations.keys()))].to_csv("out/sab/%s.edges.csv"%sab)

In [390]:
nodes = get_nodes(df, sab, relations)

MSIGDB
Gene


In [630]:
sab_dict = {
	"LINCS": ["LINCS"],
	"4DN": ["4DN"],
	"ERCC": ["ERCCRBP", "ERCCREG"],
	"GlyGen": ["PROTEOFORM", "GLYCANS"],
	"GTEx": ["GTEXEXP", "GTEXEQTL"],
	"HuBMAP": ["AZ", "HMAZ"],
	"IDG": ["IDGP", "IDGD"],
	"KF": ["KF"],
	"MoTrPAC": ["MOTRPAC"],
	"MW": ["MW"],
	"SPARC": ["NPO"],
	"CLINVAR": ["CLINVAR"],
	"HGNCHPO": ["HGNCHPO"],
	"MSIGDB": ["MSIGDB"]
}


dcc_sabs = set()
for i in sab_dict.values():
	dcc_sabs = dcc_sabs.union(i)
len(dcc_sabs)

19

In [631]:
file_sabs = set()
for i in glob('out/sab/*.csv'):
	s = i.split(".")[0].replace("out/sab/", "")
	file_sabs.add(s)

In [632]:
len(file_sabs), len(dcc_sabs), len(file_sabs.intersection(dcc_sabs))

(19, 19, 19)

In [633]:
import zlib
import zipfile

def compress(file_names, zip_name):
    print("File Paths:")
    print(file_names)

    path = "out/sab/"

    # Select the compression mode ZIP_DEFLATED for compression
    # or zipfile.ZIP_STORED to just store the file
    compression = zipfile.ZIP_DEFLATED

    # create the zip file first parameter path/name, second mode
    zf = zipfile.ZipFile("out/compressed/%s.zip"%zip_name, mode="w")
    try:
        for file_name in file_names:
            # Add file to the zip file
            # first parameter file to zip, second filename in zip
            zf.write(path + file_name, file_name, compress_type=compression)

    except FileNotFoundError:
        print("An error occurred")
    finally:
        # Don't forget to close the file!
        zf.close()

In [634]:
dict_sab = {}
for k,v in sab_dict.items():
	for i in v:
		dict_sab[i] = k

In [635]:
dcc_filename = {}
for i in glob('out/sab/*.csv'):
	filename = i.replace("out/sab/", "")
	sab = filename.split(".")[0]
	dcc = dict_sab[sab]
	if dcc not in dcc_filename:
		dcc_filename[dcc] = []
	dcc_filename[dcc].append(filename)

In [636]:
for dcc, filenames in dcc_filename.items():
	compress(filenames, dcc)

File Paths:
['IDGD.edges.csv', 'IDGP.Protein.nodes.csv', 'IDGP.edges.csv', 'IDGD.Disease or Phenotype.nodes.csv', 'IDGP.Compound.nodes.csv', 'IDGD.Compound.nodes.csv']
File Paths:
['GLYCANS.Glytoucan.nodes.csv', 'PROTEOFORM.Glycoprotein.nodes.csv', 'PROTEOFORM.Isoform.nodes.csv', 'PROTEOFORM.Glytoucan.nodes.csv', 'PROTEOFORM.GlyGen Location.nodes.csv', 'PROTEOFORM.Amino Acid.nodes.csv', 'GLYCANS.edges.csv', 'PROTEOFORM.Glycoprotein Citation.nodes.csv', 'GLYCANS.GlyGen Residue.nodes.csv', 'PROTEOFORM.Protein.nodes.csv', 'GLYCANS.Glycan Motif.nodes.csv', 'PROTEOFORM.Glycoprotein Evidence.nodes.csv', 'GLYCANS.Glycosyltransferase Reaction.nodes.csv', 'GLYCANS.Glycosylation.nodes.csv', 'PROTEOFORM.GP ID2PRO.nodes.csv', 'GLYCANS.GlyGen src.nodes.csv', 'GLYCANS.GlyGen Glycosequence.nodes.csv', 'GLYCANS.Protein.nodes.csv', 'PROTEOFORM.edges.csv', 'PROTEOFORM.Glycosylation Site.nodes.csv']
File Paths:
['4DN.4DN File.nodes.csv', '4DN.Anatomy.nodes.csv', '4DN.edges.csv', '4DN.4DN QVal Bin.nodes.c

In [516]:
concept_code[concept_code.id == 'C1417848']

Unnamed: 0,id,code_id,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
1503118,C1417848,MTH:NOCODE,MTH,NOCODE,,,,
3184031,C1417848,OMIM:162080,OMIM,162080,,,,
4303000,C1417848,HGNC:8002,HGNC,8002,,,,
4965231,C1417848,ORDO:123930,ORDO,123930,,,,
5032941,C1417848,MONDO:0005283,MONDO,0005283,,,,
5088526,C1417848,EFO:0003839,EFO,0003839,,,,
5139873,C1417848,ENSEMBL:ENSG00000129535,ENSEMBL,ENSG00000129535,13.0,24078662.0,24115010.0,
5593888,C1417848,ENTREZ:4901,ENTREZ,4901,,24078662.0,24114949.0,
