In [4]:
import pandas as pd
import requests
from glob import glob
import re

In [42]:
data_dir = '../dd_data/DD28Mar2024CSV/'

In [43]:
labels = pd.read_csv(data_dir + "CUI-SUIs.csv")
labels.columns = ['id', 'label']
labels = labels.set_index('id')
labels.head()

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
C0000005,(131)I-Macroaggregated Albumin
C0000139,"16,16-Dimethylprostaglandin E2"
C0000163,17-Hydroxycorticosteroids
C0000165,17-Hydroxysteroid Dehydrogenases
C0000190,2'-CMP


In [44]:
code_labels = pd.read_csv(data_dir + "CODE-SUIs.csv")
code_labels.columns = ["label", "id", "type", "cui"]
code_labels.head()

Unnamed: 0,label,id,type,cui
0,Dipalmitoyl Phosphatidylcholine,MSH:D015060,ET,C0000039
1,Cyperquat,MSH:D015655,ET,C0000098
2,1-naththylamine (substance),SNOMEDCT_US:13579002,FN,C0000102
3,1-Sar-8-Ile Angiotensin II,MSH:D015059,ET,C0000107
4,SAR ILE ANGIOTENSIN 02 01 08,MSH:D015059,DSV,C0000107


In [45]:
code_labels = code_labels.groupby('cui').first()

In [46]:
cl = set(code_labels.index) 
l=set(labels.index)
missing = cl - l

labels = pd.concat([labels, code_labels.loc[list(missing), ["label"]]])

In [92]:
pattern = "out/sab/(?P<SAB>.*)\.(?P<node_type>.+)\.nodes\.csv"
node_types = {}
for filename in glob('out/sab/*.nodes.csv'):
	node_type = re.match(pattern, filename).groupdict()['node_type']
	if node_type not in node_types:
		node_types[node_type] = []
	node_types[node_type].append(filename)

In [122]:
len(node_types)

50

In [115]:
node_ids = {}

In [116]:
for k,v in node_types.items():
	node_ids[k] = set()
	for filename in v:
		df = pd.read_csv(filename, index_col=0)
		node_ids[k] = node_ids[k].union(df.index)

In [117]:
for node_type_a, ids_a in node_ids.items():
	for node_type_b, ids_b in node_ids.items():
		if node_type_a != node_type_b:
			if len(ids_a.intersection(ids_b)):
				print(node_type_a, node_type_b, len(ids_a.intersection(ids_b)))

Gene Metabolite 1
Anatomy Disease or Phenotype 1
Anatomy Biofluid 3
Compound Metabolite 1305
Metabolite Gene 1
Metabolite Compound 1305


PATO Sex 2
Disease or Phenotype Anatomy 1
Biofluid Anatomy 3
Sex PATO 2


In [118]:
node_ids["Gene"].intersection(node_ids["Disease or Phenotype"])

set()

In [119]:
'''
Compound Metabolite 1305
Metabolite Compound 1305
'''

'\nCompound Metabolite 1305\nMetabolite Compound 1305\nMetabolite Disease or Phenotype 3\n'

In [120]:
l = list(node_ids["Metabolite"].intersection(node_ids["Disease or Phenotype"]))
labels.loc[l]

Unnamed: 0,label


In [121]:
labels.loc[l]

Unnamed: 0,label


In [124]:
print("\n".join(node_types.keys()))

Glytoucan
Glycoprotein
Isoform
Protein
4DN File
Gene
MOTRPAC
Anatomy
GlyGen Location
SO
ILX
Amino Acid
Compound
HSCLO
Metabolite
Glycoprotein Citation
PATO
MSIGDB
EXPBINS
RBP Binding Loci
NIFSTD
GTEXPVALUEBIN
GlyGen Residue
Disease or Phenotype
exRNA Loci
4DN QVal Bin
Biofluid
GO
Glycan Motif
Assay
Regulatory Element Activity
Glycoprotein Evidence
Taxon
ENCODE CCRE Data Matrix
Glycosyltransferase Reaction
Glycosylation
KFPT
GTEXEQTL
KFGENEBIN
GP ID2PRO
GlyGen src
GlyGen Glycosequence
KFCOHORT
Sex
CLINGEN ALLELE REGISTRY
4DN Loop
ENCODE CCRE
GTEXEXP
4DN Dataset
Glycosylation Site


In [126]:
for i in node_types["Gene"]:
	print(pd.read_csv(i, index_col=0).columns)

Index(['label', 'type', 'ENSEMBL', 'HGNC', 'ENTREZ', 'OMIM'], dtype='object')
Index(['label', 'type', 'HGNC', 'ENSEMBL', 'ENTREZ', 'OMIM'], dtype='object')
Index(['label', 'type', 'ENSEMBL', 'HGNC', 'OMIM', 'ENTREZ', 'ORDO'], dtype='object')
Index(['label', 'type', 'ENSEMBL', 'HGNC', 'ENTREZ', 'OMIM'], dtype='object')
Index(['label', 'type', 'HGNC', 'ENSEMBL', 'ENTREZ'], dtype='object')
Index(['label', 'type', 'HGNC', 'ENSEMBL', 'ENTREZ', 'OMIM', 'NCI'], dtype='object')
Index(['label', 'type', 'HGNC', 'ENSEMBL', 'OMIM', 'ENTREZ', 'ORDO'], dtype='object')
Index(['label', 'type', 'ENSEMBL', 'HGNC'], dtype='object')
Index(['label', 'type', 'ENSEMBL'], dtype='object')
Index(['label', 'type', 'HGNC', 'ENSEMBL', 'ENTREZ', 'OMIM'], dtype='object')
Index(['label', 'type', 'HGNC', 'ENSEMBL', 'ENTREZ'], dtype='object')
Index(['label', 'type', 'HGNC', 'ENSEMBL', 'ENTREZ', 'OMIM'], dtype='object')
Index(['label', 'type', 'ENTREZ'], dtype='object')


In [128]:
for i in node_types["Compound"]:
	print(pd.read_csv(i, index_col=0).columns)

Index(['label', 'type', 'CHEBI'], dtype='object')
Index(['label', 'type', 'PUBCHEM'], dtype='object')


Index(['label', 'type', 'PUBCHEM'], dtype='object')
Index(['label', 'type', 'PUBCHEM'], dtype='object')


In [129]:
for i in node_types["Metabolite"]:
	print(pd.read_csv(i, index_col=0).columns)

Index(['label', 'type', 'PUBCHEM'], dtype='object')


In [130]:
for i in node_types["Anatomy"]:
	print(pd.read_csv(i, index_col=0).columns)

Index(['label', 'type', 'EFO', 'LNC'], dtype='object')
Index(['label', 'type', 'LNC', 'UBERON', 'FMA', 'SNOMEDCT_US', 'NCI', 'CHV',
       'UWDA', 'MSH'],
      dtype='object')
Index(['label', 'type', 'LNC', 'UBERON', 'FMA', 'SNOMEDCT_US', 'NCI', 'CHV',
       'UWDA', 'MSH'],
      dtype='object')
Index(['label', 'type', 'AZ', 'CL'], dtype='object')
Index(['label', 'type', 'LNC', 'UBERON', 'ICF-CY', 'SNOMEDCT_US', 'UWDA',
       'FMA', 'CSP', 'NCI', 'CHV', 'MSH', 'ICF', 'OMIM', 'HL7V2.5', 'LCH_NW',
       'PSY', 'AZ'],
      dtype='object')
Index(['label', 'type', 'LNC', 'UBERON', 'FMA', 'NCI', 'SNOMEDCT_US', 'CHV',
       'UWDA', 'MSH'],
      dtype='object')
Index(['label', 'type', 'UBERON', 'FMA'], dtype='object')
Index(['label', 'type', 'AZ', 'CL'], dtype='object')
Index(['label', 'type', 'LNC', 'UBERON', 'FMA', 'CHV', 'NCI', 'SNOMEDCT_US',
       'CSP', 'MSH', 'UWDA', 'LCH_NW'],
      dtype='object')


In [133]:
for k,v in node_types.items():
	if len(v) > 1:
		print(k, len(v))

Glytoucan 2
Protein 5
Gene 13
Anatomy 9
Compound 4
HSCLO 2
Disease or Phenotype 5
GTEXEQTL 2


In [139]:
for i in node_types["Disease or Phenotype"]:
	print(pd.read_csv(i, index_col=0).columns)

Index(['label', 'type', 'OMIM', 'ORDO', 'MONDO'], dtype='object')
Index(['label', 'type', 'MDR', 'ICPC2ICD10ENG', 'SNOMEDCT_US', 'MONDO',
       'MEDCIN', 'CHV', 'DOID', 'OMIM', 'ICD10CM', 'MSH', 'NCI', 'EFO'],
      dtype='object')
Index(['label', 'type', 'MDR', 'ICPC2ICD10ENG', 'MONDO', 'OMIM', 'ORDO',
       'DOID', 'SNOMEDCT_US', 'MEDCIN', 'MSH', 'CHV', 'ICD10CM', 'NCI', 'EFO',
       'LNC'],
      dtype='object')
Index(['label', 'type', 'OMIM', 'ICPC2ICD10ENG', 'HP', 'MDR', 'MONDO',
       'SNOMEDCT_US', 'MEDCIN', 'DOID', 'CHV'],
      dtype='object')
Index(['label', 'type', 'HP', 'OMIM', 'MDR', 'ICPC2ICD10ENG'], dtype='object')


In [138]:
node_types["GTEXEQTL"]

['out/sab/GTEXEQTL.GTEXEQTL.nodes.csv', 'out/sab/ERCCREG.GTEXEQTL.nodes.csv']