In [1]:
from neo4j import GraphDatabase
import pandas as pd
from dotenv import load_dotenv
from glob import glob
import re
import os
import numpy as np
from tqdm import tqdm
from uuid import uuid5, NAMESPACE_URL

In [2]:
load_dotenv()

True

In [3]:
pattern = "out/sab/(?P<SAB>.*)\.(?P<node_type>.+)\.nodes\.csv"
node_types = {}
for filename in tqdm(glob('out/sab/*.nodes.csv')):
	node_type = re.match(pattern, filename).groupdict()['node_type']
	if node_type not in node_types:
		node_types[node_type] = []
	node_types[node_type].append(filename)

100%|██████████| 87/87 [00:00<00:00, 228493.71it/s]


In [4]:
node_ids = {}
for k,v in node_types.items():
	node_ids[k] = set()
	for filename in v:
		df = pd.read_csv(filename, index_col=0)
		node_ids[k] = node_ids[k].union(df.index)

In [5]:
all_nodes = set()
for k,v in node_ids.items():
	all_nodes = all_nodes.union(v)
len(all_nodes)

8196986

In [6]:
nodes = {}
for node_type_a, ids_a in node_ids.items():
	for node_type_b, ids_b in node_ids.items():
		if node_type_a != node_type_b:
			intersect = ids_a.intersection(ids_b)
			if len(intersect):
				node_ids[node_type_a] = node_ids[node_type_a] - intersect
				node_ids[node_type_b] = node_ids[node_type_b] - intersect
				nodes[(node_type_a, node_type_b)] = intersect
				print((node_type_a, node_type_b), len(intersect))

('Gene', 'Enzyme') 1891
('Gene', 'Metabolite') 1
('Anatomy', 'Disease or Phenotype') 1
('Anatomy', 'Biofluid') 3
('Compound', 'Metabolite') 1305
('PATO', 'Sex') 2


In [7]:
for k,v in node_ids.items():
	nodes[(k)] = v

In [8]:
all_nodes = set()
for k,v in nodes.items():
	print(k)
	all_nodes = all_nodes.union(v)

('Gene', 'Enzyme')
('Gene', 'Metabolite')
('Anatomy', 'Disease or Phenotype')
('Anatomy', 'Biofluid')
('Compound', 'Metabolite')
('PATO', 'Sex')
Glytoucan
Glycoprotein
Isoform
Protein
4DN File
Gene
MOTRPAC
Anatomy
GlyGen Location
SO
ILX
Amino Acid
Compound
Enzyme
HSCLO
Metabolite
Glycoprotein Citation
PATO
MSIGDB
EXPBINS
RBP Binding Loci
NIFSTD
GTEXPVALUEBIN
GlyGen Residue
Disease or Phenotype
exRNA Loci
4DN QVal Bin
Biofluid
GO
Glycan Motif
Assay
Regulatory Element Activity
Glycoprotein Evidence
Taxon
ENCODE CCRE Data Matrix
Glycosyltransferase Reaction
Glycosylation
KFPT
GTEXEQTL
KFGENEBIN
GP ID2PRO
GlyGen src
GlyGen Glycosequence
KFCOHORT
Sex
CLINGEN ALLELE REGISTRY
4DN Loop
ENCODE CCRE
GTEXEXP
4DN Dataset
Glycosylation Site


In [9]:
len(all_nodes)

8196986

In [10]:
len(nodes)

57

In [11]:
os.getenv('NEO4J_URL')

'bolt://localhost:7687'

In [12]:
def index_nodes(node_type, name):
	with GraphDatabase.driver(os.getenv('NEO4J_URL'), auth=(os.getenv('NEO4J_USER'), os.getenv('NEO4J_PASSWORD'))) as driver:
		with driver.session(database="neo4j") as session:
			tx = session.begin_transaction()
			try:
				tx.run("CREATE CONSTRAINT distillery_unique_id_%s IF NOT EXISTS  FOR (n:%s) REQUIRE n.id IS UNIQUE"%(name, node_type))
				tx.run("CREATE INDEX distillery_index_id_%s IF NOT EXISTS  FOR (n:%s) ON (n.id)"%(name, node_type))
				tx.run("CREATE INDEX distillery_index_label_%s IF NOT EXISTS  FOR (n:%s) ON (n.label)"%(name, node_type))
				tx.commit()
			except Exception as e:
				print(e)
				tx.rollback()
			finally:
				tx.close()

In [13]:
for node_type in node_types:
	n = node_type
	if len(node_type.split(" ")) > 1:
		n = "`%s`"%node_type
	index_nodes(n, node_type.replace(" ", "_"))

In [14]:
def ingest_node(node_type, nodes, limit=10000):
	success = True
	with GraphDatabase.driver(os.getenv('NEO4J_URL'), auth=(os.getenv('NEO4J_USER'), os.getenv('NEO4J_PASSWORD'))) as driver:
		with driver.session(database="neo4j") as session:
			skip = 0
			print("Ingesting: %s"%(", ".join(node_type)))
			while skip < len(nodes):
				batch = nodes[skip: skip+limit]
				tx = session.begin_transaction()
				try:
					query = '''
						UNWIND $batch as map
						CREATE (n:%s)
						SET n = map
					'''%(":".join(node_type))
					tx.run(query, {"batch": batch})
					skip += limit
					tx.commit()
				except Exception as e:
					print("Error rolling back...")
					print("Exception", e)
					tx.rollback()
					success = False
					break
				finally:
					tx.close()
			else:
				success = True			
	return success

In [15]:

for node_type, node_ids in nodes.items():
	node_type = node_type if type(node_type) == tuple else [node_type]
	node_dict = {}
	for t in node_type:
		filenames = node_types[t]
		for file in filenames:
			df = pd.read_csv(file, index_col=0).astype(str)
			idx = list(node_ids.intersection(df.index))
			df = df.loc[idx]
			for k,row in df.iterrows():
				v = {}
				for i,j in row.items():
					if type(j) == str:
						v[i] = j
					elif not np.isnan(j):
						v[i] = int(j)
				if k not in node_dict:
					node_dict[k] = {
						"id": k,
						**v
					}
				else:
					node_dict[k] = {
						**node_dict[k],
						**v
					}
	n = []
	for i in node_type:
		if len(i.split(" ")) > 1:
			n.append("`%s`"%i)
		else:
			n.append(i)
	
	r = ingest_node(n, list(node_dict.values()))
	if not r:
		break
	

Ingesting: Gene, Enzyme
Ingesting: Gene, Metabolite
Ingesting: Anatomy, `Disease or Phenotype`
Ingesting: Anatomy, Biofluid
Ingesting: Compound, Metabolite
Ingesting: PATO, Sex
Ingesting: Glytoucan
Ingesting: Glycoprotein
Ingesting: Isoform
Ingesting: Protein
Ingesting: `4DN File`
Ingesting: Gene
Ingesting: MOTRPAC
Ingesting: Anatomy
Ingesting: `GlyGen Location`
Ingesting: SO
Ingesting: ILX
Ingesting: `Amino Acid`
Ingesting: Compound
Ingesting: Enzyme
Ingesting: HSCLO
Ingesting: Metabolite
Ingesting: `Glycoprotein Citation`
Ingesting: PATO
Ingesting: MSIGDB
Ingesting: EXPBINS
Ingesting: `RBP Binding Loci`
Ingesting: NIFSTD
Ingesting: GTEXPVALUEBIN
Ingesting: `GlyGen Residue`
Ingesting: `Disease or Phenotype`
Ingesting: `exRNA Loci`
Ingesting: `4DN QVal Bin`
Ingesting: Biofluid
Ingesting: GO
Ingesting: `Glycan Motif`
Ingesting: Assay
Ingesting: `Regulatory Element Activity`
Ingesting: `Glycoprotein Evidence`
Ingesting: Taxon
Ingesting: `ENCODE CCRE Data Matrix`
Ingesting: `Glycosyltrans

In [16]:
pattern = "out/sab/(?P<SAB>.*)\.(?P<node_type>.+)\.nodes\.csv"
node_sabs = {}
for filename in glob('out/sab/*.nodes.csv'):
	sab = re.match(pattern, filename).groupdict()['SAB']
	print(sab)
	if sab not in node_sabs:
		node_sabs[sab] = []
	node_sabs[sab].append(filename)

GLYCANS
PROTEOFORM
PROTEOFORM
PROTEOFORM
IDGP
4DN
ERCCRBP
MOTRPAC
4DN
KF
PROTEOFORM
NPO
NPO
PROTEOFORM
CLINVAR
LINCS
NPO
GTEXEXP
HGNCENZ
GTEXEQTL
GTEXEXP
MW
PROTEOFORM
NPO
HMAZ
MSIGDB
GTEXEXP
GTEXEQTL
ERCCRBP
NPO
GTEXEQTL
GLYCANS
LINCS
CLINVAR
ERCCRBP
4DN
PROTEOFORM
ERCCRBP
HGNCHPO
NPO
GLYCANS
4DN
ERCCREG
HMAZ
ERCCRBP
PROTEOFORM
MOTRPAC
NPO
ERCCREG
ERCCREG
NPO
ERCCREG
IDGD
MOTRPAC
GLYCANS
GLYCANS
IDGP
KF
GTEXEQTL
KF
PROTEOFORM
GLYCANS
IDGD
GLYCANS
KF
4DN
GTEXEQTL
GLYCANS
ERCCREG
MSIGDB
MOTRPAC
MW
MW
ERCCREG
NPO
4DN
AZ
NPO
HGNCUNIPROT
ERCCREG
MW
KF
HGNCHPO
GTEXEXP
HGNCUNIPROT
4DN
PROTEOFORM


In [17]:
def ingest_edges(relation, meta, source, target, edges, limit=10000):
	success = True
	with GraphDatabase.driver(os.getenv('NEO4J_URL'), auth=(os.getenv('NEO4J_USER'), os.getenv('NEO4J_PASSWORD'))) as driver:
		with driver.session(database="neo4j") as session:
			skip = 0
			while skip < len(edges):
				batch = edges[skip: skip+limit]
				tx = session.begin_transaction()
				try:
					query = '''
						UNWIND $batch as row
						MATCH (n:%s), (m:%s)
						WHERE n.id=row.source and m.id=row.target
						CREATE (n)-[r:%s {
							%s
						}]->(m)

					'''%(source, target, relation, meta)
					tx.run(query, {"batch": batch})
					skip += limit
					tx.commit()
				except Exception as e:
					print("Error rolling back...")
					print("Exception", e)
					tx.rollback()
					success = False
					break
				finally:
					tx.close()
			else:
				success = True			
	return success

In [18]:
import csv

In [19]:
edge_pattern = "out/sab/(?P<SAB>.*)\.edges\.csv"
success = True
for filename in glob('out/sab/*.edges.csv'):
	# GET ID TYPES
	sab = re.match(edge_pattern, filename).groupdict()['SAB']
	print(sab)
	df = pd.read_csv(filename, index_col=0)
	node_typer = {}
	# if sab == 'HGNCUNIPROT':
	# 	for i in df.source.unique():
	# 		node_typer[i] = 'Gene'
	# 	for i in df.target.unique():
	# 		node_typer[i] = 'Protein'
	# else:
	for f in node_sabs[sab]:
		node_type = re.match(pattern, f).groupdict()['node_type']
		if len(node_type.split(" ")) > 1:
			node_type = "`%s`"%node_type
		with open(f) as o:
			csv_reader = csv.reader(o)
			for row in csv_reader:
				node_typer[row[0]] = node_type
	node_typer = pd.Series(node_typer)
	if len(df.index) != len(set(df.index)):
		df = df.reset_index()
		df = df[[i for i in df.columns if i != "index"]]
	# df["id"] = df.apply(lambda x: str(uuid5(NAMESPACE_URL, "%s_%s_%s_%s"%(x[0], x[2], x[1], x[3]))), axis=1)
	for relation in df.relation.unique():
		edges = []
		d = df[df.relation == relation].dropna(axis=1, how='all')
		source_type = "|".join(node_typer[df.source].unique())
		target_type = "|".join(node_typer[df.target].unique())
		print(source_type, target_type)
		meta = []
		for col in d.columns:
			if (col not in ["source", 'target']):
				meta.append("%s:row.%s"%(col, col))
		edges = list(d.to_dict(orient="index").values())
		print("Ingesting %d %s relation of %s"%(len(edges), relation, sab))
		meta = ",\n".join(meta)
		success = ingest_edges(relation, meta, source_type, target_type, edges)
		if not success:
			break
		if not success:
				break

IDGD
Compound `Disease or Phenotype`
Ingesting 7287 indication relation of IDGD
HGNCHPO
Gene `Disease or Phenotype`
Ingesting 657199 associated_with relation of HGNCHPO
GLYCANS
Glycosylation|`Glycosyltransferase Reaction`|Glytoucan|`GlyGen Residue` Protein|`GlyGen src`|`GlyGen Glycosequence`|Glycosylation|`Glycosyltransferase Reaction`|`Glycan Motif`|`GlyGen Residue`
Ingesting 182 has_enzyme_protein relation of GLYCANS
Glycosylation|`Glycosyltransferase Reaction`|Glytoucan|`GlyGen Residue` Protein|`GlyGen src`|`GlyGen Glycosequence`|Glycosylation|`Glycosyltransferase Reaction`|`Glycan Motif`|`GlyGen Residue`
Ingesting 30986 is_from_source relation of GLYCANS
Glycosylation|`Glycosyltransferase Reaction`|Glytoucan|`GlyGen Residue` Protein|`GlyGen src`|`GlyGen Glycosequence`|Glycosylation|`Glycosyltransferase Reaction`|`Glycan Motif`|`GlyGen Residue`
Ingesting 117146 has_glycosequence relation of GLYCANS
Glycosylation|`Glycosyltransferase Reaction`|Glytoucan|`GlyGen Residue` Protein|`GlyG

Failed to read from defunct connection IPv4Address(('localhost', 7687)) (ResolvedIPv6Address(('::1', 7687, 0, 0)))


Error rolling back...
Exception Failed to read from defunct connection IPv4Address(('localhost', 7687)) (ResolvedIPv6Address(('::1', 7687, 0, 0)))
4DN
`4DN Dataset`|`4DN File`|`4DN Loop` Assay|Anatomy|`4DN File`|`4DN Loop`|HSCLO|`4DN QVal Bin`
Ingesting 24 has_assay_type relation of 4DN


ServiceUnavailable: Couldn't connect to localhost:7687 (resolved to ()):
Failed to establish connection to ResolvedIPv6Address(('::1', 7687, 0, 0)) (reason [Errno 61] Connection refused)
Failed to establish connection to ResolvedIPv4Address(('127.0.0.1', 7687)) (reason [Errno 61] Connection refused)

In [None]:
node_sabs.keys()

In [None]:
df = pd.read_csv(filename, index_col=0)
df.head()

In [None]:
edge_pattern = "out/sab/(?P<SAB>.*)\.edges\.csv"
success = True
for filename in glob('out/sab/*.edges.csv'):
	# GET ID TYPES
	sab = re.match(edge_pattern, filename).groupdict()['SAB']
	print(sab)
	node_typer = {}
	for f in node_sabs[sab]:
		node_type = re.match(pattern, f).groupdict()['node_type']
		if len(node_type.split(" ")) > 1:
			node_type = "`%s`"%node_type
		with open(f) as o:
			csv_reader = csv.reader(o)
			for row in csv_reader:
				node_typer[row[0]] = node_type
	node_typer = pd.Series(node_typer)
	df = pd.read_csv(filename, index_col=0)
	if len(df.index) != len(set(df.index)):
		df = df.reset_index()
		df = df[[i for i in df.columns if i != "index"]]
	# df["id"] = df.apply(lambda x: str(uuid5(NAMESPACE_URL, "%s_%s_%s_%s"%(x[0], x[2], x[1], x[3]))), axis=1)
	for relation in df.relation.unique():
		try:
			edges = []
			d = df[df.relation == relation].dropna(axis=1, how='all')
			source_type = "|".join(node_typer[df.source].unique())
			target_type = "|".join(node_typer[df.target].unique())
			print(source_type, target_type)
		except Exception as e:
			print(e)
	# 	meta = []
	# 	for col in d.columns:
	# 		if (col not in ["source", 'target']):
	# 			meta.append("%s:row.%s"%(col, col))
	# 	edges = list(d.to_dict(orient="index").values())
	# 	print("Ingesting %d %s relation of %s"%(len(edges), relation, sab))
	# 	meta = ",\n".join(meta)
	# 	success = ingest_edges(relation, meta, source_type, target_type, edges)
	# 	if not success:
	# 		break
	# if not success:
	# 		break

In [None]:
total_edges = 0
for filename in glob('out/sab/*.edges.csv'):
	df = pd.read_csv(filename, index_col=0)
	total_edges += df.shape[0]
total_edges

In [None]:
# with GraphDatabase.driver(os.getenv('NEO4J_URL'), auth=(os.getenv('NEO4J_USER'), os.getenv('NEO4J_PASSWORD'))) as driver:
# 		with driver.session(database="neo4j") as session:
# 			tx = session.begin_transaction()
# 			try:
# 				tx.run("CREATE CONSTRAINT distillery_unique_id_%s IF NOT EXISTS  FOR (n:%s) REQUIRE n.id IS UNIQUE"%(name, node_type))
# 				tx.run("CREATE INDEX distillery_index_id_%s IF NOT EXISTS  FOR (n:%s) ON (n.id)"%(name, node_type))
# 				tx.run("CREATE INDEX distillery_index_label_%s IF NOT EXISTS  FOR (n:%s) ON (n.label)"%(name, node_type))
# 				tx.commit()
# 			except Exception as e:
# 				print(e)
# 				tx.rollback()
# 			finally:
# 				tx.close()

In [None]:
with GraphDatabase.driver(os.getenv('NEO4J_URL'), auth=(os.getenv('NEO4J_USER'), os.getenv('NEO4J_PASSWORD'))) as driver:
		with driver.session(database="neo4j") as session:
			tx = session.begin_transaction()
			record = tx.run("MATCH (a:Gene) RETURN a LIMIT 10")
			for i in record:
				a = i['a']

In [None]:
a.items()

In [None]:
enzyme_genes = pd.read_csv('../dd_data/HGNC_genes.txt', sep="\t", index_col=0)

In [None]:
enzyme_genes.head()

In [None]:
enzyme_ids = [int(i.split(":")[1]) for i in enzyme_genes[~enzyme_genes['Enzyme (EC) ID'].isna()].index]
len(enzyme_ids)

In [None]:
records = []
with GraphDatabase.driver(os.getenv('NEO4J_URL'), auth=(os.getenv('NEO4J_USER'), os.getenv('NEO4J_PASSWORD'))) as driver:
	with driver.session(database="neo4j") as session:
		tx = session.begin_transaction()
		record = tx.run('''
			UNWIND $batch as row
			MATCH (a:Gene)
			WHERE a.HGNC = row	
			RETURN a   
		''', batch=enzyme_ids)
		
		for i in record:
			records.append(i)

In [None]:
len(records)

In [None]:
recs = []
for i in records:
	recs.append(i["a"]["HGNC"])

## Add Enzyme

In [None]:
records = []
with GraphDatabase.driver(os.getenv('NEO4J_URL'), auth=(os.getenv('NEO4J_USER'), os.getenv('NEO4J_PASSWORD'))) as driver:
	with driver.session(database="neo4j") as session:
		tx = session.begin_transaction()
		tx.run('''
			UNWIND $batch as row
			MATCH (a:Gene)
			WHERE a.HGNC = row	
			set a :Enzyme
		''', batch=enzyme_ids)
		tx.commit()

In [None]:
records = []
with GraphDatabase.driver(os.getenv('NEO4J_URL'), auth=(os.getenv('NEO4J_USER'), os.getenv('NEO4J_PASSWORD'))) as driver:
	with driver.session(database="neo4j") as session:
		tx = session.begin_transaction()
		results = tx.run('''
			MATCH (a:Enzyme)
			return a
		''')
		for i in results:
			records.append(i)

In [None]:
len(records)

In [None]:
records[0]['a'].items()

In [None]:
recs = {}
for i in records:
	vals = {}
	for k,v in i['a'].items():
		if (k != 'id'):
			vals[k] = v
	recs[i['a']['id']] = vals

In [None]:
len(recs)

In [None]:
df = pd.DataFrame.from_dict(recs, orient="index")

In [None]:
df = df[['label', 'type', 'HGNC', 'ENSEMBL', 'OMIM',  'ORDO', 'ENTREZ', 'NCI']]

In [None]:
df.to_csv('out/sab/hgnc_enzyme.Enzyme.nodes.csv')

In [None]:
mapper = []
with open('../dd_data/idmapping_2023_08_24.tsv') as o:
	csv_reader = csv.reader(o, delimiter="\t")
	header = True
	for row in csv_reader:
		if header:
			header = False
		else:
			mapper.append({
				"protein": row[0],
				"gene": row[1]
			})

In [None]:
mapper[0]

In [None]:
mapper[0]

In [None]:
proteins = []
with GraphDatabase.driver(os.getenv('NEO4J_URL'), auth=(os.getenv('NEO4J_USER'), os.getenv('NEO4J_PASSWORD'))) as driver:
	with driver.session(database="neo4j") as session:
		tx = session.begin_transaction()
		record = tx.run('''
			UNWIND $batch as row
			MATCH (a:Protein)
			WHERE a.UNIPROTKB = row.protein 
			RETURN a   
		''', batch=mapper)
		
		for i in record:
			proteins.append(i)

In [None]:
genes = []
with GraphDatabase.driver(os.getenv('NEO4J_URL'), auth=(os.getenv('NEO4J_USER'), os.getenv('NEO4J_PASSWORD'))) as driver:
	with driver.session(database="neo4j") as session:
		tx = session.begin_transaction()
		record = tx.run('''
			UNWIND $batch as row
			MATCH (a:Gene)
			WHERE a.HGNC = row.gene 
			RETURN a   
		''', batch=mapper)
		
		for i in record:
			genes.append(i)

In [None]:
len(proteins)

In [None]:
len(genes)

In [None]:
proteins[0]['a']['UNIPROTKB']

In [None]:
protein_id_mapper = {}
protein_label_mapper = {}
for i in proteins:
	protein_id_mapper[i['a']['UNIPROTKB']] = i['a']["id"]
	protein_label_mapper[i['a']['UNIPROTKB']] = i['a']["label"]

In [None]:
gene_id_mapper = {}
gene_label_mapper = {}
for i in genes:
	gene_id_mapper[i['a']['HGNC']] = i['a']["id"]
	gene_label_mapper[i['a']['HGNC']] = i['a']["label"]

In [None]:
protein_gene = pd.read_csv('../dd_data/idmapping_2023_08_24.tsv', sep="\t", index_col=0)

In [None]:
protein_gene.index.name = "protein"
protein_gene.columns = ["gene"]

In [None]:
protein_gene_mapper = protein_gene.to_dict()["gene"]

In [None]:
protein_list = []
gene_list = []
prot_gene = []
for prot, protid in protein_id_mapper.items():
	hgnc = protein_gene_mapper[prot]
	if hgnc in gene_id_mapper:
		prot_gene.append({
			"source": gene_id_mapper[hgnc],
			"target": protid,
			"relation": "is_protein",
			"sab": "HGNCUNIPROT"
		})
		protein_list.append({
			"id": protid,
			"label": protein_label_mapper[prot],
			"UNIPROTKB": prot
		})
		gene_list.append({
			"id": gene_id_mapper[hgnc],
			"label": gene_label_mapper[hgnc],
			"HGNC": hgnc
		})

In [None]:
len(protein_list), len(gene_list)

In [None]:
df = pd.DataFrame.from_records(prot_gene)

In [None]:
protein_df = pd.DataFrame.from_records(protein_list)
gene_df = pd.DataFrame.from_records(gene_list)

In [None]:
protein_df = protein_df.set_index('id')
gene_df = gene_df.set_index('id')
gene_df.head()

In [None]:
protein_df.to_csv("out/sab/HGNCUNIPROT.Protein.nodes.csv")
gene_df.to_csv("out/sab/HGNCUNIPROT.Gene.nodes.csv")

In [None]:
df.to_csv('out/sab/HGNCUNIPROT.edges.csv')