In [1]:
version = "v0.0.5"

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 6/6

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [3]:
from collections import defaultdict

In [4]:
import pandas as pd
import re
import numpy as np
import os

In [5]:
from py2neo import Graph, Node, Relationship

In [6]:
import helpers

In [7]:
from importlib import reload

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [8]:
graph = Graph(host="neo4j")

In [9]:
from pathlib import Path

base_path = Path("..")
parsed_path = base_path / "data" / "parsed"

## Add string lists for full text search

In [10]:
# str external links

cy = f'''
MATCH (m)
WHERE EXISTS(m.external_links)
SET m._external_links = trim(reduce(s='', el IN m.external_links | s + el + ' '))
'''
print(cy)
graph.run(cy)

MATCH (m)
         WHERE EXISTS(m.external_links)
         SET m._external_links = trim(reduce(s='', el IN m.external_links | s + el + ' '))


(No data)

In [12]:
# str identifiers
cy = '''
MATCH (m:Family)
WITH m, COALESCE(trim(reduce(s='', el IN m.ath_homologues | s + el + ' ')), '') AS ath,  
        COALESCE(trim(reduce(s='', el IN m.osa_homologues | s + el + ' ')), '') AS osa, 
        COALESCE(trim(reduce(s='', el IN m.sly_homologues | s + el + ' ')), '') AS sly, 
        COALESCE(trim(reduce(s='', el IN m.stu_homologues | s + el + ' ')), '') AS stu
SET m._identifiers = trim(reduce(s='', el IN [ath, osa, sly, stu] | s + el + ' '))
'''
print(cy)
graph.run(cy)


    MATCH (m:Family)
    WITH m, COALESCE(trim(reduce(s='', el IN m.ath_homologues | s + el + ' ')), '') AS ath,  
            COALESCE(trim(reduce(s='', el IN m.osa_homologues | s + el + ' ')), '') AS osa, 
            COALESCE(trim(reduce(s='', el IN m.sly_homologues | s + el + ' ')), '') AS sly, 
            COALESCE(trim(reduce(s='', el IN m.stu_homologues | s + el + ' ')), '') AS stu
    SET m._identifiers = trim(reduce(s='', el IN [ath, osa, sly, stu] | s + el + ' '))



(No data)

In [14]:
cy = f'''
MATCH (m)
WHERE EXISTS(m.synonyms)
SET m._synonyms = trim(reduce(s='', el IN m.synonyms | s + el + ' '))
'''
print(cy)
graph.run(cy)


MATCH (m)
WHERE EXISTS(m.synonyms)
SET m._synonyms = trim(reduce(s='', el IN m.synonyms | s + el + ' '))



(No data)

## Indexes

In [16]:
# metabolites

# cy = f'''MATCH (n:MetaboliteFamily) SET n :Metabolite '''
# graph.run(cy)

cy = '''
CREATE FULLTEXT INDEX  metabolite_names
FOR (m:Metabolite)
ON EACH [
    m.name, 
    m._synonyms, 
    m.description, 
    m._external_links
]
'''
graph.run(cy)

(No data)

In [18]:
# complexes 

cy = '''
CREATE FULLTEXT INDEX complex_names 
FOR (m:Complex) 
ON EACH [
     m.name, 
     m._synonyms, 
     m.description, 
     m._external_links
] 
'''
graph.run(cy)

(No data)

In [20]:
# foreign

cy = '''
CREATE FULLTEXT INDEX foreign_names 
FOR (m:Foreign) 
ON EACH [
    m.name, 
    m._synonyms,
    m._identifiers,
    m.description, 
    m._external_links
    ] 
'''

graph.run(cy)

(No data)

In [21]:
# process

cy = '''
CREATE FULLTEXT INDEX process_names 
FOR (m:Process) 
ON EACH [
    m.name, 
    m._synonyms, 
    m.description, 
    m._external_links
    ] 
'''
graph.run(cy)

(No data)

In [23]:
# identifiers

cy = '''
CREATE FULLTEXT INDEX identifiers 
FOR (m:Family) 
ON EACH [m._identifiers]
'''
graph.run(cy)

(No data)

## Data Science graph

In [10]:
cy = '''
CALL gds.graph.create.cypher(
    'reaction-graph',
    'MATCH (n) RETURN id(n) AS id',
    'MATCH (a)-[r]->(b) WHERE EXISTS(r.reaction_type) RETURN id(a) AS source, id(b) AS target'
)
YIELD graphName, nodeCount, relationshipCount, createMillis;
'''
graph.run(cy)

 graphName      | nodeCount | relationshipCount | createMillis 
----------------|-----------|-------------------|--------------
 reaction-graph |       805 |              1180 |         1549 

In [16]:
# check connected components

cy = '''
CALL gds.wcc.stream('reaction-graph')
YIELD nodeId, componentId
WITH componentId, count(*) AS componentSize
RETURN componentId, componentSize ORDER BY componentId
'''
data = graph.run(cy).data()
cc_sizes = pd.DataFrame(data)
cc_sizes

Unnamed: 0,componentId,componentSize
0,0,732
1,3,12
2,49,1
3,59,1
4,63,1
5,65,1
6,66,1
7,71,1
8,74,1
9,77,1


In [17]:
cy = '''
CALL gds.wcc.stream('reaction-graph')
YIELD nodeId, componentId
RETURN gds.util.asNode(nodeId).name AS name, componentId
ORDER BY componentId, name
'''
data = graph.run(cy).data()
cc_nodes = pd.DataFrame(data)
cc_nodes

Unnamed: 0,name,componentId
0,&alpha;/&beta; hydroxylase,0
1,"12,13-EOT",0
2,12-OH-JA-Ile,0
3,13-HPOT,0
4,4CLL,0
...,...,...
800,UGT76,295
801,VDAC1,298
802,WD40,301
803,bHLH,304


In [31]:
for componentID, subdf in cc_nodes.groupby('componentId'):
    if subdf.shape[0] < cc_sizes['componentSize'].max():
        print(f'{componentID}\t{subdf.shape[0]}\t{", ".join(list(subdf["name"].values))}')

3	12	&beta;-carotene isomerase, 9-cis-&beta;-carotene, 9-cis-10&prime;-apo-&beta;-carotenal, CCD, CL, CLA, CYP, all-trans-&beta;-carotene, rx00314, rx00315, rx00316, rx00317
49	1	NADPH
59	1	H2O
63	1	HO2.
65	1	O3
66	1	OH.
71	1	SL
74	1	UDP glucose
77	1	anthocyanin
107	1	tZRP
110	1	DAMP/HAMP
111	1	PostROS
112	1	PreROS
116	1	elf18
118	1	ch
119	1	6K1
120	1	6K2
122	11	CP, CP|CPIP1,2b, CP|CPIP2a, HSP, HSP90|RAR1|SGT1, RAR1, SGT1, rx00053, rx00201, rx00202, rx00203
124	1	NIa-Pro
125	1	NIb
126	1	P1
128	1	P3N-PIPO
130	1	oomycete
132	1	trichous-bacteria
140	1	X2
141	1	X3
161	8	ARF, AUX-signalling, MIR390, TAS3, rx00111, rx00112, rx00113, rx00121
167	1	BR1
185	1	CYP71B
213	1	HLS
215	1	HMGR
219	1	IEGT
241	1	MKK
249	4	NDR, NDR1|RIN4, RIN4, rx00055
263	1	PPL
277	1	RPL17
278	1	RPL7
295	1	UGT76
298	1	VDAC1
301	1	WD40
304	1	bHLH
318	1	ROS-production


In [None]:
'''MATCH (m {name:"9-cis-10&prime;-apo-&beta;-carotenal"}) 
OPTIONAL MATCH (m)--(p:Reaction)
OPTIONAL MATCH (p)--(q)
OPTIONAL MATCH (q)--(r:Reaction)
OPTIONAL MATCH (r)--(s)
OPTIONAL MATCH (s)--(t:Reaction) 
OPTIONAL MATCH (t)--(u)

RETURN m, p, q, r, s, t, u'''

# END 