In [1]:
#%pip install monotonic openpyxl

In [2]:
version = "v0.0.5"

In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 6/6

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [4]:
from collections import defaultdict

In [5]:
import pandas as pd
import re
import numpy as np
import os

In [6]:
from py2neo import Graph, Node, Relationship

In [7]:
import helpers

In [8]:
from importlib import reload

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [9]:
graph = Graph(host="neo4j")

In [10]:
from pathlib import Path

base_path = Path("..")
parsed_path = base_path / "data" / "parsed"

## Add string lists for full text search

In [11]:
# str external links

cy = f'''
MATCH (m)
WHERE EXISTS(m.external_links)
SET m._external_links = trim(reduce(s='', el IN m.external_links | s + el + ' '))
'''
print(cy)
graph.run(cy)


MATCH (m)
WHERE EXISTS(m.external_links)
SET m._external_links = trim(reduce(s='', el IN m.external_links | s + el + ' '))



(No data)

In [12]:
# str identifiers
cy = '''
MATCH (m:Family)
WITH m, COALESCE(trim(reduce(s='', el IN m.ath_homologues | s + el + ' ')), '') AS ath,  
        COALESCE(trim(reduce(s='', el IN m.osa_homologues | s + el + ' ')), '') AS osa, 
        COALESCE(trim(reduce(s='', el IN m.sly_homologues | s + el + ' ')), '') AS sly, 
        COALESCE(trim(reduce(s='', el IN m.stu_homologues | s + el + ' ')), '') AS stu
SET m._identifiers = trim(reduce(s='', el IN [ath, osa, sly, stu] | s + el + ' '))
'''
print(cy)
graph.run(cy)


MATCH (m:Family)
WITH m, COALESCE(trim(reduce(s='', el IN m.ath_homologues | s + el + ' ')), '') AS ath,  
        COALESCE(trim(reduce(s='', el IN m.osa_homologues | s + el + ' ')), '') AS osa, 
        COALESCE(trim(reduce(s='', el IN m.sly_homologues | s + el + ' ')), '') AS sly, 
        COALESCE(trim(reduce(s='', el IN m.stu_homologues | s + el + ' ')), '') AS stu
SET m._identifiers = trim(reduce(s='', el IN [ath, osa, sly, stu] | s + el + ' '))



(No data)

In [13]:
cy = f'''
MATCH (m)
WHERE EXISTS(m.synonyms)
SET m._synonyms = trim(reduce(s='', el IN m.synonyms | s + el + ' '))
'''
print(cy)
graph.run(cy)


MATCH (m)
WHERE EXISTS(m.synonyms)
SET m._synonyms = trim(reduce(s='', el IN m.synonyms | s + el + ' '))



(No data)

## Indexes

In [14]:
# metabolites


cy = '''
CREATE FULLTEXT INDEX  metabolite_names
FOR (m:Metabolite)
ON EACH [
    m.name, 
    m._synonyms, 
    m.description, 
    m._external_links
]
'''
graph.run(cy)

(No data)

In [15]:
# complexes 

cy = '''
CREATE FULLTEXT INDEX complex_names 
FOR (m:Complex) 
ON EACH [
     m.name, 
     m._synonyms, 
     m.description, 
     m._external_links
] 
'''
graph.run(cy)

(No data)

In [16]:
# foreign

cy = '''
CREATE FULLTEXT INDEX foreign_names 
FOR (m:Foreign) 
ON EACH [
    m.name, 
    m._synonyms,
    m._identifiers,
    m.description, 
    m._external_links
    ] 
'''

graph.run(cy)

(No data)

In [17]:
# process

cy = '''
CREATE FULLTEXT INDEX process_names 
FOR (m:Process) 
ON EACH [
    m.name, 
    m._synonyms, 
    m.description, 
    m._external_links
    ] 
'''
graph.run(cy)

(No data)

In [18]:
# identifiers

cy = '''
CREATE FULLTEXT INDEX identifiers 
FOR (m:Family) 
ON EACH [m._identifiers]
'''
graph.run(cy)

(No data)

In [19]:
# plant parts

cy = '''
CREATE FULLTEXT INDEX plant_names 
FOR (m:Plant) 
ON EACH [    
    m.name, 
    m._synonyms, 
    m.description, 
    m._external_links,
    m._identifiers
    ]
'''
graph.run(cy)

(No data)

In [20]:
# everything 
cy = '''
MATCH (n)
SET n :Node
'''
graph.run(cy)

cy = '''
CREATE FULLTEXT INDEX general_text 
FOR (m:Node)
ON EACH [    
    m.name, 
    m._synonyms, 
    m.description, 
    m._external_links,
    m._identifiers
    ]
'''
graph.run(cy)

(No data)

## Data Science graph

In [21]:
cy = '''
CALL gds.graph.create.cypher(
    'reaction-graph',
    'MATCH (n) RETURN id(n) AS id',
    'MATCH (a)-[r]->(b) WHERE EXISTS(r.reaction_type) RETURN id(a) AS source, id(b) AS target'
)
YIELD graphName, nodeCount, relationshipCount, createMillis;
'''
graph.run(cy)

 graphName      | nodeCount | relationshipCount | createMillis 
----------------|-----------|-------------------|--------------
 reaction-graph |       805 |              1180 |         1292 

In [22]:
# check connected components

cy = '''
CALL gds.wcc.stream('reaction-graph')
YIELD nodeId, componentId
WITH componentId, count(*) AS componentSize
RETURN componentId, componentSize ORDER BY componentId
'''
data = graph.run(cy).data()
cc_sizes = pd.DataFrame(data)
cc_sizes

Unnamed: 0,componentId,componentSize
0,0,1
1,1,732
2,2,1
3,3,1
4,4,1
5,6,11
6,8,1
7,9,1
8,10,1
9,12,1


In [23]:
cy = '''
CALL gds.wcc.stream('reaction-graph')
YIELD nodeId, componentId
RETURN gds.util.asNode(nodeId).name AS name, componentId
ORDER BY componentId, name
'''
data = graph.run(cy).data()
cc_nodes = pd.DataFrame(data)
cc_nodes

Unnamed: 0,name,componentId
0,elf18,0
1,&alpha;/&beta; hydroxylase,1
2,"12,13-EOT",1
3,12-OH-JA-Ile,1
4,13-HPOT,1
...,...,...
800,RPL7,284
801,UGT76,301
802,VDAC1,304
803,WD40,307


In [24]:
for componentID, subdf in cc_nodes.groupby('componentId'):
    if subdf.shape[0] < cc_sizes['componentSize'].max():
        print(f'{componentID}\t{subdf.shape[0]}\t{", ".join(list(subdf["name"].values))}')

0	1	elf18
2	1	ch
3	1	6K1
4	1	6K2
6	11	CP, CP|CPIP1,2b, CP|CPIP2a, HSP, HSP90|RAR1|SGT1, RAR1, SGT1, rx00053, rx00201, rx00202, rx00203
8	1	NIa-Pro
9	1	NIb
10	1	P1
12	1	P3N-PIPO
14	1	oomycete
16	1	trichous-bacteria
17	8	ARF, AUX-signalling, MIR390, TAS3, rx00111, rx00112, rx00113, rx00121
21	1	ROS-production
30	1	X2
31	1	X3
34	12	&beta;-carotene isomerase, 9-cis-&beta;-carotene, 9-cis-10&prime;-apo-&beta;-carotenal, CCD, CL, CLA, CYP, all-trans-&beta;-carotene, rx00314, rx00315, rx00316, rx00317
57	1	BR1
75	1	CYP71B
103	1	HLS
105	1	HMGR
109	1	IEGT
131	1	MKK
188	1	NADPH
198	1	H2O
202	1	HO2.
204	1	O3
205	1	OH.
210	1	SL
213	1	UDP glucose
216	1	anthocyanin
246	1	tZRP
249	1	DAMP/HAMP
250	1	PostROS
251	1	PreROS
255	4	NDR, NDR1|RIN4, RIN4, rx00055
269	1	PPL
283	1	RPL17
284	1	RPL7
301	1	UGT76
304	1	VDAC1
307	1	WD40
310	1	bHLH


# END 