In [1]:
version = "v0.1.0"

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 6/6

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [3]:
from collections import defaultdict

In [4]:
import pandas as pd
import re
import numpy as np
import os

In [5]:
from py2neo import Graph, Node, Relationship

In [6]:
import helpers

In [7]:
from importlib import reload

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [8]:
graph = Graph(host="neo4j")

In [9]:
from pathlib import Path

base_path = Path("..")
parsed_path = base_path / "data" / "parsed"

## Add string lists for full text search

In [10]:
# str external links

cy = f'''
MATCH (m)
WHERE EXISTS(m.external_links)
SET m._external_links = trim(reduce(s='', el IN m.external_links | s + el + ' '))
'''
print(cy)
graph.run(cy)


MATCH (m)
WHERE EXISTS(m.external_links)
SET m._external_links = trim(reduce(s='', el IN m.external_links | s + el + ' '))



(No data)

In [17]:
# str identifiers
cy = '''
MATCH (m:Family)
WITH m, COALESCE(trim(reduce(s='', el IN m.ath_homologues | s + el + ' ')), '') AS ath,  
        COALESCE(trim(reduce(s='', el IN m.osa_homologues | s + el + ' ')), '') AS osa, 
        COALESCE(trim(reduce(s='', el IN m.sly_homologues | s + el + ' ')), '') AS sly, 
        COALESCE(trim(reduce(s='', el IN m.stu_homologues | s + el + ' ')), '') AS stu
SET m._identifiers = trim(reduce(s='', el IN [ath, osa, sly, stu] | s + el + ' '))
'''
print(cy)
graph.run(cy)


# str identifiers
cy = '''
MATCH (m:FunctionalCluster)
WITH m, COALESCE(trim(reduce(s='', el IN m.ath_homologues | s + el + ' ')), '') AS ath,  
        COALESCE(trim(reduce(s='', el IN m.osa_homologues | s + el + ' ')), '') AS osa, 
        COALESCE(trim(reduce(s='', el IN m.sly_homologues | s + el + ' ')), '') AS sly, 
        COALESCE(trim(reduce(s='', el IN m.stu_homologues | s + el + ' ')), '') AS stu
SET m._identifiers = trim(reduce(s='', el IN [ath, osa, sly, stu] | s + el + ' '))
'''
print(cy)
graph.run(cy)


MATCH (m:Family)
WITH m, COALESCE(trim(reduce(s='', el IN m.ath_homologues | s + el + ' ')), '') AS ath,  
        COALESCE(trim(reduce(s='', el IN m.osa_homologues | s + el + ' ')), '') AS osa, 
        COALESCE(trim(reduce(s='', el IN m.sly_homologues | s + el + ' ')), '') AS sly, 
        COALESCE(trim(reduce(s='', el IN m.stu_homologues | s + el + ' ')), '') AS stu
SET m._identifiers = trim(reduce(s='', el IN [ath, osa, sly, stu] | s + el + ' '))


MATCH (m:FunctionalCluster)
WITH m, COALESCE(trim(reduce(s='', el IN m.ath_homologues | s + el + ' ')), '') AS ath,  
        COALESCE(trim(reduce(s='', el IN m.osa_homologues | s + el + ' ')), '') AS osa, 
        COALESCE(trim(reduce(s='', el IN m.sly_homologues | s + el + ' ')), '') AS sly, 
        COALESCE(trim(reduce(s='', el IN m.stu_homologues | s + el + ' ')), '') AS stu
SET m._identifiers = trim(reduce(s='', el IN [ath, osa, sly, stu] | s + el + ' '))



(No data)

In [12]:
cy = f'''
MATCH (m)
WHERE EXISTS(m.synonyms)
SET m._synonyms = trim(reduce(s='', el IN m.synonyms | s + el + ' '))
'''
print(cy)
graph.run(cy)


MATCH (m)
WHERE EXISTS(m.synonyms)
SET m._synonyms = trim(reduce(s='', el IN m.synonyms | s + el + ' '))



(No data)

## Indexes

In [13]:
# metabolites

cy = '''
CREATE FULLTEXT INDEX  metabolites_all
FOR (m:Metabolite)
ON EACH [
    m.name, 
    m._synonyms, 
    m.description, 
    m.additional_information,    
    m._external_links
]
'''
graph.run(cy)

(No data)

In [14]:
# complexes 

cy = '''
CREATE FULLTEXT INDEX complex_all 
FOR (m:Complex) 
ON EACH [
     m.name, 
     m._synonyms, 
     m.description, 
     m.additional_information,    
     m._external_links
] 
'''
graph.run(cy)

(No data)

In [15]:
# foreign

cy = '''
CREATE FULLTEXT INDEX foreign_all 
FOR (m:Foreign) 
ON EACH [
    m.name, 
    m._synonyms,
    m._identifiers,
    m.description, 
    m.additional_information,    
    m._external_links
    ] 
'''

graph.run(cy)

(No data)

In [16]:
# process

cy = '''
CREATE FULLTEXT INDEX process_all 
FOR (m:Process) 
ON EACH [
    m.name, 
    m._synonyms, 
    m.description, 
    m.additional_information,        
    m._external_links
    ] 
'''
graph.run(cy)

(No data)

In [19]:
# identifiers

cy = '''
CREATE FULLTEXT INDEX family_identifiers 
FOR (m:Family) 
ON EACH [m._identifiers]
'''
graph.run(cy)

ClientError: [Schema.EquivalentSchemaRuleAlreadyExists] An equivalent index already exists, 'Index( id=33, name='family_identifiers', type='GENERAL FULLTEXT', schema=(:Family {_identifiers}), indexProvider='fulltext-1.0' )'.

In [20]:
# plant parts

cy = '''
CREATE FULLTEXT INDEX plant_all 
FOR (m:Plant) 
ON EACH [    
    m.name, 
    m._synonyms, 
    m.description, 
    m.additional_information,    
    m._external_links,
    m._identifiers
    ]
'''
graph.run(cy)

(No data)

In [21]:
# everything 
cy = '''
MATCH (n)
SET n :Node
'''
graph.run(cy)

cy = '''
CREATE FULLTEXT INDEX general_text 
FOR (m:Node)
ON EACH [    
    m.name, 
    m._synonyms, 
    m.description, 
    m.additional_information,
    m._external_links,
    m._identifiers
    ]
'''
graph.run(cy)

(No data)

## Data Science graph

In [28]:
cy = '''
CALL gds.graph.create.cypher(
    'reaction-graph1',
    'MATCH (n) WHERE NOT n:Family RETURN id(n) AS id',
    'MATCH (a)-[r]->(b) WHERE EXISTS(r.reaction_type) RETURN id(a) AS source, id(b) AS target'
)
YIELD graphName, nodeCount, relationshipCount, createMillis;
'''
graph.run(cy)

 graphName       | nodeCount | relationshipCount | createMillis 
-----------------|-----------|-------------------|--------------
 reaction-graph1 |       872 |              1176 |           46 

In [30]:
# check connected components

cy = '''
CALL gds.wcc.stream('reaction-graph1')
YIELD nodeId, componentId
WITH componentId, count(*) AS componentSize
RETURN componentId, componentSize ORDER BY componentId
'''
data = graph.run(cy).data()
cc_sizes = pd.DataFrame(data)
cc_sizes

Unnamed: 0,componentId,componentSize
0,0,726
1,3,13
2,49,1
3,59,1
4,63,1
5,65,1
6,66,1
7,71,1
8,74,1
9,77,1


In [32]:
cy = '''
CALL gds.wcc.stream('reaction-graph1')
YIELD nodeId, componentId
RETURN gds.util.asNode(nodeId).name AS name, componentId
ORDER BY componentId, name
'''
data = graph.run(cy).data()
cc_nodes = pd.DataFrame(data)
cc_nodes

Unnamed: 0,name,componentId
0,"12,13-EOT",0
1,12-OH-JA-Ile,0
2,13-HPOT,0
3,"AAO[AT1G04580,AT2G27150,AT3G43600,AT4G34890,AT...",0
4,ACC,0
...,...,...
867,rx00306,295
868,NDR1[AT3G20600],304
869,NDR1|RIN4,304
870,RIN4[AT3G25070],304


In [33]:
for componentID, subdf in cc_nodes.groupby('componentId'):
    print("compponentID")
    if subdf.shape[0] < cc_sizes['componentSize'].max():
        print(f'{componentID}\t{subdf.shape[0]}\t{", ".join(list(subdf["name"].values))}')

3	13	9-cis-&beta;-carotene, 9-cis-10&prime;-apo-&beta;-carotenal, CCD7[AT2G44990], CCD8[AT4G32810], CL, CLA, D27[OS11G0587000], MAX1[AT2G26170], all-trans-&beta;-carotene, rx00314, rx00315, rx00316, rx00317
49	1	NADPH
59	1	H2O
63	1	HO2.
65	1	O3
66	1	OH.
71	1	SL
74	1	UDP glucose
77	1	anthocyanin
107	1	tZRP
110	1	DAMP/HAMP
111	1	PostROS
112	1	PreROS
116	1	elf18
118	1	ch
119	1	6K1
120	1	6K2
122	8	CP, CPIP1,2b[AT1G10350], CPIP2a[AT3G08910], CP|CPIP1,2b, CP|CPIP2a, rx00201, rx00202, rx00203
124	1	NIa-Pro
125	1	NIb
126	1	P1
128	1	P3N-PIPO
129	16	CO[AT5G15840], CO|OBE1, OBE1[AT3G07780], OBE1|VPg, OBE1|WRKY11, OBE1|WRKY17, RH8[AT4G00660], RH8|VPg, VPg, WRKY11[AT4G31550], WRKY17[AT2G24570], rx00226, rx00227, rx00229, rx00230, rx00231
130	1	oomycete
132	1	trichous-bacteria
133	3	ARF2[AT5G62000], AUX-signalling, rx00121
137	1	ROS-production
143	10	GPAphid2[GPAphid2], GPAphid2|RANGAP, RANGAP[AT3G63130,AT5G19320], RANGAP|Rx1, RANGAP|Rx2, Rx1[Rx1], Rx2[Rx2], rx00057, rx00058, rx00059
171	7	ARF3[AT2G

# END 