In [None]:
version = "v0.1.0"

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Import neo4j DB: 6/?

Code to translate v2.7.4_PIS-model.xlsx to neo4j database. 

## Setup

In [None]:
from collections import defaultdict

In [None]:
import pandas as pd
import re
import numpy as np
import os

In [None]:
from py2neo import Graph, Node, Relationship

In [None]:
import helpers

In [None]:
from importlib import reload

Connect to graph via docker-compose link. See http://localhost:7474/browser/

In [None]:
graph = Graph(host="neo4j")

In [None]:
from pathlib import Path

base_path = Path("..")
parsed_path = base_path / "data" / "parsed"

## Add string lists for full text search

In [None]:
# str external links

cy = f'''
MATCH (m)
WHERE EXISTS(m.external_links)
SET m._external_links = trim(reduce(s='', el IN m.external_links | s + el + ' '))
'''
print(cy)
graph.run(cy)

In [None]:
# str identifiers
cy = '''
MATCH (m:Family)
WITH m, COALESCE(trim(reduce(s='', el IN m.ath_homologues | s + el + ' ')), '') AS ath,  
        COALESCE(trim(reduce(s='', el IN m.osa_homologues | s + el + ' ')), '') AS osa, 
        COALESCE(trim(reduce(s='', el IN m.sly_homologues | s + el + ' ')), '') AS sly, 
        COALESCE(trim(reduce(s='', el IN m.stu_homologues | s + el + ' ')), '') AS stu
SET m._identifiers = trim(reduce(s='', el IN [ath, osa, sly, stu] | s + el + ' '))
'''
print(cy)
graph.run(cy)


# str identifiers
cy = '''
MATCH (m:FunctionalCluster)
WITH m, COALESCE(trim(reduce(s='', el IN m.ath_homologues | s + el + ' ')), '') AS ath,  
        COALESCE(trim(reduce(s='', el IN m.osa_homologues | s + el + ' ')), '') AS osa, 
        COALESCE(trim(reduce(s='', el IN m.sly_homologues | s + el + ' ')), '') AS sly, 
        COALESCE(trim(reduce(s='', el IN m.stu_homologues | s + el + ' ')), '') AS stu
SET m._identifiers = trim(reduce(s='', el IN [ath, osa, sly, stu] | s + el + ' '))
'''
print(cy)
graph.run(cy)

In [None]:
cy = f'''
MATCH (m)
WHERE EXISTS(m.synonyms)
SET m._synonyms = trim(reduce(s='', el IN m.synonyms | s + el + ' '))
'''
print(cy)
graph.run(cy)

## Indexes

In [None]:
# metabolites

cy = '''
CREATE FULLTEXT INDEX  metabolites_all
FOR (m:Metabolite)
ON EACH [
    m.name, 
    m._synonyms, 
    m.description, 
    m.additional_information,    
    m._external_links
]
'''
graph.run(cy)

In [None]:
# complexes 

cy = '''
CREATE FULLTEXT INDEX complex_all 
FOR (m:Complex) 
ON EACH [
     m.name, 
     m._synonyms, 
     m.description, 
     m.additional_information,    
     m._external_links
] 
'''
graph.run(cy)

In [None]:
# foreign

cy = '''
CREATE FULLTEXT INDEX foreign_all 
FOR (m:Foreign) 
ON EACH [
    m.name, 
    m._synonyms,
    m._identifiers,
    m.description, 
    m.additional_information,    
    m._external_links
    ] 
'''

graph.run(cy)

In [None]:
# process

cy = '''
CREATE FULLTEXT INDEX process_all 
FOR (m:Process) 
ON EACH [
    m.name, 
    m._synonyms, 
    m.description, 
    m.additional_information,        
    m._external_links
    ] 
'''
graph.run(cy)

In [None]:
# identifiers

cy = '''
CREATE FULLTEXT INDEX family_identifiers 
FOR (m:Family) 
ON EACH [m._identifiers]
'''
graph.run(cy)

In [None]:
# plant parts

cy = '''MATCH (n:FunctionalCluster) CALL apoc.create.addLabels(n, ['Plant']) YIELD node RETURN node '''
graph.run(cy)


cy = '''
CREATE FULLTEXT INDEX plant_all 
FOR (m:Plant) 
ON EACH [    
    m.name, 
    m._synonyms, 
    m.description, 
    m.additional_information,    
    m._external_links,
    m._identifiers
    ]
'''
graph.run(cy)

In [None]:
# everything 
cy = '''
MATCH (n)
SET n :Node
'''
graph.run(cy)

cy = '''
CREATE FULLTEXT INDEX general_text 
FOR (m:Node)
ON EACH [    
    m.name, 
    m._synonyms, 
    m.description, 
    m.additional_information,
    m._external_links,
    m._identifiers
    ]
'''
graph.run(cy)

## Data Science graph

In [None]:
cy = '''
CALL gds.graph.create.cypher(
    'reaction-graph1',
    'MATCH (n) WHERE NOT n:Family RETURN id(n) AS id',
    'MATCH (a)-[r]->(b) WHERE EXISTS(r.reaction_type) RETURN id(a) AS source, id(b) AS target'
)
YIELD graphName, nodeCount, relationshipCount, createMillis;
'''
graph.run(cy)

In [None]:
# check connected components

cy = '''
CALL gds.wcc.stream('reaction-graph1')
YIELD nodeId, componentId
WITH componentId, count(*) AS componentSize
RETURN componentId, componentSize ORDER BY componentId
'''
data = graph.run(cy).data()
cc_sizes = pd.DataFrame(data)
cc_sizes

In [None]:
cy = '''
CALL gds.wcc.stream('reaction-graph1')
YIELD nodeId, componentId
RETURN gds.util.asNode(nodeId).name AS name, componentId
ORDER BY componentId, name
'''
data = graph.run(cy).data()
cc_nodes = pd.DataFrame(data)
cc_nodes

In [None]:
for componentID, subdf in cc_nodes.groupby('componentId'):
    print("compponentID")
    if subdf.shape[0] < cc_sizes['componentSize'].max():
        print(f'{componentID}\t{subdf.shape[0]}\t{", ".join(list(subdf["name"].values))}')

# END 