# Getting Started
We use `neo4j` community server edition and `apoc` library for processing graph data. <br/>
`apoc` is used to parallelize the query in `neo4j`, so that we can process large scale graph faster<br/>
<br/>
Before proceed, you need to ensure that you have `neo4j` (https://neo4j.com/download-center/#community) and `apoc` (https://neo4j.com/developer/neo4j-apoc/) installed.

If you are not familiar with `CYPHER` and `apoc` syntaxes, you can follow the tutorial in `https://neo4j.com/developer/cypher/` and `https://neo4j.com/blog/intro-user-defined-procedures-apoc/`

In [1]:
from neo4j import GraphDatabase
import pandas as pd

In [2]:
# Connect to neo4j
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "<PASSWORD>"))

In [3]:
# Delete all nodes and relations in neo4j
def remove_data(tx):
    tx.run("MATCH (a)-[r]-(b) DELETE a,r,b")
    tx.run("MATCH (a) DELETE a")
    
with driver.session() as session:
    session.write_transaction(remove_data)

In [4]:
# Generate CSV file for importing to neo4j
NEO4J_DATA_FOLDER = '/var/lib/neo4j/data' # Check https://neo4j.com/docs/operations-manual/current/configuration/file-locations/
entity_df = pd.read_csv('./data/opendialkg_entities.txt', sep='\t', header=None)
entity_df.drop_duplicates(inplace=True)
entity_df.columns = ['entity']
entity_df.to_csv(f'{NEO4J_DATA_FOLDER}/opendialkg_entities.csv',index=False)

In [5]:
%%time
# Add entity to neo4j and add unique constraint
def add_entity(tx):
    tx.run("""
        CALL apoc.periodic.iterate('
             load csv with headers from "file:///opendialkg_entities.csv" AS row return row ','
             CREATE (a:Node {value: row.entity})
        ',{batchSize:1000, iterateList:true, parallel:true})
    """)
    
def index_entity(tx):
    tx.run("""
        CREATE CONSTRAINT ON (n:Node) ASSERT n.value IS UNIQUE;
    """)

with driver.session() as session:
    session.write_transaction(add_entity)
    session.write_transaction(index_entity)

CPU times: user 3.68 ms, sys: 0 ns, total: 3.68 ms
Wall time: 2.28 s


In [21]:
# Prepare graph triplets
def preprocess_relation(relation):
    relation = relation.replace('.','').replace(',','').replace('/','').replace('(','').replace(')','').replace('\'','')
    relation = relation.replace('  ',' ').replace(' ','_').replace('-','_').replace('~','X_')
    return relation.lower()

triplet_df = pd.read_csv('./data/opendialkg_triples.txt', sep='\t')
triplet_df['relation'] = triplet_df['relation'].apply(lambda r: preprocess_relation(r))
triplet_df.drop_duplicates(inplace=True)
triplet_df.to_csv('opendialkg_triplet_preprocess.csv', index=False)

In [22]:
triplet_df.shape

(1174016, 3)

In [10]:
%%time
# Add relation to neo4j, this step might take some time
def add_relation(tx):
    tx.run("""
        CALL apoc.periodic.iterate('
            load csv with headers from "file:///opendialkg_triplet_preprocess.csv" AS row return row ','
            MATCH (a:Node),(b:Node) 
            WHERE a.value=row.source AND b.value=row.target
            CALL apoc.create.relationship(a, row.relation, {}, b) yield rel
            REMOVE rel.noOp
        ',{batchSize:100, iterateList:true, parallel:false})""")

with driver.session() as session:
    session.write_transaction(add_relation)

CPU times: user 270 ms, sys: 136 ms, total: 406 ms
Wall time: 2h 44min 10s


In [11]:
# Check the entity and relation in neo4j
def read_count(tx):
    for record in  tx.run("MATCH (a)-[r]->(b) RETURN COUNT(r)"):
        print(record)
        
with driver.session() as session:
    print(session.read_transaction(read_count))

<Record COUNT(r)=1186964>
None


In [3]:
%%time
# Add count to each node based on number of degree
def add_count(tx):
    tx.run("""
        MATCH (n:Node)
        SET n.count = SIZE((n)-[]-())
        RETURN 0
    """)

with driver.session() as session:
    session.write_transaction(add_count)

CPU times: user 1.33 s, sys: 68.9 ms, total: 1.4 s
Wall time: 2.32 s


In [14]:
driver.close()