In [1]:
from neo4j import GraphDatabase
import pandas as pd

In [2]:
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "CAiRE2020neo4j"))

In [3]:
def remove_data(tx):
    tx.run("MATCH (a)-[r]-(b) DELETE a,r,b")
    tx.run("MATCH (a) DELETE a")
    
with driver.session() as session:
    session.write_transaction(remove_data)

In [4]:
entity_df = pd.read_csv('../data/opendialkg_entities.txt', sep='\t', header=None)
entity_df.drop_duplicates(inplace=True)
entity_df.columns = ['entity']
entity_df.to_csv('opendialkg_entities.csv',index=False)

In [5]:
%%time
def add_entity(tx):
    tx.run("""
        CALL apoc.periodic.iterate('
             load csv with headers from "file:///opendialkg_entities.csv" AS row return row ','
             CREATE (a:Node {value: row.entity})
        ',{batchSize:1000, iterateList:true, parallel:true})
    """)
    
def index_entity(tx):
    tx.run("""
        CREATE CONSTRAINT ON (n:Node) ASSERT n.value IS UNIQUE;
    """)

with driver.session() as session:
    session.write_transaction(add_entity)
    session.write_transaction(index_entity)

CPU times: user 3.68 ms, sys: 0 ns, total: 3.68 ms
Wall time: 2.28 s


In [18]:
triplet_df = pd.read_csv('../data/opendialkg/data/opendialkg_triples.txt', sep='\t')

In [21]:
def preprocess_relation(relation):
    relation = relation.replace('.','').replace(',','').replace('/','').replace('(','').replace(')','').replace('\'','')
    relation = relation.replace('  ',' ').replace(' ','_').replace('-','_').replace('~','X_')
    return relation.lower()

triplet_df['relation'] = triplet_df['relation'].apply(lambda r: preprocess_relation(r))
triplet_df.drop_duplicates(inplace=True)
triplet_df.to_csv('opendialkg_triplet_preprocess.csv', index=False)

In [22]:
triplet_df.shape

(1174016, 3)

In [9]:
def remove_relation(tx):
    tx.run("MATCH (a)-[r]->(b) DELETE r")
    
with driver.session() as session:
    session.write_transaction(remove_relation)

In [10]:
%%time
def add_relation(tx):
    tx.run("""
        CALL apoc.periodic.iterate('
            load csv with headers from "file:///opendialkg_triplet_preprocess.csv" AS row return row ','
            MATCH (a:Node),(b:Node) 
            WHERE a.value=row.source AND b.value=row.target
            CALL apoc.create.relationship(a, row.relation, {}, b) yield rel
            REMOVE rel.noOp
        ',{batchSize:100, iterateList:true, parallel:false})""")

with driver.session() as session:
    session.write_transaction(add_relation)

CPU times: user 270 ms, sys: 136 ms, total: 406 ms
Wall time: 2h 44min 10s


In [11]:
def read_count(tx):
    for record in  tx.run("MATCH (a)-[r]->(b) RETURN COUNT(r)"):
        print(record)
        
with driver.session() as session:
    print(session.read_transaction(read_count))

<Record COUNT(r)=1186964>
None


In [3]:
%%time
def add_count(tx):
    tx.run("""
        MATCH (n:Node)
        SET n.count = SIZE((n)-[]-())
        RETURN 0
    """)

with driver.session() as session:
    session.write_transaction(add_count)

CPU times: user 1.33 s, sys: 68.9 ms, total: 1.4 s
Wall time: 2.32 s


In [14]:
driver.close()