# Preparing

In [1]:
import json
from neo4j import GraphDatabase
from tqdm.notebook import tqdm
import psycopg2

In [2]:
with open("./datasets/preprocessed2_aan.json") as f:
    aan_dataset = json.load(f)
    
len(aan_dataset)

15602

# Loading data into Neo4J

In [36]:
driver = GraphDatabase.driver("bolt://localhost:7687",
                              auth=("neo4j", "AAN"),
                              encrypted=False)

In [37]:
# loading nodes
for ref in tqdm(aan_dataset):
    with driver.session() as session:
        session.run("MERGE (a:Reference {id: $ref_id, title: $title}) ",
                    ref_id=ref['id'], title=ref['title'])

HBox(children=(FloatProgress(value=0.0, max=15602.0), HTML(value='')))




In [40]:
with driver.session() as session:
    session.run("CREATE INDEX ON :Reference(id)")

In [41]:
# loading edges
for paper in tqdm(aan_dataset):
    for ref_id in paper['references']:
        with driver.session() as session:
            session.run("MATCH (paper {id: $paper_id}) "
                        "MATCH (ref {id: $ref_id})"
                        "MERGE (paper)-[:CITES]->(ref)",
                        paper_id=paper['id'], ref_id=ref_id)

HBox(children=(FloatProgress(value=0.0, max=15602.0), HTML(value='')))




In [50]:
with driver.session() as session:
    print("Nb nodes :",list(session.run("MATCH (n) RETURN COUNT(n)").records()))
    print("Nb nodes :",list(session.run("MATCH ()-[e]->() RETURN COUNT(e)").records()))

Nb nodes : [<Record COUNT(n)=15602>]
Nb nodes : [<Record COUNT(e)=85240>]


In [None]:
with driver.session() as session:    
    session.run("CALL gds.graph.create('citations','Reference','CITES')")
    session.run("CALL gds.graph.create('undirected_citations','Reference',
                "{ CITES: { orientation: 'UNDIRECTED' }})")

In [54]:
###### run Personalized PageRank
with driver.session() as session:  
   results = session.run("MATCH (ref:Reference {id: 'E03-1062'}) "
                           "CALL gds.pageRank.stream('citations', { "
                           "maxIterations: 20, "
                           "dampingFactor: 0.85, "
                           "sourceNodes: [ref]}) "
                           "YIELD nodeId, score "
                           "RETURN gds.util.asNode(nodeId).id AS ref_id, score "
                           "ORDER BY score DESC, ref ASC LIMIT 100").records()

for result in results:
    print(result)

<Record ref_id='E03-1062' score=0.15000000000000002>
<Record ref_id='P97-1026' score=0.1275000050663948>
<Record ref_id='J86-3001' score=0.026544135308358818>
<Record ref_id='P86-1029' score=0.02430696557275951>
<Record ref_id='W94-0316' score=0.02430696557275951>
<Record ref_id='P85-1008' score=0.02430696557275951>
<Record ref_id='J87-3006' score=0.02376828426204156>
<Record ref_id='P86-1021' score=0.02256251559592783>
<Record ref_id='W96-0410' score=0.021675001177936793>
<Record ref_id='W94-0317' score=0.0026319643948227167>
<Record ref_id='W94-0311' score=0.0026319643948227167>
<Record ref_id='C88-2110' score=0.0011185848677996546>
<Record ref_id='E89-1018' score=0.0011185848677996546>
<Record ref_id='A88-1004' score=0.00047539857623632995>
<Record ref_id='P87-1028' score=0.00045162864989833906>
<Record ref_id='P82-1028' score=0.0003609270561355515>
<Record ref_id='C88-2149' score=0.0003169323841575533>
<Record ref_id='P87-1025' score=0.0003169323841575533>
<Record ref_id='P82-1029'

In [55]:
###### run Personalized PageRank
with driver.session() as session:  
   results = session.run("MATCH (ref:Reference {id: 'E03-1062'}) "
                           "CALL gds.pageRank.stream('undirected_citations', { "
                           "maxIterations: 20, "
                           "dampingFactor: 0.85, "
                           "sourceNodes: [ref]}) "
                           "YIELD nodeId, score "
                           "RETURN gds.util.asNode(nodeId).id AS ref_id, score "
                           "ORDER BY score DESC, ref ASC LIMIT 100").records()

for result in results:
    print(result)

<Record ref_id='E03-1062' score=0.15606312372897407>
<Record ref_id='P97-1026' score=0.14980056943570616>
<Record ref_id='J86-3001' score=0.013235124580296544>
<Record ref_id='W98-1419' score=0.011163989055626187>
<Record ref_id='P85-1008' score=0.010713084194526346>
<Record ref_id='W00-1416' score=0.010650625136315737>
<Record ref_id='W96-0410' score=0.01015261692036802>
<Record ref_id='P86-1029' score=0.009858328984580566>
<Record ref_id='W98-1403' score=0.009364908596370467>
<Record ref_id='P99-1006' score=0.008793623127573369>
<Record ref_id='P02-1003' score=0.008569904028968268>
<Record ref_id='W00-1423' score=0.008238842488623273>
<Record ref_id='W02-0111' score=0.007662615659364747>
<Record ref_id='P01-1009' score=0.007642532856522165>
<Record ref_id='W05-1609' score=0.00764171068372832>
<Record ref_id='C00-2148' score=0.0074911494743417975>
<Record ref_id='W94-0316' score=0.007366186394159513>
<Record ref_id='P01-1028' score=0.007053268029932269>
<Record ref_id='N01-1003' score

In [33]:
aan_dataset[0]

{'id': 'E03-1062',
 'author': ['Piwek,Paul'],
 'title': 'A Flexible Pragmatics-Driven Language Generator For Animated Agents',
 'venue': 'EACL',
 'year': 2003,
 'citations': [],
 'references': ['P97-1026'],
 'abstract': 'This paper describes the NECA MNLG;a fully implemented Multimodal Natu-ral Language Generation module. TheMNLG is deployed as part of the NECAsystem which generates dialogues between animated agents. The generation module supports the seamless integration of full grammar rules, templatesand canned text. The generator takes input which allows for the specification ofsyntactic, semantic and pragmatic con-straints on the output.'}

# Loading data into PostgreSQL

In [17]:
connection = psycopg2.connect(host="localhost", port=5432, database="aan", 
                              user="aan", password="aan")

cursor = connection.cursor()

In [18]:
cursor.execute("DROP TABLE IF EXISTS reference")

In [19]:
cursor.execute("""
    CREATE TABLE reference (
    ref_id VARCHAR(10) PRIMARY KEY, 
    title VARCHAR(255) NOT NULL,
    json_record JSON NOT NULL)
    """)

In [20]:
for ref in tqdm(aan_dataset):
    cursor.execute("""
        INSERT INTO reference (ref_id, title, json_record) 
        VALUES (%(ref_id)s, %(title)s, %(json_rec)s)
        """,
        {'ref_id': ref['id'], 
         'title': ref['title'].lower(), 
         'json_rec': json.dumps(ref)})

HBox(children=(FloatProgress(value=0.0, max=15602.0), HTML(value='')))




In [22]:
connection.commit()

In [26]:
cursor.close()
connection.close()