# Loading data into Neo4J

In [32]:
import json
import csv
import os
from neo4j import GraphDatabase
import psycopg2
from elasticsearch import Elasticsearch, helpers
from tqdm.notebook import tqdm

In [5]:
dblp_dataset_dir_path = "./datasets/dblp-aminer/"
chunkfiles = [f for f in os.listdir(dblp_dataset_dir_path) if 'chunk' in f]

In [15]:
# nb records : 2 945 030
# nb edges :  29 882 572

# Loading data into Neo4J

In [52]:
with open('./datasets/dblp-aminer/nodes.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    for filename in tqdm(chunkfiles):
        with open(dblp_dataset_dir_path + filename) as file:
            for paper in json.load(file):
                writer.writerow([paper['id']])

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [53]:
with open('./datasets/dblp-aminer/edges.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    for filename in tqdm(chunkfiles):
        with open(dblp_dataset_dir_path + filename) as file:
            for paper in json.load(file):
                if 'references' in paper:
                    for ref_id in paper['references']:
                        writer.writerow([paper['id'],ref_id])

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [47]:
driver = GraphDatabase.driver("bolt://localhost:7687",
                              auth=("neo4j", "DBLP"),
                              encrypted=False)

In [None]:
# fields : 'abstract', 'authors', 'id', 'references', 'citations',
#          'title', 'venue', 'year'

In [32]:
with driver.session() as session:
    session.run("CREATE CONSTRAINT ON (n:Reference) ASSERT n.ref_id IS UNIQUE")

In [35]:
# loading nodes
with driver.session() as session:
    session.run("USING PERIODIC COMMIT 1000 "
                "LOAD CSV FROM 'file:///nodes.csv' AS row "
                "MERGE (r:Reference {ref_id: row[0]})")

In [54]:
# loading edges
with driver.session() as session:
    session.run("USING PERIODIC COMMIT 1000 "
                "LOAD CSV FROM 'file:///edges.csv' AS row "
                "MATCH (paper:Reference {ref_id: row[0]}) "
                "MATCH (ref:Reference {ref_id: row[1]})"
                "MERGE (paper)-[:CITES]->(ref)")

In [55]:
with driver.session() as session:
    print("Nb nodes :",list(session.run("MATCH (n) RETURN COUNT(n)").records()))
    print("Nb nodes :",list(session.run("MATCH ()-[e]->() RETURN COUNT(e)").records()))

Nb nodes : [<Record COUNT(n)=2945030>]
Nb nodes : [<Record COUNT(e)=29882572>]


In [58]:
with driver.session() as session:    
    session.run("CALL gds.graph.create('citations','Reference','CITES')")
    session.run("CALL gds.graph.create('undirected_citations','Reference', "
                "{ CITES: { orientation: 'UNDIRECTED' }})")

In [59]:
###### run Personalized PageRank
with driver.session() as session:  
   results = session.run("MATCH (ref:Reference {ref_id: '40093513'}) "
                           "CALL gds.pageRank.stream('citations', { "
                           "maxIterations: 20, "
                           "dampingFactor: 0.85, "
                           "sourceNodes: [ref]}) "
                           "YIELD nodeId, score "
                           "RETURN gds.util.asNode(nodeId).ref_id AS ref_id, score "
                           "ORDER BY score DESC, ref ASC LIMIT 100").records()

for result in results:
    print(result)

<Record ref_id='40093513' score=0.15000000000000002>
<Record ref_id='2165612380' score=0.042570962326877536>
<Record ref_id='2122789628' score=0.04250041559268914>
<Record ref_id='2030458294' score=0.04250000063329935>
<Record ref_id='1513168562' score=0.011022120268567314>
<Record ref_id='1823507963' score=0.009368800681400516>
<Record ref_id='2010595692' score=0.009031250094994945>
<Record ref_id='2061310592' score=0.009031250094994916>
<Record ref_id='2141608913' score=0.009031250094994902>
<Record ref_id='2002649781' score=0.008633138489629275>
<Record ref_id='2119275489' score=0.007676562620326876>
<Record ref_id='2062261404' score=0.007597741042285985>
<Record ref_id='2049947953' score=0.0069055840850304325>
<Record ref_id='2047499522' score=0.006646211070258297>
<Record ref_id='1543852490' score=0.006525078346021473>
<Record ref_id='2067936680' score=0.005548500839884685>
<Record ref_id='2111869173' score=0.005347006299399047>
<Record ref_id='2029406036' score=0.0048588766598705

In [60]:
###### run Personalized PageRank
with driver.session() as session:  
   results = session.run("MATCH (ref:Reference {ref_id: '40093513'}) "
                           "CALL gds.pageRank.stream('undirected_citations', { "
                           "maxIterations: 20, "
                           "dampingFactor: 0.85, "
                           "sourceNodes: [ref]}) "
                           "YIELD nodeId, score "
                           "RETURN gds.util.asNode(nodeId).ref_id AS ref_id, score "
                           "ORDER BY score DESC, ref ASC LIMIT 100").records()

for result in results:
    print(result)

<Record ref_id='40093513' score=0.15501319170677078>
<Record ref_id='2159697746' score=0.018129472379546978>
<Record ref_id='1091014733' score=0.017047913580848496>
<Record ref_id='2048728292' score=0.016415602537139584>
<Record ref_id='2138890364' score=0.01589933196177937>
<Record ref_id='2165612380' score=0.015529415605356435>
<Record ref_id='2122789628' score=0.01550537695934846>
<Record ref_id='1969798161' score=0.015221216932119044>
<Record ref_id='2030458294' score=0.015140069366480163>
<Record ref_id='1978580536' score=0.015103423830388082>
<Record ref_id='1975175514' score=0.0035378488122580942>
<Record ref_id='2088481969' score=0.0028428178649792827>
<Record ref_id='2127108020' score=0.002647092170605436>
<Record ref_id='2546689421' score=0.0025030112348074338>
<Record ref_id='2058316166' score=0.0024190935633589088>
<Record ref_id='2009004709' score=0.0016565631232025467>
<Record ref_id='1978519374' score=0.0016470429849386567>
<Record ref_id='2147880316' score=0.00155800782

# Loading data into PostgreSQL

In [3]:
connection = psycopg2.connect(host="localhost",port=5432, database="dblp", 
                              user="dblp", password="dblp")

cursor = connection.cursor()

In [4]:
cursor.execute("""DROP TABLE IF EXISTS reference""")

cursor.execute("""CREATE TABLE reference(
                ref_id VARCHAR (10) PRIMARY KEY,
                title VARCHAR (511) NOT NULL,
                json_record JSON NOT NULL);""")

In [5]:
# fields : 'abstract', 'authors', 'id', 'references', 'citations',
#          'title', 'venue', 'year'

chunkfiles = [f for f in os.listdir(dblp_dataset_dir_path) if 'chunk' in f]

for filename in tqdm(chunkfiles):
    with open(dblp_dataset_dir_path + filename) as file:
        for ref in json.load(file):
            cursor.execute("""INSERT INTO reference (ref_id, title, json_record) VALUES  
                              (%(ref_id)s, %(title)s, %(json_rec)s)""",
                           {'ref_id': ref['id'], 'title': ref['title'].lower(), 
                            'json_rec': json.dumps(ref)})

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [6]:
connection.commit()

In [7]:
cursor.close()
connection.close()

# Loading data into ElasticSearch

In [25]:
es=Elasticsearch([{'host':'localhost', 'port':9200}])

In [30]:
es.indices.delete(index='dblp_v1', ignore=[400, 404])

{'acknowledged': True}

In [31]:
settings = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
}
#        "mappings": {
#            "paper": {
#                "dynamic": "strict",
#                "properties": {
#                    "id": {
#                        "type": "keyword"
#                    },
#                    "title": {
#                        "type": "text"
#                    },
#                    "authors": {
#                        "type": "text"
#                    },
#                    "year": {
#                        "type": "integer"
#                    },
#                    "venue": {
#                        "type": "text"
#                    },
#                    "abstract": {
#                        "type": "text"
#                    },
#                }
#            }
#        }
#    }

es.indices.create(index='dblp_v1', ignore=[400, 404], body=settings)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'dblp_v1'}

In [33]:
# fields : 'abstract', 'authors', 'id', 'references', 'citations',
#          'title', 'venue', 'year'

chunkfiles = [f for f in os.listdir(dblp_dataset_dir_path) if 'chunk' in f]

for filename in tqdm(chunkfiles):
    with open(dblp_dataset_dir_path + filename) as file:
        bulk = []        
        
        for ref in json.load(file):
            ref['authors'] = ' ; '.join([a['name'] for a in ref['authors']])
            ref['venue'] = ref['venue'].pop('raw', '')
            ref['year'] = ref.pop('year', 0)
            ref.pop('references', '')
            ref.pop('citations', '')
            
            bulk.append(ref)
            
            if len(bulk) > 5000:
                helpers.bulk(es, bulk, index='dblp_v1', doc_type='paper')
                bulk = []
                
        if bulk:
            helpers.bulk(es, bulk, index='dblp_v1', doc_type='paper')

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [35]:
es.indices.refresh('dblp_v1')
es.cat.count('dblp_v1', params={"format": "json"})

[{'epoch': '1586268360', 'timestamp': '14:06:00', 'count': '2945030'}]