In [20]:
from neo4j import GraphDatabase

import pandas as pd

In [21]:
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

In [22]:
# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = "neo4j://localhost:7687"
AUTH = ("neo4j", "12345678")

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()

conn = Neo4jConnection(uri=URI, user=AUTH[0], pwd=AUTH[1])

<h3>Query 1</h3>

Find/define the research communities

In [23]:
research_communities = [['database', ['data management', 'indexing', 'data modeling', 'big data', 'data processing', 'data storage', 'data querying']],
            ['artificial_intelligence', ['machine learning', 'neural networks', 'deep learning', 'natural language processing', 'computer vision', 'reinforcement learning', 'expert systems', 'knowledge representation', 'genetic algorithms', 'bayesian networks']],
            ['cybersecurity', ['cybersecurity', 'network security', 'information security', 'vulnerability', 'penetration testing', 'threat detection', 'malware analysis', 'security policies', 'risk management', 'cybercrime']]
]

In [24]:
query =''' UNWIND [
                   ['database', ['data management', 'indexing', 'data modeling', 'big data', 'data processing', 'data storage', 'data querying']],
                   ['artificial_intelligence', ['machine learning', 'neural networks', 'deep learning', 'natural language processing', 'computer vision', 'reinforcement learning', 'expert systems', 'knowledge representation', 'genetic algorithms', 'bayesian networks']],
                   ['cybersecurity', ['cybersecurity', 'network security', 'information security', 'vulnerability', 'penetration testing', 'threat detection', 'malware analysis', 'security policies', 'risk management', 'cybercrime']]
                  ] AS rsrch_com
    MERGE (c:Community {name: rsrch_com[0]})
    WITH c, rsrch_com
    UNWIND rsrch_com[1] as keyword
    MERGE (key:Keyword {word: keyword})
    MERGE (key)-[:DEFINES]->(c)
'''

res = conn.query(query=query)

<h3>Query 2</h3>

Find the conferences and journals related to the database community
(i.e., are specific to the field of databases). Assume that if 90% of the papers published
in a conference/journal contain one of the keywords of the database community we
consider that conference/journal as related to that community.

In [25]:
query =''' MATCH (n:Conference|Journal)-[:OF|PUBLISHED_IN]-(:Edition|Volume)-[:PRESENTED_IN|BELONGS_TO]-(p:Paper)-[:RELATED_TO]-(:Keyword)-[:DEFINES]-(c:Community)
WHERE c.name = "database"
WITH n, c, count(distinct p) as num_papers_com

MATCH (n)-[:OF|PUBLISHED_IN]-(:Edition|Volume)-[:PRESENTED_IN|BELONGS_TO]-(p1:Paper)
WITH n, c, num_papers_com, count(p1) AS total_num_papers
WHERE 100*toFloat(num_papers_com)/total_num_papers >= 90
WITH n, c, num_papers_com, total_num_papers

MERGE (n)-[:LINKED_TO]->(c)
RETURN n.id, n.name, num_papers_com, total_num_papers, 100*num_papers_com/total_num_papers
'''
res = conn.query(query=query)
df = pd.DataFrame(res, columns=["ID", "Name", "Number of Papers per Community", "Total Number of Papers", "Percentage"])
df

Unnamed: 0,ID,Name,Number of Papers per Community,Total Number of Papers,Percentage
0,59943403,GTE Laboratories Incorporated,12,12,100
1,42806937,ANSI X2H2,1,1,100
2,18459043,Research Report / G / IBM / Cambridge Scientif...,1,1,100
3,32986085,Technical Report,1,1,100
4,90081263,"ETH Zurich, Department of Computer Science / T...",1,1,100


<h3>Query 3</h3>

Next, we want to identify the top papers of these conferences/journals. We need to find the papers with the highest page rank provided the number of citations from the papers of the same community (papers in the conferences/journals of the database community). As a result we would obtain (highlight), say, the top-100 papers of the conferences of the database community.

In [28]:
query =''' MATCH (c:Community)<-[r:LINKED_TO]-(:Conference|Journal)<-[:OF|PUBLISHED_IN]-(:Edition|Volume)<-[:PRESENTED_IN|BELONGS_TO]-(p1:Paper)<-[:CITES]-(p2:Paper)-[:PRESENTED_IN|BELONGS_TO]->(:Edition|Volume)-[:OF|PUBLISHED_IN]->(n:Conference|Journal)
           WHERE c.name="database"
           WITH c, n, p1
           MATCH (c)--(n)
           WITH c, n, p1, count(*) AS cites
           ORDER BY cites DESC LIMIT 100
           MERGE (p1)-[:TOP_100]->(c)
           RETURN c.name, p1.id, p1.title, cites
        '''
res = conn.query(query=query)
df = pd.DataFrame(res, columns=["Community", "Paper ID", "Paper Title", "Number of Citations"])
df

Unnamed: 0,Community,Paper ID,Paper Title,Number of Citations
0,database,4,DARWIN: On the Incremental Migration of Legacy...,4
1,database,10,MetaObject Protocol Concepts for a RISC Object...,4
2,database,9,A 'RISC' Object Model for Object System Intero...,3
3,database,14,Experiments with Dispatching in a Distributed ...,3
4,database,5,"Integrating Heterogeneous, Autonomous, Distrib...",2
5,database,3,An Evaluation of Object-Oriented DBMS Developm...,2
6,database,7,Integrating Object-Oriented Applications and M...,2
7,database,8,Towards a Transaction Management System for DOM.,2
8,database,11,Object Data Language Facilities for Multimedia...,2
9,database,3,An Evaluation of Object-Oriented DBMS Developm...,1


<h3>Query 4</h3>

Finally, an author of any of these top-100 papers is automatically considered a potential good match to review database papers. In addition, we want to identify gurus, i.e., very reputated authors that would be able to review for top conferences. We identify gurus as those authors that are authors of, at least, two papers among the top-100 identified.

In [29]:
query =''' MATCH (c:Community)-[r:TOP_100]-(p:Paper)<-[:WROTE]-(a:Author)
           WHERE c.name = "database"
           WITH a, c, count(r) as numPapers
           WHERE numPapers > 1
           MERGE (a)-[:GURU_OF]->(c)
           RETURN a.name, numPapers
            '''
res = conn.query(query=query)
df = pd.DataFrame(res, columns=["Author", "Num Papers"])
df

Unnamed: 0,Author,Community,Num Papers
0,Frank Manola,database,8
1,Farshad Nayeri,database,2
2,Mark F. Hornick,database,2
3,Alejandro P. Buchmann,database,2
