## Python Neo4j Queries

**Part 1** - playground for translating ETH search input to Neo4j queries, mainly experimental

**Part 2** - Neo4j commands to construct expertise score (author export on topic)

## Part 1

In [None]:
import pandas as pd
import numpy as np
import nltk
import pickle
from nltk import word_tokenize, RegexpTokenizer,PunktSentenceTokenizer, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
#nltk.download('stopwords')
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [None]:
def prepro(query):
    '''
    Performs necessary preprocessing steps to be able to match with data in Neo4j
    '''
    stop_words = stopwords.words('english')
    lemmatiser = WordNetLemmatizer()
    word_stemmer = PorterStemmer()
    count = 0
    tokens = gensim.utils.simple_preprocess(str(query), deacc=True)
    count += len(tokens)
    cleaned = [word for word in tokens if word not in stop_words]
    lemmatized = [lemmatiser.lemmatize(word_stemmer.stem(word)) for word in cleaned]
    return lemmatized

In [None]:
prepro('machine learning')

In [None]:
def normalise(query):
    tokens = gensim.utils.simple_preprocess(str(query), deacc=True)
    return tokens

In [None]:
all_info = pd.read_csv('graph_data_final.csv')
print(all_info.columns)
all_info.head()

In [None]:
def create_list(query_type):
    dep_org_prof = []
    for query in list(all_info[query_type].unique()):
        dep_org_prof.append(normalise(query))
    return dep_org_prof

In [None]:
author_name = create_list('author')
department = create_list('department') 
organisation = create_list('organisation')[1:] #First is nan

In [None]:
['andreas'] in author_name

### Query Matching - Still need to do partial Matching

In [None]:
import re
def query_match(query):
    norm_query = normalise(query)
    matching_author = [all_info['author'].unique()[i] for i,v in enumerate(author_name) if set(norm_query) == set(v)]
    #matching_org = [v for v in author_name if set(norm_query) == set(v)]
    matching_org = [v for v in organisation if set(norm_query) == set(v)]
    if matching_author == [] and matching_org == []:
         # partial_matching_authors = [sublist for sublist in author_name for item in sublist if item in norm_query]
        print('not an author')
        print('not an organisation')
    elif matching_author!=[]:
        print('--------Neo4J query: Author--------')
        print('--------Show Department--------')
        print('MATCH (p:Person) - [r:WORKS_IN] - (d)')
        print('WHERE p.name = '+'\'' +' '.join(matching_author)+'\'')
        print('Return d')
        print('--------Show Organisation--------')
        print('MATCH (p:Person) - [r:BELONGS_TO] - (d)')
        print('WHERE p.name = '+'\'' +' '.join(matching_author)+'\'')
        print('Return d')
        print('--------Show 10 most recent publications--------')
        print('MATCH (p:Person) - [r:PUBLISHED] - (d)')
        print('WHERE p.name = '+'\'' +' '.join(matching_author)+'\'')
        print('Return d')
        print('Limit 10')
        print('--------Show 10 most common collaborators--------')
        print('MATCH (p:Person)-[r1:PUBLISHED]-(pub:Publication)-[r2:PUBLISHED]-(c:Person)')
        print('WHERE p.name = '+'\'' +' '.join(matching_author)+'\'')
        print('WITH c, COUNT(pub) as cp')
        print('RETURN c, cp')
        print('ORDER BY cp DESCENDING')
        print('LIMIT 10')
        print('--------Show Expertise Areas--------')
        print('MATCH (t:Topic) - [r:EXPERT_ON] - (p:Person)')
        print('WHERE p.name = '+'\'' +' '.join(matching_author)+'\'')
        print('RETURN r.weight, t')
        print('ORDER BY r.weight DESC')
    elif matching_org != []:
        print('--------Neo4J query: Organisation--------')
        print('MATCH (d:Department) - [:WORKS_IN] - (p:Person)')
        print('WHERE p.name = ', matching_org)
        print('ORDER by d.date DESC')
        print('RETURN d')

In [None]:
# Example queries
query = ('Mohsen Ghaffari')
query_match(query)

## Part 2

### Neo4J: Building the Expert score function

<br>
<br>

            #Add weights between authors and publications
            MATCH (p:Publication) - [r:PUBLISHED] - (a:Person)
            SET r.weight = 1-toFloat(duration.between(p.date, date("2020-01-01")).years)/toFloat(duration.between(date("1930-01-01"),date("2020-01-01")).years)
            
            #Add word count property to Topic nodes
            MATCH (w:Word) -[r:IS_IN] -(t:Topic)
            WITH t, count(w) as cnt
            SET t.word_count = cnt
            
            #Add publication count property to Topic nodes
            MATCH (p:Publication) -[r:IS_ABOUT] -(t:Topic)
            WITH t, count(p) as cnt
            SET t.pub_count = cnt
            
            #Create EXPERT_ON relationship between authors and topics
            #Note: To avoid memory issues, use WHERE for selection on subsets of departments
            #Ex: WHERE d.name IN ['Information Technology and Electrical Engineering', 'Mathematics']
            #Engineering choice: Exponential when computing alpha
            WITH 0.75 AS alpha_exp
            MATCH (t:Topic)-[r1:IS_ABOUT]-(p:Publication)-[r2:PUBLISHED]-(a:Person)-[r3:WORKS_IN]-(d:Department)
            WITH t, a, toFloat(count(p)) as cnt, alpha_exp
            WITH t, a,
               CASE WHEN cnt>=10.0
                  THEN 1
                  ELSE (cnt/10.0)^(alpha_exp)
               END AS alpha
            CREATE (a) - [r4:EXPERT_ON] -> (t)
            
            #Create score property on EXPERT_ON relationship
            #Again, select on department to avoid memory issues
            #Engineering choice: Exponentials for word count and publication count
            WITH 63 AS min_pub_count, 9 AS min_word_count
            MATCH (t:Topic)-[r1:IS_ABOUT]-(p:Publication)-[r2:PUBLISHED]-(a:Person)-[r3:WORKS_IN]-(d:Department)
            WITH t, a, AVG(r1.weight*r2.weight)*(min_word_count/toFloat(t.word_count))^(0.75)*(min_pub_count/toFloat(t.pub_count))^(1) as s
            MATCH (t)-[r4:EXPERT_ON]-(a)
            SET r4.score = s*r4.alpha
            
            

### Expert Query Example

<br>
<br>

            #Find experts on Climate Change
            #Find all topics that contain all search terms
            #Aggregate expert scores, by summing over all scores between topic & author
            #and weighting by the sum of the word probabilities in topic
            #Engineering choice: Sum or average to aggregate word probabilites
            WITH ['climat', 'chang']
            as words
            MATCH (w:Word)-[r1:IS_IN]-(t:Topic)
            WHERE w.name in words
            WITH t, size(words) as inputCnt, count(DISTINCT w) as cnt, SUM(r1.weight) as s
            WHERE cnt = inputCnt
            WITH  t, s
            MATCH (t:Topic)-[r3:EXPERT_ON] - (p:Person)-[r2:WORKS_IN]-(d:Department)
            WITH p, SUM(r3.score*s) as s2, d
            RETURN p.name, s2, d.name
            ORDER BY s2 DESC