In [225]:
import pandas as pd
import numpy as np
import sys
from py2neo import Graph

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from collections import Counter

In [265]:
# Set up authentication parameters
# Connect to authenticated graph database
g = Graph("bolt://localhost:7687", user = "neo4j", password = "Swathiravi22")

### Book Recommendation System

In [282]:
def collaborative_filtering(graph_1, uid, neighbourhood_size, num_recos):

    query = """
            MATCH (u1:User {user_id:$uid})-[:REVIEWS]->(Book1)
            WITH u1, collect(id(Book1)) AS u1Books
            MATCH (u2:User)-[:REVIEWS]->(Book2) WHERE u1 <> u2
            WITH u1 as us1, u2 as us2, (gds.alpha.similarity.jaccard(u1Books, collect(id(Book2))))*100 AS similarity

            ORDER BY similarity DESC, us2.user_id
            WITH us1, COLLECT(us2)[0..$k] as neighbours
            WHERE size(neighbours) = $k
            UNWIND neighbours as neighbour
            WITH us1, neighbour

            MATCH (neighbour)-[:REVIEWS]->(b:Book)
            WHERE not (us1)-[:REVIEWS]->(b:Book)
            WITH us1,b,COUNT(DISTINCT neighbour) as cnt
            ORDER BY us1.user_id, cnt DESC
            RETURN us1.user_id as user, COLLECT(b.title)[0..$n] as recos
           """

    recos = {}

    for row in graph_1.run(query, uid = uid, k=neighbourhood_size, n=num_recos):

        recos[row[0]] = row[1]

    return recos

In [301]:
uid = '0ef32090550901ead25cb0ea21c4d36b'
recommendations = collaborative_filtering(g,uid,20,10)

print("Books recommendations for", uid, ":\n")
for num in range(len(recommendations[uid])):
    print(recommendations[uid][num])

Books recommendations for 0ef32090550901ead25cb0ea21c4d36b :

Letters to a Young Poet
Eugene Onegin
Hate That Cat (Jack, #2)
Some Ether
Neon Vernacular: New and Selected Poems
Imperial
New Selected Poems and Translations
Just Saying
Book of Haikus
Flowers of Evil: A Selection


### Link Prediction

In [154]:
def sample(df):
    copy = df.copy()
    zero = Counter(copy.label.values)[0]
    un = Counter(copy.label.values)[1]
    n = zero - un
    if (n<0):
        n = int(zero/3)
    print(zero , un, n)
    copy = copy.drop(copy[copy.label == 0].sample(n=n, random_state=1).index)
    return copy.sample(frac=1)

In [155]:
train_existing_links = g.run("""
MATCH (user:User)-[:SHARE_BOOK_EARLY]->(other:User)
RETURN id(user) AS node1, id(other) AS node2, 1 AS label
""").to_data_frame()

In [156]:
train_missing_links = g.run("""
MATCH (user:User)
WHERE (user)-[:SHARE_BOOK_EARLY]-()
WITH user
LIMIT 25
MATCH (user)-[:SHARE_BOOK_EARLY*2]-(other)
WHERE not((user)-[:SHARE_BOOK_EARLY]-(other))
RETURN id(user) AS node1, id(other) AS node2, 0 AS label
""").to_data_frame()
train_missing_links = train_missing_links.drop_duplicates()

In [157]:
training_df = train_missing_links.append(train_existing_links, ignore_index=True)
training_df['label'] = training_df['label'].astype('category')

In [158]:
training_df = sample(training_df)

27578 189172 9192


In [159]:
test_existing_links = g.run("""
MATCH (user:User)-[:SHARE_BOOK_LATE]->(other:User)
RETURN id(user) AS node1, id(other) AS node2, 1 AS label
""").to_data_frame()

In [160]:
test_missing_links = g.run("""
MATCH (user:User)
WHERE (user)-[:SHARE_BOOK_LATE]-()
WITH user
LIMIT 25
MATCH (user)-[:SHARE_BOOK_LATE*2]-(other)
WHERE not((user)-[:SHARE_BOOK_LATE]-(other))
RETURN id(user) AS node1, id(other) AS node2, 0 AS label
""").to_data_frame()
test_missing_links = test_missing_links.drop_duplicates()

In [161]:
test_df = test_missing_links.append(test_existing_links, ignore_index=True)
test_df['label'] = test_df['label'].astype('category')

In [162]:
test_df = sample(test_df)

8192 75173 2730


In [163]:
def apply_graphy_features(data, rel_type):
    query = """
    UNWIND $pairs AS pair
    MATCH (u1) WHERE id(u1) = pair.node1
    MATCH (u2) WHERE id(u2) = pair.node2
    RETURN pair.node1 AS node1,
           pair.node2 AS node2,
           gds.alpha.linkprediction.commonNeighbors(u1,u2,{relationshipQuery:"SHARE_BOOK_LATE"}) AS CN,
           gds.alpha.linkprediction.adamicAdar(u1, u2, {relationshipQuery:"SHARE_BOOK_LATE"}) AS aa,
           gds.alpha.linkprediction.totalNeighbors(u1, u2, {relationshipQuery:"SHARE_BOOK_LATE"}) AS tn
    """
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]
    features = g.run(query, {"pairs": pairs, "relType": rel_type}).to_data_frame()
    return pd.merge(data, features, on = ["node1", "node2"])

In [164]:
training_df = apply_graphy_features(training_df, "SHARE_BOOK_EARLY")

In [165]:
test_df = apply_graphy_features(test_df, "SHARE_BOOK_LATE")

In [175]:
training_df.head()

Unnamed: 0,node1,node2,label,CN,aa,tn
0,3085,8515,0,0.0,0.0,0.0
1,8841,13581,1,7.0,1.534605,73.0
2,6611,7466,1,13.0,2.376036,17.0
3,5163,13209,1,8.0,1.337278,62.0
4,5207,11371,1,8.0,1.337278,55.0


In [176]:
test_df.head()

Unnamed: 0,node1,node2,label,CN,aa,tn
0,10160,12755,1,167.0,29.20634,259.0
1,12580,13684,1,20.0,5.428605,25.0
2,3109,11031,0,2.0,0.341023,446.0
3,5692,9567,1,30.0,5.704548,218.0
4,9886,11196,1,136.0,23.417943,266.0


In [166]:
classifier = RandomForestClassifier(n_estimators=30, max_depth=10, random_state=0)

In [300]:
columns = ["CN"]

X = training_df[columns]
y = training_df["label"]
classifier.fit(X, y)

predictions = classifier.predict(test_df[columns])
y_test = test_df["label"]

display("Accuracy", accuracy_score(y_test, predictions))
display("Precision", precision_score(y_test, predictions))
display("Recall", recall_score(y_test, predictions))

sorted(list(zip(columns, classifier.feature_importances_)), key = lambda x: x[1]*-1)

'Accuracy'

0.6525454207230111

'Precision'

0.905957300275482

'Recall'

0.6999587617894723

[('CN', 1.0)]

In [173]:
columns = ["aa"]

X = training_df[columns]
y = training_df["label"]
classifier.fit(X, y)

predictions = classifier.predict(test_df[columns])
y_test = test_df["label"]

display("Accuracy", accuracy_score(y_test, predictions))
display("Precision", precision_score(y_test, predictions))
display("Recall", recall_score(y_test, predictions))

sorted(list(zip(columns, classifier.feature_importances_)), key = lambda x: x[1]*-1)

'Accuracy'

0.6196192720282756

'Precision'

0.9023234788897929

'Recall'

0.6638420709563274

[('aa', 1.0)]

In [174]:
columns = ["tn"]

X = training_df[columns]
y = training_df["label"]
classifier.fit(X, y)

predictions = classifier.predict(test_df[columns])
y_test = test_df["label"]

display("Accuracy", accuracy_score(y_test, predictions))
display("Precision", precision_score(y_test, predictions))
display("Recall", recall_score(y_test, predictions))

sorted(list(zip(columns, classifier.feature_importances_)), key = lambda x: x[1]*-1)

'Accuracy'

0.9135611086996962

'Precision'

0.9552268692181388

'Recall'

0.9518976228167028

[('tn', 1.0)]