### Install Neo4j packages

In [0]:
%%capture
%pip install neo4j
%pip install graphdatascience

### Create Neo4j Driver and set configuration variables

In [0]:
from neo4j import GraphDatabase

Neo4jConnectionInfo = {
    "URI": "neo4j+s://62d77af8.databases.neo4j.io", 
    "Username": dbutils.secrets.get(scope="neo4j", key="username"), 
    "Password": dbutils.secrets.get(scope="neo4j", key="password")
}

LoadSettings = {
    "writeMode": "Overwrite",
    "partitionColName": "partitionCode"
}

driver = GraphDatabase.driver(Neo4jConnectionInfo["URI"], auth=(Neo4jConnectionInfo["Username"], Neo4jConnectionInfo["Password"]))


Failed to write data to connection ResolvedIPv4Address(('104.199.72.226', 7687)) (ResolvedIPv4Address(('104.199.72.226', 7687)))
Failed to write data to connection IPv4Address(('62d77af8.databases.[REDACTED].io', 7687)) (ResolvedIPv4Address(('104.199.72.226', 7687)))


### Create constraints in the Neo4j database

In [0]:

# helper function
def run(driver, query, params=None):
    with driver.session() as session:
        if params is not None:
            return [r for r in session.run(query, params)]
        else:
            return [r for r in session.run(query)]

run(driver,'CREATE CONSTRAINT userId_unique IF NOT EXISTS FOR (user:User) REQUIRE user.userId  IS UNIQUE')

run(driver,'CREATE CONSTRAINT news_id_unique IF NOT EXISTS FOR (news:News) REQUIRE news.newsId IS UNIQUE')

run(driver,'CREATE CONSTRAINT wiki_id_unique IF NOT EXISTS FOR (entity:WikiEntity) REQUIRE entity.wikidataId IS UNIQUE')

run(driver,'CREATE CONSTRAINT category_subject_unique IF NOT EXISTS FOR (category:Category) REQUIRE category.subject IS UNIQUE')

run(driver,'CREATE CONSTRAINT sub_category_subject_unique IF NOT EXISTS FOR (subcategory:Subcategory) REQUIRE subcategory.subject IS UNIQUE')

[]

Run this in Workspace to delete all nodes and relationships in the db

CALL apoc.periodic.iterate('MATCH (n) RETURN n', 'DETACH DELETE n', {batchSize: 500})

### Define utility functions for writing DataFrames to Neo4j

In [0]:

from pyspark.sql.functions import col, substring, concat_ws

def writeTableToNeo (tableName, cypherQuery):
    df = spark.read.table(tableName)
    result = (df.write
        .format("org.neo4j.spark.DataSource")
        .option("url", Neo4jConnectionInfo["URI"])
        .option("authentication.basic.username", Neo4jConnectionInfo["Username"])
        .option("authentication.basic.password", Neo4jConnectionInfo["Password"])
        .option("query", cypherQuery)
        .mode(LoadSettings["writeMode"])
        .save())
    
def writeTableToNeoSinglePartition (tableName, cypherQuery): 
    df = spark.read.table(tableName).repartition(1)
    result = (df.write
        .format("org.neo4j.spark.DataSource")
        .option("url", Neo4jConnectionInfo["URI"])
        .option("authentication.basic.username", Neo4jConnectionInfo["Username"])
        .option("authentication.basic.password", Neo4jConnectionInfo["Password"])
        .option("query", cypherQuery)
        .mode(LoadSettings["writeMode"])
        .save())    
    
def writeDfToNeo (df, cypherQuery):
    result = (df.write
        .format("org.neo4j.spark.DataSource")
        .option("url", Neo4jConnectionInfo["URI"])
        .option("authentication.basic.username", Neo4jConnectionInfo["Username"])
        .option("authentication.basic.password", Neo4jConnectionInfo["Password"])
        .option("batch.size", 25000)
        .option("query", cypherQuery)
        .mode(LoadSettings["writeMode"])
        .save())
    
# tableSize = 10 means a 10x10 table
def getPartitionsAndBatches (tableSize):
    batches = []
    for i in range(tableSize):
        partitions = []
        for j in range(tableSize):
            k = (i+j)%tableSize
            partitions.append(str(k) + '-' + str(j))
        batches.append(partitions)
    return batches

batches = getPartitionsAndBatches(10)    

def filter_by_partition_code(df, partitionColName, partitionSet):
    return df.filter(col(partitionColName).isin(partitionSet))

def writeRelTableToNeo(tableName, startNodeCol, endNodeCol, cypherQuery):
    df = spark.read.table(tableName)
    partitionColName = LoadSettings['partitionColName']
    newDf = df.withColumn(partitionColName, concat_ws('-', substring(df[startNodeCol], -1, 1), substring(df[endNodeCol], -1, 1)))
    # Create separate DataFrames for each batch
    dataframes = {}
    for index, partitionSet in enumerate(batches):
        dataframes[index] = filter_by_partition_code(newDf, partitionColName, partitionSet)
        parallelRelDf = dataframes[index].repartition(10, partitionColName)
        print('Num partitions (' + str(index) + '): ' + str(parallelRelDf.rdd.getNumPartitions()))

        writeDfToNeo(parallelRelDf, cypherQuery)


### Process tables and write Nodes to Neo4j

In [0]:
writeTableToNeo("users", "MERGE (user:User {userId: event.userId})")
writeTableToNeo("news", """
    MERGE (news:News {
      newsId: event.newsId,
      category: event.category,
      subcategory: event.subcategory,
      title: event.title,
      url: event.url})
    WITH event, news
    WHERE event.abstract IS NOT null
    SET news.abstract = event.abstract""")
writeTableToNeo("news_approx_time", """
    MATCH (news:News {newsId: event.newsId})
    SET news.approxTime = event.approxTime
    """)
writeTableToNeo("categories", "MERGE (category:Category {subject: event.category})")
writeTableToNeo("entities", """
    MERGE(entity:WikiEntity {
        wikidataId: event.WikidataId,
        wikiLabel: event.Label,
        wikiType: event.Type,
        url: 'https://www.wikidata.org/wiki/' + event.WikidataId
    })""")
writeTableToNeo("entity_embedding", """
    MATCH(entity:WikiEntity {wikidataId: event.wikiEntityId})
    SET entity.wikiEncoding = toFloatList(split(event.entityEmbedding, ';'))""")


### Process tables and write Relationships to Neo4j

In [0]:
writeTableToNeoSinglePartition("sub_category_of_category", """
    MERGE(subcategory:Subcategory {subject: event.category + '-' + event.subcategory})
    WITH subcategory, event
    MATCH(category:Category {subject: event.category})
    MERGE(subcategory)-[r:SUBCATEGORY_OF]->(category)
    """)

writeTableToNeoSinglePartition("news_belongs_to_sub_category", """
    MATCH(subcategory:Subcategory {subject: event.category + '-' + event.subcategory})
    MATCH(news:News {newsId: event.newsId})
    MERGE(news)-[r:BELONGS_TO_SUBCATEGORY]->(subcategory)
    """)

writeRelTableToNeo("news_title_about", "newsId", "WikidataId", """
    MATCH(news:News {newsId: event.newsId})
    MATCH(entity:WikiEntity {wikidataId: event.WikidataId})
    MERGE(news)-[r:TITLE_ABOUT{ confidence: toFloat(event.Confidence)}]->(entity)
    """)

writeRelTableToNeo("news_abstract_about", "newsId", "WikidataId", """
    MATCH(news:News {newsId: event.newsId})
    MATCH(entity:WikiEntity {wikidataId: event.WikidataId})
    MERGE(news)-[r:ABSTRACT_ABOUT{ confidence: toFloat(event.Confidence)}]->(entity)
    """)

writeRelTableToNeo("clicks", "userId", "newsId", """
    MATCH(user:User {userId: event.userId})
    MATCH(news:News {newsId: event.newsId})
    MERGE(user)-[r:CLICKED {
      splitSet: event.splitSet,
      impressionId: event.impressionId,
      impressionTime: event.time
    }]->(news)
    """)

writeRelTableToNeo("historic_clicks", "userId", "newsId", """
    MATCH(user:User {userId: event.userId})
    MATCH(news:News {newsId: event.newsId})
    MERGE(user)-[r:HISTORICALLY_CLICKED {splitSet: event.splitSet}]->(news)
    RETURN count(r)
    """)

# writeRelTableToNeo("did_not_clicks", "userId", "newsId", """
#     MATCH(user:User {userId: event.userId})
#     MATCH(news:News {newsId: event.newsId})
#     MERGE(user)-[r:DID_NOT_CLICK {
#       splitSet: event.splitSet,
#       impressionId: event.impressionId,
#       impressionTime: event.time
#     }]->(news)
#     """)

Num partitions (0): 10
Num partitions (1): 10
Num partitions (2): 10
Num partitions (3): 10
Num partitions (4): 10
Num partitions (5): 10
Num partitions (6): 10
Num partitions (7): 10
Num partitions (8): 10
Num partitions (9): 10
Num partitions (0): 10
Num partitions (1): 10
Num partitions (2): 10
Num partitions (3): 10
Num partitions (4): 10
Num partitions (5): 10
Num partitions (6): 10
Num partitions (7): 10
Num partitions (8): 10
Num partitions (9): 10
Num partitions (0): 10
Num partitions (1): 10
Num partitions (2): 10
Num partitions (3): 10
Num partitions (4): 10
Num partitions (5): 10
Num partitions (6): 10
Num partitions (7): 10
Num partitions (8): 10
Num partitions (9): 10
Num partitions (0): 10
Num partitions (1): 10
Num partitions (2): 10
Num partitions (3): 10
Num partitions (4): 10
Num partitions (5): 10
Num partitions (6): 10
Num partitions (7): 10
Num partitions (8): 10
Num partitions (9): 10


### Initialize Neo4j Graph Data Science (GDS)

In [0]:
from graphdatascience import GraphDataScience

gds = GraphDataScience(Neo4jConnectionInfo["URI"], auth=(Neo4jConnectionInfo["Username"], Neo4jConnectionInfo["Password"]), aura_ds=True)


### Basic Cypher Queries for Collaborative Filtering (CF) - not using GDS
From here we could try just using Cypher to accomplish basic Collaborative Filtering. For example, take the below user and the news they clicked. You can see a mixed interest between a number of categories.

In [0]:
USER_ID = "U18465"

In [0]:
gds.run_cypher('''
    MATCH (u1:User {userId: $userId})
           -[r1:CLICKED]->(n1:News)
    RETURN n1.newsId AS newsId,
           n1.title AS title,
           n1.abstract AS abstract,
           n1.category AS category,
           n1.subcategory As subcategory,
           r1.impressionTime AS impressionTime,
           type(r1) AS clickType
    ORDER BY clickType, impressionTime DESC
    ''', params={'userId': USER_ID})

Unnamed: 0,newsId,title,abstract,category,subcategory,impressionTime,clickType
0,N34130,The Kardashians Face Backlash Over 'Insensitiv...,Kardashian's Face Backlash Over Family Food Fight,tv,tv-celebrity,2019-11-15T13:22:55.000000000,CLICKED
1,N31958,Opinion: Colin Kaepernick is about to get what...,The end may be near for the 3-year-old saga of...,sports,football_nfl,2019-11-15T13:22:55.000000000,CLICKED
2,N48740,I've been writing about tiny homes for a year ...,I stayed in a tiny house for three days to see...,lifestyle,voices,2019-11-15T12:11:01.000000000,CLICKED
3,N60724,I'm A Queer Woman Dating A Trans Man & No On...,I'm a queer woman partnered with a transgender...,lifestyle,voices,2019-11-15T12:11:01.000000000,CLICKED
4,N44621,Koalas face threat from historic brush fires,Hundreds of vulnerable koalas are feared dead ...,weather,weathertopstories,2019-11-15T12:11:01.000000000,CLICKED
5,N10083,"Debunking the week's tabloid stories: Nov. 15,...",A round-up of the week's tabloid stories that ...,entertainment,celebrity,2019-11-15T12:00:04.000000000,CLICKED
6,N6916,THEN AND NOW: What all your favorite '90s star...,These heartthrobs and fan favorites made the 1...,entertainment,celebrity,2019-11-15T10:26:28.000000000,CLICKED
7,N19990,Cows swept away by Hurricane Dorian found aliv...,A trio of wild cows believed to have been swep...,lifestyle,lifestylebuzz,2019-11-15T10:26:28.000000000,CLICKED
8,N5940,Meghan Markle and Hillary Clinton Secretly Spe...,Meghan Markle Invites Hillary Clinton to Her F...,lifestyle,lifestyleroyals,2019-11-15T10:08:57.000000000,CLICKED
9,N58748,Father-daughter first dance medley had every g...,How awesome is this first dance medley? They g...,video,viral,2019-11-15T08:34:38.000000000,CLICKED


Assuming we can measure the similarity of user interests via commonly clicked news articles, we can do a three hop query to find potential recommendations for user U18465 based on the activity of users that clicked on the same news as U18465.

With the below query we can get an aggregate count of the nodes we would need to traverse over to get the recommendations. 

In [0]:
gds.run_cypher('''
    MATCH (u1:User {userId: $userId})
           -[r1:CLICKED]->(n1:News)
           <-[r2:CLICKED]-(u2:User)
           -[r3:CLICKED]->(n2:News)
    RETURN u1.userId AS userId,
           count(DISTINCT n1) AS clickedNews,
           count(DISTINCT u2) AS likeUsers,
           count(DISTINCT n2) AS potentialRecommendations
    ''', params={'userId': USER_ID})

Unnamed: 0,userId,clickedNews,likeUsers,potentialRecommendations
0,U18465,41,19559,5907


### Scaling CF with GDS: FastRP Node Embeddings and K-Nearest-Neighbor (KNN)
With GDS we can use FastRP node embeddings to reduce the dimensionality of the problem then use an unsupervised machine learning technique called K-Nearest Neighbor (KNN) to identify and draw recommendations between users with similar/close embeddings. Because FastRP embeddings are based off the graph structure, users with similar embeddings should also be relatively connected in the graph via having shared sets of viewed news articles.

#### Graph Projection
We will start with a graph projection leveraging the User and News nodes. We will also include both historic and recent impression clicks, but we will give less weight to historic clicks so-as to favor more recent user activity. Lastly we will use an UNDIRECTED orientation so FastRP can traverse the graph bi-directionaly.

In [0]:
gds.graph.drop('embedding-projection')

graphName                                             embedding-projection
database                                                             [REDACTED]
databaseLocation                                                     local
memoryUsage                                                               
sizeInBytes                                                             -1
nodeCount                                                           159295
relationshipCount                                                  4720584
configuration            {'relationshipProjection': {'HISTORICALLY_CLIC...
density                                                           0.000186
creationTime                           2024-03-08T19:07:26.785337562+00:00
modificationTime                       2024-03-08T19:07:32.746542453+00:00
schema                   {'graphProperties': {}, 'nodes': {'User': {'em...
schemaWithOrientation    {'graphProperties': {}, 'nodes': {'User': {'em...
Name: 0, dtype: obje

In [0]:
g0, res = gds.graph.project('embedding-projection', ['User', 'News'], {
    'CLICKED':{
        'orientation':'UNDIRECTED',
        'properties': {'weight': {'property': 'confidence', 'defaultValue': 1.0}}
    },
    'HISTORICALLY_CLICKED':{
        'orientation':'UNDIRECTED',
        'properties': {'weight': {'property': 'confidence', 'defaultValue': 0.1}}
    }
})
res

nodeProjection            {'User': {'label': 'User', 'properties': {}}, ...
relationshipProjection    {'HISTORICALLY_CLICKED': {'aggregation': 'DEFA...
graphName                                              embedding-projection
nodeCount                                                            159295
relationshipCount                                                   4720584
projectMillis                                                          1440
Name: 0, dtype: object

### FastRP
When running FastRP we will make sure to include the relationship weight property. While we should be able to do this all in one projection, depending on the GDS version you are using, I find it easiest to write the embeddings back to the database and create a seperate projection just for KNN.

In [0]:
gds.fastRP.mutate(g0, mutateProperty='embedding', embeddingDimension=256, randomSeed=7474, relationshipWeightProperty='weight');
gds.graph.nodeProperties.write(g0, ["embedding"], ["User"]);

If you are curious this is what the embeddings look like, they are just vectors of floating point numbers. In this case they are are 256 numbers long as specified in the embeddingDimension parameter above.

In [0]:
gds.run_cypher('MATCH(n:User) RETURN n.userId, n.embedding LIMIT 10')

Unnamed: 0,n.userId,n.embedding
0,U13740,"[0.11379876732826233, 0.10179446637630463, 0.0..."
1,U91836,"[-0.02173590287566185, -0.04518648982048035, 0..."
2,U59196,"[0.16520173847675323, -0.041554875671863556, -..."
3,U73700,"[-0.030124668031930923, 0.027448158711194992, ..."
4,U34670,"[0.1975572109222412, 0.016372354701161385, -0...."
5,U8125,"[0.029716797173023224, -0.19273772835731506, 0..."
6,U19739,"[0.07212769985198975, -0.0009595770388841629, ..."
7,U8355,"[0.0269266739487648, -0.08955472707748413, 0.0..."
8,U46596,"[0.03828085586428642, -0.0747443288564682, 0.1..."
9,U79199,"[0.023930972442030907, -0.030633574351668358, ..."


### K-Nearest-Neighbors (KNN)
We can now run KNN to estimate similarity (a.k.a. SIMILAR_USER) relationships between User nodes and write them back to the graph.

In [0]:
gds.graph.drop('cf-projection')

In [0]:
g1, res = gds.graph.project('cf-projection', {'User':{'properties':['embedding']}},'*')
res

nodeProjection            {'User': {'label': 'User', 'properties': {'emb...
relationshipProjection    {'__ALL__': {'aggregation': 'DEFAULT', 'orient...
graphName                                                     cf-projection
nodeCount                                                             94057
relationshipCount                                                         0
projectMillis                                                           999
Name: 0, dtype: object

In [0]:
knn_stats_df = gds.knn.write(g1, nodeProperties=['embedding'],
    writeRelationshipType='SIMILAR_USER',
    writeProperty='score',
    sampleRate=1.0,
    maxIterations=1000);

KNN relationships are only written when a positive similarity is found between node pairs which, in this case, is based on cosine similarity between the nodeWeightProperty values of each node. here we are using the FastRP embedding we calculated over the click/historic click relationships as the nodeWeightProperty. 

### Collaborative Filtering with SIMILAR_USER Relationships
Now we can structure a Collaborative filtering query for user U18465 but with

1. more refined results,
2. using less traversal steps, and
3. with a score from KNN that allows us to rank order the results based on aggregate similarity

In [0]:
gds.run_cypher( '''
    MATCH(u:User {userId: $userId})-[:CLICKED]->(n:News)
    WITH collect(id(n)) AS clickedNewsIds

    //get similar News according to KNN and exclude previously clicked news
    MATCH (u:User {userId: $userId})-[s:SIMILAR_USER]->(similar:User)-[:CLICKED]->(news:News)
    WHERE NOT id(news) IN clickedNewsIds

    //aggregate and return ranked results
    RETURN DISTINCT news.newsId as newsId,
        news.title AS title,
        news.category AS category,
        news.subcategory As subcategory,
        sum(s.score) AS totalScore ORDER BY totalScore DESC
    ''', params={'userId': USER_ID})

Unnamed: 0,newsId,title,category,subcategory,totalScore
0,N287,Three school workers charged in death of speci...,news,newscrime,4.327445
1,N26262,Celebrity plastic surgery transformations,entertainment,entertainment-celebrity,4.309714
2,N23446,Prince Harry and Meghan Markle just shared a n...,lifestyle,lifestyleroyals,3.481054
3,N62360,The son of a Chinese billionaire has been bann...,news,newsworld,3.474919
4,N33619,College gymnast dies following training accide...,news,newsus,3.459343
...,...,...,...,...,...
240,N42397,Hilaria Baldwin Shares Emotional Video Reveali...,tv,tv-celebrity,0.859922
241,N27581,Amazon's $1.5 million political gambit backfir...,finance,finance-companies,0.859922
242,N57090,Kevin Hart Makes First Official Appearance at ...,entertainment,awards,0.859922
243,N1019,Meghan Markle Smudged Makeup on a Fan's Coat &...,lifestyle,lifestyleroyals,0.859922


We cut back results significantly compared to the previous 6K with pure Cypher.

Drop any previous tables we created.

In [0]:
%sql
DROP TABLE IF EXISTS news_with_embeddings;
DROP TABLE IF EXISTS users_with_embeddings;
DROP TABLE IF EXISTS similar_users;

### Pull enhanced data from Neo4j and write to a Table

In [0]:

df = (spark.read.format("org.neo4j.spark.DataSource")
        .option("url", Neo4jConnectionInfo["URI"])
        .option("authentication.basic.username", Neo4jConnectionInfo["Username"])
        .option("authentication.basic.password", Neo4jConnectionInfo["Password"])
        .option("labels", ":User")
        .load())

df.write.saveAsTable("users_with_embeddings") 

# df = (spark.read.format("org.neo4j.spark.DataSource")
#         .option("url", Neo4jConnectionInfo["URI"])
#         .option("authentication.basic.username", Neo4jConnectionInfo["Username"])
#         .option("authentication.basic.password", Neo4jConnectionInfo["Password"])
#         .option("labels", ":News")
#         .load())

# df.write.saveAsTable("news_with_embeddings")    

In [0]:

similarUserDf = (spark.read.format("org.neo4j.spark.DataSource")
        .option("url", Neo4jConnectionInfo["URI"])
        .option("authentication.basic.username", Neo4jConnectionInfo["Username"])
        .option("authentication.basic.password", Neo4jConnectionInfo["Password"])
        .option("query", """
                MATCH (user:User)-[s:SIMILAR_USER]->(similar)
                RETURN user.userId as userId, similar.userId as similarUserId, s.score as score
            """)
        .load())

similarUserDf.show()

similarUserDf.write.saveAsTable("similar_users")


+------+-------------+------------------+
|userId|similarUserId|             score|
+------+-------------+------------------+
|U13740|       U16783|0.8351926803588867|
|U13740|       U82601|0.8353410959243774|
|U13740|       U45589|0.8467587828636169|
|U13740|       U63767| 0.833566427230835|
|U13740|       U25337|0.8391966819763184|
|U13740|       U78897|0.8361396789550781|
|U13740|       U78421|0.8339868783950806|
|U13740|       U47684|0.8360647559165955|
|U13740|       U62989|0.8382823467254639|
|U13740|       U66970|0.8337756991386414|
|U91836|       U52540|0.7402944564819336|
|U91836|       U83199|0.7376580834388733|
|U91836|       U18659|0.7591962814331055|
|U91836|       U44963| 0.738801896572113|
|U91836|       U91901|0.7295254468917847|
|U91836|       U56351|0.7339047193527222|
|U91836|       U54683|0.7382723093032837|
|U91836|       U63843|  0.73367840051651|
|U91836|       U62107|0.7278967499732971|
|U91836|       U59132|0.7274794578552246|
+------+-------------+------------