In [None]:
!pip3 install --upgrade pip
!pip install neo4j
!pip install graphdatascience
!pip install tiktoken
!pip install openai
!pip install pinecone-client
!pip install summarytools
!pip install scikit-learn

In [138]:
import pandas as pd
from neo4j import GraphDatabase
from graphdatascience import GraphDataScience
from neo4j.exceptions import ServiceUnavailable
import logging
import re
import pyarrow as pa
import numpy as np
import struct
import sys
import time
import openai
import tiktoken
import json
import pinecone
import summarytools
from sklearn.metrics.pairwise import cosine_similarity
from tenacity import retry, wait_random_exponential, stop_after_attempt, retry_if_not_exception_type
from sklearn.model_selection import train_test_split

In [139]:
MODEL_GPT35 = "gpt-3.5-turbo"
# Replace with your API key
openai.api_key = "<your key>"
encoder = tiktoken.encoding_for_model(MODEL_GPT35)

EMBEDDING_MODEL = 'text-embedding-ada-002'
EMBEDDING_CTX_LENGTH = 8191
EMBEDDING_ENCODING = 'cl100k_base'
PINECONE_API_KEY = "<your key>"
PINECON_ENV = "<your env>"
DEMO_INDEX = "movie-index"

In [140]:
# Replace with the actual URI, username and password
AURA_CONNECTION_URI = "<your aura uri>"
AURA_USERNAME = "<your neo4j username>"
AURA_PASSWORD = "<your neo4j password>"

In [179]:
# This function attempts to get an embedding for a given input text or tokens.
# If there's an invalid request error, it won't retry.
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6), retry=retry_if_not_exception_type(openai.InvalidRequestError))
def get_embedding(text_or_tokens, model=EMBEDDING_MODEL):
    return openai.Embedding.create(input=text_or_tokens, model=model)["data"][0]["embedding"]

# Get the embedding from the given ids
def fetch_embeddings(ids, index):
    # Fetch the embeddings
    try:
        results = index.fetch(ids=ids, namespace="movie-namespace")
    except Exception as error:
        print("Error fetching embeddings:", error)
        return {}

    return results

# Get all the embedding for the vector database
def fetch_all_embeddings(df, batch_size=100):
    # Initialize Pinecone
    pinecone.init(api_key=PINECONE_API_KEY, environment=PINECON_ENV)
    # Use the existing index
    index = pinecone.Index(DEMO_INDEX)

    # Convert the DataFrame's index to a list of strings
    string_index_list = [str(i) for i in df.index.tolist()]

    # Prepare a list to hold the embeddings
    embedding_list = []

    # Fetch embeddings in batches
    for i in range(0, len(string_index_list), batch_size):
        batch_ids = string_index_list[i:i+batch_size]
        batch_results = fetch_embeddings(batch_ids, index)

        # Extract embeddings from the results
        for id in batch_ids:
            if str(id) in batch_results['vectors']:
                embedding_list.append(batch_results['vectors'][str(id)]['values'])
            else:
                print(f"Embedding not found for id: {id}")
                embedding_list.append(None)  # Or append a default value

    return embedding_list

# This function adds embeddings to all nodes in the graph.
# It fetches each node and its properties, creates an embedding based on this data, and stores the embedding in the node.
def write_embeddings_to_nodes(tx):
    results = tx.run("MATCH (n) RETURN n, id(n) as id")
    nodes_to_update = []

    for record in results:
        node = record["n"]

        if node:
          id = record["id"]
          properties = dict(node)
          data = {
                "label": list(node.labels)[0], # only one label will be included, will be either Person or Movie in this case
                "properties": properties
            }
          try:
                embedding = get_embedding(json.dumps(data))
          except openai.InvalidRequestError as e:
                print(e)

          nodes_to_update.append({'id': id, 'embedding': embedding})
        else:
            print(f"Node {node.element_id} does not have 'labels' or 'properties'")

    tx.run("UNWIND $nodes as node MATCH (n) WHERE id(n) = node.id SET n.embedding = node.embedding", nodes=nodes_to_update)


# This function finds nodes that have embeddings similar to an input text's embedding.
# It calculates the cosine similarity between the input's embedding and every node's embedding, then returns the top 10 nodes with the highest similarity.
def find_similar_nodes(tx, input_text):
    """Find nodes similar to the input text."""
    try:
      input_embedding = get_embedding(input_text)
    except openai.InvalidRequestError as e:
            print(e)
    results = tx.run("MATCH (n) RETURN id(n) AS id, n.embedding AS embedding, n")
    similarities = []
    for node in results:
        embedding = node["embedding"]
        if embedding is None:
            continue  # Skip nodes with no embedding

        embedding = np.array(embedding).reshape(1, -1)
        similarity = cosine_similarity([input_embedding], embedding)
        similarities.append((node["id"], similarity, node["n"]))
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:10]

Prepare the graph dataset using Neo4j Aura DS

In [153]:
#Client instantiation
gds = GraphDataScience(
    AURA_CONNECTION_URI,
    auth=(AURA_USERNAME, AURA_PASSWORD),
    aura_ds=True
)

gds.set_database("neo4j")

In [None]:
# Make movie titles unique
gds.run_cypher("""
    CREATE CONSTRAINT FOR (movie:Movie) REQUIRE movie.title IS UNIQUE""")

# Make person names unique
gds.run_cypher("""
    CREATE CONSTRAINT FOR (person:Person) REQUIRE person.name IS UNIQUE
""")

In [None]:
gds.run_cypher("""
    SHOW CONSTRAINTS""")

Unnamed: 0,id,name,type,entityType,labelsOrTypes,properties,ownedIndex
0,4,constraint_42a79aaf,UNIQUENESS,NODE,[Movie],[title],constraint_42a79aaf
1,6,constraint_a831e4ce,UNIQUENESS,NODE,[Person],[name],constraint_a831e4ce


In [168]:
# Load movie nodes
gds.run_cypher("""
    LOAD CSV
      WITH HEADERS
      FROM 'https://data.neo4j.com/intro/movies/movies.csv' AS row
    MERGE (m:Movie {title: row.title})
      ON CREATE SET m.released = toInteger(row.released), m.tagline = row.tagline
    RETURN count(*)
""")

Unnamed: 0,count(*)
0,38


In [169]:
# Load person nodes
gds.run_cypher("""
    LOAD CSV
      WITH HEADERS
      FROM 'https://data.neo4j.com/intro/movies/people.csv' AS row
    MERGE (p:Person {name: row.name})
      ON CREATE SET p.born = toInteger(row.born)
    RETURN count(*)
""")

Unnamed: 0,count(*)
0,102


In [170]:
# Build edges between nodes
gds.run_cypher("""
    LOAD CSV
      WITH HEADERS
      FROM 'https://data.neo4j.com/intro/movies/actors.csv' AS row
      FIELDTERMINATOR ','
    MATCH (p:Person {name: row.person})
    MATCH (m:Movie {title: row.movie})
    MERGE (p)-[actedIn:ACTED_IN]->(m)
      ON CREATE SET actedIn.roles = split(row.roles, ';')
    RETURN count(*)
""")

Unnamed: 0,count(*)
0,172


In [154]:
# Neo4j database client
driver = GraphDatabase.driver(AURA_CONNECTION_URI, auth=(AURA_USERNAME, AURA_PASSWORD))

In [180]:
start_time = time.time()
# Generate embedding using openAI
nodes_to_update = []
with driver.session() as session:
    session.execute_write(write_embeddings_to_nodes)

driver.close()

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

Execution time: 38.089141607284546 seconds


Load node ids and embeddings

In [187]:
def get_all_nodes(tx):
    results = tx.run("MATCH (n) RETURN n, id(n) as id")
    nodes = []
    for record in results:
      node = record['n']
      id = record['id']
      item = { "id": id, "node": node}
      nodes.append(item)
    return nodes
nodes = []
with driver.session() as session:
  nodes = session.execute_read(get_all_nodes)


In [None]:
for node in nodes:
  properties = dict(node['node'])
  print("node ID:", node['id'], " Embedding:", properties.get('embedding'))

When you have a large number of nodes and corresponding embeddings, you might need to use ANN (Approximate Nearest Neighbors) methods to find the most similar node for each node, and then create corresponding relationships in Neo4j. In this case, you can use Pinecone, the ANN tool mentioned earlier, or other ANN tools to assist you.

First, add the node embeddings to Pinecone's index:

In [190]:
import pinecone

pinecone.init(api_key=PINECONE_API_KEY, environment=PINECON_ENV)

# Check if the index already exists
if DEMO_INDEX not in pinecone.list_indexes():
    pinecone.create_index(DEMO_INDEX, dimension=1536)

index = pinecone.Index(DEMO_INDEX)
try:
    for node in nodes:
      properties = dict(node['node'])
      embedding = properties.get('embedding')

      index.upsert(
          vectors=[{'id': str(node['id']),
                    'values': embedding
                    }
          ],
          namespace="movie-namespace"
      )
except Exception as error:
    print(error)
    print("Node id: ", str(node['id']), "-> embedding: ", embedding)
    print("Length of the embedding: ", len(embedding))

In [None]:
fetch_embeddings(["108"], index)

In [207]:
similarities = {}

start_time = time.time()

for node in nodes:
    properties = dict(node['node'])
    node_id = node['id']
    embedding = properties.get('embedding')
    query_response = index.query(
        namespace="movie-namespace",
        top_k=10,
        include_values=True,
        include_metadata=True,
        vector=embedding
    )
    if query_response:
      for match in query_response['matches']:
          similar_node_id = int(match['id'])
          similarity = match['score']
          if str(similar_node_id) != str(node_id) and similarity > 0.95:  # Ignore self-similarity
              similarities[(node_id, similar_node_id)] = similarity

end_time = time.time()

execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

Execution time: 8.88858962059021 seconds


In [None]:
similarities

In [208]:
len(similarities)

66

In [210]:
start_time = time.time()
for (node_id_1, node_id_2), similarity in similarities.items():
  node1 = min(node_id_1, node_id_2)
  node2 = max(node_id_1, node_id_2) # prevent duplicate un-directional relations

  result = gds.run_cypher(
      f"""MATCH (a)
          MATCH (b)
          WHERE id(a) = {node1} AND id(b) = {node2}
          MERGE (a)-[r:CONNECT]->(b)
            ON CREATE SET r.value = {similarity}
          """)
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

Execution time: 7.5071563720703125 seconds


In [211]:
with driver.session() as session:
    input_text = "Keanu Reeves"
    similar_nodes = session.execute_read(lambda tx: find_similar_nodes(tx, input_text))

for node_id, similarity, node in similar_nodes:
    print(f"Node ID: {node_id}, Similarity: {similarity}, Node:{dict(node)}")


Node ID: 139, Similarity: [[0.82594762]], Node:{'born': 1964, 'name': 'Keanu Reeves', 'embedding': [0.004789399914443493, 0.007225189823657274, -0.010676713660359383, -0.02829591929912567, -0.02840110845863819, 0.0072646355256438255, 0.015568015165627003, 0.006225891411304474, -0.03379205986857414, -0.018355531617999077, 0.012280849739909172, 0.02579767443239689, 0.013017174787819386, -0.018802586942911148, 0.003458098042756319, 0.015239299274981022, 0.010045577771961689, -0.025994904339313507, 0.01614655740559101, -0.018355531617999077, 0.01155109889805317, -0.013148661702871323, 0.00340221612714231, -0.01446352805942297, -0.0007630332838743925, 0.028585189953446388, 0.010446611791849136, -0.028006648644804955, 0.005233167205005884, -0.04181274399161339, 0.007672244217246771, -0.031241219490766525, -0.01980188488960266, -0.03610622510313988, -0.03347649425268173, 0.01538393460214138, -0.012326870113611221, 0.00047170824836939573, 0.025942308828234673, -0.0015055218245834112, 0.0034055

In [167]:
# Delete all data
gds.run_cypher("""
    MATCH (n)-[r]-(m)
    DETACH DELETE n, r, m
""")

In [None]:
# DROP CONSTRAINT (In case you want to clean up the database)
gds.run_cypher("""
    DROP CONSTRAINT constraint_42a79aaf
    """)

gds.run_cypher("""
    DROP CONSTRAINT constraint_a831e4ce
""")

In [None]:
# Test
def get_all_pure_nodes(tx):
    results = tx.run("MATCH (n) RETURN n, id(n) as id")
    nodes = []
    for record in results:
      node = record['n']
      id = record["id"]
      properties = dict(node)
      data = {
                "label": list(node.labels)[0],
                'properties': properties
            }
      print(id)
      #print(properties)
      nodes.append(node)
    return nodes
nodes = []
with driver.session() as session:
  nodes = session.execute_read(get_all_pure_nodes)
