In [1]:
from neo4j import GraphDatabase
from openai.embeddings_utils import get_embedding
import openai

"""
LoadEmbedding: call OpenAI embedding API to generate embeddings for each property of node in Neo4j
Version: 1.1
"""
OPENAI_KEY = "YOUR-OPENAI-KEY"
EMBEDDING_MODEL = "text-embedding-ada-002"
NEO4J_URL = "neo4j+s://<hostname>:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "<NEO4J_PASSWORD>"
NEO4J_DATABASE = "neo4j"

def LoadEmbedding(label, property):
    driver = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USER, NEO4J_PASSWORD), database=NEO4J_DATABASE)
    openai.api_key = OPENAI_KEY

    with driver.session() as session:
        # get chunks in document, together with their section titles
        result = session.run(f"MATCH (ch:{label}) -[:HAS_PARENT]-> (s:Section) RETURN id(ch) AS id, s.title + ' >> ' + ch.{property} AS text")
        # call OpenAI embedding API to generate embeddings for each proporty of node
        # for each node, update the embedding property
        count = 0
        for record in result:
            id = record["id"]
            text = record["text"]

            # TO DO: add text length threshold to skip short text
            # ####
            
            # For better performance, text can be batched
            embedding = get_embedding(text, EMBEDDING_MODEL)
            
            # key property of Embedding node differentiates different embeddings
            cypher = "CREATE (e:Embedding) SET e.key=$key, e.value=$embedding"
            cypher = cypher + " WITH e MATCH (n) WHERE id(n) = $id CREATE (n) -[:HAS_EMBEDDING]-> (e)"
            session.run(cypher,key=property, embedding=embedding, id=id )
            count = count + 1

        session.close()
        
        print("Processed " + str(count) + " " + label + " nodes for property @" + property + ".")
        return count




In [2]:

LoadEmbedding("Chunk", "sentences")

LoadEmbedding("Table", "name")



Processed 118 Chunk nodes for property @sentences.
Processed 8 Table nodes for property @name.


8