In [1]:
import json
from langchain_community.vectorstores import Neo4jVector
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()
NEO4J_PASSWORD = os.environ.get("NEO4J_PASSWORD")
NEO4J_URI = os.environ.get("NEO4J_URI")
NEO4J_USERNAME = os.environ.get("NEO4J_USERNAME")
NEO4J_DATABASE = os.environ.get("NEO4J_DATABASE")

In [4]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [6]:
def main():
    if not NEO4J_PASSWORD:
        print("API key or password not found")
        return
    
    embeddings = hf_embeddings

    try:
        with open("cleaned.json", "r", encoding = "utf-8") as f:
            monograph = json.load(f)
    except FileNotFoundError:
        print("'Cleaned.json' no found")
        return
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs_with_metadata = []
    for chapter in monograph.get("chapters", []):
        content = chapter.get("content", "")
        if content:
            chunks = text_splitter.split_text(content)
            for i, chunk in enumerate(chunks):
                doc = {
                    "text": chunk,
                    "metadata": {
                        "chapter": chapter.get("chapter"),
                        "title": chapter.get("title"),
                        "chunk_seq_id": i
                    }
                }
                docs_with_metadata.append(doc)

    texts = [doc["text"] for doc in docs_with_metadata]
    metadatas = [doc["metadata"] for doc in docs_with_metadata]

    print(f"Prepared {len(texts)} text chunks for vectorization")

    print("Ingesting data and creating vector embeddings")
    try:
        neo4j_vector = Neo4jVector.from_texts(
            texts=texts,
            embedding=embeddings,
            metadatas=metadatas,
            url=NEO4J_URI,
            username=NEO4J_USERNAME,
            password=NEO4J_PASSWORD,
            database=NEO4J_DATABASE,
            index_name="monograph_chunks",              # name for the vector index
            node_label="Chunk",                         # label for the nodes that will store the text and embeddinsg
            text_node_property="text",                  # property name for the text
            embedding_node_property="embeddings",       # property name for the vecor embedding
            create_id_index=True
        )
        print("data ingestion and vectorization done")
        print("a vector index named 'monograph_chunks has been created" )

    except Exception as e:
        print(f"an error occured: {e}")

if __name__ == "__main__":
    main()

Prepared 682 text chunks for vectorization
Ingesting data and creating vector embeddings


  return forward_call(*args, **kwargs)


data ingestion and vectorization done
a vector index named 'monograph_chunks has been created
