In [2]:
import os
import re
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.document_loaders import WikipediaLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

## chunk text

In [10]:
# pip install pypdf

In [5]:
import os
import json
import glob
from typing import List
from multiprocessing import Pool
from tqdm import tqdm
from langchain.document_loaders import CSVLoader, PyPDFLoader, Docx2txtLoader
from langchain.docstore.document import Document

In [6]:
# Map file extensions to document loaders and their arguments
loaders_mapping = {
    ".csv": CSVLoader,
    ".docx": Docx2txtLoader,
    ".pdf": PyPDFLoader
}

def load_document(file_path: str) -> Document:
    ## Find extension of the file
    ext = "." + file_path.rsplit(".", 1)[-1]
    if ext in loaders_mapping:
        # Find the appropriate loader class
        loader_class = loaders_mapping[ext]
        # Invoke the instance of document loader
        loader = loader_class(file_path)
        ## Return the loaded document
        return loader.load()
    else:
        raise ValueError(f"Unsupported file extension '{ext}'")

def load_documents(source_dir: str) -> List[Document]:
    """
    Loads all documents from the source documents directory
    """
    all_files = []
    for ext in loaders_mapping:
        #Find all the files within source documents which matches the extensions in loaders_mapping
        all_files.extend(
            glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
        )

    ## Spinning up resource pool
    with Pool(processes=os.cpu_count()) as pool:
        results = []
        with tqdm(total=len(all_files), desc='Loading new documents', ncols=80) as pbar:
            # Load each document from filtered files list using load_single_document function
            for i, doc in enumerate(pool.imap_unordered(load_document, all_files)):
                results.extend(doc)
                pbar.update()

    return results

In [11]:
loaded_documents = load_documents("./sample")
print(f"Length of loaded documents: {len(loaded_documents)}")

Loading new documents: 100%|██████████████████████| 1/1 [00:24<00:00, 24.20s/it]

Length of loaded documents: 88





In [14]:
from langchain.text_splitter import SpacyTextSplitter

# split docs in chunks
text_splitter = SpacyTextSplitter(
    chunk_size=512,
    chunk_overlap=64
)

documents = text_splitter.split_documents(loaded_documents)
len(documents)



830

## Neo4j vector index

https://python.langchain.com/docs/integrations/vectorstores/neo4jvector

https://github.com/sauravjoshi23/towards-agi/blob/main/retrieval%20augmented%20generation/integrated-qa-neo4j-langchain.ipynb

In [None]:
# %pip install langchain openai tiktoken neo4j python-dotenv transformers

In [None]:
import os
import re
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.document_loaders import WikipediaLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [32]:
# Instantiate Neo4j vector from documents
neo4j_vector = Neo4jVector.from_documents(
    documents,
    OpenAIEmbeddings(),
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"]
)

In [33]:
query = "What is the name of the CEO of Procter and Gamble?"
neo4j_docs_with_score = neo4j_vector.similarity_search_with_score(query, k=3)

for doc, score in neo4j_docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

--------------------------------------------------------------------------------
Score:  0.949164628982544
The Procter & Gamble Company • 73
Company Leadership
Jon R. Moeller
Chairman of the Board, President and Chief Executive Officer
Shailesh G. Jejurikar
Chief Operating Officer
Gary Coombe
Chief Executive Officer – Grooming
Executive Sponsor – Corporate Wellbeing
Jennifer Davis
Chief Executive Officer –  
Health CareMa.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.9390952587127686
20        The Procter & Gamble Company
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.9373219013214111
72 • The Procter & Gamble Company
Board of Directors
B. Marc Allen
Chief Strategy Officer and Senior Vice President of 
Strategy and Corporate Development a

## FAISS vector index

In [28]:
# !pip install faiss-gpu

In [24]:
from langchain.vectorstores import FAISS

In [29]:
FAISS_vector = FAISS.from_documents(documents,OpenAIEmbeddings())

In [34]:
query = "What is the name of the CEO of Procter and Gamble?"
FAISS_docs_with_score = FAISS_vector.similarity_search_with_score(query, k=3)

for doc, score in FAISS_docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

--------------------------------------------------------------------------------
Score:  0.949164628982544
The Procter & Gamble Company • 73
Company Leadership
Jon R. Moeller
Chairman of the Board, President and Chief Executive Officer
Shailesh G. Jejurikar
Chief Operating Officer
Gary Coombe
Chief Executive Officer – Grooming
Executive Sponsor – Corporate Wellbeing
Jennifer Davis
Chief Executive Officer –  
Health CareMa.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.9390952587127686
20        The Procter & Gamble Company
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.9373219013214111
72 • The Procter & Gamble Company
Board of Directors
B. Marc Allen
Chief Strategy Officer and Senior Vice President of 
Strategy and Corporate Development a

# ElasticSearch embed

In [1]:
from langchain_community.embeddings.elasticsearch import ElasticsearchEmbeddings

In [None]:
document_embeddings = embeddings.embed_documents(documents)

