In [9]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

In [2]:
# Directory containing PDF files
pdf_directory = "./documents"

In [3]:
# Initialize an empty list to hold all documents
documents = []

In [5]:
# Load each PDF file in the directory
for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        pdf_loader = PyPDFLoader(file_path=os.path.join(pdf_directory, filename))
        documents.extend(pdf_loader.load())

In [6]:

# Split the loaded documents into smaller chunks for better embedding
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)

In [11]:
# Create embeddings for the document chunks
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


  from tqdm.autonotebook import tqdm, trange


In [12]:
# Store the document embeddings in Chroma vectorstore
vectorstore = Chroma.from_documents(docs, embeddings, persist_directory="./chroma_vectorstore")

# Persist the vectorstore for future retrieval
vectorstore.persist()


  vectorstore.persist()
