# Storage with Vector Databases

## Chroma DB

In [7]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

def process_pdf_to_chroma(pdf_path, persist_directory="./chroma_db", chunk_size=500, chunk_overlap=50):
    """
    Loads a PDF, performs document-based chunking, and stores chunks dynamically in ChromaDB.
    
    Parameters:
        pdf_path (str): Path to the PDF file.
        persist_directory (str): Directory to store ChromaDB.
        chunk_size (int): Size of each chunk.
        chunk_overlap (int): Overlap between chunks to retain context.
    """
    # Load the PDF
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()
    
    # Combine extracted text from all pages
    full_text = "\n".join([page.page_content for page in pages])
    
    # Perform document-based chunking using RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    chunks = text_splitter.split_text(full_text)
    
    # Convert text chunks into LangChain Document objects
    documents = [Document(page_content=chunk) for chunk in chunks]
    
    # Initialize HuggingFace embeddings model
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    
    # Initialize ChromaDB vector store
    chroma_store = Chroma.from_documents(documents, embedding_model, persist_directory=persist_directory)
    
    print(f"Stored {len(documents)} chunks in ChromaDB.")
    return chroma_store

# Example usage:
pdf_path = "test.pdf"  
chroma_store = process_pdf_to_chroma(pdf_path)

# Example query:
query = "Limitations of alignment?"
results = chroma_store.similarity_search(query, k=1)

# Display search results
for i, result in enumerate(results):
    print(f"Result {i+1}:\n{result.page_content}\n")


Stored 65 chunks in ChromaDB.
Result 1:
tuning methods that can be applied on the fly. The audience
of alignment determines whether the procedure is bidirec-
tional or unidirectional, as well as the source and format of
the KSBs.
The discussion in this paper does not detail choosing the
right scope of alignment, but such a process is clearly nec-
essary. It may be a human-centered activity that aims for a
reflective equilibrium in which the scope has a competence,
transience, and audience with fairly small internal conflict



## Faiss

In [8]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

def process_pdf_to_faiss(pdf_path, chunk_size=500, chunk_overlap=50):
    """
    Loads a PDF, performs document-based chunking, and stores chunks dynamically in FAISS.
    
    Parameters:
        pdf_path (str): Path to the PDF file.
        chunk_size (int): Size of each chunk.
        chunk_overlap (int): Overlap between chunks to retain context.
    """
    # Load the PDF
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()
    
    # Combine extracted text from all pages
    full_text = "\n".join([page.page_content for page in pages])
    
    # Perform document-based chunking using RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    chunks = text_splitter.split_text(full_text)
    
    # Initialize Sentence Transformer model
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(chunks)
    
    # Initialize FAISS index
    embedding_size = embeddings.shape[1]
    faiss_index = faiss.IndexFlatL2(embedding_size)
    faiss_index.add(np.array(embeddings))
    
    print(f"Stored {len(chunks)} chunks in FAISS.")
    return faiss_index, chunks, model

# Example usage:
pdf_path = "test.pdf" 
faiss_index, chunks, model = process_pdf_to_faiss(pdf_path)

# Example query:
query = "Limitations of alignment?"
query_embedding = model.encode([query])
_, indices = faiss_index.search(np.array(query_embedding), k=1)

# Display search results
for i, idx in enumerate(indices[0]):
    print(f"Result {i+1}:\n{chunks[idx]}\n")

Stored 65 chunks in FAISS.
Result 1:
tuning methods that can be applied on the fly. The audience
of alignment determines whether the procedure is bidirec-
tional or unidirectional, as well as the source and format of
the KSBs.
The discussion in this paper does not detail choosing the
right scope of alignment, but such a process is clearly nec-
essary. It may be a human-centered activity that aims for a
reflective equilibrium in which the scope has a competence,
transience, and audience with fairly small internal conflict

