In [13]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import NLTKTextSplitter
import warnings
warnings.filterwarnings('ignore')

In [6]:
def read_pdf(file_path):
    """
    Load and parse the content of a PDF file.
    
    Args:
        file_path (str): The path to the PDF file to be loaded.
    
    Returns:
        list: A list of Document objects containing the file's content.
    """
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    return docs

In [7]:
def chunk(docs, chunk_size=1000, chunk_overlap=50):
    """
    Splits the documents into smaller chunks using NLTK-based text splitting.
    
    This method processes the text content from the provided Document objects and splits them into smaller
    chunks with specified chunk size and overlap using the `NLTKTextSplitter`.
    
    Args:
        docs (list): A list of Document objects containing the text to be split.
        chunk_size (int): The maximum size of each text chunk (default is 1000).
        chunk_overlap (int): The number of overlapping characters between consecutive chunks (default is 50).
    
    Returns:
        list: A list of Document objects representing the text chunks.
    """
    # Extract text content from Document objects
    texts = [doc.page_content for doc in docs if hasattr(doc, "page_content")]
    
    # Initialize the text splitter with the specified chunk size and overlap
    text_splitter = NLTKTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    
    # Split the text into chunks and collect them in a list
    chunks = []
    for text in texts:
        chunks.extend(text_splitter.create_documents([text]))
    
    # Log the number of generated chunks for debugging
    print(f"Split into {len(chunks)} chunks")
    return chunks

In [8]:
file_path = "/home/marwan/marwan/Tasks/Corporatica/rag_task/streamlit/files/How Our Brain Works.pdf"
docs = read_pdf(file_path)
chunks = chunk(docs)

Split into 735 chunks


In [22]:
from langchain.embeddings import CacheBackedEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.storage import LocalFileStore

def create_vector_index_and_embedding_model(chunks):
    """
    Creates an embedding model and vector index using Langchain and Hugging Face embeddings.
    
    Args:
        chunks (list of str): List of text chunks (documents) that need to be indexed.
    
    Returns:
        tuple: Returns a tuple containing:
            - embeddings_model: The Hugging Face embedding model used for encoding the text.
            - vector_index: The FAISS index that stores the vectors of the documents for fast retrieval.
            - texts: The original document texts.
    
    Notes:
        - The `LocalFileStore` is used to store embeddings and cache them for future use.
    """
    
    # Set up local storage for cached embeddings
    store = LocalFileStore("./cache/")  # Directory where embeddings will be cached
    
    # Define the embedding model ID and model arguments (device, remote code trust)
    embed_model_id = 'intfloat/e5-small-v2'  # Hugging Face model ID for embeddings
    model_kwargs = {"device": "cpu", "trust_remote_code": True}  # Parameters for the model (use CPU)
    
    # Create the Hugging Face embeddings model using the specified model ID and arguments
    embeddings_model = HuggingFaceEmbeddings(model_name=embed_model_id, model_kwargs=model_kwargs)
    
    # Wrap the Hugging Face model with caching to avoid recalculating embeddings
    embedder = CacheBackedEmbeddings.from_bytes_store(embeddings_model, store, namespace=embed_model_id)
    
    # Use FAISS to create a vector index from the documents (chunks)
    vector_index = FAISS.from_documents(chunks, embedder)  # Generate the index based on the document embeddings
    
    # Return the embeddings model, vector index, and the original texts
    return embeddings_model, vector_index, chunks

# Retrieve text and embeddings from FAISS
def retrieve_text_and_embeddings(vector_index, texts):
    """
    Retrieves the list of text and embeddings from the FAISS vector index.
    
    Args:
        vector_index (FAISS): The FAISS vector index object.
        texts (list of str): The original documents (text chunks).
        
    Returns:
        list of tuples: Each tuple contains a text chunk and its corresponding embedding vector.
    """
    # Accessing the FAISS index directly to get the embeddings and their corresponding IDs
    embeddings = vector_index.index.reconstruct_n(0, len(texts))  # Get all embeddings
    text_and_embeddings = [(texts[i], embeddings[i]) for i in range(len(texts))]
    
    return text_and_embeddings

# Create the embedding model and vector index
embeddings_model, vector_index, texts = create_vector_index_and_embedding_model(chunks)

# Retrieve the list of texts and their embeddings
text_and_embeddings = retrieve_text_and_embeddings(vector_index, texts)

# Print the results
for text, embedding in text_and_embeddings:
    print(f"Text:\n {text.page_content}")
    print(f"Embedding\n: {embedding[:5]}...")
    break

Text:
 xiiiIntroduction
The attributes of the human brain that produce human 
intelligence are the focus of this book.

This book is about the real you.

Not the re ﬂ ected image you see in the mirror, the 
shape of your face and body and the coloration of your skin, hair, and eyes.

That’s a machine.

A set of moveable connected bones driven by muscles and covered with skin and hair.

You control the machine that you see in the mirror.

The real you is contained within your brain.

A t  t h i s  v e r y  m o m e n t ,  y o u r  b r a i n  i s  b u i l d i n g  n e u r a l  
representations of every object you currently perceive and 
a neural representation of the space around you.

These 
perceptions of objects and space, contained in different areas of your brain, are brought together to build your virtual world.

That neural virtual world is your reality.

The most important object in your virtual world is you.
Embedding
: [-0.06646905  0.03610453  0.00271671  0.01035635 -0.00708345