# RAG pipelines and Data Ingestion to Vector db pipelines

In [1]:
import os
from pathlib import Path
from langchain_community.document_loaders import PyMuPDFLoader , PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def process_all_pdf(pdf_directory):
    
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    pdf_files = list(pdf_dir.glob('**/*.pdf'))
    
    print(f"found {len(pdf_files)} PDF files to Process")
    
    for pdf_files in pdf_files:
        print(f"\npreprocessing : {pdf_files.name}")
        try:
            loader = PyPDFLoader(str(pdf_files))
            documents = loader.load()
            
            for doc in documents:
                doc.metadata['source_file'] = pdf_files.name
                doc.metadata['file_type'] = 'pdf'
                
            all_documents.extend(documents)
            print(f" Loaded {len(documents)} pages")
        except Exception as e:
            print(f" Error : {e}")
            
    print(f"\n Total documents loaded : {len(all_documents)}")
    return all_documents    

all_pdf_files = process_all_pdf('./data')

found 2 PDF files to Process

preprocessing : Hands On Machine Learning with Scikit Learn and TensorFlow.pdf
 Loaded 564 pages

preprocessing : LLM_Book.pdf


Object 616 0 not defined.
Object 620 0 not defined.
Object 631 0 not defined.
Object 635 0 not defined.
Object 635 0 not defined.
Overwriting cache for 0 635
Object 635 0 not defined.
Overwriting cache for 0 635
Object 647 0 not defined.
Object 650 0 not defined.
Object 653 0 not defined.
Object 650 0 not defined.
Overwriting cache for 0 650
Object 653 0 not defined.
Overwriting cache for 0 653
Object 650 0 not defined.
Overwriting cache for 0 650
Object 653 0 not defined.
Overwriting cache for 0 653
Object 650 0 not defined.
Overwriting cache for 0 650
Object 653 0 not defined.
Overwriting cache for 0 653
Object 650 0 not defined.
Overwriting cache for 0 650
Object 653 0 not defined.
Overwriting cache for 0 653
Object 780 0 not defined.
Object 783 0 not defined.
Object 783 0 not defined.
Overwriting cache for 0 783
Object 791 0 not defined.
Object 794 0 not defined.
Object 794 0 not defined.
Overwriting cache for 0 794
Object 803 0 not defined.
Object 806 0 not defined.
Object 810 0 n

 Loaded 593 pages

 Total documents loaded : 1157


In [3]:
all_pdf_files

[Document(metadata={'producer': 'Antenna House PDF Output Library 6.2.609 (Linux64)', 'creator': 'AH CSS Formatter V6.2 MR4 for Linux64 : 6.2.6.18551 (2014/09/24 15:00JST)', 'creationdate': '2017-03-10T21:55:34+00:00', 'author': 'Aurélien Géron', 'moddate': '2017-05-16T09:54:54+08:00', 'title': 'Hands-On Machine Learning with Scikit-Learn and TensorFlow', 'trapped': '/False', 'source': 'data\\Books\\Hands On Machine Learning with Scikit Learn and TensorFlow.pdf', 'total_pages': 564, 'page': 0, 'page_label': 'Cover', 'source_file': 'Hands On Machine Learning with Scikit Learn and TensorFlow.pdf', 'file_type': 'pdf'}, page_content='Aurélien Géron\nHands-On  \nMachine Learning  \nwith Scikit-Learn  \n& TensorFlow  \nCONCEPTS, TOOLS, AND TECHNIQUES  \nTO BUILD INTELLIGENT SYSTEMS\n\x00D\x00o\x00w\x00n\x00l\x00o\x00a\x00d\x00 \x00f\x00r\x00o\x00m\x00 \x00f\x00i\x00n\x00e\x00l\x00y\x00b\x00o\x00o\x00k\x00 \x00w\x00w\x00w\x00.\x00f\x00i\x00n\x00e\x00l\x00y\x00b\x00o\x00o\x00k\x00.\x00c\x00o\x

In [4]:
def split_documents(documents, chunk_size = 1000, chunk_overlap = 200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"split : {len(documents)} documents into {len(split_docs)} chunks")
    
    if split_docs:
        print("\nExample Chunk :" )
        print(f"content : {split_docs[0].page_content[:200]} ...")
        print(f"Metadata : {split_docs[0].metadata}")
        
    return split_docs

In [5]:
chunks = split_documents(all_pdf_files)
chunks

split : 1157 documents into 2728 chunks

Example Chunk :
content : Aurélien Géron
Hands-On  
Machine Learning  
with Scikit-Learn  
& TensorFlow  
CONCEPTS, TOOLS, AND TECHNIQUES  
TO BUILD INTELLIGENT SYSTEMS
 D o w n l o a d   f r o m   f i n e l y b o o k   w w w  ...
Metadata : {'producer': 'Antenna House PDF Output Library 6.2.609 (Linux64)', 'creator': 'AH CSS Formatter V6.2 MR4 for Linux64 : 6.2.6.18551 (2014/09/24 15:00JST)', 'creationdate': '2017-03-10T21:55:34+00:00', 'author': 'Aurélien Géron', 'moddate': '2017-05-16T09:54:54+08:00', 'title': 'Hands-On Machine Learning with Scikit-Learn and TensorFlow', 'trapped': '/False', 'source': 'data\\Books\\Hands On Machine Learning with Scikit Learn and TensorFlow.pdf', 'total_pages': 564, 'page': 0, 'page_label': 'Cover', 'source_file': 'Hands On Machine Learning with Scikit Learn and TensorFlow.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Antenna House PDF Output Library 6.2.609 (Linux64)', 'creator': 'AH CSS Formatter V6.2 MR4 for Linux64 : 6.2.6.18551 (2014/09/24 15:00JST)', 'creationdate': '2017-03-10T21:55:34+00:00', 'author': 'Aurélien Géron', 'moddate': '2017-05-16T09:54:54+08:00', 'title': 'Hands-On Machine Learning with Scikit-Learn and TensorFlow', 'trapped': '/False', 'source': 'data\\Books\\Hands On Machine Learning with Scikit Learn and TensorFlow.pdf', 'total_pages': 564, 'page': 0, 'page_label': 'Cover', 'source_file': 'Hands On Machine Learning with Scikit Learn and TensorFlow.pdf', 'file_type': 'pdf'}, page_content='Aurélien Géron\nHands-On  \nMachine Learning  \nwith Scikit-Learn  \n& TensorFlow  \nCONCEPTS, TOOLS, AND TECHNIQUES  \nTO BUILD INTELLIGENT SYSTEMS\n\x00D\x00o\x00w\x00n\x00l\x00o\x00a\x00d\x00 \x00f\x00r\x00o\x00m\x00 \x00f\x00i\x00n\x00e\x00l\x00y\x00b\x00o\x00o\x00k\x00 \x00w\x00w\x00w\x00.\x00f\x00i\x00n\x00e\x00l\x00y\x00b\x00o\x00o\x00k\x00.\x00c\x00o\x

### Embedding and VectorDb

In [6]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
import uuid
from chromadb.config import Settings
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
class EmbeddingManager:
    
    def __init__(self, model_name : str = "all-MiniLM-L6-v2"):
        
        self.model_name = model_name
        self.model = None
        self._load_model()
        
    def _load_model(self):
        try:
            print(f"Loading embedding model : {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model Loaded Successfully. Embedded dimensions : {self.model.get_sentence_embedding_dimension()}")
        except Exception as e :
            print(f"Error loading model {self.model_name} : {e}")
            raise
    
    def generate_embeddings(self, texts: List[str]) -> np.ndarray :
        
        if not self.model:
            raise ValueError("Model not loaded!")

        print(f" Generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"Generating embedding with shape: {embeddings.shape}")
        
        return embeddings     
           
    def get_embedding_dimensions(self) -> int :
        if not self.model:
            raise ValueError("Model not Loaded")
        
        return self.model.get_sentence_embedding_dimension()        

In [None]:
embeddingManager = EmbeddingManager()
embeddingManager

Loading embedding model : all-MiniLM-L6-v2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
