##RAG Pipeline - Data ingestion to vectorDB pipeline    

In [None]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from pathlib import Path

# Robust import for different LangChain layouts (langchain_core => langchain v1+)
# Do NOT import directly from langchain_core at top-level; use try/except
try:
    from langchain_core.text_splitter import RecursiveCharacterTextSplitter
    _langchain_text_splitter_source = "langchain_core"
except Exception:
    try:
        from langchain.text_splitter import RecursiveCharacterTextSplitter
        _langchain_text_splitter_source = "langchain"
    except Exception:
        # Fallback: provide a very small, local splitter compatible with our usage
        _langchain_text_splitter_source = "fallback"

        class RecursiveCharacterTextSplitter:
            def __init__(self, chunk_size=1000, chunk_overlap=200, length_function=len, separators=None):
                self.chunk_size = chunk_size
                self.chunk_overlap = chunk_overlap

            def split_documents(self, documents):
                out = []
                for doc in documents:
                    text = getattr(doc, 'page_content', str(doc))
                    start = 0
                    text_len = len(text)
                    if text_len == 0:
                        continue
                    while start < text_len:
                        end = min(start + self.chunk_size, text_len)
                        chunk = text[start:end]
                        # create a simple object with the attributes we expect
                        new_doc = type('Doc', (), {})()
                        new_doc.page_content = chunk
                        new_doc.metadata = getattr(doc, 'metadata', {}).copy() if hasattr(doc, 'metadata') else {}
                        out.append(new_doc)
                        if end == text_len:
                            break
                        start = end - self.chunk_overlap
                        if start < 0:
                            start = 0
                return out

print("Text splitter import source:", _langchain_text_splitter_source)
if _langchain_text_splitter_source == "fallback":
    print("Warning: using fallback text splitter. To use the official splitter, install langchain-core:")
    print("%pip install 'langchain-core>=1.0.0'")


ModuleNotFoundError: No module named 'langchain_core.text_splitter'

In [8]:
### Read all the pdf's inside the directory
def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'

            all_documents.extend(documents)
            print(f" ✓ Loaded {len(documents)} pages")

        except Exception as e:  
            print(f" ✗ Error: {e}")

    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
all_pdf_documents = process_all_pdfs("../data")

Found 3 PDF files to process

Processing: Ai Easy Solutions 2019 Pattern.pdf
 ✓ Loaded 75 pages

Processing: LOI_Bhoi Kalpesh Sanjay.pdf
 ✓ Loaded 1 pages

Processing: zerotomastery.io+-+Complete+Ethical+Hacking+Bootcamp+Zero+to+Mastery+Guide.pdf
 ✓ Loaded 3 pages

Total documents loaded: 79
 ✓ Loaded 75 pages

Processing: LOI_Bhoi Kalpesh Sanjay.pdf
 ✓ Loaded 1 pages

Processing: zerotomastery.io+-+Complete+Ethical+Hacking+Bootcamp+Zero+to+Mastery+Guide.pdf
 ✓ Loaded 3 pages

Total documents loaded: 79


In [9]:
all_pdf_documents

[Document(metadata={'producer': 'Adobe Scan for Android 22.05.19-regular', 'creator': 'Adobe Scan for Android 22.05.19-regular', 'creationdate': '', 'source': '..\\data\\pdf\\Ai Easy Solutions 2019 Pattern.pdf', 'total_pages': 75, 'page': 0, 'page_label': '1', 'source_file': 'Ai Easy Solutions 2019 Pattern.pdf', 'file_type': 'pdf'}, page_content=''),
 Document(metadata={'producer': 'Adobe Scan for Android 22.05.19-regular', 'creator': 'Adobe Scan for Android 22.05.19-regular', 'creationdate': '', 'source': '..\\data\\pdf\\Ai Easy Solutions 2019 Pattern.pdf', 'total_pages': 75, 'page': 1, 'page_label': '2', 'source_file': 'Ai Easy Solutions 2019 Pattern.pdf', 'file_type': 'pdf'}, page_content=''),
 Document(metadata={'producer': 'Adobe Scan for Android 22.05.19-regular', 'creator': 'Adobe Scan for Android 22.05.19-regular', 'creationdate': '', 'source': '..\\data\\pdf\\Ai Easy Solutions 2019 Pattern.pdf', 'total_pages': 75, 'page': 2, 'page_label': '3', 'source_file': 'Ai Easy Solutions

In [10]:
## text splitting get into chunks
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into smaller chunks"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split into {len(split_docs)} chunks")
    return split_docs

# Split the loaded PDF documents
    if split_docs:
        print(f"\nexample chunk:")
        print(f"content, {split_docs[0].page_content[:200]}... ")
        print(f"metadata: {split_docs[0].metadata}")
    return split_docs

In [11]:
chunks= split_documents(all_pdf_documents)
chunks

NameError: name 'RecursiveCharacterTextSplitter' is not defined