### Data Ingestion

In [1]:
from langchain_core.documents import Document

In [None]:
doc = Document(
    page_content="This is the main text content I am using to create rag",
    metadata = {
        "source":"example.txt",
        "pages": 1,
        "author": "Krishanu Das",
        "date_created": "2025-01-01"
    }
)
doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Krishanu Das', 'date_created': '2025-01-01'}, page_content='This is the main text content I am using to create rag')

In [4]:
# create a simple text file
import os
os.makedirs('../data/text_files', exist_ok=True)

In [9]:
sample_texts = {
    "../data/text_files/python_intro.txt":"Python is a language that feels like it was designed on a calm Sunday morning. You write a few lines, and suddenly things just… work. Lists stretch and shrink like elastic, dictionaries quietly hold secrets in key-value pairs, and functions politely accept just about anything you hand them. One moment you’re looping through nested data structures like an explorer navigating a digital jungle, and the next you’re importing libraries that turn your tiny script into a data-crunching powerhouse. Whether it's automating a boring task or spinning up a quick API, Python is the friendly companion that never complains about indentation—as long as you respect the tabs and spaces.",
    "../data/text_files/rag_intro.txt":"RAG is like giving an AI a backpack full of books and saying, “Don’t just guess—go look it up.” Instead of hallucinating confidently like it owns the universe, the model pauses, fetches chunks of real information from a vector store, and then crafts an answer that actually makes sense. Embeddings act like the secret GPS coordinates pointing to the most relevant documents, while retrievers quietly drag them back from the depths of your knowledge base. Suddenly, your chatbot knows company policies better than HR and answers tech queries faster than the dev team on caffeine. With RAG, AI becomes less of a storyteller and more of a reliable assistant—still creative, but with receipts."
}

for file_path, text in sample_texts.items():
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(text)

print("Sample text files created")

Sample text files created


In [None]:
# Text loader
from langchain_community.document_loaders import TextLoader
loader = TextLoader("../data/text_files/python_intro.txt", encoding="utf-8")
document = loader.load()
document

[Document(metadata={'source': '../data/text_files/python_intro.txt'}, page_content="Python is a language that feels like it was designed on a calm Sunday morning. You write a few lines, and suddenly things just… work. Lists stretch and shrink like elastic, dictionaries quietly hold secrets in key-value pairs, and functions politely accept just about anything you hand them. One moment you’re looping through nested data structures like an explorer navigating a digital jungle, and the next you’re importing libraries that turn your tiny script into a data-crunching powerhouse. Whether it's automating a boring task or spinning up a quick API, Python is the friendly companion that never complains about indentation—as long as you respect the tabs and spaces.")]

In [None]:
# directory loader
from langchain_community.document_loaders import DirectoryLoader
dir_loader = DirectoryLoader(
    "../data/text_files",
    glob='**/*.txt',
    loader_cls=TextLoader,
    loader_kwargs={
        'encoding':'utf-8'
    },
    show_progress=False
)
documents = dir_loader.load()
documents

['RAG is like giving an AI a backpack full of books and saying, “Don’t just guess—go look it up.” Instead of hallucinating confidently like it owns the universe, the model pauses, fetches chunks of real information from a vector store, and then crafts an answer that actually makes sense. Embeddings act like the secret GPS coordinates pointing to the most relevant documents, while retrievers quietly drag them back from the depths of your knowledge base. Suddenly, your chatbot knows company policies better than HR and answers tech queries faster than the dev team on caffeine. With RAG, AI becomes less of a storyteller and more of a reliable assistant—still creative, but with receipts.',
 "Python is a language that feels like it was designed on a calm Sunday morning. You write a few lines, and suddenly things just… work. Lists stretch and shrink like elastic, dictionaries quietly hold secrets in key-value pairs, and functions politely accept just about anything you hand them. One moment y

In [12]:
from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
dir_loader = DirectoryLoader(
    "../data/pdf",
    glob="**/*.pdf",
    loader_cls=PyMuPDFLoader,
    show_progress=False
)
documents = dir_loader.load()
documents

[Document(metadata={'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-07-22T09:44:08+00:00', 'source': '../data/pdf/Assigntment 2.pdf', 'file_path': '../data/pdf/Assigntment 2.pdf', 'total_pages': 3, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-07-22T09:44:08+00:00', 'trapped': '', 'modDate': 'D:20250722094408Z', 'creationDate': 'D:20250722094408Z', 'page': 0}, page_content='Indian Institute of Technology Jodhpur\nFundamentals of Distributed Systems\nAssignment – 2\nTotal Marks:\n20\nSubmission Deadline:\n27 July 2025'),
 Document(metadata={'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-07-22T09:44:08+00:00', 'source': '../data/pdf/Assigntment 2.pdf', 'file_path': '../data/pdf/Assigntment 2.pdf', 'total_pages': 3, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-07-22T09:44:08+00:00', 'trapped': '', 'modDate': 'D:20