In [4]:
import os
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

print("‚úÖ Environment loaded")
print(f"Current directory: {os.getcwd()}")
print(f"Sample data directory exists: {Path('sample_data').exists()}")

‚úÖ Environment loaded
Current directory: f:\sourab\rag_practice
Sample data directory exists: True


In [5]:
from langchain_core.documents import Document
doc = Document(
    page_content="This is the actual content of the document. It contains the text we want to process.",
    metadata={
        "source": "example.pdf",
        "page": 1,
        "author": "John Doe",
        "date": "2025-01-15"
    }
)
# Inspect the document
print("üìÑ Document Structure:")
print(f"\nType: {type(doc)}")
print(f"\nContent (first 100 chars): {doc.page_content[:100]}")
print(f"\nMeta data: {doc.metadata}")
print(f"\nSource: {doc.metadata['source']}")
print(f"\nPage Number: {doc.metadata['page']}")

üìÑ Document Structure:

Type: <class 'langchain_core.documents.base.Document'>

Content (first 100 chars): This is the actual content of the document. It contains the text we want to process.

Meta data: {'source': 'example.pdf', 'page': 1, 'author': 'John Doe', 'date': '2025-01-15'}

Source: example.pdf

Page Number: 1


In [6]:
from langchain_community.document_loaders import PyPDFLoader
pdf_path = "data\\attention.pdf"

if Path(pdf_path).exists():
    print(f"Loading PDF: {pdf_path}")
    print("‚è≥ This may take a moment...\n")

    loader = PyPDFLoader(pdf_path)
    document = loader.load()

    print(f"‚úÖ Loaded {len(document)} pages\n")
    print("First page: \n")
    print(f"   Content (first 200 chars):  {doc.page_content[:200]}...")

    print(f"\nüìÑ Last Page (page {len(document)}):")
    print(f"   Content (first 200 chars): {document[-1].page_content[:200]}...")
    
else:
    print(f"‚ùå PDF not found: {pdf_path}")
    print("   Make sure the file exists in the project root")


Loading PDF: data\attention.pdf
‚è≥ This may take a moment...

‚úÖ Loaded 15 pages

First page: 

   Content (first 200 chars):  This is the actual content of the document. It contains the text we want to process....

üìÑ Last Page (page 15):
   Content (first 200 chars): Input-Input Layer5
The
Law
will
never
be
perfect
,
but
its
application
should
be
just
-
this
is
what
we
are
missing
,
in
my
opinion
.
<EOS>
<pad>
The
Law
will
never
be
perfect
,
but
its
application
sh...


In [7]:
if Path(pdf_path).exists():
    loader = PyPDFLoader(pdf_path)
    
    print("üîÑ Lazy loading pages (memory efficient):")
    
    # Process first 3 pages only
    for i, page in enumerate(loader.lazy_load()):
        if i >= 3:  # Only process first 3 pages for demo
            break
        
        print(f"\nPage {i+1}:")
        print(f"  Length: {len(page.page_content)} characters")
        print(f"  Preview: {page.page_content[:100]}...")
    
    print("\nüí° Tip: Use lazy_load() for PDFs > 100 pages to save memory")

üîÑ Lazy loading pages (memory efficient):

Page 1:
  Length: 2859 characters
  Preview: Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and...

Page 2:
  Length: 4257 characters
  Preview: 1 Introduction
Recurrent neural networks, long short-term memory [13] and gated recurrent [7] neural...

Page 3:
  Length: 1826 characters
  Preview: Figure 1: The Transformer - model architecture.
The Transformer follows this overall architecture us...

üí° Tip: Use lazy_load() for PDFs > 100 pages to save memory


In [8]:
pdf_directory = 'data'
if Path(pdf_path).exists():
    print(f"üìÇ Loading PDFs from: {pdf_directory}/\n")
    all_docs = []
    pdf_files = list(Path(pdf_directory).glob("*.pdf"))

    for pdf_file in pdf_files:
        print(f"   - {pdf_file.name}")
        loader = PyPDFLoader(str(pdf_file))
        print("="*30)

        doc = loader.load()
        print(doc)
        print("="*30)
        all_docs.extend(doc)
        print(all_docs)
        print("="*30)
        print(doc[0].metadata)
        print("="*30)

        print(f"    ‚úÖ Loaded {len(doc)} pages\n")

    print(f"\nüìä Total: {len(all_docs)} pages from {len(pdf_files)} PDFs")

    sources = set(doc.metadata['source'] for doc in all_docs)
    print("\nSource: ")
    for source in sources:
        print(f"   -{Path(source).name}")
else:
    print(f"‚ùå Directory not found: {pdf_directory}")


üìÇ Loading PDFs from: data/

   - attention.pdf
[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data\\attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani‚àó\nGoogle Brain\navaswani@google.com\nNoam Shazeer‚àó\nGoogle Brain\nnoam@google.com\nNiki Parmar‚àó\nGoogle Research\nnikip@google.com\nJakob Uszkoreit‚àó\nGoogle Research\nusz@google.com\nLlion Jones‚àó\nGoogle Research\nllion@google.com\nAidan N. Gomez‚àó ‚Ä†\nUniversity o

In [9]:
from langchain_community.document_loaders import CSVLoader
csv_path = "sample_data/products.csv"
if Path(csv_path).exists():
    print(f"Loading CSV: {csv_path}\n")

    loader = CSVLoader(
        file_path=csv_path,
        source_column="product_name"
    )
    docs = loader.load()
    print(f"‚úÖ Loaded {len(docs)} products\n")

    for i, doc in enumerate(docs[:3],1):
        print("="*70)
        print(f"Product: {i}")
        print("="*70)
        print(doc.page_content)
        print(f"\nSource: {doc.metadata['source']}")
        print(f"Row: {doc.metadata.get('row','N/A')}")
        print()
    print(f"... and {len(docs) - 3} more products")
else:
    print(f"‚ùå CSV not found: {csv_path}")

Loading CSV: sample_data/products.csv

‚úÖ Loaded 15 products

Product: 1
product_id: 1
product_name: Laptop Pro 15
category: Electronics
description: High-performance laptop with 15-inch display, Intel i7 processor, 16GB RAM, and 512GB SSD. Perfect for professional work and gaming.
price: 1299.99
stock: 45

Source: Laptop Pro 15
Row: 0

Product: 2
product_id: 2
product_name: Wireless Mouse
category: Accessories
description: Ergonomic wireless mouse with 6 programmable buttons, 2400 DPI optical sensor, and long battery life. Compatible with Windows and Mac.
price: 29.99
stock: 150

Source: Wireless Mouse
Row: 1

Product: 3
product_id: 3
product_name: USB-C Hub
category: Accessories
description: 7-in-1 USB-C hub with HDMI, USB 3.0 ports, SD card reader, and USB-C power delivery. Ideal for laptops and tablets.
price: 49.99
stock: 80

Source: USB-C Hub
Row: 2

... and 12 more products


In [10]:
if Path(csv_path).exists():
    loader = CSVLoader(
        file_path=csv_path,
        csv_args={
            'delimiter':',',
            'quotechar':'"',
            'fieldnames':None
        },
        source_column="product_id"
    )

    docs = loader.load()
    print("üìä CSV with custom configuration:\n")
    print(f"First document source: {docs[0].metadata['source']}")
    print(f"Content preview:\n{docs[0].page_content[:200]}...")

üìä CSV with custom configuration:

First document source: 1
Content preview:
product_id: 1
product_name: Laptop Pro 15
category: Electronics
description: High-performance laptop with 15-inch display, Intel i7 processor, 16GB RAM, and 512GB SSD. Perfect for professional work an...


In [11]:
from langchain_community.document_loaders import JSONLoader

json_path = 'sample_data\\api_response.json'

if Path(json_path).exists():
    print(f"Loading JSON: {json_path}\n")
    loader = JSONLoader(
        file_path=json_path,
        jq_schema=".articles[]",
        text_content=False
    )
    docs = loader.load()
    print(f"Loaded {len(docs)} articles\n")
    print("First Article:")
    print(f"Content: \n{docs[0].page_content}\n")
    print(f"Metadata: {docs[0].metadata}")
else:
    print(f"‚ùå JSON not found: {json_path}")

Loading JSON: sample_data\api_response.json

Loaded 5 articles

First Article:
Content: 
{"id": "article_001", "title": "Introduction to Retrieval-Augmented Generation (RAG)", "author": "Dr. Sarah Chen", "published_date": "2025-01-10", "category": "Machine Learning", "tags": ["RAG", "LLM", "NLP", "AI"], "summary": "Retrieval-Augmented Generation (RAG) is a powerful technique that combines information retrieval with large language models to generate more accurate and contextual responses.", "content": "RAG systems work by first retrieving relevant documents from a knowledge base, then using those documents as context for a language model to generate responses. This approach significantly reduces hallucinations and provides more factual, grounded outputs. The architecture typically consists of three main components: a document store, an embedding model for semantic search, and a language model for generation.", "reading_time": "5 minutes", "views": 15420, "likes": 892}

Metadata: {'sourc

In [12]:
from langchain_community.document_loaders import UnstructuredHTMLLoader
html_path = 'sample_data\\blog_post.html'

if Path(html_path).exists():
    print(f"Loading HTML: {html_path}\n")
    file_url = f"file://{Path(html_path).absolute()}"
    loader = UnstructuredHTMLLoader(html_path)
    documents = loader.load()

    print(f"‚úÖ Loaded {len(documents)} document(s)\n")
    print(documents[:3])
    doc = documents[0]
    print(f"\nContent length: {len(doc.page_content)} characters")
    print(f"\n First 500 words: \n{doc.page_content[:500]}...")
    print(f"\n Metadata: {doc.metadata}")
else:
    print(f"‚ùå HTML not found: {html_path}")



Loading HTML: sample_data\blog_post.html

‚úÖ Loaded 1 document(s)

[Document(metadata={'source': 'sample_data\\blog_post.html'}, page_content="Building Intelligent Applications with RAG\n\nBy Dr. Amanda Foster | January 15, 2025 | 12 min read\n\nIntroduction\n\nIn the rapidly evolving landscape of artificial intelligence, Retrieval-Augmented Generation (RAG) has emerged as a game-changing approach for building intelligent applications. Unlike traditional chatbots that rely solely on the knowledge embedded in their training data, RAG systems combine the power of information retrieval with language generation to produce more accurate, contextual, and up-to-date responses.\n\nThis article will guide you through the fundamentals of RAG, explore the LangChain framework, and demonstrate how to build production-ready RAG applications. Whether you're a seasoned ML engineer or just starting your AI journey, you'll find practical insights and actionable advice.\n\nWhat is Retrieval-Augmented Ge

In [13]:
from langchain_community.document_loaders import WebBaseLoader
url = [
    "https://python.langchain.com/docs/introduction/",
    "https://python.langchain.com/docs/expression_language/"
]
loader = WebBaseLoader(url)
docs = loader.load()

print(f"Loaded {len(docs)} pages")
for doc in docs:
    print(f"   -{doc.metadata['source']}")


USER_AGENT environment variable not set, consider setting it to identify your requests.


Loaded 2 pages
   -https://python.langchain.com/docs/introduction/
   -https://python.langchain.com/docs/expression_language/


In [14]:
print("="*80)
print("üìÑ LOADED DOCUMENTS CONTENT")
print("="*80)
for i, doc in enumerate(docs, 1):
    print("="*80)
    print(f"Page{i}: {doc.metadata['source']}")
    print("="*80)

    print(f"\nüìù Content Preview (first 1000 chars):")
    print(doc.page_content[:1200])
    print(f"\n... [Total length: {len(doc.page_content)} characters]")

    for key, value in doc.metadata.items():
        print(f"{key}: {value}")
    print("\n")

print("\n" + "="*80)
print("üìñ FULL CONTENT OF PAGE 1")
print("="*80)
print(docs[0].page_content)

üìÑ LOADED DOCUMENTS CONTENT
Page1: https://python.langchain.com/docs/introduction/

üìù Content Preview (first 1000 chars):
LangChain overview - Docs by LangChainSkip to main contentDocs by LangChain home pageLangChain + LangGraphSearch...‚åòKAsk AIGitHubTry LangSmithTry LangSmithSearch...NavigationLangChain overviewLangChainLangGraphDeep AgentsIntegrationsLearnReferenceContributePythonOverviewGet startedInstallQuickstartChangelogPhilosophyCore componentsAgentsModelsMessagesToolsShort-term memoryStreamingStructured outputMiddlewareOverviewBuilt-in middlewareCustom middlewareAdvanced usageGuardrailsRuntimeContext engineeringModel Context Protocol (MCP)Human-in-the-loopMulti-agentRetrievalLong-term memoryAgent developmentLangSmith StudioTestAgent Chat UIDeploy with LangSmithDeploymentObservabilityOn this page Create an agent Core benefitsLangChain overviewCopy pageLangChain is an open source framework with a pre-built agent architecture and integrations for any model or tool ‚Äî so yo

In [15]:
from langchain_community.document_loaders import TextLoader
txt_path = 'sample_data\\notes.txt'

if Path(txt_path).exists():
    print(f"Loading text file: {txt_path}\n")

    loader = TextLoader(txt_path, encoding="utf-8")
    documents = loader.load()
    print(f"‚úÖ Loaded {len(documents)} document\n")

    doc = documents[0]
    print(f"üìÑ Content length: {len(doc.page_content)} characters")
    print(f"\nüìù First 300 characters:\n{doc.page_content[:300]}...")
    print(f"\nüîç Metadata: {doc.metadata}")
else:
    print(f"‚ùå Text file not found: {txt_path}")

Loading text file: sample_data\notes.txt

‚úÖ Loaded 1 document

üìÑ Content length: 8567 characters

üìù First 300 characters:
LANGCHAIN STUDY NOTES - RAG IMPLEMENTATION

Date: January 15, 2025
Topic: Retrieval-Augmented Generation with LangChain 1.0+


CORE CONCEPTS
-------------

1. Document Object Structure
   - page_content: The actual text content
   - metadata: Dictionary wit...

üîç Metadata: {'source': 'sample_data\\notes.txt'}


In [16]:

readme_path = 'sample_data\\README.md'

if Path(readme_path).exists():
    try:
        from langchain_community.document_loaders import UnstructuredMarkdownLoader
        print(f"Loading markdown: {readme_path}\n")
        loader = UnstructuredMarkdownLoader(readme_path)
        docs = loader.load()

        print(f"‚úÖ Loaded {len(docs)} document(s)")
        print(f"\nFirst 200 chars:\n{docs[0].page_content[:200]}...")

    except ImportError:
        print("‚ö†Ô∏è UnstructuredMarkdownLoader requires additional dependencies")
        print("   Install with: pip install unstructured")
        print("\n   For now, using TextLoader:")

        loader = TextLoader(readme_path)
        docs = loader.load()
        print(f"   ‚úÖ Loaded with TextLoader: {len(docs[0].page_content)} chars")

else:
    print(f"‚ÑπÔ∏è No README.md found in current directory")

Loading markdown: sample_data\README.md

‚úÖ Loaded 1 document(s)

First 200 chars:
üìö LangChain Notebooks - Complete RAG Course

A comprehensive, hands-on course for learning Retrieval-Augmented Generation (RAG) with LangChain 1.0.5+ - November 2025.

Perfect for mixed-level classes ...


In [17]:
from langchain_community.document_loaders import DirectoryLoader
data_dir = 'sample_data'

if Path(data_dir).exists():
    print(f"üìÇ Loading all text files from: {data_dir}/\n")
    loader = DirectoryLoader(
        data_dir,
        glob="*.txt",
        loader_cls=TextLoader,
        show_progress=True
    )
    documents = loader.load()
    print(f"\n‚úÖ Loaded {len(documents)} text file(s)\n")

    for doc in documents:
        print(f"   -{doc.metadata['source']} ({len(doc.page_content)} chars)")
else:
    print(f"‚ùå Directory not found: {data_dir}")

üìÇ Loading all text files from: sample_data/



100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<?, ?it/s]


‚úÖ Loaded 1 text file(s)

   -sample_data\notes.txt (8639 chars)





In [22]:
def load_all_documents(directory: str):
    all_docs = []
    directory_path = Path(directory)

    if not directory_path.exists():
        print(f"‚ùå Directory not found: {directory}")
        return []
    
    print(f"üìÇ Loading from: {directory}\n")
    
    pdf_files = list(directory_path.glob("*.pdf"))
    for pdf in pdf_files:
        loader = PyPDFLoader(str(pdf))
        doc = loader.load()
        all_docs.extend(doc)
        print(f"  ‚úÖ PDF: {pdf.name} ({len(doc)} pages)")

    txt_files = list(directory_path.glob("*.txt"))
    for txt in txt_files:
        loader = TextLoader(str(txt))
        doc = loader.load()
        all_docs.extend(doc)
        print(f"  ‚úÖ TXT: {txt.name}")

    json_files = list(directory_path.glob("*.json"))
    for json in json_files:
        loader = JSONLoader(
            str(json),
            jq_schema='.',
            text_content=False
        )
        doc = loader.load()
        all_docs.extend(doc)
        print(f"  ‚úÖ JSON: {json.name}")
    
    csv_files = list(directory_path.glob("*.csv"))
    for csv in csv_files:
        loader = CSVLoader(str(csv))
        doc = loader.load()
        all_docs.extend(doc)
        print(f"  ‚úÖ CSV: {csv.name} ({len(docs)} rows)")
    return all_docs
    
data_dir = 'sample_data'
if Path(data_dir).exists():
    all_docs = load_all_documents('sample_data')

    print(f"\nüìà Summary:")
    sources = [doc.metadata['source'] for doc in all_docs]
    print(f"   Files loaded: {len(set(sources))}")
    print(f"   Total documents: {len(all_docs)}")


üìÇ Loading from: sample_data

  ‚úÖ TXT: notes.txt
  ‚úÖ JSON: api_response.json
  ‚úÖ CSV: products.csv (1 rows)

üìà Summary:
   Files loaded: 3
   Total documents: 17
