In [2]:
# Standard library imports
import os
from pathlib import Path

# Environment variable management - for secure API key handling
from dotenv import load_dotenv

# LangChain Document Loaders - for loading PDF documents
from langchain_community.document_loaders import PyPDFLoader

# LangChain Text Splitters - for breaking documents into manageable chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter

# OpenAI Integration - for embeddings and LLM
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

# Vector Store - FAISS for efficient similarity search
from langchain_community.vectorstores import FAISS

# LangChain Core Components
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

print("‚úì All imports successful!")
print("‚úì Compatible with LangChain 1.0+")

‚úì All imports successful!
‚úì Compatible with LangChain 1.0+


In [3]:
# Load environment variables from .env file
load_dotenv()

# Verify API key is loaded
if not os.getenv("OPENAI_API_KEY"):
    print("‚ö†Ô∏è  WARNING: OPENAI_API_KEY not found!")
    print("Please set it in .env file or uncomment the line below:")
    # os.environ["OPENAI_API_KEY"] = "your_api_key_here"
else:
    print("‚úì OpenAI API Key loaded successfully!")
    print(f"‚úì Key starts with: {os.getenv('OPENAI_API_KEY')[:8]}...")

‚úì OpenAI API Key loaded successfully!
‚úì Key starts with: sk-proj-...


In [6]:
# ===== CONFIGURATION: Update this path to your PDF file =====
pdf_path = "../resources/pdfs/attention.pdf"  # Change this to your PDF file path
# =============================================================

# Check if file exists
if not os.path.exists(pdf_path):
    print(f"‚ö†Ô∏è  ERROR: File '{pdf_path}' not found!")
    print("Please update the pdf_path variable with your PDF file location.")
else:
    # Initialize the PDF loader
    loader = PyPDFLoader(pdf_path)
    
    # Load all pages from the PDF
    # Each page becomes a separate Document object
    documents = loader.load()
    
    # Display information about loaded documents
    print(f"‚úì Loaded {len(documents)} pages from '{pdf_path}'")
    print(f"\n--- First Document Preview ---")
    print(f"Content (first 500 chars): {documents[0].page_content[:500]}...")
    print(f"\nMetadata: {documents[0].metadata}")
    print(f"\nTotal characters across all pages: {sum(len(doc.page_content) for doc in documents):,}")

‚úì Loaded 15 pages from '../resources/pdfs/attention.pdf'

--- First Document Preview ---
Content (first 500 chars): Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani‚àó
Google Brain
avaswani@google.com
Noam Shazeer‚àó
Google Brain
noam@google.com
Niki Parmar‚àó
Google Research
nikip@google.com
Jakob Uszkoreit‚àó
Google Research
usz@google.com
Llion Jones‚àó
Google Research
llion@google.com
Aidan N. Gomez‚àó ‚Ä†
University of Toronto
aidan@cs.toronto.edu
≈Åukasz ...

Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../

In [8]:
# Example: Loading multiple PDFs from a directory

pdf_directory = "../resources/pdfs"  # Directory containing your PDFs
all_documents = []

if os.path.exists(pdf_directory):
    pdf_files = list(Path(pdf_directory).glob("*.pdf"))
    print(f"Found {len(pdf_files)} PDF files")
    
    for pdf_file in pdf_files:
        loader = PyPDFLoader(str(pdf_file))
        docs = loader.load()
        all_documents.extend(docs)
        print(f"  ‚úì Loaded {len(docs)} pages from {pdf_file.name}")
    
    print(f"\nTotal pages loaded: {len(all_documents)}")
    documents = all_documents  # Use this for the rest of the pipeline

Found 3 PDF files
  ‚úì Loaded 15 pages from attention.pdf
  ‚úì Loaded 19 pages from rag.pdf
  ‚úì Loaded 21 pages from ragsurvey.pdf

Total pages loaded: 55


In [9]:
# Initialize the text splitter with recommended settings
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,        # Maximum characters per chunk (roughly 200-250 tokens)
    chunk_overlap=128,      # Characters overlap between chunks (maintains context)
    length_function=len,    # Function to measure chunk length
    separators=["\n\n", "\n", " ", ""]  # Try to split on paragraphs first, then lines, etc.
)

# Split the documents into chunks
# This creates smaller, manageable pieces while preserving semantic meaning
chunks = text_splitter.split_documents(documents)

# Display splitting results
print(f"‚úì Split {len(documents)} documents into {len(chunks)} chunks")
print(f"\nAverage chunk size: {sum(len(chunk.page_content) for chunk in chunks) / len(chunks):.0f} characters")

# Preview a few chunks
print(f"\n--- Chunk Examples ---")
for i, chunk in enumerate(chunks[:3]):
    print(f"\nChunk {i+1} (length: {len(chunk.page_content)} chars):")
    print(f"{chunk.page_content[:200]}...")
    print(f"Metadata: {chunk.metadata}")

‚úì Split 55 documents into 267 chunks

Average chunk size: 898 characters

--- Chunk Examples ---

Chunk 1 (length: 986 chars):
Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
...
Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../resources/pdfs/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}

Chunk 2 (length: 944 chars):
based solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine translation tasks show these models to
be superior in quality while being mo

In [10]:
# Initialize OpenAI Embeddings
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",  # Latest, cost-effective embedding model
    # Alternative: "text-embedding-3-large" for better quality
)

# Test the embeddings with a sample text
sample_text = "This is a test sentence to demonstrate embeddings."
sample_embedding = embeddings.embed_query(sample_text)

print(f"‚úì Embeddings model initialized: text-embedding-3-small")
print(f"‚úì Embedding dimension: {len(sample_embedding)}")
print(f"‚úì Sample embedding (first 10 values): {sample_embedding[:10]}")
print(f"\n‚ÑπÔ∏è  Each chunk will be converted to a {len(sample_embedding)}-dimensional vector for similarity search")

‚úì Embeddings model initialized: text-embedding-3-small
‚úì Embedding dimension: 1536
‚úì Sample embedding (first 10 values): [0.020370882004499435, -0.0031641265377402306, -0.0005454652709886432, 0.0045827641151845455, -0.015004359185695648, -0.034060992300510406, 0.0176328606903553, 0.01959054544568062, 0.0013125392142683268, 0.00596546521410346]

‚ÑπÔ∏è  Each chunk will be converted to a 1536-dimensional vector for similarity search


In [14]:
# Create FAISS vector store from document chunks
# This step converts each chunk to an embedding and stores it
print(f"Creating FAISS index from {len(chunks)} chunks...")
print("This may take a minute depending on the number of chunks...")

vectorstore = FAISS.from_documents(
    documents=chunks,      # Our split document chunks
    embedding=embeddings   # OpenAI embedding model
)

print(f"‚úì FAISS vector store created successfully!")
print(f"‚úì Indexed {len(chunks)} document chunks")

# Save the vector store to disk for later use
# This allows you to reload the index without re-processing documents
vectorstore_path = "../vectorstores/faiss_index"
vectorstore.save_local(vectorstore_path)
print(f"‚úì Vector store saved to '{vectorstore_path}'")
print(f"\n‚ÑπÔ∏è  You can reload this index later using: FAISS.load_local('{vectorstore_path}', embeddings)")

Creating FAISS index from 267 chunks...
This may take a minute depending on the number of chunks...
‚úì FAISS vector store created successfully!
‚úì Indexed 267 document chunks
‚úì Vector store saved to '../vectorstores/faiss_index'

‚ÑπÔ∏è  You can reload this index later using: FAISS.load_local('../vectorstores/faiss_index', embeddings)


In [15]:
# Example: load an existing vector store instead of creating a new one
vectorstore_path = "../vectorstores/faiss_index"
vectorstore = FAISS.load_local(
    vectorstore_path, 
    embeddings,
    allow_dangerous_deserialization=True  # Required for loading pickled data
)
print(f"‚úì Loaded existing vector store from '{vectorstore_path}'")

‚úì Loaded existing vector store from '../vectorstores/faiss_index'


In [16]:
# Create a retriever from the vector store
retriever = vectorstore.as_retriever(
    search_type="similarity",    # Use cosine similarity for search
    search_kwargs={"k": 4}        # Retrieve top 4 most relevant chunks
)

print("‚úì Retriever configured successfully")
print(f"  - Search type: similarity")
print(f"  - Number of documents to retrieve (k): 4")

# Test the retriever with a sample query
# Note: In LangChain 1.0+, use .invoke() instead of .get_relevant_documents()
test_query = "What is the main topic of this document?"
retrieved_docs = retriever.invoke(test_query)  # LangChain 1.0+ method

print(f"\n--- Retriever Test ---")
print(f"Query: '{test_query}'")
print(f"Retrieved {len(retrieved_docs)} documents:")

for i, doc in enumerate(retrieved_docs):
    print(f"\nDocument {i+1}:")
    print(f"  Content preview: {doc.page_content[:150]}...")
    print(f"  Metadata: {doc.metadata}")

‚úì Retriever configured successfully
  - Search type: similarity
  - Number of documents to retrieve (k): 4

--- Retriever Test ---
Query: 'What is the main topic of this document?'
Retrieved 4 documents:

Document 1:
  Content preview: Feeding all relevant documents directly into LLMs can lead
to information overload, diluting the focus on key details with
irrelevant content.To mitig...
  Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-03-28T00:54:45+00:00', 'author': '', 'keywords': '', 'moddate': '2024-03-28T00:54:45+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../resources/pdfs/ragsurvey.pdf', 'total_pages': 21, 'page': 3, 'page_label': '4'}

Document 2:
  Content preview: Table I.
B. Indexing Optimization
In the Indexing phase, documents will be processed, seg-
mented, and transformed into Embeddings t

In [17]:
# Initialize the ChatOpenAI model
llm = ChatOpenAI(
      model="gpt-5-nano",  # Choose your model
      # Alternative options:
      # model="gpt-4o",           # Faster GPT-4 performance, good 
      # balance,
      # model="gpt-3.5-turbo",    # Faster and cheaper option

      temperature=0,         # 0 = deterministic, factual responses (recommended for Q&A)
      max_tokens=2000,       # Maximum length of response
  )

print("‚úì LLM configured successfully")
print(f"  - Model: gpt-5-nano")
print(f"  - Temperature: 0 (deterministic)")
print(f"  - Max tokens: 2000")

# Test the LLM with a simple query
test_response = llm.invoke("Say 'Hello, I am ready to answer questions!'")
print(f"\nLLM Test Response: {test_response.content}")

#   üìù Explanation of Parameters:

#   Model Selection:

#   # Option 1: Best quality (slower, more expensive)
#   llm = ChatOpenAI(model="gpt-5-nano")

#   # Option 2: Fast GPT-4 performance (balanced)
#   llm = ChatOpenAI(model="gpt-4o")

#   # Option 3: Fast and cheap (good for testing)
#   llm = ChatOpenAI(model="gpt-3.5-turbo")

#   Temperature:

#   temperature=0    # Deterministic, focused (best for factual Q&A)
#   temperature=0.7  # More creative, varied responses
#   temperature=1.0  # Most creative, less predictable

#   Max Tokens:

#   max_tokens=2000  # Controls maximum response length

‚úì LLM configured successfully
  - Model: gpt-5-nano
  - Temperature: 0 (deterministic)
  - Max tokens: 2000

LLM Test Response: Hello, I am ready to answer questions!


In [21]:
# Define the prompt template for the RAG system
# This tells the LLM how to use the retrieved context
system_prompt = (
    "You are a helpful assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question. "
    "If you don't know the answer based on the context, say that you don't know. "
    "Keep the answer concise and accurate.\n\n"
    "Context: {context}\n\n"
    "Question: {question}"
)

# Create the prompt template
prompt = ChatPromptTemplate.from_template(system_prompt)

# Helper function to format documents
def format_docs(docs):
    """Format retrieved documents into a single string."""
    return "\n\n".join(doc.page_content for doc in docs)

# Build the RAG chain using LangChain 1.0+ LCEL (LangChain Expression Language)
# This uses the pipe operator (|) to chain components together
rag_chain = (
    {
        "context": retriever | format_docs,  # Retrieve docs and format them
        "question": RunnablePassthrough()      # Pass through the question
    }
    | prompt           # Format with prompt template
    | llm              # Generate answer with LLM
    | StrOutputParser() # Parse output to string
)

print("‚úì RAG chain created successfully using LangChain 1.0+ LCEL!")
print("\nRAG Pipeline Flow:")
print("  1. User provides a query")
print("  2. Retriever finds top 4 relevant chunks")
print("  3. Chunks are formatted as context")
print("  4. Context + question are formatted with prompt template")
print("  5. LLM generates answer based on context")
print("  6. Answer is parsed and returned to user")

‚úì RAG chain created successfully using LangChain 1.0+ LCEL!

RAG Pipeline Flow:
  1. User provides a query
  2. Retriever finds top 4 relevant chunks
  3. Chunks are formatted as context
  4. Context + question are formatted with prompt template
  5. LLM generates answer based on context
  6. Answer is parsed and returned to user


In [20]:
# Example Query 1: General question about the document
query1 = "What is the main topic or subject of this document?"

print(f"Query: {query1}")
print("\nProcessing...\n")

# With LangChain 1.0+, we invoke the chain with the question directly
answer = rag_chain.invoke(query1)

print("=" * 80)
print("ANSWER:")
print("=" * 80)
print(answer)
print("\n" + "=" * 80)

# To see which documents were retrieved, we can call the retriever separately
print("\nSOURCE DOCUMENTS USED:")
print("=" * 80)
retrieved_docs = retriever.invoke(query1)
for i, doc in enumerate(retrieved_docs):
    print(f"\nDocument {i+1}:")
    print(f"  Source: {doc.metadata}")
    print(f"  Content: {doc.page_content[:200]}...")
    print("-" * 80)

Query: What is the main topic or subject of this document?

Processing...

ANSWER:
The document discusses Retrieval-Augmented Generation (RAG) systems, focusing on indexing optimization (chunking strategies, post-retrieval processing) and modular RAG architectures, including the use of knowledge graphs for indexing.


SOURCE DOCUMENTS USED:

Document 1:
  Source: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-03-28T00:54:45+00:00', 'author': '', 'keywords': '', 'moddate': '2024-03-28T00:54:45+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../resources/pdfs/ragsurvey.pdf', 'total_pages': 21, 'page': 7, 'page_label': '8'}
  Content: Table I.
B. Indexing Optimization
In the Indexing phase, documents will be processed, seg-
mented, and transformed into Embeddings to be stored in a
vector database. The quality of index cons

In [22]:
# Example Query 2: Specific information extraction
query2 = "Can you summarize the key points from this document?"

print(f"Query: {query2}")
print("\nProcessing...\n")

answer = rag_chain.invoke(query2)

print("=" * 80)
print("ANSWER:")
print("=" * 80)
print(answer)
print("\n" + "=" * 80)

Query: Can you summarize the key points from this document?

Processing...

ANSWER:
Here are the key points from the document you provided:

- Purpose
  - Focuses on optimizing indexing for retrieval-augmented generation (RAG): how documents are processed, stored as embeddings, and retrieved to support accurate generation.

- Indexing and chunking strategy
  - Documents are chunked into fixed-size tokens (e.g., 100‚Äì512) to create embeddings for a vector store.
  - Trade-offs:
    - Larger chunks: capture more context but introduce more noise and higher cost.
    - Smaller chunks: less noise but may miss context.
  - Challenges with fixed chunks: truncation within sentences; no perfect balance between semantic completeness and context length.
  - Techniques to improve context handling:
    - Recursive splits and sliding windows to enable layered, multi-pass retrieval.
    - Small2Big approach: use small sentences for retrieval but provide adjacent sentences as larger context to the LL

In [25]:
# Example Query 3
custom_query = "What specific details are mentioned about multi head attention from the document?"

print(f"Query: {custom_query}")
print("\nProcessing...\n")

answer = rag_chain.invoke(custom_query)

print("=" * 80)
print("ANSWER:")
print("=" * 80)
print(answer)
print("\n" + "=" * 80)

Query: What specific details are mentioned about multi head attention from the document?

Processing...

ANSWER:
- MultiHead(Q, K, V) = Concat(head1, ..., headh) WO
- headi = Attention(QWQi, KWKi, VWVi)
- Projection matrices per head: WQi ‚àà R^{d_model √ó d_k}, WKi ‚àà R^{d_model √ó d_k}, WVi ‚àà R^{d_model √ó d_v}
- Output projection: WO ‚àà R^{h d_v √ó d_model}
- Number of heads: h = 8
- Per-head dimensions: d_k = d_v = d_model / h = 64
- Because each head has reduced dimensionality, total computational cost is similar to single-head attention with full dimensionality
- The attention used is Scaled Dot-Product Attention, with Attention(Q, K, V) = softmax(Q K^T / sqrt(d_k)) V
- In practice, Q, K, V are packed into matrices and processed in parallel across heads.

