In [3]:
import os
from pathlib import Path
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain_community.vectorstores import FAISS

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [2]:
load_dotenv()
def check_env(var_name):
    if os.getenv(var_name):
        print(f"✅ {var_name} found")
    else:
        print("❌ {var_name} not found — please add it to your .env")
check_env("AZURE_OPENAI_API_KEY")
check_env("AZURE_OPENAI_ENDPOINT")
check_env("AZURE_OPENAI_DEPLOYMENT_NAME")
check_env("AZURE_OPENAI_API_VERSION")
check_env("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME")

✅ AZURE_OPENAI_API_KEY found
✅ AZURE_OPENAI_ENDPOINT found
✅ AZURE_OPENAI_DEPLOYMENT_NAME found
✅ AZURE_OPENAI_API_VERSION found
✅ AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME found


In [8]:
path = r'data\attention.pdf'
if not os.path.exists(path):
    print("ERROR: File '{path}' not found!")
    print("Please update with correct PDF path")
else:
    loader = PyPDFLoader(path)
    documents = loader.load()

    print(f"PDF Loaded with {len(documents)} pages from '{path}'")
    print("\n--- First Document Preview ---")
    print(f"Content: {documents[0].page_content[:500]}...")
    print(f"Metadata: {documents[0].metadata}")
    print(f"Total charactors accross all pages: {sum(len(doc.page_content) for doc in documents)}")


PDF Loaded with 15 pages from 'data\attention.pdf'

--- First Document Preview ---
Content: Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗ †
University of Toronto
aidan@cs.toronto.edu
Łukasz ...
Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data\\attention.pdf', 'total_pages': 15, 'pag

In [10]:
#Loading multiple datasets
pdf_directory = './data'
all_documents = []

if os.path.exists(pdf_directory):
    pdf_files = list(Path(pdf_directory).glob("*.pdf"))
    print(f"Found {len(pdf_files)} PDF files")

    for pdf_file in pdf_files:
        loader = PyPDFLoader(pdf_file)
        doc = loader.load()
        all_documents.extend(doc)
        print(f"Doc loaded with {len(doc)} for {pdf_file.name}")
    print(f"\nTotal pages loaded: {len(all_documents)}")
    documents = all_documents

Found 2 PDF files
Doc loaded with 15 for attention.pdf
Doc loaded with 21 for ragsurvey.pdf

Total pages loaded: 36


In [13]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=128,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)
chunks = text_splitter.split_documents(documents)
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
print(f"Average chunk size: {sum(len(chunk.page_content)for chunk in chunks) }")

print(f"\n--- Chunks Example---")
for i, chunk in enumerate(chunks[:3]):
    print(f"\nChunk {i+1} (length: {len(chunk.page_content)} chars)")
    print(f"{chunk.page_content[:200]}...")
    print(f"Metadata: {chunk.metadata}")

Split 36 documents into 180 chunks.
Average chunk size: 164183

--- Chunks Example---

Chunk 1 (length: 986 chars)
Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
...
Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data\\attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}

Chunk 2 (length: 944 chars)
based solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine translation tasks show these models to
be superior in quality while being more pa...
Metadata: {'produc

In [14]:
embeddings = AzureOpenAIEmbeddings(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    model=os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME")
)
sample_text = "This is the text sentance to demonstrate embeddings"
sample_embedding = embeddings.embed_query(sample_text)

print("✓ Embeddings model initialized: text-embedding-ada-002")
print(f"✓ Embedding dimension: {len(sample_embedding)}")
print(f"✓ Sample embedding (first 10 values): {sample_embedding[:10]}")
print(f"\nℹ️  Each chunk will be converted to a {len(sample_embedding)}-dimensional vector for similarity search")

✓ Embeddings model initialized: text-embedding-ada-002
✓ Embedding dimension: 1536
✓ Sample embedding (first 10 values): [-0.018449613824486732, 0.002806746633723378, 0.0020374529995024204, -0.010561833158135414, 0.0018392505589872599, 0.015305251814424992, -0.0036852199118584394, 0.004199202172458172, -0.02843363769352436, -0.026498645544052124]

ℹ️  Each chunk will be converted to a 1536-dimensional vector for similarity search


In [15]:
print(f"Creating FAISS index from {len(chunks)} chunks")
vectore_store = FAISS.from_documents(
    documents=chunks,
    embedding=embeddings
)
print(f"FAISS vector stored successfully!")
print(f"Indexed {len(chunks)} documents chunks")

vectorstore_path = "./faiss_index"
vectore_store.save_local(vectorstore_path)
print(f"✓ Vector store saved to '{vectorstore_path}'")
print(f"\nℹ️  You can reload this index later using: FAISS.load_local('{vectorstore_path}', embeddings)")

Creating FAISS index from 180 chunks
FAISS vector stored successfully!
Indexed 180 documents chunks
✓ Vector store saved to './faiss_index'

ℹ️  You can reload this index later using: FAISS.load_local('./faiss_index', embeddings)


In [16]:
vectorstore_path = './faiss_index'
vectore_store = FAISS.load_local(
    vectorstore_path,
    embeddings,
    allow_dangerous_deserialization=True
)
print(f"✓ Loaded existing vector store from '{vectorstore_path}'")

✓ Loaded existing vector store from './faiss_index'


In [18]:
retriever = vectore_store.as_retriever(
    search_type = "similarity",
    search_kwargs={"k":4}
)
print("✓ Retriever configured successfully")
print(f"  - Search type: similarity")
print(f"  - Number of documents to retrieve (k): 4")

test_query = "What is the main topic of this documents?"
retrieved_docs = retriever.invoke(test_query)

print(f"\n--- Retriever Test ---")
print(f"Query: '{test_query}'")
print(f"Retrieved {len(retrieved_docs)} documents:")

for i, doc in enumerate(retrieved_docs):
    print(f"\nDocuments: {i+1}")
    print(f"  Content preview: {doc.page_content[:150]}...")
    print(f"  Metadata: {doc.metadata}")

✓ Retriever configured successfully
  - Search type: similarity
  - Number of documents to retrieve (k): 4

--- Retriever Test ---
Query: 'What is the main topic of this documents?'
Retrieved 4 documents:

Documents: 1
  Content preview: Table I.
B. Indexing Optimization
In the Indexing phase, documents will be processed, seg-
mented, and transformed into Embeddings to be stored in a
v...
  Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-03-28T00:54:45+00:00', 'author': '', 'keywords': '', 'moddate': '2024-03-28T00:54:45+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data\\ragsurvey.pdf', 'total_pages': 21, 'page': 7, 'page_label': '8'}

Documents: 2
  Content preview: caused by block extraction issues.
Knowledge Graph index . Utilize KG in constructing the
hierarchical structure of documents contributes to ma

In [None]:
llm = AzureChatOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    deployment_name=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
    temperature=0.0,
    max_tokens=2000,
)
print("✓ LLM configured successfully")
print(f"  - Model: gpt-4-o")
print(f"  - Temperature: 0 (deterministic)")
print(f"  - Max tokens: 2000")


✓ LLM configured successfully
  - Model: gpt-4-o
  - Temperature: 0 (deterministic)
  - Max tokens: 2000


In [22]:
system_prompt = (
    "You are a helpful assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question. "
    "If you don't know the answer based on the context, say that you don't know. "
    "Keep the answer concise and accurate.\n\n"
    "Context: {context}\n\n"
    "Question: {question}"   
)
prompt = ChatPromptTemplate.from_template(system_prompt)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {
        "context":retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)
print("✓ RAG chain created successfully using LangChain 1.0+ LCEL!")
print("\nRAG Pipeline Flow:")
print("  1. User provides a query")
print("  2. Retriever finds top 4 relevant chunks")
print("  3. Chunks are formatted as context")
print("  4. Context + question are formatted with prompt template")
print("  5. LLM generates answer based on context")
print("  6. Answer is parsed and returned to user")

✓ RAG chain created successfully using LangChain 1.0+ LCEL!

RAG Pipeline Flow:
  1. User provides a query
  2. Retriever finds top 4 relevant chunks
  3. Chunks are formatted as context
  4. Context + question are formatted with prompt template
  5. LLM generates answer based on context
  6. Answer is parsed and returned to user


In [23]:
query1 = "What is the main topic or subject of this documents"
print(f"Query: {query1}\n")
print(f"\nProcessing...\n")

answer = rag_chain.invoke(query1)

print("=" * 80)
print("ANSWER:")
print("=" * 80)
print(answer)
print("\n" + "=" * 80)
retrieved_docs = retriever.invoke(query1)
for i, doc in enumerate(retrieved_docs):
    print(f"\nDocument: {i+1}")
    print(f"   Source: {doc.metadata}")
    print(f"   Content: {doc.page_content[:200]}")
    print("-"*80)

Query: What is the main topic or subject of this documents


Processing...

ANSWER:
The main topic of the document is optimization techniques for indexing and retrieval in information systems, focusing on strategies like chunking, recursive splits, sliding windows, and the use of Knowledge Graphs (KG) to enhance semantic completeness, context length, and retrieval accuracy. It also discusses methods for improving the efficiency of retrieval-augmented generation (RAG) systems.


Document: 1
   Source: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-03-28T00:54:45+00:00', 'author': '', 'keywords': '', 'moddate': '2024-03-28T00:54:45+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data\\ragsurvey.pdf', 'total_pages': 21, 'page': 7, 'page_label': '8'}
   Content: Table I.
B. Indexing Optimization
In the Indexing phase, docum

In [24]:
query2 = "Can you summarize the key points from this document?"

print(f"Query: {query2}")
print("\nProcessing...\n")

answer = rag_chain.invoke(query2)

print("=" * 80)
print("ANSWER:")
print("=" * 80)
print(answer)
print("\n" + "=" * 80)

Query: Can you summarize the key points from this document?

Processing...

ANSWER:
The document discusses the current state and future directions of the Retrieval-Augmented Generation (RAG) framework. Key points include:

1. **Assessment and Evaluation**: The paper summarizes assessment methods for RAG across 26 tasks and nearly 50 datasets, detailing evaluation objectives, metrics, benchmarks, and tools.

2. **Core Components**: It explores the three main components of RAG:
   - **Retrieval**: Optimization methods like indexing, query, and embedding improvements.
   - **Generation**: Post-retrieval processes and fine-tuning of large language models (LLMs).
   - **Augmentation**: Analysis of augmentation processes, including metadata enrichment and content generation.

3. **Enhancements**: 
   - Use of semi-structured (e.g., PDFs) and structured data (e.g., Knowledge Graphs) for improvement.
   - Trends in using LLM-generated content for retrieval and enhancement.

4. **Challenges and