In [1]:
import os

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # For macOS users to avoid OpenMP error, specifically for FIASS
from pathlib import Path
from dotenv import load_dotenv

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [2]:
load_dotenv()

True

In [3]:
pdf_path = "../resources/pdfs/attention.pdf"  # Change this to your PDF file path

# Check if file exists
if not os.path.exists(pdf_path):
    print(f"⚠️  ERROR: File '{pdf_path}' not found!")
    print("Please update the pdf_path variable with your PDF file location.")
else:
    # Initialize the PDF loader
    loader = PyPDFLoader(pdf_path)
    
    # Load all pages from the PDF
    # Each page becomes a separate Document object
    documents = loader.load()
    
    # Display information about loaded documents
    print(f"✓ Loaded {len(documents)} pages from '{pdf_path}'")
    print(f"\n--- First Document Preview ---")
    print(f"Content (first 500 chars): {documents[0].page_content[:500]}...")
    print(f"\nMetadata: {documents[0].metadata}")
    print(f"\nTotal characters across all pages: {sum(len(doc.page_content) for doc in documents):,}")

✓ Loaded 15 pages from '../resources/pdfs/attention.pdf'

--- First Document Preview ---
Content (first 500 chars): Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗ †
University of Toronto
aidan@cs.toronto.edu
Łukasz ...

Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../resources/pdfs/at

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,        # Maximum characters per chunk (roughly 200-250 tokens)
    chunk_overlap=128,      # Characters overlap between chunks (maintains context)
    length_function=len,    # Function to measure chunk length
    separators=["\n\n", "\n", " ", ""]  # Try to split on paragraphs first, then lines, etc.
)

# Split the documents into chunks
# This creates smaller, manageable pieces while preserving semantic meaning
chunks = text_splitter.split_documents(documents)

# Display splitting results
print(f"✓ Split {len(documents)} documents into {len(chunks)} chunks")
print(f"\nAverage chunk size: {sum(len(chunk.page_content) for chunk in chunks) / len(chunks):.0f} characters")

# Preview a few chunks
print(f"\n--- Chunk Examples ---")
for i, chunk in enumerate(chunks[:3]):
    print(f"\nChunk {i+1} (length: {len(chunk.page_content)} chars):")
    print(f"{chunk.page_content[:200]}...")
    print(f"Metadata: {chunk.metadata}")

✓ Split 15 documents into 49 chunks

Average chunk size: 873 characters

--- Chunk Examples ---

Chunk 1 (length: 986 chars):
Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
...
Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../resources/pdfs/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}

Chunk 2 (length: 944 chars):
based solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine translation tasks show these models to
be superior in quality while being more 

In [5]:
# Initialize Gemini Embeddings
gemini_embeddings = GoogleGenerativeAIEmbeddings(
    model="gemini-embedding-001",  # Latest, cost-effective embedding model
)

In [6]:
# Create FAISS vector store from document chunks
# This step converts each chunk to an embedding and stores it
print(f"Creating FAISS index from {len(chunks)} chunks...")
print("This may take a minute depending on the number of chunks...")

vectorstore = FAISS.from_documents(
    documents=chunks,      # Our split document chunks
    embedding=gemini_embeddings   # Gemini embedding model
)

print(f"✓ FAISS vector store created successfully!")
print(f"✓ Indexed {len(chunks)} document chunks")

# Save the vector store to disk for later use
# This allows you to reload the index without re-processing documents
vectorstore_path = "../vectorstores/faiss_index"
vectorstore.save_local(vectorstore_path)
print(f"✓ Vector store saved to '{vectorstore_path}'")
print(f"\nℹ️  You can reload this index later using: FAISS.load_local('{vectorstore_path}', embeddings)")

Creating FAISS index from 49 chunks...
This may take a minute depending on the number of chunks...
✓ FAISS vector store created successfully!
✓ Indexed 49 document chunks
✓ Vector store saved to '../vectorstores/faiss_index'

ℹ️  You can reload this index later using: FAISS.load_local('../vectorstores/faiss_index', embeddings)


In [7]:
# Create a retriever from the vector store
retriever = vectorstore.as_retriever(
    search_type="similarity",    # Use cosine similarity for search
    search_kwargs={"k": 4}        # Retrieve top 4 most relevant chunks
)

In [None]:
# Initialize the Google Gemini model

from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.0,
    max_output_tokens=2000
)