In [1]:
# Standard library imports
import os
import sys
from pathlib import Path

# LangChain Document Loaders
from langchain_community.document_loaders import PyPDFLoader

# LangChain Text Splitters
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Ollama Integration
from langchain_ollama import OllamaEmbeddings, ChatOllama

# ChromaDB Vector Store
from langchain_chroma import Chroma

# LangChain Core Components
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

print("‚úì All imports successful!")
print("‚úì Ready for local offline RAG!")
print(f"\nPython version: {sys.version}")

‚úì All imports successful!
‚úì Ready for local offline RAG!

Python version: 3.13.0 (main, Oct 16 2024, 08:05:40) [Clang 18.1.8 ]


In [2]:
# Check Ollama is running and list available models
!ollama list

NAME                       ID              SIZE      MODIFIED      
nomic-embed-text:latest    0a109f422b47    274 MB    7 months ago     
deepseek-r1:latest         0a8c26691023    4.7 GB    9 months ago     
llama3.2:latest            a80c4f17acd5    2.0 GB    14 months ago    


In [14]:
# Test Ollama connection with a simple query
print("Testing Ollama connection...\n")

try:
    test_llm = ChatOllama(model="deepseek-r1", temperature=0)
    response = test_llm.invoke("Say 'Hello! I am running locally on your machine!'")
    
    print("‚úì Ollama is working!")
    print(f"Response: {response.content}")
    
except Exception as e:
    print(f"‚úó Error connecting to Ollama: {e}")
    print("\nMake sure Ollama is running. Try: ollama serve")

Testing Ollama connection...

‚úì Ollama is working!
Response: <think>
Okay, so the user wants me to say "Hello! I am running locally on your machine!" in response to their greeting. Let me make sure I understand what they're asking for.

First, they said, "Say 'Hello! I am running locally on your machine!'". That seems straightforward‚Äîthey want a specific response. But wait, are they testing my capabilities or trying to see if I can follow instructions accurately?

I should consider the context here. If someone is greeting me and wants me to respond with that exact phrase, maybe they're checking how I handle user interactions. It could also be part of a larger system setup where they need confirmation from an AI.

Now, thinking about my response options. The initial reply was "Hello! How can I assist you today?" which seems appropriate for general interaction. But the user specifically wants to say another phrase. Maybe they want me to repeat that exact message back to them or use i

In [4]:
# ===== CONFIGURATION: Update this path to your PDF file =====
pdf_path = "../resources/pdfs/attention.pdf"  # Change this to your PDF file path
# =============================================================

# Check if file exists
if not os.path.exists(pdf_path):
    print(f"‚ö†Ô∏è  ERROR: File '{pdf_path}' not found!")
    print("Please update the pdf_path variable with your PDF file location.")
else:
    # Load the PDF
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    
    # Display information
    print(f"‚úì Loaded {len(documents)} pages from '{pdf_path}'")
    print(f"\n--- First Page Preview ---")
    print(f"Content (first 300 chars): {documents[0].page_content[:300]}...")
    print(f"\nMetadata: {documents[0].metadata}")
    print(f"\nTotal characters: {sum(len(doc.page_content) for doc in documents):,}")

‚úì Loaded 15 pages from '../resources/pdfs/attention.pdf'

--- First Page Preview ---
Content (first 300 chars): Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani‚àó
Google Brain
avaswani@google.com
Noam Shazeer‚àó
Google Brain
noam@google.com
Niki Par...

Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '../resources/pdfs/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}

Total characters: 39,587


In [5]:
# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,        # Characters per chunk
    chunk_overlap=128,      # Overlap to maintain context
    length_function=len,
    separators=["\n\n", "\n", " ", ""]  # Split on paragraphs, then lines, etc.
)

# Split documents
chunks = text_splitter.split_documents(documents)

# Display results
avg_chunk_size = sum(len(chunk.page_content) for chunk in chunks) / len(chunks)

print(f"‚úì Split {len(documents)} documents into {len(chunks)} chunks")
print(f"\nAverage chunk size: {avg_chunk_size:.0f} characters")

# Preview chunks
print(f"\n--- Chunk Examples ---")
for i, chunk in enumerate(chunks[:3]):
    print(f"\nChunk {i+1} (length: {len(chunk.page_content)} chars):")
    print(f"{chunk.page_content[:200]}...")

‚úì Split 15 documents into 49 chunks

Average chunk size: 873 characters

--- Chunk Examples ---

Chunk 1 (length: 986 chars):
Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
...

Chunk 2 (length: 944 chars):
based solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine translation tasks show these models to
be superior in quality while being more pa...

Chunk 3 (length: 986 chars):
‚àóEqual contribution. Listing order is random. Jakob proposed replacing RNNs with self-attention and started
the effort to evaluate this idea. Ashish, with Illia, designed and implemented the first Tra...


In [6]:
# Initialize Ollama Embeddings with nomic-embed-text
embeddings = OllamaEmbeddings(
    model="nomic-embed-text",
    # base_url="http://localhost:11434"  # Default Ollama URL
)

# Test embeddings
print("Testing nomic-embed-text embeddings...\n")
sample_text = "This is a test sentence for embeddings."
sample_embedding = embeddings.embed_query(sample_text)

print(f"‚úì Embeddings model: nomic-embed-text")
print(f"‚úì Embedding dimension: {len(sample_embedding)}")
print(f"‚úì Sample embedding (first 10 values): {sample_embedding[:10]}")
print(f"\n‚ÑπÔ∏è  Each chunk will be converted to a {len(sample_embedding)}-dimensional vector")
print(f"‚ÑπÔ∏è  All processing happens locally on your machine!")

Testing nomic-embed-text embeddings...

‚úì Embeddings model: nomic-embed-text
‚úì Embedding dimension: 768
‚úì Sample embedding (first 10 values): [0.032470614, 0.060767617, -0.1661029, -0.08210531, 0.04330724, -0.02606512, 0.051619876, -0.015222877, -0.0082509415, -0.028311769]

‚ÑπÔ∏è  Each chunk will be converted to a 768-dimensional vector
‚ÑπÔ∏è  All processing happens locally on your machine!


In [9]:
# Create ChromaDB vector store
print(f"Creating ChromaDB vector store from {len(chunks)} chunks...")
print("This may take a minute...\n")

# Set persistent directory
persist_directory = "../vectorstores/chroma_db"

# Create vector store
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=persist_directory,
    collection_name="local_rag_collection"
)

print(f"‚úì ChromaDB vector store created successfully!")
print(f"‚úì Indexed {len(chunks)} document chunks")
print(f"‚úì Stored at: {persist_directory}")
print(f"\n‚ÑπÔ∏è  Vector store persisted to disk - you can reload it later!")

Creating ChromaDB vector store from 49 chunks...
This may take a minute...

‚úì ChromaDB vector store created successfully!
‚úì Indexed 49 document chunks
‚úì Stored at: ../vectorstores/chroma_db

‚ÑπÔ∏è  Vector store persisted to disk - you can reload it later!


In [10]:
# Create retriever
retriever = vectorstore.as_retriever(
    search_type="similarity",    # Use cosine similarity
    search_kwargs={"k": 4}        # Retrieve top 4 most relevant chunks
)

print("‚úì Retriever configured successfully")
print(f"  - Search type: similarity")
print(f"  - Number of documents to retrieve (k): 4")

# Test retrieval
test_query = "What is the main topic of this document?"
print(f"\n--- Retriever Test ---")
print(f"Query: '{test_query}'")

retrieved_docs = retriever.invoke(test_query)

print(f"\nRetrieved {len(retrieved_docs)} documents:")
for i, doc in enumerate(retrieved_docs):
    print(f"\nDocument {i+1}:")
    print(f"  Content preview: {doc.page_content[:150]}...")
    print(f"  Source: Page {doc.metadata.get('page', 'N/A')}")

‚úì Retriever configured successfully
  - Search type: similarity
  - Number of documents to retrieve (k): 4

--- Retriever Test ---
Query: 'What is the main topic of this document?'

Retrieved 4 documents:

Document 1:
  Content preview: (section 5.4), learning rates and beam size on the Section 22 development set, all other parameters
remained unchanged from the English-to-German base...
  Source: Page 8

Document 2:
  Content preview: Input-Input Layer5
The
Law
will
never
be
perfect
,
but
its
application
should
be
just
-
this
is
what
we
are
missing
,
in
my
opinion
.
<EOS>
<pad>
The
...
  Source: Page 13

Document 3:
  Content preview: Table 1: Maximum path lengths, per-layer complexity and minimum number of sequential operations
for different layer types. n is the sequence length, d...
  Source: Page 5

Document 4:
  Content preview: Attention Visualizations
Input-Input Layer5
It
is
in
this
spirit
that
a
majority
of
American
governments
have
passed
new
laws
since
2009
making
the
re.

In [13]:
# Initialize Ollama LLM
llm = ChatOllama(
    model="deepseek-r1",
    temperature=0,          # Deterministic responses (0 = focused, 1 = creative)
    # num_predict=2000,     # Max tokens to generate
    # top_k=40,             # Top-k sampling
    # top_p=0.9,            # Top-p (nucleus) sampling
)

print("‚úì LLM configured successfully")
print(f"  - Model: deepseek-r1:latest  (local)")
print(f"  - Temperature: 0 (deterministic)")

# Test LLM
test_response = llm.invoke("Say 'Hello! I am DeepSeek running locally!'")
print(f"\nLLM Test Response: {test_response.content}")

‚úì LLM configured successfully
  - Model: deepseek-r1:latest  (local)
  - Temperature: 0 (deterministic)

LLM Test Response: <think>

</think>

Hello! I am DeepSeek running locally. How can I assist you today?


In [15]:
# Define prompt template
system_prompt = (
    "You are a helpful assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question. "
    "If you don't know the answer based on the context, say that you don't know. "
    "Keep the answer concise and accurate.\n\n"
    "Context: {context}\n\n"
    "Question: {question}"
)

prompt = ChatPromptTemplate.from_template(system_prompt)

# Helper function to format documents
def format_docs(docs):
    """Format retrieved documents into a single string."""
    return "\n\n".join(doc.page_content for doc in docs)

# Build RAG chain using LCEL
rag_chain = (
    {
        "context": retriever | format_docs,  # Retrieve and format docs
        "question": RunnablePassthrough()      # Pass through the question
    }
    | prompt           # Format with prompt template
    | llm              # Generate answer with local LLM
    | StrOutputParser() # Parse output to string
)

print("‚úì RAG chain created successfully using LCEL!")
print("\nRAG Pipeline Flow:")
print("  1. User provides a query")
print("  2. Retriever finds top 4 relevant chunks (local ChromaDB)")
print("  3. Chunks are formatted as context")
print("  4. Context + question formatted with prompt template")
print("  5. Local LLM (gemma3:1b) generates answer")
print("  6. Answer parsed and returned")
print("\nüîí Everything runs locally on your machine!")

‚úì RAG chain created successfully using LCEL!

RAG Pipeline Flow:
  1. User provides a query
  2. Retriever finds top 4 relevant chunks (local ChromaDB)
  3. Chunks are formatted as context
  4. Context + question formatted with prompt template
  5. Local LLM (gemma3:1b) generates answer
  6. Answer parsed and returned

üîí Everything runs locally on your machine!


In [16]:
# Example Query 1: General question
query1 = "What is the main topic or contribution of this document?"

print(f"Query: {query1}")
print("\nProcessing locally...\n")

answer = rag_chain.invoke(query1)

print("=" * 80)
print("ANSWER:")
print("=" * 80)
print(answer)
print("\n" + "=" * 80)

# Show source documents
print("\nSOURCE DOCUMENTS USED:")
print("=" * 80)
retrieved_docs = retriever.invoke(query1)
for i, doc in enumerate(retrieved_docs):
    print(f"\nDocument {i+1}:")
    print(f"  Page: {doc.metadata.get('page', 'N/A')}")
    print(f"  Content: {doc.page_content[:200]}...")
    print("-" * 80)

Query: What is the main topic or contribution of this document?

Processing locally...

ANSWER:
<think>
Okay, so I need to figure out the main topic or contribution of this document based on the context provided. Let me start by reading through the context carefully.

The context seems to be discussing some technical aspects related to machine translation models, specifically focusing on attention mechanisms and learning rates. There are mentions of layers in a neural network model, like Layer 5 and Layer 6, which suggests that this is part of a larger architecture, possibly an encoder-decoder model used for translating English to German.

The sections mentioned include section 5.4 and section 3.2, which probably detail different parts of the model's architecture. The context talks about attention layers in the encoder and decoder, mentions learning rates and beam size as parameters that were adjusted during training but remained unchanged otherwise. There are also figures showing atte

In [17]:
# Example Query 2: Specific information extraction
query2 = "Can you summarize the key technical contributions or innovations mentioned?"

print(f"Query: {query2}")
print("\nProcessing locally...\n")

answer = rag_chain.invoke(query2)

print("=" * 80)
print("ANSWER:")
print("=" * 80)
print(answer)
print("\n" + "=" * 80)

Query: Can you summarize the key technical contributions or innovations mentioned?

Processing locally...

ANSWER:
<think>
Okay, so I need to summarize the key technical contributions or innovations from the given context about the Transformer architecture. Let me read through the context carefully and pick out the important points.

First, the context mentions that Jakob proposed replacing RNNs with self-attention and started evaluating this idea. That's a big contribution because it introduced the Transformer model, which uses attention mechanisms instead of recurrence. Then, Ashish, along with Illia, designed and implemented the first Transformer models. They were crucial in every aspect, so their work was foundational.

Noam proposed several key components: scaled dot-product attention, multi-head attention, and a parameter-free position representation. These are core innovations because they improved how the model handles attention and positional information without adding paramet

In [18]:
# Example Query 3: Your custom question
custom_query = "What specific details are mentioned about the methodology or approach?"

print(f"Query: {custom_query}")
print("\nProcessing locally...\n")

answer = rag_chain.invoke(custom_query)

print("=" * 80)
print("ANSWER:")
print("=" * 80)
print(answer)
print("\n" + "=" * 80)

Query: What specific details are mentioned about the methodology or approach?

Processing locally...

ANSWER:
<think>
Okay, so I need to figure out what specific details are mentioned about the methodology or approach in the given context. Let me read through the context carefully.

The first part talks about a Law not being perfect but its application should be just‚Äîthis is an opinionated statement from someone. Then there's some mention of Figure 4 with attention heads in layer 5, which seems to relate to how the model processes information. It mentions two attention heads involved in anaphora resolution and shows full attentions for head 5, as well as isolated attentions from the word 'its' for heads 5 and 6.

Further down, there are references to sections like section 3.2 about Multi-Head Attention, which is a method used in Transformers. The context also discusses self-attention mechanisms, their applications in tasks like reading comprehension, summarization, entailment, and se

In [19]:
# Interactive Q&A
def ask_question(question):
    """Ask a question to the RAG system."""
    print(f"\n{'='*80}")
    print(f"Question: {question}")
    print(f"{'='*80}")
    
    answer = rag_chain.invoke(question)
    
    print(f"\nAnswer: {answer}")
    print(f"{'='*80}\n")
    
    return answer

# Try it out!
# Change the question below to ask anything about your document
my_question = "What are the main findings or results?"
ask_question(my_question)


Question: What are the main findings or results?

Answer: <think>
Okay, so I need to figure out what the main findings or results are from the given context about their research on the Transformer architecture. Let me start by reading through the context carefully.

First, there's mention of experiments conducted at Google Brain and Google Research during the NIPS 2017 conference. They looked into variations of the Transformer model, specifically focusing on different configurations like varying the number of layers (N), embedding dimensions (d), hidden dimensions (dff), attention heads (h), key dimension (dk), value dimension (dv), dropout rate (Pdrop), label smoothing (œµls), training perplexity (train PPL), and BLEU scores during inference.

Looking at Table 3, there are several configurations labeled from A to E. Each configuration has different values for d, dff, h, dk, dv, Pdrop, œµls, and the resulting train PPL and BLEU scores. For example, Configuration A uses a smaller model

"<think>\nOkay, so I need to figure out what the main findings or results are from the given context about their research on the Transformer architecture. Let me start by reading through the context carefully.\n\nFirst, there's mention of experiments conducted at Google Brain and Google Research during the NIPS 2017 conference. They looked into variations of the Transformer model, specifically focusing on different configurations like varying the number of layers (N), embedding dimensions (d), hidden dimensions (dff), attention heads (h), key dimension (dk), value dimension (dv), dropout rate (Pdrop), label smoothing (œµls), training perplexity (train PPL), and BLEU scores during inference.\n\nLooking at Table 3, there are several configurations labeled from A to E. Each configuration has different values for d, dff, h, dk, dv, Pdrop, œµls, and the resulting train PPL and BLEU scores. For example, Configuration A uses a smaller model with fewer layers (N=6) but lower embedding dimensio