In [1]:
pip install -U langchain langgraph langchain-chroma langchain-ollama langchain-community pypdf

Collecting langchain
  Downloading langchain-1.0.8-py3-none-any.whl.metadata (4.9 kB)
Collecting langgraph
  Downloading langgraph-1.0.3-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-chroma
  Downloading langchain_chroma-1.0.0-py3-none-any.whl.metadata (1.9 kB)
Collecting langchain-ollama
  Using cached langchain_ollama-1.0.0-py3-none-any.whl.metadata (2.1 kB)
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting pypdf
  Downloading pypdf-6.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<2.0.0,>=1.0.6 (from langchain)
  Downloading langchain_core-1.0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting pydantic<3.0.0,>=2.7.4 (from langchain)
  Downloading pydantic-2.12.4-py3-none-any.whl.metadata (89 kB)
Collecting langgraph-checkpoint<4.0.0,>=2.1.0 (from langgraph)
  Downloading langgraph_checkpoint-3.0.1-py3-none-any.whl.metadata (4.7 kB)
Collecting langgraph-prebuilt<1.1.0,>=1.0.2 (from langgraph)
 

In [2]:


import os
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [4]:
# ============================================================================
# STEP 1: DOCUMENTS AND DOCUMENT LOADERS
# ============================================================================
# Load PDF - works with both online URLs and local file paths
pdf_url = "/Users/matthewbeattie/Desktop/Books/Docker up and running/4. Working with Docker Images _ Docker_ Up & Running, 3rd Edition.pdf"
loader = PyPDFLoader(pdf_url)
documents = loader.load()

In [5]:
documents[0]
print(f"\n✓ Loaded {len(documents)} pages from PDF")


✓ Loaded 59 pages from PDF


In [6]:
sample_doc = documents[0]
print(f"\nSample Document Structure:")
print(f"- Content length: {len(sample_doc.page_content)} characters")
print(f"- Metadata: {sample_doc.metadata}")
print(f"- Content preview: {sample_doc.page_content[:200]}...")


Sample Document Structure:
- Content length: 2028 characters
- Metadata: {'producer': 'Skia/PDF m114', 'creator': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', 'creationdate': '2023-06-22T20:52:42+00:00', 'moddate': '2023-06-22T20:52:42+00:00', 'source': '/Users/matthewbeattie/Desktop/Books/Docker up and running/4. Working with Docker Images _ Docker_ Up & Running, 3rd Edition.pdf', 'total_pages': 59, 'page': 0, 'page_label': '1'}
- Content preview: Chapter 4. Working with Docker Images
Every Linux container is based on an image. Images are the underlying definition of what
gets reconstituted into a running container, much like a virtual disk bec...


In [7]:
# ============================================================================
# STEP 2: TEXT SPLITTING  
# ============================================================================
print("\n2.1 Configuring Text Splitter...")
print("- Chunk size: 1024 characters (as specified)")
print("- Overlap: 100 characters (10% overlap)")
print("- Method: Recursive character splitting")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=100,  # 10% of 1024
    length_function=len,
    add_start_index=True,  # Preserves character index as metadata
)

print("\n2.2 Splitting documents into chunks...")
chunks = text_splitter.split_documents(documents)

print(f"\n✓ Split {len(documents)} pages into {len(chunks)} chunks")

chunk_sizes = [len(chunk.page_content) for chunk in chunks]
print(f"\nChunk Analysis:")
print(f"- Average chunk size: {sum(chunk_sizes) / len(chunk_sizes):.0f} characters")
print(f"- Largest chunk: {max(chunk_sizes)} characters")
print(f"- Smallest chunk: {min(chunk_sizes)} characters")


2.1 Configuring Text Splitter...
- Chunk size: 1024 characters (as specified)
- Overlap: 100 characters (10% overlap)
- Method: Recursive character splitting

2.2 Splitting documents into chunks...

✓ Split 59 pages into 135 chunks

Chunk Analysis:
- Average chunk size: 802 characters
- Largest chunk: 1024 characters
- Smallest chunk: 102 characters


In [11]:
# ============================================================================
# STEP 3: EMBEDDINGS
# ======
# 
embeddings = OllamaEmbeddings(
    model="nomic-embed-text",
    base_url="http://localhost:11434"
)




In [12]:
len(embeddings.embed_query("Hello world"))

768

In [13]:
# ============================================================================
# STEP 4: VECTOR STORES
# ============================================================================
print("\n4.1 Creating Chroma Vector Store...")
print("- Collection name: pdf_collection")
print("- Storage: Local persistent directory")
print("- Embedding function: nomic-embed-text via Ollama")

vector_store = Chroma(
    collection_name="pdf_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_db",
)

vector_store.add_documents(documents=chunks)

print(f"✓ Added {len(chunks)} document chunks to vector store")


4.1 Creating Chroma Vector Store...
- Collection name: pdf_collection
- Storage: Local persistent directory
- Embedding function: nomic-embed-text via Ollama
✓ Added 135 document chunks to vector store


In [14]:
# ============================================================================
# STEP 5: QUERYING THE VECTOR STORE
# ============================================================================

print("\n5.1 Basic Similarity Search")
print("Finding information about docker")

query = "What is the Anatomy of a Dockerfile"
results = vector_store.similarity_search(query, k=5)

print(f"\nQuery: '{query}'")
print(f"Retrieved {len(results)} most similar chunks:")

for i, doc in enumerate(results, 1):
    print(f"\n--- Result {i} ---")
    print(f"Content: {doc.page_content[:300]}...")
    print(f"Source: Page {doc.metadata.get('page', 'unknown')}")


5.1 Basic Similarity Search
Finding information about docker

Query: 'What is the Anatomy of a Dockerfile'
Retrieved 5 most similar chunks:

--- Result 1 ---
Content: with the Dockerfile. This file describes all the steps that are required to create an image
and is usually contained within the root directory of the source code repository for your
application.
1...
Source: Page 0

--- Result 2 ---
Content: image management, Docker relies heavily on this storage backend, which communicates
with the underlying Linux filesystem to build and manage the multiple layers that com-
bine into a single usable image. The primary storage backends that are supported include
the following:
Overlay2
B-Tree File Syst...
Source: Page 0

--- Result 3 ---
Content: A typical Dockerfile might look something like the one shown here, which creates a con-
tainer for a Node.js-based application:
FROM node:18.13.0
ARG email="anna@example.com"
LABEL "maintainer"=$email
LABEL "rating"="Five Stars" "class"="First 

In [15]:
print("\n5.2 Similarity Search with Scores")
print("Same search but with similarity scores to see confidence levels...")

results_with_scores = vector_store.similarity_search_with_score(query, k=5)

for i, (doc, score) in enumerate(results_with_scores, 1):
    print(f"\n--- Result {i} (Similarity Score: {score:.4f}) ---")
    print(f"Content: {doc.page_content[:200]}...")
    print(f"Source: Page {doc.metadata.get('page', 'unknown')}")


5.2 Similarity Search with Scores
Same search but with similarity scores to see confidence levels...

--- Result 1 (Similarity Score: 0.4560) ---
Content: with the Dockerfile. This file describes all the steps that are required to create an image
and is usually contained within the root directory of the source code repository for your
application.
1...
Source: Page 0

--- Result 2 (Similarity Score: 0.5663) ---
Content: image management, Docker relies heavily on this storage backend, which communicates
with the underlying Linux filesystem to build and manage the multiple layers that com-
bine into a single usable ima...
Source: Page 0

--- Result 3 (Similarity Score: 0.5907) ---
Content: A typical Dockerfile might look something like the one shown here, which creates a con-
tainer for a Node.js-based application:
FROM node:18.13.0
ARG email="anna@example.com"
LABEL "maintainer"=$email...
Source: Page 1

--- Result 4 (Similarity Score: 0.6160) ---
Content: Chapter 4. Working with Docke

In [16]:
print("\n5.3 Metadata Filtering")
print("Using metadata filters to search specific parts of the document...")

# First, let's see what metadata is available
print("\nAvailable metadata in our chunks:")
if chunks:
    sample_metadata = chunks[0].metadata
    print(f"Sample metadata: {sample_metadata}")
    
    # Get unique page numbers for filtering examples
    page_numbers = set()
    for chunk in chunks[:10]:  # Check first 10 chunks
        if 'page' in chunk.metadata:
            page_numbers.add(chunk.metadata['page'])
    print(f"Available page numbers (sample): {sorted(list(page_numbers))[:5]}...")


5.3 Metadata Filtering
Using metadata filters to search specific parts of the document...

Available metadata in our chunks:
Sample metadata: {'producer': 'Skia/PDF m114', 'creator': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', 'creationdate': '2023-06-22T20:52:42+00:00', 'moddate': '2023-06-22T20:52:42+00:00', 'source': '/Users/matthewbeattie/Desktop/Books/Docker up and running/4. Working with Docker Images _ Docker_ Up & Running, 3rd Edition.pdf', 'total_pages': 59, 'page': 0, 'page_label': '1', 'start_index': 0}
Available page numbers (sample): [0, 1, 2, 3, 4]...


In [17]:
print("\n5.3.1 Filter by Specific Page")
if page_numbers:
    target_page = sorted(list(page_numbers))[0]  # Use first available page
    page_results = vector_store.similarity_search(
        "methodology approach",
        k=10,
        filter={"page": target_page}
    )
    print(f"Searching only in Page {target_page}:")
    for i, doc in enumerate(page_results, 1):
        print(f"  Result {i}: Page {doc.metadata.get('page')} - {doc.page_content[:150]}...")


5.3.1 Filter by Specific Page
Searching only in Page 0:
  Result 1: Page 0 - Chapter 4. Working with Docker Images
Every Linux container is based on an image. Images are the underlying definition of what
gets reconstituted into...
  Result 2: Page 0 - with the Dockerfile. This file describes all the steps that are required to create an image
and is usually contained within the root directory of the ...
  Result 3: Page 0 - image management, Docker relies heavily on this storage backend, which communicates
with the underlying Linux filesystem to build and manage the multi...


In [18]:
print("\n5.3.3 Multiple Metadata Filters")
# Complex filtering with multiple conditions
complex_results = vector_store.similarity_search(
    "research findings",
    k=2,
    filter={
        "$and": [
            {"page": {"$lte": 10}},  # Page 0 or higher
            {"source": {"$ne": ""}}  # Has a source
        ]
    } # type: ignore
)

print("Using complex filter (page >= 0 AND has source):")
for i, doc in enumerate(complex_results, 1):
    print(f"  Result {i}: Page {doc.metadata.get('page')} - {doc.page_content[:150]}...")


5.3.3 Multiple Metadata Filters
Using complex filter (page >= 0 AND has source):
  Result 1: Page 6 - age. In many cases, you will simply see a . at the end of a build command, since a single
period represents the current directory. This build context ...
  Result 2: Page 0 - Chapter 4. Working with Docker Images
Every Linux container is based on an image. Images are the underlying definition of what
gets reconstituted into...


In [19]:
# ============================================================================
# STEP 6: RETRIEVERS
# ============================================================================
print("\n6. Creating Retriever...")

# Similarity Retriever
similarity_retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 4}
)


6. Creating Retriever...


In [20]:
# ============================================================================
# STEP 7: RAG FOUNDATION
# ============================================================================
# final_query = "What are the key contributions of this paper?"
final_query = "What are the main parts of building a docker image?"
context_docs = similarity_retriever.invoke(final_query)

print(f"\nQuery: '{final_query}'")
print(f"✓ Retrieved {len(context_docs)} relevant document chunks")


Query: 'What are the main parts of building a docker image?'
✓ Retrieved 4 relevant document chunks


In [21]:
# Show what would be sent to LLM
print(f"\nContext that would be sent to LLM:")
for i, doc in enumerate(context_docs[:2], 1):  # Show first 2 for brevity
    print(f"\nChunk {i}: {doc.page_content[:250]}...")


Context that would be sent to LLM:

Chunk 1: with the Dockerfile. This file describes all the steps that are required to create an image
and is usually contained within the root directory of the source code repository for your
application.
1...

Chunk 2: Chapter 4. Working with Docker Images
Every Linux container is based on an image. Images are the underlying definition of what
gets reconstituted into a running container, much like a virtual disk becomes a VM when
you start it up. Docker or Open Con...
