# RAG Application with AWS Bedrock & ChromaDB (Cloud)
## Phase 1: Setup & Configuration
This notebook covers the setup of dependencies, configuration of credentials, and initialization of AWS Bedrock and ChromaDB Cloud clients.

In [1]:
# Step 1: Install Dependencies
# Using %pip ensures packages are installed in the current Jupyter kernel
%pip install boto3 chromadb langchain langchain-community langchain-aws langchain-text-splitters python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Step 2: Configuration & Variables
import os
from dotenv import load_dotenv
import chromadb
from chromadb.config import Settings

# Load environment variables from .env file
load_dotenv()

# --- AWS Configuration ---
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_REGION = os.getenv("AWS_REGION", "us-west-2")

# Validate required environment variables
if not AWS_ACCESS_KEY_ID or not AWS_SECRET_ACCESS_KEY:
    raise ValueError(
        "Missing required AWS credentials. Please ensure AWS_ACCESS_KEY_ID and "
        "AWS_SECRET_ACCESS_KEY are set in your .env file or environment variables."
    )

# --- Bedrock Model Configuration ---
# Using a stable Claude 3 Sonnet ID which is widely available in us-west-2
BEDROCK_MODEL_ID = "meta.llama3-8b-instruct-v1:0"

# --- ChromaDB Cloud Configuration ---
# Sign up at https://trychroma.com to get your API Token
CHROMA_API_KEY = ""
CHROMA_TENANT = ""  # Usually 'default_tenant' for most users
CHROMA_DATABASE = "dev-demo" # Usually 'default_database'
CHROMA_COLLECTION_NAME = "rag_collection"

# Apply Environment Variables for Boto3 (only if values are not None)
if AWS_ACCESS_KEY_ID:
    os.environ["AWS_ACCESS_KEY_ID"] = AWS_ACCESS_KEY_ID
if AWS_SECRET_ACCESS_KEY:
    os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET_ACCESS_KEY
if AWS_REGION:
    os.environ["AWS_DEFAULT_REGION"] = AWS_REGION

print("Configuration Loaded.")

Configuration Loaded.


In [3]:
# Step 3: Initialize Clients
import boto3
import chromadb

print("1. Initializing Boto3 Session...")
try:
    session = boto3.Session(
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
        region_name=AWS_REGION
    )
    bedrock_client = session.client("bedrock-runtime")
    print("   ‚úÖ Bedrock Client Initialized successfully.")
except Exception as e:
    print(f"   ‚ùå Error initializing Bedrock: {e}")

print("\n2. Initializing ChromaDB Cloud Client...")
try:
    # Initialize CloudClient specifically for Chroma Cloud
    chroma_client = chromadb.PersistentClient(path="./chroma_db")
    
    # Get or create the collection
    collection = chroma_client.get_or_create_collection(name=CHROMA_COLLECTION_NAME)
    print(f"   ‚úÖ Connected to Chroma Cloud. Collection '{CHROMA_COLLECTION_NAME}' ready.")
    print(f"   ‚ÑπÔ∏è Current Collection Count: {collection.count()}")
except Exception as e:
    print(f"   ‚ùå Error initializing ChromaDB Cloud: {e}")

1. Initializing Boto3 Session...
   ‚úÖ Bedrock Client Initialized successfully.

2. Initializing ChromaDB Cloud Client...
   ‚úÖ Connected to Chroma Cloud. Collection 'rag_collection' ready.
   ‚ÑπÔ∏è Current Collection Count: 8228


## Phase 2: Data Ingestion & Chunking
We will read text files from the `files/` directory, chunk them using LangChain's `RecursiveCharacterTextSplitter`, save the chunks to `files/chunked/`, and verify the output.

In [4]:
# Step 4: Setup Directories
import os

SOURCE_DIR = "Richmond_Policies_Cleaned"
CHUNKED_DIR = os.path.join(SOURCE_DIR, "chunked")

# Create chunked directory if it doesn't exist
if not os.path.exists(CHUNKED_DIR):
    os.makedirs(CHUNKED_DIR)
    print(f"‚úÖ Created directory: {CHUNKED_DIR}")
else:
    print(f"‚ÑπÔ∏è Directory exists: {CHUNKED_DIR}")

# List source files (excluding directory or hidden files)
source_files = [f for f in os.listdir(SOURCE_DIR) if os.path.isfile(os.path.join(SOURCE_DIR, f)) and not f.startswith('.')]
print(f"Found {len(source_files)} files in {SOURCE_DIR}: {source_files[:5]} ...")

‚ÑπÔ∏è Directory exists: Richmond_Policies_Cleaned/chunked
Found 95 files in Richmond_Policies_Cleaned: ['jury_duty_and_subpoenas_policy.txt', 'endowment_spending_policy.txt', 'policy_on_pregnancy_childbirth_lactation_and_related_conditions_faculty_and_staff1.txt', 'password_policy.txt', 'policy_on_provision_of_financial_resources_to_students.txt'] ...


In [5]:
# Step 5: Load, Chunk, and Save Files
try:
    # Try modern import first
    from langchain_text_splitters import RecursiveCharacterTextSplitter
except ImportError:
    # Fallback to legacy import
    try:
        from langchain.text_splitter import RecursiveCharacterTextSplitter
    except ImportError:
        # Last resort
        from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize Splitter (prioritize sentence boundaries)
text_splitter = RecursiveCharacterTextSplitter(
    separators=[". ", "? ", "! ", "\n", " ", ""],
    chunk_size=1000,      # Characters (~200 tokens)
    chunk_overlap=100,    # Overlap to maintain context
    length_function=len,
    is_separator_regex=False
)

total_chunks_processed = 0

print("Starting chunking process...\n")

for file_name in source_files:
    file_path = os.path.join(SOURCE_DIR, file_name)
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
            
        # Create Chunks
        chunks = text_splitter.split_text(text)
        
        # Save each chunk with metadata in filename
        # Format: ch{index}-{original_name}-{metadata}.txt
        base_name = os.path.splitext(file_name)[0]
        
        for i, chunk_content in enumerate(chunks):
            # Metadata example: length of chunk
            metadata_str = f"len{len(chunk_content)}"
            chunk_filename = f"ch{i+1}-{base_name}-{metadata_str}.txt"
            chunk_path = os.path.join(CHUNKED_DIR, chunk_filename)
            
            with open(chunk_path, 'w', encoding='utf-8') as chunk_file:
                chunk_file.write(chunk_content)
                
        print(f"‚úÖ {file_name}: Created {len(chunks)} chunks.")
        total_chunks_processed += len(chunks)
        
    except Exception as e:
        print(f"‚ùå Error processing {file_name}: {e}")

print(f"\nüéâ Total Chunks Created: {total_chunks_processed}")

  if not hasattr(np, "object"):


Starting chunking process...

‚úÖ jury_duty_and_subpoenas_policy.txt: Created 3 chunks.
‚úÖ endowment_spending_policy.txt: Created 4 chunks.
‚úÖ policy_on_pregnancy_childbirth_lactation_and_related_conditions_faculty_and_staff1.txt: Created 9 chunks.
‚úÖ password_policy.txt: Created 11 chunks.
‚úÖ policy_on_provision_of_financial_resources_to_students.txt: Created 13 chunks.
‚úÖ course_level_policy.txt: Created 4 chunks.
‚úÖ bereavement_leave_policy.txt: Created 5 chunks.
‚úÖ policy_for_events_with_alcohol_on_campus.txt: Created 22 chunks.
‚úÖ alcohol_and_drug_policy.txt: Created 53 chunks.
‚úÖ policy_on_space_allocation_and_facilities_resources.txt: Created 34 chunks.
‚úÖ multiple_donor_gifts_policy.txt: Created 7 chunks.
‚úÖ general_data_privacy_regulation_notice.txt: Created 34 chunks.
‚úÖ policy_for_employment_of_out_of_state_residents.txt: Created 13 chunks.
‚úÖ non-retaliation_policy.txt: Created 4 chunks.
‚úÖ office_assignment_policy.txt: Created 13 chunks.
‚úÖ official_universi

In [6]:
# Step 6: Verify a Sample Chunk
# Check one of the generated files to ensure content is correct
if os.listdir(CHUNKED_DIR):
    sample_chunk = os.listdir(CHUNKED_DIR)[0]
    sample_path = os.path.join(CHUNKED_DIR, sample_chunk)
    
    print(f"--- Content of {sample_chunk} ---")
    with open(sample_path, 'r', encoding='utf-8') as f:
        print(f.read()[:500]) # Print first 500 chars
    print("\n--- End of Sample ---")
else:
    print("No chunks found to verify.")

--- Content of ch2-lock_and_key_management_policy-len394.txt ---
they are magnetically swiped, which then allows access. one card ‚Äì the official university of richmond identification card. this card permits access to their
housing and many university services and facilities. operator key ‚Äì any key that provides access to a limited number of locks within one building. outside contractors ‚Äì companies hired by the university of richmond to provide a service.

--- End of Sample ---


## Phase 3: Embeddings & Vector Store
We will now read the chunked files we just created, generate embeddings (handled automatically by Chroma's default embedding function), and upsert them into the ChromaDB Cloud collection.

> **Note:** We are using ChromaDB's default embedding model (`all-MiniLM-L6-v2`) which is built into the client. No extra API calls to Bedrock are needed for *embedding* in this setup, saving costs.

In [7]:
# Step 7: Prepare Data for Embedding
import uuid
import re

chunked_files = [f for f in os.listdir(CHUNKED_DIR) if f.endswith('.txt')]

documents = []
metadatas = []
ids = []

print(f"Found {len(chunked_files)} chunk files to process.")

for file_name in chunked_files:
    file_path = os.path.join(CHUNKED_DIR, file_name)
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            
        # Parse Metadata from Filename
        # Format: ch{index}-{original_name}-{len}.txt
        # Example: ch1-academic_policy-len495.txt
        try:
            name_no_ext = os.path.splitext(file_name)[0]
            parts = name_no_ext.split('-')
            
            # 1. Chunk Part (first item, e.g., 'ch1')
            chunk_part = int(parts[0].replace('ch', ''))
            
            # 2. Size (last item, e.g., 'len495')
            size = int(parts[-1].replace('len', ''))
            
            # 3. File Name (everything in between)
            original_filename = "-".join(parts[1:-1])
            
            meta = {
                "source": file_name,
                "file_name": original_filename,
                "chunk_part": chunk_part,
                "size": size
            }
        except Exception as e:
            # Fallback if naming convention doesn't match
            print(f"‚ö†Ô∏è Metadata parse warning for {file_name}: {e}")
            meta = {"source": file_name}

        # Add to lists
        documents.append(content)
        metadatas.append(meta)
        ids.append(str(uuid.uuid4()))
        
    except Exception as e:
        print(f"Warning: Could not read {file_name}: {e}")

print(f"Prepared {len(documents)} documents for embedding.")

Found 2486 chunk files to process.
Prepared 2486 documents for embedding.


In [8]:
# Step 8: Add to ChromaDB (Embed & Upsert)
# Batch size limit for Chroma is usually 1000 (we hit 1914!), so we must batch.
print("Upserting documents to ChromaDB Collection in batches...")

BATCH_SIZE = 900  # Safe batch size
total_docs = len(documents)

try:
    for i in range(0, total_docs, BATCH_SIZE):
        batch_docs = documents[i : i + BATCH_SIZE]
        batch_metas = metadatas[i : i + BATCH_SIZE]
        batch_ids = ids[i : i + BATCH_SIZE]
        
        collection.add(
            documents=batch_docs,
            metadatas=batch_metas,
            ids=batch_ids
        )
        print(f"   ‚úÖ Processed batch {i} to {min(i+BATCH_SIZE, total_docs)}")
        
    print(f"\nüéâ Successfully added all {total_docs} documents to ChromaDB!")
    print(f"Final Collection Count: {collection.count()}")
    
except Exception as e:
    print(f"‚ùå Error adding to ChromaDB: {e}")

Upserting documents to ChromaDB Collection in batches...
   ‚úÖ Processed batch 0 to 900
   ‚úÖ Processed batch 900 to 1800
   ‚úÖ Processed batch 1800 to 2486

üéâ Successfully added all 2486 documents to ChromaDB!
Final Collection Count: 10714


In [9]:
# Step 9: Verify Embedding with a Test Query
# We will perform a simple similarity search (no LLM yet) to see if we get relevant chunks.

query_text = "What is the alcohol policy?"  # Replace with a relevant question for your data

print(f"Querying ChromaDB for: '{query_text}'...\n")

results = collection.query(
    query_texts=[query_text],
    n_results=3 # Get top 3 matches
)

if results['documents']:
    for i, doc in enumerate(results['documents'][0]):
        meta = results['metadatas'][0][i]
        print(f"[Result {i+1}]")
        print(f"   File: {meta.get('file_name', 'Unknown')}")
        print(f"   Part: {meta.get('chunk_part', '?')}")
        print(f"   Size: {meta.get('size', '?')}")
        print(f"   Snippet: {doc[:100]}...\n")
else:
    print("No results found. Check if documents were added correctly.")

Querying ChromaDB for: 'What is the alcohol policy?'...

[Result 1]
   File: alcohol_and_drug_policy
   Part: 7
   Size: 973
   Snippet: . refusal by an employee to comply with the applicable requirements may
be grounds for immediate dis...

[Result 2]
   File: alcohol_and_drug_policy
   Part: 7
   Size: 973
   Snippet: . refusal by an employee to comply with the applicable requirements may
be grounds for immediate dis...

[Result 3]
   File: alcohol_and_drug_policy
   Part: 7
   Size: 973
   Snippet: . refusal by an employee to comply with the applicable requirements may
be grounds for immediate dis...



## Phase 4: Retrieval & Generation
We implement the custom retrieval logical (with distince threshold filtering) and connect it to AWS Bedrock for the final answer generation.

In [10]:
# Step 10: Custom Retrieval Function
def retrieve_documents(query, n_results=5, threshold=1.5, filter_by=None):
    """
    Retrieve relevant documents with distance threshold filtering.
    
    Args:
        query: The search query string
        n_results: Max results to return initially
        threshold: Max distance (lower = more strict match). 
                   For Cosine distance: 0 is identical, 1 is orthogonal, 2 is opposite.
                   Typical good matches are < 1.0 depending on embedding model.
        filter_by: Metadata filter dict (optional)
    
    Returns:
        List of dicts: {text, source, distance}
    """
    # Query Chroma
    results = collection.query(
        query_texts=[query],
        n_results=n_results,
        where=filter_by,
        include=["documents", "metadatas", "distances"]
    )
    
    docs = []
    
    # Check if we got results
    if results['documents'] and results['documents'][0]:
        # Iterate through the first query's results
        for text, meta, dist in zip(
            results['documents'][0],
            results['metadatas'][0],
            results['distances'][0]
        ):
            # Filter by threshold
            if dist <= threshold:
                docs.append({
                    "text": text,
                    "source": meta.get("source", "unknown"),
                    "distance": dist
                })
                
    print(f"‚úÖ Retrieved {len(docs)} documents (Threshold: {threshold})")
    return docs

In [11]:
# VALID_CATEGORIES removed - using direct query without category routing

In [12]:
# Step 11: RAG Generation Function (Bedrock) - Single Step Query
from langchain_aws import ChatBedrock

# Fix for newer LangChain versions (v0.1+)
try:
    from langchain_core.prompts import PromptTemplate
    from langchain_core.runnables import RunnablePassthrough
    from langchain_core.output_parsers import StrOutputParser
except ImportError:
    # Fallback for older versions
    from langchain.prompts import PromptTemplate
    from langchain.schema.runnable import RunnablePassthrough
    from langchain.schema.output_parser import StrOutputParser

# Initialize LLM
llm = ChatBedrock(
    model_id=BEDROCK_MODEL_ID,
    client=bedrock_client,
    model_kwargs={"max_tokens": 1000, "temperature": 0.1}
)

def generate_answer(query):
    # Direct query without category routing - single step
    print(f"üîç Querying: {query}")
    
    # Retrieve documents directly (no metadata filter)
    relevant_docs = retrieve_documents(query, n_results=5, threshold=1.2, filter_by=None)
    print(f"üîç Retrieved {len(relevant_docs)} documents")
    
    if not relevant_docs:
        return "I could not find any relevant information to answer your question."
    
    # Format Context
    context_text = "\n\n".join([f"[Source: {d['source']}]\n{d['text']}" for d in relevant_docs])
    
    # Construct Prompt
    prompt_template = """
    Human: You are a concise and direct assistant. Use the following pieces of context to answer the question at the end.
    
    Rules for answering:
    1. Be extremely concise.
    2. Do NOT use bullet points or numbered lists. 
    3. Provide a single, direct paragraph.
    4. If you don't know the answer, just say that you don't know.

    Context:
    {context}

    Question: {question}

    Assistant:"""
    
    prompt = PromptTemplate(
        template=prompt_template, 
        input_variables=["context", "question"]
    )
    
    # Invoke LLM
    final_prompt = prompt.format(context=context_text, question=query)
    response = llm.invoke(final_prompt)
    
    return response.content

In [13]:
# Step 12: Final Test
query = "What is the policy regarding drug usage?"

print(f"‚ùì Question: {query}\n")

answer = generate_answer(query)

print("üí° Answer:")
print(answer)

‚ùì Question: What is the policy regarding drug usage?

üîç Querying: What is the policy regarding drug usage?
‚úÖ Retrieved 5 documents (Threshold: 1.2)
üîç Retrieved 5 documents
üí° Answer:
The university policy prohibits the unauthorized manufacture, distribution, and possession of controlled substances, including cocaine, ecstasy, and LSD, which are punishable by severe penalties. Additionally, students and employees who violate state and federal laws may be referred for criminal prosecution and are subject to disciplinary action, with sanctions ranging from substance education to permanent separation.
