In [2]:
! pip install -U -r requirements.txt

Collecting langchain-nomic (from -r requirements.txt (line 1))
  Downloading langchain_nomic-0.1.4-py3-none-any.whl.metadata (1.6 kB)
Collecting langchain_community (from -r requirements.txt (line 2))
  Using cached langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting tiktoken (from -r requirements.txt (line 3))
  Downloading tiktoken-0.9.0-cp311-cp311-win_amd64.whl.metadata (6.8 kB)
Collecting langchainhub (from -r requirements.txt (line 4))
  Downloading langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting chromadb (from -r requirements.txt (line 5))
  Downloading chromadb-1.0.12-cp39-abi3-win_amd64.whl.metadata (7.0 kB)
Collecting langchain (from -r requirements.txt (line 6))
  Downloading langchain-0.3.25-py3-none-any.whl.metadata (7.8 kB)
Collecting langgraph (from -r requirements.txt (line 7))
  Downloading langgraph-0.4.7-py3-none-any.whl.metadata (6.8 kB)
Collecting tavily-python (from -r requirements.txt (line 8))
  Downloading tavily_python

In [2]:
import os
from dotenv import load_dotenv

load_dotenv(dotenv_path=".env.local")

LANGCHAIN_TRACING_V2 = os.getenv('LANGCHAIN_TRACING_V2')
LANGCHAIN_ENDPOINT = os.getenv('LANGCHAIN_ENDPOINT')
LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')
LANGCHAIN_PROJECT = os.getenv('LANGCHAIN_PROJECT')

FIRECRAWL_API_KEY = os.getenv('FIRECRAWL_API_KEY')
TAVILY_API_KEY = os.getenv('TAVILY_API_KEY')

In [3]:
from utils.config import CONFIG

CHUNK_SIZE = CONFIG["CHUNK_SIZE"]
CHUNK_OVERLAP = CONFIG["CHUNK_OVERLAP"]
LOCAL_LLM = CONFIG["LOCAL_LLM"]

# Retrieve Documents

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.document_loaders.firecrawl import FireCrawlLoader
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain.docstore.document import Document
import requests

urls = [
    "https://python.langchain.com/docs/introduction/",
    "https://www.ai-jason.com/learning-ai/how-to-reduce-llm-cost",
    "https://www.ai-jason.com/learning-ai/ai-agent-vision-tutorial",
    "https://www.ai-jason.com/learning-ai/ai-research-agent",
]

def scrape_with_firecrawl(url):
    endpoint = "https://api.firecrawl.dev/v1/scrape"
    headers = {
        "Authorization": f"Bearer {FIRECRAWL_API_KEY}",
        "Content-Type": "application/json"
    }
    body = {
        "url": url,
        # "mode": "scrape", # deprecated
        "formats": ["markdown"], # Request markdown directly
        "onlyMainContent": True,
        "removeBase64Images": True,
        "blockAds": True,
        "proxy": "basic",
        "timeout": 30000
    }
    response = requests.post(endpoint, headers=headers, json=body)

    if response.status_code == 403:
        print(f"[❌ BLOCKED] {url} - This site is not supported by Firecrawl.")
        return None
    if response.status_code != 200:
        print(f"[❌ FAIL] {url} - Status: {response.status_code} - {response.text}")
        return None
    
    result = response.json()
    content = result.get("data", {}).get("markdown", "").strip()

    if not content:
        print(f"[⚠️] No content extracted from {url}")
        return None

    return Document(page_content=content, metadata={"url": url})

docs = []
for url in urls:
    try:
        doc = scrape_with_firecrawl(url)
        if doc and doc.page_content:
            docs.append(doc)
            print(f"[✅] Scraped: {url}")
        print(f"[ℹ️] Content length: {len(doc.page_content) if doc else 0}")
    except Exception as e:
        print(f"[❌] Failed: {url} - {e}")

# split docs
try:
    if docs and isinstance(docs[0], list):
        docs_list = [item for sublist in docs for item in sublist]
        print(f"[ℹ️] Flattened docs count: {len(docs_list)}")
    else:
        docs_list = docs
        print(f"[ℹ️] Docs already flat, count: {len(docs_list)}")
except Exception as e:
    print(f"[❌] Error flattening docs list: {e}")
    docs_list = docs  # fallback

try:
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=CHUNK_SIZE, 
        chunk_overlap=CHUNK_OVERLAP
    )
    docs_split = text_splitter.split_documents(docs_list)
    print(f"[ℹ️] Split into {len(docs_split)} document chunks")
except Exception as e:
    print(f"[❌] Error during document splitting: {e}")

# filter out complex metadata, ensure proper doc format
filtered_docs = []
for i, doc in enumerate(docs_split):
    try:
        # ensure doc is an instance of Document & has a 'metadata' attribute
        if isinstance(doc, Document) and hasattr(doc, 'metadata'):
            clean_metadata = {k: v for k, v in doc.metadata.items() if not isinstance(v, (str, int, float, bool))}
            filtered_doc = Document(
                page_content=doc.page_content,
                metadata=clean_metadata
            )
            filtered_docs.append(filtered_doc)
        else:
            print(f"[⚠️] Skipping doc at index {i}: Invalid type or missing metadata")
    except Exception as e:
        print(f"[❌] Error processing doc at index {i}: {e}")

print(f"[ℹ️] Filtered docs count: {len(filtered_docs)}")

# add to vectorDb
try:
    vectorstore = Chroma.from_documents(
        documents=filtered_docs,
        collection_name="rag-chroma",
        embedding=GPT4AllEmbeddings(),
    )
    retriever = vectorstore.as_retriever()
    print(f"[✅] Vectorstore created and retriever initialized")
except Exception as e:
    print(f"[❌] Error creating vectorstore or retriever: {e}")

[✅] Scraped: https://python.langchain.com/docs/introduction/
[ℹ️] Content length: 10885
[✅] Scraped: https://www.ai-jason.com/learning-ai/how-to-reduce-llm-cost
[ℹ️] Content length: 7064
[✅] Scraped: https://www.ai-jason.com/learning-ai/ai-agent-vision-tutorial
[ℹ️] Content length: 11674
[✅] Scraped: https://www.ai-jason.com/learning-ai/ai-research-agent
[ℹ️] Content length: 5937
[ℹ️] Docs already flat, count: 4
[ℹ️] Split into 48 document chunks
[ℹ️] Filtered docs count: 48
[✅] Vectorstore created and retriever initialized


# Grade Documents -> Retrieval Grader

In [7]:
from langchain.prompts import PromptTemplate
# from langchain_community.chat_models import ChatOllama
from langchain_ollama import OllamaLLM
from langchain_core.output_parsers import JsonOutputParser

llm = OllamaLLM(
    model=LOCAL_LLM,
    format="json",
    temperature=0.1,
    max_tokens=512,
    streaming=True,
    verbose=True,
)

prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a grader assessing relevance of a retrieved document to a user question. If the document contains keywords related to the user question, grade it as relevant. It dos not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Give a binary score 'YES' or 'NO' score to indicate whether the document is relevant to the question. \n 
    Provide the binary score as a JSON with a single key 'score' and no premable or explaination.
    <|eot_id|><|start_header_id|>user<|end_header_id|>
    Here is the retrieved document: \n\n {document}\n\n
    Here is the user question: {question} \n <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
    input_variables=["question", "document"],
)

retrieval_grader = prompt | llm | JsonOutputParser()
question = "how to save LLM costs?"
docs = retriever.invoke(question)
doc_txt = docs[1].page_content
print(retrieval_grader.invoke({"question": question, "document": doc_txt}))

{'score': 'YES'}


# Generate Answer

In [8]:
from langchain.prompts import PromptTemplate
from langchain import hub
from langchain_core.output_parsers import StrOutputParser

prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are an assistant for question-answering tasks.
    Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
    Use three sentences maximum and keep the answer concise and to the point <|eot_id|><|start_header_id|>user<|end_header_id|>
    Question: {question} \n
    Context: {context} \n
    Answer: <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
    input_variables=["question", "document"],
)

llm = OllamaLLM(
    model=LOCAL_LLM,
    temperature=0.1, 
    max_tokens=512, 
    streaming=True, 
    verbose=True
)

# post-processing
def format_docs(docs):
    return ("\n\n".join(doc.page_content for doc in docs))

# chain
rag_chain = prompt | llm | StrOutputParser()

question = "how to save LLM costs?"
docs = retriever.invoke(question)
generation = rag_chain.invoke({
    "context": docs,
    "question": question,
})
print(generation)

To save LLM costs, consider changing the model used for specific tasks and reserving more expensive models for complex questions, optimizing agent memory by managing conversation history stored in memory, and using observability platforms like L Smith to monitor and log costs. These strategies can help reduce costs by up to 78% or more while maintaining performance and user experience. Continuously optimizing your LLM usage is also crucial for maximizing efficiency and profitability.


# Web Search via Tavily -> check accuracy