# RAG Pipeline (LangChain + LangGraph + Groq + Open-Source Embeddings)

This notebook provides a structured, end-to-end RAG pipeline with clear sections and reusable functions.
Everything is written in English and avoids emojis.

## 1. Environment and Dependencies
Install required packages if you are running this notebook in a fresh environment.

In [None]:
# You may comment this cell if dependencies are already installed
!pip install -q langchain langchain-community langchain-groq sentence-transformers faiss-cpu langgraph ragas pypdf spacy rank-bm25 transformers accelerate pinecone-client pinecone-text python-dotenv

In [None]:
# Download the spaCy English model (run once)
!python -m spacy download en_core_web_sm

## 2. Configuration
Load environment variables and validate API keys (Groq and Pinecone).

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
    raise ValueError("Please set GROQ_API_KEY in your environment or .env file")

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
if not PINECONE_API_KEY:
    raise ValueError("Please set PINECONE_API_KEY in your environment or .env file")

os.environ["GROQ_API_KEY"] = GROQ_API_KEY
print("Configuration loaded")

## 3. Document Loading and Cleaning
Load a PDF and normalize its text content.

In [None]:
import re
from langchain_community.document_loaders import PyMuPDFLoader

pdf_path = "Deepseek-r1.pdf"  # Place your PDF in the project root or adjust the path
loader = PyMuPDFLoader(pdf_path)
docs = loader.load()
print(f"Loaded {len(docs)} pages")

def clean_text(text: str) -> str:
    text = re.sub(r"-\n", "", text)  # fix hyphen-newlines
    text = re.sub(r"\n", " ", text)  # flatten newlines
    text = re.sub(r"\s+", " ", text)
    return text.strip()

cleaned_docs = [clean_text(d.page_content) for d in docs]
print("Text cleaning complete")

## 4. Chunking
Use both fixed-size chunking and semantic chunking (with spaCy sentences).

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
import spacy

nlp = spacy.load("en_core_web_sm")

# Fixed chunking
fixed_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
fixed_chunks = fixed_splitter.create_documents(cleaned_docs)

# Semantic chunking by sentences with token cap
def semantic_chunk(text: str, max_tokens: int = 120):
    doc = nlp(text)
    chunks, current = [], []
    for sent in doc.sents:
        current.append(sent.text)
        if len(" ".join(current).split()) > max_tokens:
            chunks.append(" ".join(current))
            current = []
    if current:
        chunks.append(" ".join(current))
    return chunks

semantic_chunks = []
for d in cleaned_docs:
    semantic_chunks.extend(semantic_chunk(d))

print(f"Fixed chunks: {len(fixed_chunks)}")
print(f"Semantic chunks: {len(semantic_chunks)}")

## 5. Embeddings, Vector Store (Pinecone), BM25, and Reranker
Initialize embedding model, Pinecone index, upload vectors, BM25 corpus, and the reranker model.

In [None]:
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from rank_bm25 import BM25Okapi
import numpy as np

print("Loading embedding model...")
embed_model = SentenceTransformer("BAAI/bge-large-en-v1.5")
print(f"Embedding model dimension: {embed_model.get_sentence_embedding_dimension()}")

# Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "rag-semantic-index"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=embed_model.get_sentence_embedding_dimension(),
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
index = pc.Index(index_name)

# Upsert vectors
vectors, batch_size = [], 100
for i, text in enumerate(semantic_chunks):
    emb = embed_model.encode(text, show_progress_bar=False).tolist()
    vectors.append({"id": str(i), "values": emb, "metadata": {"text": text[:500]}})

for i in range(0, len(vectors), batch_size):
    batch = vectors[i:i+batch_size]
    index.upsert(vectors=batch)
    print(f"Uploaded {min(i+batch_size, len(vectors))}/{len(vectors)} vectors")
print(f"All vectors uploaded, total: {len(vectors)}")

# BM25
tokenized_corpus = [doc.split() for doc in semantic_chunks]
bm25 = BM25Okapi(tokenized_corpus)
print("BM25 ready")

# Reranker
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

print("Loading reranker...")
reranker_model = AutoModelForSequenceClassification.from_pretrained("BAAI/bge-reranker-base")
reranker_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-base")
reranker_model.eval()
print("Reranker ready")

## 6. Retrieval Utilities
Vector search (Pinecone), hybrid search (BM25 + vector), and reranking.

In [None]:
def vector_search_pinecone(query: str, top_k: int = 10):
    q_emb = embed_model.encode(query, show_progress_bar=False).tolist()
    res = index.query(vector=q_emb, top_k=top_k, include_metadata=True)
    return res.get("matches", [])

def hybrid_search(query: str, alpha: float = 0.5, top_k: int = 10):
    query_tokens = query.split()
    if not query_tokens:
        return []
    bm25_scores = bm25.get_scores(query_tokens)
    if np.max(bm25_scores) - np.min(bm25_scores) > 1e-9:
        bm25_norm = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))
    else:
        bm25_norm = np.ones_like(bm25_scores) * 0.5

    vector_results = vector_search_pinecone(query, top_k=top_k * 2)
    vector_scores = np.zeros(len(semantic_chunks))
    for m in vector_results:
        idx = int(m["id"])
        if 0 <= idx < len(semantic_chunks):
            vector_scores[idx] = m.get("score", 0.0)

    if np.max(vector_scores) - np.min(vector_scores) > 1e-9:
        vector_norm = (vector_scores - np.min(vector_scores)) / (np.max(vector_scores) - np.min(vector_scores))
    else:
        vector_norm = np.ones_like(vector_scores) * 0.5

    hybrid = alpha * bm25_norm + (1 - alpha) * vector_norm
    best_idx = np.argsort(hybrid)[::-1][:top_k]
    return [(i, float(hybrid[i]), semantic_chunks[i]) for i in best_idx]

def rerank(query: str, candidates, top_k: int = 5):
    if not candidates:
        return []
    pairs = [[query, c[2]] for c in candidates]
    inputs = reranker_tokenizer(pairs, padding=True, truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        scores = reranker_model(**inputs).logits.squeeze()
    if scores.dim() == 0:
        scores = scores.unsqueeze(0)
    scored = list(zip(scores.tolist(), candidates))
    scored.sort(key=lambda x: x[0], reverse=True)
    return scored[:top_k]

## 7. High-level Retrieval and LLM
Define a retrieve() function and initialize the Groq LLM, then provide rag_answer().

In [None]:
from langchain_groq import ChatGroq

llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.1, max_tokens=1000)
print("Groq LLM ready")

def retrieve(query: str, top_k: int = 5, alpha: float = 0.5):
    hybrid_results = hybrid_search(query, alpha=alpha, top_k=20)
    if not hybrid_results:
        return []
    reranked_results = rerank(query, hybrid_results, top_k=top_k)
    return [r[1][2] for r in reranked_results]

def rag_answer(query: str, top_k: int = 5) -> str:
    ctx = retrieve(query, top_k=top_k)
    if not ctx:
        return "Sorry, no relevant information found."
    context_text = "\n\n".join([f"[Document {i+1}]: {t}" for i, t in enumerate(ctx)])
    prompt = f"""Answer the question based on the following context. If the context does not contain relevant information, please state so.

Context:
{context_text}

Question: {query}

Answer:"""
    resp = llm.invoke(prompt)
    return resp.content if hasattr(resp, 'content') else str(resp)

## 8. LangGraph: Minimal Pipeline (retrieve -> generate)
We use LangGraph to orchestrate retrieval followed by generation.

In [None]:
from typing import TypedDict, List, Optional
from langgraph.graph import StateGraph, END

class AgentState(TypedDict, total=False):
    question: str
    chat_history: List[str]
    top_k: int
    documents: List[str]
    answer: str

def build_graph():
    graph = StateGraph(AgentState)

    def retrieve_node(state: AgentState) -> AgentState:
        question = state.get("question", "")
        top_k = state.get("top_k", 5)
        docs = retrieve(question, top_k=top_k, alpha=0.5)
        state["documents"] = docs
        return state

    def generate_node(state: AgentState) -> AgentState:
        question = state.get("question", "")
        docs = state.get("documents", []) or []
        hist = state.get("chat_history", []) or []
        context_text = "\n\n".join([f"[Document {i+1}]: {t}" for i, t in enumerate(docs)]) if docs else ""
        history_text = ("\n\nConversation history (most recent first):\n" + "\n".join(hist[-3:])) if hist else ""
        prompt = f"""Answer the question using only the information from the provided context and conversation history. If the context does not contain relevant information, say that you do not know.

Context:
{context_text}{history_text}

Question: {question}

Answer:"""
        resp = llm.invoke(prompt)
        state["answer"] = resp.content if hasattr(resp, "content") else str(resp)
        return state

    graph.add_node("retrieve", retrieve_node)
    graph.add_node("generate", generate_node)
    graph.set_entry_point("retrieve")
    graph.add_edge("retrieve", "generate")
    graph.add_edge("generate", END)
    return graph.compile()

graph = build_graph()
print("LangGraph pipeline ready")

def run_graph(question: str, chat_history: Optional[List[str]] = None, top_k: int = 5) -> str:
    state = {
        "question": question,
        "chat_history": chat_history or [],
        "top_k": top_k,
    }
    result = graph.invoke(state)
    return result.get("answer", "")

## 9. Examples
Single-turn and multi-turn examples using both plain RAG and LangGraph pipeline.

In [None]:
# Single-turn
q = "What is the main contribution of DeepSeek-R1?"
print("RAG answer:")
print(rag_answer(q))

print("
LangGraph answer:")
print(run_graph(q))

# Multi-turn
history = []
q1 = "What is DeepSeek-R1?"
a1 = run_graph(q1, history)
history.append(f"User: {q1}\nAssistant: {a1}")

q2 = "How does it compare to other models?"
a2 = run_graph(q2, history)
history.append(f"User: {q2}\nAssistant: {a2}")

print("
Multi-turn results:")
print("Turn 1 Answer:", a1[:300])
print("Turn 2 Answer:", a2[:300])

## 10. Evaluation Utilities
Simple LLM-judge metrics and embedding-based recall for quick assessment.

In [None]:
from langchain_groq import ChatGroq as _Groq
import re as _re
import numpy as _np

class GroqJudge:
    """Deterministic Groq LLM wrapper for evaluation.
    
    Use a lower temperature for stable scoring."""
    def __init__(self, model: str = "llama-3.3-70b-versatile"):
        self.llm = _Groq(model=model, temperature=0.0)
    def __call__(self, prompt: str) -> str:
        resp = self.llm.invoke(prompt)
        return resp.content if hasattr(resp, 'content') else str(resp)

judge_llm = GroqJudge()
print("Evaluation LLM ready")

def _clamp01(x: float) -> float:
    return max(0.0, min(1.0, x))

def evaluate_faithfulness(answer: str, contexts: list[str]) -> float:
    contexts_text = "\n\n".join([f"[Document {i+1}]: {c}" for i, c in enumerate(contexts)])
    prompt = f"""You are an evaluator. Evaluate how faithful the answer is to the context (whether the answer is based on the context without fabrication).

Context:
{contexts_text}

Answer:
{answer}

Output only a number between 0 and 1."""
    score_text = judge_llm(prompt)
    m = _re.search(r'0?\.\d+|1\.0|0', score_text)
    return _clamp01(float(m.group())) if m else 0.0

def evaluate_relevance(question: str, answer: str) -> float:
    prompt = f"""Evaluate how well the answer responds to the question.

Question:
{question}

Answer:
{answer}

Output only a number between 0 and 1."""
    score_text = judge_llm(prompt)
    m = _re.search(r'0?\.\d+|1\.0|0', score_text)
    return _clamp01(float(m.group())) if m else 0.0

def _cosine(a: _np.ndarray, b: _np.ndarray) -> float:
    na, nb = _np.linalg.norm(a), _np.linalg.norm(b)
    if na == 0 or nb == 0:
        return 0.0
    return float(_np.dot(a, b) / (na * nb))

def evaluate_context_recall(question: str, contexts: list[str]) -> float:
    if not contexts:
        return 0.0
    q_emb = embed_model.encode(question, show_progress_bar=False)
    ctx_embs = embed_model.encode(contexts, show_progress_bar=False)
    sims = [_cosine(q_emb, c) for c in ctx_embs]
    return float(max(sims)) if sims else 0.0

def evaluate_rag(question: str, answer: str, contexts: list[str]) -> dict:
    print("Starting RAG evaluation...")
    results = {
        "faithfulness": evaluate_faithfulness(answer, contexts),
        "relevance": evaluate_relevance(question, answer),
        "context_recall": evaluate_context_recall(question, contexts),
    }
    print("Done")
    return results

# Example evaluation
_q = "What is DeepSeek-R1?"
_ans = rag_answer(_q)
_ctx = retrieve(_q)
eval_results = evaluate_rag(_q, _ans, _ctx)
eval_results