# üìò RAG Pipeline Notebook (LangChain + LangGraph + Groq + Open-Source Embeddings)

This notebook contains the full pipeline structure. Fill in sections as needed.


## 1. Install Dependencies

In [73]:
!pip install langchain langchain-community langchain-groq sentence-transformers faiss-cpu langgraph ragas pypdf spacy rank-bm25 transformers accelerate pinecone-client pinecone-text



In [74]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------- ------------------ 6.8/12.8 MB 42.0 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 42.2 MB/s  0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## 2. Load PDF

In [84]:
from langchain_community.document_loaders import PyMuPDFLoader
pdf_path = "Deepseek-r1.pdf"
loader = PyMuPDFLoader(pdf_path)
docs = loader.load()
len(docs)

22

## 3. Cleaning Functions

In [85]:
import re

def clean_text(text):
    text = re.sub(r"-\n", "", text)  # fix hyphen-newlines
    text = re.sub(r"\n", " ", text)  # flatten newlines
    text = re.sub(r"\s+", " ", text)
    return text.strip()

cleaned_docs = []
for d in docs:
    cleaned_docs.append(clean_text(d.page_content))

## 4. Chunking

In [86]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
import spacy

# load spacy for semantic chunking
nlp = spacy.load("en_core_web_sm")

# ---------------------------
# 1. Fixed Chunking
# ---------------------------
fixed_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

fixed_chunks = fixed_splitter.create_documents(cleaned_docs)

# ---------------------------
# 2. Semantic Chunking
# ---------------------------
def semantic_chunk(text, max_tokens=120):
    doc = nlp(text)
    chunks = []
    current = []

    for sent in doc.sents:
        current.append(sent.text)
        if len(" ".join(current).split()) > max_tokens:
            chunks.append(" ".join(current))
            current = []
    if current:
        chunks.append(" ".join(current))
    return chunks

semantic_chunks = []
for d in cleaned_docs:
    semantic_chunks.extend(semantic_chunk(d))

print("Fixed chunks:", len(fixed_chunks))
print("Semantic chunks:", len(semantic_chunks))

Fixed chunks: 136
Semantic chunks: 75


## 5.1. Initialize Embedding Model


In [87]:
from sentence_transformers import SentenceTransformer
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Initialize embedding model
embed_model = SentenceTransformer("BAAI/bge-large-en-v1.5")

Loading embedding model...
Embedding model loaded, dimension: 1024


## 5.2. Pinecone Vector Database

In [88]:
from rank_bm25 import BM25Okapi
import numpy as np
from pinecone import Pinecone, ServerlessSpec
import os

# Get Pinecone API key from environment variables
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
if not PINECONE_API_KEY:
    raise ValueError("Please set PINECONE_API_KEY environment variable or configure it in .env file")

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "rag-semantic-index"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=embed_model.get_sentence_embedding_dimension(),
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

index = pc.Index(index_name)

In [89]:
vectors = []
batch_size = 100  # Process in batches to avoid uploading too many at once

for i, text in enumerate(semantic_chunks):
    emb = embed_model.encode(text, show_progress_bar=False).tolist()
    vectors.append({
        "id": str(i),
        "values": emb,
        "metadata": {"text": text[:500]}  # Limit metadata length
    })

# Batch upload
for i in range(0, len(vectors), batch_size):
    batch = vectors[i:i + batch_size]
    index.upsert(vectors=batch)
    print(f"Uploaded {min(i + batch_size, len(vectors))}/{len(vectors)} vectors")

print(f"All vectors uploaded, total: {len(vectors)}")

‚úÖ Uploaded 75/75 vectors
‚úÖ All vectors uploaded, total: 75


## 5.3. Reranker Ê®°Âûã


In [94]:
# Initialize BM25
tokenized_corpus = [doc.split() for doc in semantic_chunks]
bm25 = BM25Okapi(tokenized_corpus)

def vector_search_pinecone(query, top_k=10):
    """
    Perform vector search using Pinecone.
    
    Args:
        query: Query text
        top_k: Number of top results to return
    
    Returns:
        list: List of matching results from Pinecone
    """
    q_emb = embed_model.encode(query, show_progress_bar=False).tolist()
    res = index.query(
        vector=q_emb,
        top_k=top_k,
        include_metadata=True
    )
    return res.get("matches", [])

def hybrid_search(query, alpha=0.5, top_k=10):
    """
    Hybrid search combining BM25 and vector search.
    
    Args:
        query: Query text
        alpha: BM25 weight (0-1), alpha=1 means only BM25, alpha=0 means only vector search
        top_k: Number of top results to return
    
    Returns:
        list: [(index, hybrid_score, text), ...]
    """
    # BM25 scores
    query_tokens = query.split()
    if not query_tokens:
        return []
    
    bm25_scores = bm25.get_scores(query_tokens)
    
    # Normalize BM25 scores
    if np.max(bm25_scores) - np.min(bm25_scores) > 1e-9:
        bm25_norm = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))
    else:
        bm25_norm = np.ones_like(bm25_scores) * 0.5

    # Pinecone vector search
    vector_results = vector_search_pinecone(query, top_k=top_k * 2)  # Get more candidates
    
    vector_scores = np.zeros(len(semantic_chunks))
    for m in vector_results:
        idx = int(m["id"])
        if 0 <= idx < len(semantic_chunks):
            vector_scores[idx] = m.get("score", 0.0)

    # Normalize vector scores
    if np.max(vector_scores) - np.min(vector_scores) > 1e-9:
        vector_norm = (vector_scores - np.min(vector_scores)) / (np.max(vector_scores) - np.min(vector_scores))
    else:
        vector_norm = np.ones_like(vector_scores) * 0.5

    # Hybrid score
    hybrid = alpha * bm25_norm + (1 - alpha) * vector_norm

    # Top K results
    best_idx = np.argsort(hybrid)[::-1][:top_k]

    return [(i, float(hybrid[i]), semantic_chunks[i]) for i in best_idx]

In [95]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

print("Loading Reranker model...")
reranker_model = AutoModelForSequenceClassification.from_pretrained(
    "BAAI/bge-reranker-base"
)
reranker_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-base")
reranker_model.eval()  # Set to evaluation mode
print("Reranker model loaded")

def rerank(query, candidates, top_k=5):
    """
    Rerank candidate results using Reranker model.
    
    Args:
        query: Query text
        candidates: list of (index, hybrid_score, text)
        top_k: Number of top results to return
    
    Returns:
        list: [(rerank_score, (index, hybrid_score, text)), ...]
    """
    if not candidates:
        return []
    
    pairs = [[query, c[2]] for c in candidates]
    inputs = reranker_tokenizer(
        pairs,
        padding=True,
        truncation=True,
        max_length=512,  # Limit max length
        return_tensors="pt"
    )
    
    with torch.no_grad():
        scores = reranker_model(**inputs).logits.squeeze()
    
    # Handle single result case
    if scores.dim() == 0:
        scores = scores.unsqueeze(0)
    
    scored = list(zip(scores.tolist(), candidates))
    scored.sort(key=lambda x: x[0], reverse=True)

    return scored[:top_k]

Loading Reranker model...
Reranker model loaded


## 6. Groq LLM (RAG Generation)

In [97]:
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, AIMessage
import os
from typing import List, Tuple, Optional
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get Groq API key from environment variables
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# If not found, prompt user to set it
if not GROQ_API_KEY:
    print("GROQ_API_KEY not found in environment variables.")
    print("Please do one of the following:")
    print("1. Create a .env file in the project root with: GROQ_API_KEY=your_key_here")
    print("2. Or set it directly in this cell (temporary, for development only):")
    print("   GROQ_API_KEY = 'your_groq_api_key_here'")
    print("\nTo get your Groq API key, visit: https://console.groq.com/")
    raise ValueError("Please set GROQ_API_KEY environment variable or configure it in .env file")

os.environ["GROQ_API_KEY"] = GROQ_API_KEY

# Initialize LLM
llm = ChatGroq(
    model="llama-3.3-70b-versatile",
    temperature=0.1,  # Lower temperature for more stable output
    max_tokens=1000
)
print("Groq LLM initialized")

def retrieve(query, top_k=5, alpha=0.5):
    """
    Retrieve relevant documents.
    
    Args:
        query: Query text
        top_k: Number of top documents to return
        alpha: BM25 weight for hybrid search
    
    Returns:
        list: List of retrieved document texts
    """
    # Hybrid search
    hybrid_results = hybrid_search(query, alpha=alpha, top_k=20)
    
    if not hybrid_results:
        print("No relevant documents found")
        return []
    
    # Rerank
    reranked_results = rerank(query, hybrid_results, top_k=top_k)
    
    # Extract texts
    retrieved_texts = [r[1][2] for r in reranked_results]
    return retrieved_texts
    
def rag_answer(query, top_k=5):
    """
    Generate answer using RAG.
    
    Args:
        query: Query text
        top_k: Number of documents to retrieve
    
    Returns:
        str: Generated answer
    """
    # Retrieve relevant documents
    ctx = retrieve(query, top_k=top_k)
    
    if not ctx:
        return "Sorry, no relevant information found."
    
    # Build prompt
    context_text = "\n\n".join([f"[Document {i+1}]: {text}" for i, text in enumerate(ctx)])
    
    prompt = f"""Answer the question based on the following context. If the context does not contain relevant information, please state so.

Context:
{context_text}

Question: {query}

Answer:"""
    
    # Call LLM
    response = llm.invoke(prompt)
    
    # Extract text content
    if hasattr(response, 'content'):
        return response.content
    else:
        return str(response)

# Test
test_query = "What is the main contribution of DeepSeek-R1?"
print(f"\nTest query: {test_query}")
answer = rag_answer(test_query)
print(f"\nAnswer:\n{answer}")

Groq LLM initialized

Test query: What is the main contribution of DeepSeek-R1?

Answer:
The context does not contain a direct statement about the main contribution of DeepSeek-R1. However, based on the information provided, it can be inferred that DeepSeek-R1 is an improvement over DeepSeek-R1-Zero, with a focus on making the reasoning processes more readable and aligning with human preferences. The main contribution of DeepSeek-R1 seems to be its ability to produce clear and coherent Chains of Thought (CoT) while demonstrating strong general capabilities, achieved through a pipeline that incorporates reinforcement learning with human-friendly cold-start data.


## 6.1. Multi-turn Conversation Example


In [93]:
# Example: Multi-turn conversation
print("\n" + "="*60)
print("üí¨ Multi-turn Conversation Example")
print("="*60)

# Create a new conversation
conv = ConversationManager(top_k=5, alpha=0.5)

# Turn 1: Initial question
print("\n[Turn 1]")
q1 = "What is DeepSeek-R1?"
print(f"User: {q1}")
a1 = conv.chat(q1)
print(f"Assistant: {a1}")

# Turn 2: Follow-up question
print("\n[Turn 2]")
q2 = "How does it compare to other models?"
print(f"User: {q2}")
a2 = conv.chat(q2)
print(f"Assistant: {a2}")

# Turn 3: Another follow-up
print("\n[Turn 3]")
q3 = "What are its main features?"
print(f"User: {q3}")
a3 = conv.chat(q3)
print(f"Assistant: {a3}")

# Display full conversation history
conv.display_history()



üí¨ Multi-turn Conversation Example


NameError: name 'ConversationManager' is not defined

## 6.2. Interactive Conversation Loop


In [None]:
def interactive_chat():
    """
    Interactive chat loop for multi-turn conversation.
    Type 'quit', 'exit', or 'reset' to end or reset conversation.
    """
    conv = ConversationManager(top_k=5, alpha=0.5)
    
    print("\n" + "="*60)
    print("üí¨ Interactive RAG Chat")
    print("="*60)
    print("Type your questions below. Commands:")
    print("  - 'quit' or 'exit': End conversation")
    print("  - 'reset': Clear conversation history")
    print("  - 'history': Show conversation history")
    print("="*60 + "\n")
    
    while True:
        try:
            # Get user input
            user_input = input("You: ").strip()
            
            if not user_input:
                continue
            
            # Handle commands
            if user_input.lower() in ['quit', 'exit']:
                print("\nüëã Goodbye!")
                break
            elif user_input.lower() == 'reset':
                conv.reset()
                print("‚úÖ Conversation history cleared\n")
                continue
            elif user_input.lower() == 'history':
                conv.display_history()
                continue
            
            # Process query
            print("ü§î Thinking...")
            answer = conv.chat(user_input)
            print(f"\nAssistant: {answer}\n")
            
        except KeyboardInterrupt:
            print("\n\nüëã Goodbye!")
            break
        except Exception as e:
            print(f"\n‚ùå Error: {e}\n")

# Uncomment the line below to start interactive chat
# interactive_chat()


## 6.3. Programmatic Conversation API


In [None]:
# Programmatic conversation example
# Useful for integrating RAG into applications

def create_conversation_session():
    """
    Create a new conversation session.
    
    Returns:
        ConversationManager: New conversation manager instance
    """
    return ConversationManager(top_k=5, alpha=0.5)

# Example usage in an application
def example_application():
    """
    Example of how to use ConversationManager in an application.
    """
    # Create session
    session = create_conversation_session()
    
    # Simulate user interactions
    queries = [
        "What is DeepSeek-R1?",
        "What are its key capabilities?",
        "How does it perform on reasoning tasks?"
    ]
    
    print("\n" + "="*60)
    print("üì± Application Example")
    print("="*60)
    
    for i, query in enumerate(queries, 1):
        print(f"\n[Request {i}]")
        print(f"User Query: {query}")
        
        # Get answer
        answer = session.chat(query)
        
        print(f"Response: {answer[:200]}...")  # Truncate for display
    
    # Get conversation summary
    print("\n" + "="*60)
    print("üìä Conversation Summary")
    print("="*60)
    print(f"Total turns: {len(session.get_history())}")
    session.display_history()

# Run example
example_application()


In [None]:
from langchain_groq import ChatGroq
import numpy as np
import re

# --------------------------------
# 1. Groq Judge LLM Wrapper
# --------------------------------
class GroqJudge:
    """
    Groq LLM wrapper for evaluation.
    """
    def __init__(self, model="llama-3.3-70b-versatile"):
        self.llm = ChatGroq(model=model, temperature=0.0)  # Deterministic output for evaluation

    def __call__(self, prompt):
        response = self.llm.invoke(prompt)
        if hasattr(response, 'content'):
            return response.content
        return str(response)


judge_llm = GroqJudge()
print("‚úÖ Groq Judge initialized")


# --------------------------------
# 2. Faithfulness Metric (LLM-Judge)
# --------------------------------
def evaluate_faithfulness(answer, contexts):
    """
    Evaluate how faithful the answer is to the context.
    
    Args:
        answer: Generated answer (string)
        contexts: List of context texts
    
    Returns:
        float: Faithfulness score (0-1)
    """
    # Ensure answer is a string
    if hasattr(answer, 'content'):
        answer = answer.content
    answer = str(answer)
    
    # Format contexts
    contexts_text = "\n\n".join([f"[Document {i+1}]: {ctx}" for i, ctx in enumerate(contexts)])
    
    prompt = f"""You are an evaluator. Evaluate how faithful the answer is to the context (whether the answer is based on the context without fabricating information).

Context:
{contexts_text}

Answer:
{answer}

Give a score between 0 and 1, where:
- 1.0: Answer is completely based on context, no fabricated information
- 0.5: Answer is partially based on context, but contains some fabrication or errors
- 0.0: Answer is unrelated to context or contains significant errors

Output only the number (a decimal between 0 and 1)."""
    
    score_text = judge_llm(prompt)
    # Extract number
    score_match = re.search(r'0?\.\d+|1\.0|0', score_text)
    if score_match:
        score = float(score_match.group())
        return max(0.0, min(1.0, score))  # Clamp to 0-1
    return 0.0


# --------------------------------
# 3. Relevance Metric (LLM-Judge)
# --------------------------------
def evaluate_relevance(question, answer):
    """
    Evaluate how well the answer responds to the question.
    
    Args:
        question: Question text
        answer: Answer text
    
    Returns:
        float: Relevance score (0-1)
    """
    # Ensure answer is a string
    if hasattr(answer, 'content'):
        answer = answer.content
    answer = str(answer)
    
    prompt = f"""Evaluate how well the answer responds to the question.

Question:
{question}

Answer:
{answer}

Give a score between 0 and 1, where:
- 1.0: Answer completely addresses the question, accurate and complete
- 0.5: Answer partially addresses the question, but incomplete or inaccurate
- 0.0: Answer does not address the question or is completely irrelevant

Output only the number (a decimal between 0 and 1)."""
    
    score_text = judge_llm(prompt)
    # Extract number
    score_match = re.search(r'0?\.\d+|1\.0|0', score_text)
    if score_match:
        score = float(score_match.group())
        return max(0.0, min(1.0, score))
    return 0.0


# --------------------------------
# 4. Context Recall (Embedding Similarity)
# --------------------------------
def cosine_similarity(a, b):
    """
    Calculate cosine similarity between two vectors.
    
    Args:
        a: First vector
        b: Second vector
    
    Returns:
        float: Cosine similarity score
    """
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    if norm_a == 0 or norm_b == 0:
        return 0.0
    return np.dot(a, b) / (norm_a * norm_b)

def evaluate_context_recall(question, contexts, embed_model):
    """
    Evaluate the relevance of retrieved contexts to the question (recall metric).
    
    Args:
        question: Question text
        contexts: List of context texts
        embed_model: Embedding model
    
    Returns:
        float: Maximum similarity score (0-1)
    """
    if not contexts:
        return 0.0
    
    q_emb = embed_model.encode(question, show_progress_bar=False)
    ctx_embs = embed_model.encode(contexts, show_progress_bar=False)
    
    sims = [cosine_similarity(q_emb, c) for c in ctx_embs]
    return float(max(sims)) if sims else 0.0


# --------------------------------
# 5. Combine all metrics into one evaluator
# --------------------------------
def evaluate_rag(query, answer, contexts, embed_model):
    """
    Comprehensive evaluation of RAG system performance.
    
    Args:
        query: Query text
        answer: Generated answer
        contexts: List of retrieved context texts
        embed_model: Embedding model
    
    Returns:
        dict: Dictionary containing all evaluation metrics
    """
    print(f"\nüìä Starting RAG system evaluation...")
    print(f"Query: {query}")
    print(f"Retrieved {len(contexts)} contexts")
    
    results = {
        "faithfulness": evaluate_faithfulness(answer, contexts),
        "relevance": evaluate_relevance(query, answer),
        "context_recall": evaluate_context_recall(query, contexts, embed_model)
    }
    
    print(f"\n‚úÖ Evaluation completed:")
    print(f"  - Faithfulness: {results['faithfulness']:.3f}")
    print(f"  - Relevance: {results['relevance']:.3f}")
    print(f"  - Context Recall: {results['context_recall']:.3f}")
    
    return results


# Test evaluation
print("\n" + "="*60)
print("üß™ Testing evaluation functionality")
print("="*60)

test_query = "What is DeepSeek-R1?"
print(f"\nQuery: {test_query}")

# Generate answer
answer = rag_answer(test_query)
print(f"\nGenerated answer:\n{answer}")

# Retrieve contexts
contexts = retrieve(test_query)
print(f"\nNumber of retrieved contexts: {len(contexts)}")

# Evaluate
evaluation_results = evaluate_rag(test_query, answer, contexts, embed_model)
print(f"\nEvaluation results: {evaluation_results}")

{'faithfulness': 0.9, 'relevance': 0.9, 'context_recall': 0.7536224}