# RAG Pipeline Optimization Benchmark

**Comprehensive testing of ALL RAG components to maximize LLM Judge score**

## What We're Testing:

### 1. Embedding Models
- BAAI/bge-large-en-v1.5 (Current - 1024 dim)
- intfloat/multilingual-e5-large (1024 dim, multi-language)

### 2. Retrieval Strategies
- Top-K: 3, 5, 10 documents
- MMR (Maximal Marginal Relevance)
- Reranking with cross-encoder

### 3. LLM Models
- Llama-4-Maverick-17B, DeepSeek-R1, GPT-5-mini, Claude-Sonnet-4.5

### 4. Prompting Strategies
- Baseline, Citation-focused, Step-by-step, Few-shot

In [46]:
# !pip install openai pinecone-client sentence-transformers rank-bm25 python-dotenv pandas matplotlib seaborn jiwer

In [47]:
import os
import json
import time
import re
from typing import Dict, List, Tuple
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from openai import AzureOpenAI
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer, CrossEncoder
from jiwer import wer, cer
import numpy as np

load_dotenv()
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (16, 10)

print("‚úÖ Libraries loaded")

‚úÖ Libraries loaded


In [48]:
# Auto-detect project root
if Path("data").exists() and Path("docs").exists():
    PROJECT_ROOT = Path.cwd()
elif Path("../data").exists() and Path("../docs").exists():
    PROJECT_ROOT = Path.cwd().parent
else:
    current = Path.cwd()
    while current != current.parent:
        if (current / "data").exists() and (current / "docs").exists():
            PROJECT_ROOT = current
            break
        current = current.parent
    else:
        PROJECT_ROOT = Path.cwd()

DATA_DIR = PROJECT_ROOT / "data"
DOCS_DIR = PROJECT_ROOT / "docs"
OUTPUT_DIR = PROJECT_ROOT / "output"

print(f"‚úÖ Project root: {PROJECT_ROOT}")

‚úÖ Project root: /Users/ismatsamadov/SOCAR_Hackathon


In [49]:
# Load test cases using dynamic paths
with open(DOCS_DIR / "sample_questions.json", "r", encoding="utf-8") as f:
    questions = json.load(f)

with open(DOCS_DIR / "sample_answers.json", "r", encoding="utf-8") as f:
    expected_answers = json.load(f)

print(f"‚úÖ Loaded {len(questions)} test questions")

‚úÖ Loaded 5 test questions


In [50]:
# Connect to Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index(os.getenv("PINECONE_INDEX_NAME", "hackathon"))

stats = index.describe_index_stats()
print(f"‚úÖ Vector DB connected: {stats['total_vector_count']} vectors")

‚úÖ Vector DB connected: 1300 vectors


In [51]:
EMBEDDING_MODELS = {
    "bge-large-en": "BAAI/bge-large-en-v1.5",
    "multilingual-e5-large": "intfloat/multilingual-e5-large"
}

embedding_cache = {}
for key, model_name in EMBEDDING_MODELS.items():
    print(f"Loading {key}...")
    embedding_cache[key] = SentenceTransformer(model_name)

print(f"‚úÖ Loaded {len(embedding_cache)} embedding models")

Loading bge-large-en...
Loading multilingual-e5-large...
‚úÖ Loaded 2 embedding models


In [52]:
def retrieve_vanilla(query: str, embed_model, top_k: int = 3):
    """Vanilla retrieval: Simple top-k vector search."""
    query_embedding = embed_model.encode(query).tolist()
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
    
    documents = []
    for match in results["matches"]:
        documents.append({
            "pdf_name": match["metadata"].get("pdf_name", "unknown.pdf"),
            "page_number": match["metadata"].get("page_number", 0),
            "content": match["metadata"].get("text", ""),
            "score": match.get("score", 0.0)
        })
    return documents

def retrieve_with_mmr(query: str, embed_model, top_k: int = 3, lambda_param: float = 0.5, fetch_k: int = 20):
    """MMR for diversity."""
    candidates = retrieve_vanilla(query, embed_model, top_k=fetch_k)
    if len(candidates) <= top_k:
        return candidates[:top_k]
    
    query_emb = embed_model.encode(query)
    candidate_texts = [doc["content"] for doc in candidates]
    candidate_embs = embed_model.encode(candidate_texts)
    
    selected = []
    selected_embs = []
    
    for _ in range(min(top_k, len(candidates))):
        mmr_scores = []
        for i, (doc, emb) in enumerate(zip(candidates, candidate_embs)):
            if i in [candidates.index(s) for s in selected]:
                mmr_scores.append(-float("inf"))
                continue
            
            relevance = np.dot(query_emb, emb) / (np.linalg.norm(query_emb) * np.linalg.norm(emb))
            if selected_embs:
                similarities = [np.dot(emb, s_emb) / (np.linalg.norm(emb) * np.linalg.norm(s_emb)) for s_emb in selected_embs]
                max_sim = max(similarities)
            else:
                max_sim = 0
            
            mmr = lambda_param * relevance - (1 - lambda_param) * max_sim
            mmr_scores.append(mmr)
        
        best_idx = np.argmax(mmr_scores)
        selected.append(candidates[best_idx])
        selected_embs.append(candidate_embs[best_idx])
    
    return selected

def retrieve_with_reranking(query: str, embed_model, top_k: int = 3, fetch_k: int = 20):
    """Two-stage: retrieve then rerank."""
    candidates = retrieve_vanilla(query, embed_model, top_k=fetch_k)
    if len(candidates) <= top_k:
        return candidates[:top_k]
    
    reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
    pairs = [[query, doc["content"]] for doc in candidates]
    scores = reranker.predict(pairs)
    
    scored_docs = [(doc, score) for doc, score in zip(candidates, scores)]
    scored_docs.sort(key=lambda x: x[1], reverse=True)
    
    return [doc for doc, _ in scored_docs[:top_k]]

RETRIEVAL_STRATEGIES = {
    "vanilla_k3": {"func": retrieve_vanilla, "params": {"top_k": 3}},
    "vanilla_k5": {"func": retrieve_vanilla, "params": {"top_k": 5}},
    "mmr_balanced": {"func": retrieve_with_mmr, "params": {"top_k": 3, "lambda_param": 0.5}},
    "reranked_k3": {"func": retrieve_with_reranking, "params": {"top_k": 3, "fetch_k": 20}}
}

print(f"‚úÖ Configured {len(RETRIEVAL_STRATEGIES)} retrieval strategies")

‚úÖ Configured 4 retrieval strategies


In [53]:
azure_client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-08-01-preview"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

LLM_MODELS = {
    "Llama-4-Maverick": "Llama-4-Maverick-17B-128E-Instruct-FP8",
    "DeepSeek-R1": "DeepSeek-R1",
    "GPT-5-mini": "gpt-5-mini"
}
# Note: Claude-Sonnet-4.5 removed - not available in Azure deployment

PROMPTING_STRATEGIES = {
    "baseline": """Cavab verin:
{context}

Sual: {query}""",
    
    "citation_focused": """M…ônb…ô g√∂st…ôrin:
{context}

Sual: {query}
H…ôr faktƒ± PDF v…ô s…ôhif…ô n√∂mr…ôsi il…ô g√∂st…ôrin.""",
    
    "few_shot": """N√ºmun…ô: "Pal√ßƒ±q vulkanlarƒ±nƒ±n t…ôsir radiusu 10 km-dir (PDF: doc.pdf, S…ôhif…ô: 5)"

{context}

Sual: {query}"""
}

print(f"‚úÖ Configured {len(LLM_MODELS)} LLM models and {len(PROMPTING_STRATEGIES)} prompts")

‚úÖ Configured 3 LLM models and 3 prompts


In [54]:
def generate_answer(llm_model: str, query: str, documents: List[Dict], prompt_strategy: str = "baseline") -> Tuple[str, float]:
    """Generate answer using LLM."""
    context_parts = []
    for i, doc in enumerate(documents, 1):
        context_parts.append(f"S…ôn…ôd {i} ({doc['pdf_name']}, S…ôhif…ô {doc['page_number']}):\n{doc['content']}")
    context = "\n\n".join(context_parts)
    
    prompt_template = PROMPTING_STRATEGIES[prompt_strategy]
    prompt = prompt_template.format(context=context, query=query)
    
    try:
        start_time = time.time()
        deployment = LLM_MODELS[llm_model]
        
        # GPT-5 models use max_completion_tokens, others use max_tokens
        if deployment.startswith("gpt-5"):
            response = azure_client.chat.completions.create(
                model=deployment,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.2,
                max_completion_tokens=1000
            )
        else:
            response = azure_client.chat.completions.create(
                model=deployment,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.2,
                max_tokens=1000
            )
        
        elapsed = time.time() - start_time
        answer = response.choices[0].message.content
        return answer, elapsed
    
    except Exception as e:
        return f"ERROR: {str(e)}", 0.0

print("‚úÖ Generation function ready")

‚úÖ Generation function ready


In [55]:
def evaluate_rag(expected: str, generated: str, documents: List[Dict]) -> Dict:
    """Evaluate RAG answer quality."""
    def normalize(text):
        return text.lower().strip()
    
    # Accuracy
    if expected:
        wer_score = wer(normalize(expected), normalize(generated)) * 100
        accuracy = max(0, 100 - wer_score)
    else:
        accuracy = 0
    
    # Citation quality
    pdf_names = [doc["pdf_name"].replace(".pdf", "") for doc in documents]
    cited_pdfs = sum(1 for pdf in pdf_names if pdf in generated)
    citation_score = (cited_pdfs / len(pdf_names)) * 100 if pdf_names else 0
    
    # Completeness
    word_count = len(generated.split())
    completeness = min(100, (word_count / 30) * 100)
    
    # Overall LLM Judge Score
    llm_judge_score = round(accuracy * 0.35 + citation_score * 0.35 + completeness * 0.30, 2)
    
    return {
        "Accuracy": round(accuracy, 2),
        "Citation_Score": round(citation_score, 2),
        "Completeness": round(completeness, 2),
        "LLM_Judge_Score": llm_judge_score
    }

print("‚úÖ Evaluation functions ready")

‚úÖ Evaluation functions ready


## Run Comprehensive Benchmark

In [56]:
# Select configurations to test
CONFIGS_TO_TEST = [
    ("bge-large-en", "vanilla_k3", "Llama-4-Maverick", "baseline"),
    ("bge-large-en", "vanilla_k3", "Llama-4-Maverick", "citation_focused"),
    ("bge-large-en", "vanilla_k3", "Llama-4-Maverick", "few_shot"),
    ("bge-large-en", "vanilla_k5", "Llama-4-Maverick", "baseline"),
    ("bge-large-en", "mmr_balanced", "Llama-4-Maverick", "baseline"),
    ("bge-large-en", "reranked_k3", "Llama-4-Maverick", "baseline"),
    ("multilingual-e5-large", "vanilla_k3", "Llama-4-Maverick", "baseline")
]

print(f"Testing {len(CONFIGS_TO_TEST)} configurations on {len(questions)} questions")

Testing 7 configurations on 5 questions


In [57]:
results = []

for config_idx, (embed_key, retrieval_key, llm_key, prompt_key) in enumerate(CONFIGS_TO_TEST, 1):
    config_name = f"{embed_key}_{retrieval_key}_{llm_key}_{prompt_key}"
    print("********************************")
    print(f"Config {config_idx}/{len(CONFIGS_TO_TEST)}: {config_name}")
    
    embed_model = embedding_cache[embed_key]
    retrieval_func = RETRIEVAL_STRATEGIES[retrieval_key]["func"]
    retrieval_params = RETRIEVAL_STRATEGIES[retrieval_key]["params"]
    
    for example_key, messages in questions.items():
        user_msg = [m for m in messages if m["role"] == "user"][-1]
        query = user_msg["content"]
        
        print(f"  {example_key}: {query[:40]}...")
        
        documents = retrieval_func(query, embed_model, **retrieval_params)
        answer, response_time = generate_answer(llm_key, query, documents, prompt_key)
        
        if answer.startswith("ERROR"):
            print(f"    ‚ùå {answer}")
            continue
        
        expected = expected_answers.get(example_key, {}).get("Answer", "")
        metrics = evaluate_rag(expected, answer, documents)
        
        results.append({
            "Config": config_name,
            "Embedding": embed_key,
            "Retrieval": retrieval_key,
            "LLM": llm_key,
            "Prompt": prompt_key,
            "Question": example_key,
            "Response_Time": round(response_time, 2),
            **metrics
        })
        
        print(f"    ‚úÖ Score: {metrics['LLM_Judge_Score']:.1f}% ({response_time:.2f}s)")

print("********************************")
print("‚úÖ Benchmark complete!")

********************************
Config 1/7: bge-large-en_vanilla_k3_Llama-4-Maverick_baseline
  Example1: Daha az quyu il…ô daha √ßox hasilat …ôld…ô e...
    ‚úÖ Score: 43.2% (1.86s)
  Example2: Q…ôrbi Ab≈üeron yataƒüƒ±nda suvurma t…ôdbirl…ô...
    ‚úÖ Score: 41.7% (2.32s)
  Example3: Pirallahƒ± strukturunda 1253 n√∂mr…ôli quyu...
    ‚úÖ Score: 30.0% (2.23s)
  Example4: Bakƒ± arxipelaqƒ± (BA) v…ô A≈üaƒüƒ± K√ºr √ß√∂k…ôkl...
    ‚úÖ Score: 53.3% (3.55s)
  Example5: Bu zonada hansƒ± prosesl…ôr ba≈ü verir?...
    ‚úÖ Score: 30.0% (1.45s)
********************************
Config 2/7: bge-large-en_vanilla_k3_Llama-4-Maverick_citation_focused
  Example1: Daha az quyu il…ô daha √ßox hasilat …ôld…ô e...
    ‚úÖ Score: 65.0% (3.50s)
  Example2: Q…ôrbi Ab≈üeron yataƒüƒ±nda suvurma t…ôdbirl…ô...
    ‚úÖ Score: 41.7% (3.54s)
  Example3: Pirallahƒ± strukturunda 1253 n√∂mr…ôli quyu...
    ‚úÖ Score: 65.0% (3.78s)
  Example4: Bakƒ± arxipelaqƒ± (BA) v…ô A≈üaƒüƒ± K√ºr √ß√∂k…ôkl...
    ‚úÖ Score: 53.3% 

In [58]:
df = pd.DataFrame(results)

config_summary = df.groupby("Config").agg({
    "LLM_Judge_Score": "mean",
    "Accuracy": "mean",
    "Citation_Score": "mean",
    "Completeness": "mean",
    "Response_Time": "mean"
}).round(2).sort_values("LLM_Judge_Score", ascending=False)

print("\n" + "="*100)
print("üìä CONFIGURATION RANKINGS")
print("="*100)
print(config_summary.to_string())
print("="*100)


üìä CONFIGURATION RANKINGS
                                                            LLM_Judge_Score  Accuracy  Citation_Score  Completeness  Response_Time
Config                                                                                                                            
bge-large-en_vanilla_k3_Llama-4-Maverick_citation_focused             55.67      0.00           73.33         100.0           3.61
bge-large-en_vanilla_k3_Llama-4-Maverick_few_shot                     45.70      4.86           40.00         100.0           2.17
bge-large-en_vanilla_k3_Llama-4-Maverick_baseline                     39.65      7.57           20.00         100.0           2.28
bge-large-en_reranked_k3_Llama-4-Maverick_baseline                    37.31      7.57           13.33         100.0           3.02
bge-large-en_vanilla_k5_Llama-4-Maverick_baseline                     35.60      0.00           16.00         100.0           3.38
bge-large-en_mmr_balanced_Llama-4-Maverick_baseline   

In [59]:
best_config = config_summary.iloc[0]

print("\n" + "="*100)
print("üèÜ OPTIMAL RAG CONFIGURATION")
print("="*100)
print(f"Best Configuration: {best_config.name}")
print(f"LLM Judge Score: {best_config['LLM_Judge_Score']:.2f}%")
print(f"Accuracy: {best_config['Accuracy']:.2f}%")
print(f"Citation Quality: {best_config['Citation_Score']:.2f}%")
print(f"Response Time: {best_config['Response_Time']:.2f}s")
print("="*100)


üèÜ OPTIMAL RAG CONFIGURATION
Best Configuration: bge-large-en_vanilla_k3_Llama-4-Maverick_citation_focused
LLM Judge Score: 55.67%
Accuracy: 0.00%
Citation Quality: 73.33%
Response Time: 3.61s


In [60]:
# Save results using dynamic path
output_dir = OUTPUT_DIR / "rag_optimization_benchmark"
output_dir.mkdir(parents=True, exist_ok=True)

df.to_csv(output_dir / "detailed_results.csv", index=False, encoding="utf-8")
config_summary.to_csv(output_dir / "summary.csv", encoding="utf-8")

print("\n‚úÖ Results saved to output/rag_optimization_benchmark/")


‚úÖ Results saved to output/rag_optimization_benchmark/
