# LLM Benchmarking for SOCAR Hackathon RAG Chatbot

Testing different LLM models for the `/llm` endpoint to find the best performer.

## Evaluation Criteria (LLM Judge Metrics):
- **Accuracy**: Is the answer correct?
- **Relevance**: Are retrieved citations relevant?
- **Completeness**: Does it fully answer the question?
- **Citation Quality**: Proper sources with page numbers?
- **Response Time**: Speed of generation

## Available LLM Models:
1. **Llama-4-Maverick-17B** (Open-source)
2. **DeepSeek-R1** (Open-source reasoning)
3. **GPT-4.1, GPT-5, GPT-5-mini**
4. **Claude Sonnet 4.5**

In [45]:
# Install required packages
# !pip install openai pinecone-client sentence-transformers python-dotenv pandas matplotlib seaborn jiwer

In [46]:
import os
import json
import time
from typing import Dict, List, Tuple
from dotenv import load_dotenv
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from openai import AzureOpenAI
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
from jiwer import wer, cer
from pathlib import Path

load_dotenv()
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (14, 8)

print("‚úÖ Libraries loaded")

‚úÖ Libraries loaded


In [47]:
# Auto-detect project root
if Path("data").exists() and Path("docs").exists():
    PROJECT_ROOT = Path.cwd()
elif Path("../data").exists() and Path("../docs").exists():
    PROJECT_ROOT = Path.cwd().parent
else:
    current = Path.cwd()
    while current != current.parent:
        if (current / "data").exists() and (current / "docs").exists():
            PROJECT_ROOT = current
            break
        current = current.parent
    else:
        PROJECT_ROOT = Path.cwd()

DATA_DIR = PROJECT_ROOT / "data"
DOCS_DIR = PROJECT_ROOT / "docs"
OUTPUT_DIR = PROJECT_ROOT / "output"

print(f"‚úÖ Project root: {PROJECT_ROOT}")
print(f"‚úÖ Docs directory: {DOCS_DIR}")

‚úÖ Project root: /Users/ismatsamadov/SOCAR_Hackathon
‚úÖ Docs directory: /Users/ismatsamadov/SOCAR_Hackathon/docs


In [48]:
# Load sample questions and answers using dynamic paths
with open(DOCS_DIR / "sample_questions.json", "r", encoding="utf-8") as f:
    questions = json.load(f)

with open(DOCS_DIR / "sample_answers.json", "r", encoding="utf-8") as f:
    expected_answers = json.load(f)

print(f"Loaded {len(questions)} test cases")

Loaded 5 test cases


In [49]:
# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index(os.getenv("PINECONE_INDEX_NAME", "hackathon"))

# Initialize embedding model
embed_model = SentenceTransformer("BAAI/bge-large-en-v1.5")

print(f"‚úÖ Vector DB connected")
print(f"‚úÖ Embedding model loaded")

‚úÖ Vector DB connected
‚úÖ Embedding model loaded


In [50]:
def retrieve_documents(query: str, top_k: int = 3) -> List[Dict]:
    """Retrieve relevant documents from vector database."""
    query_embedding = embed_model.encode(query).tolist()
    
    results = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True
    )
    
    documents = []
    for match in results["matches"]:
        documents.append({
            "pdf_name": match["metadata"].get("pdf_name", "unknown.pdf"),
            "page_number": match["metadata"].get("page_number", 0),
            "content": match["metadata"].get("text", ""),
            "score": match.get("score", 0.0)
        })
    
    return documents

print("‚úÖ Retrieval function ready")

‚úÖ Retrieval function ready


In [57]:
# Initialize Azure OpenAI
azure_client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-08-01-preview"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

LLM_MODELS = {
    "Llama-4-Maverick": "Llama-4-Maverick-17B-128E-Instruct-FP8",
    "DeepSeek-R1": "DeepSeek-R1",
    "GPT-4.1": "gpt-4.1",
    # "GPT-5-mini": "gpt-5-mini"
    # "Claude-Sonnet-4.5": "claude-sonnet-4-5"  # Not available in Azure deployment
}

print(f"‚úÖ Configured {len(LLM_MODELS)} LLM models")

‚úÖ Configured 3 LLM models


In [61]:
def generate_answer(model_name: str, query: str, documents: List[Dict],
                   temperature: float = 0.2, max_tokens: int = 1000) -> Tuple[str, float]:
    """Generate answer using specified LLM model."""
    context_parts = []
    for i, doc in enumerate(documents, 1):
        context_parts.append(
            f"Document {i} (Source: {doc['pdf_name']}, Page {doc['page_number']}):\n{doc['content']}"
        )
    context = "\n\n".join(context_parts)
    
    prompt = f"""Siz SOCAR-ƒ±n tarixi neft v…ô qaz s…ôn…ôdl…ôri √ºzr…ô m√ºt…ôx…ôssis k√∂m…ôk√ßisisiniz.

Kontekst:
{context}

Sual: {query}

∆ètraflƒ± cavab verin v…ô m√ºtl…ôq s…ôn…ôd m…ônb…ôl…ôrin…ô istinad edin."""
    
    deployment = LLM_MODELS[model_name]
    
    try:
        start_time = time.time()
        
        # GPT-5 models use max_completion_tokens, others use max_tokens
        if deployment.startswith("gpt-5"):
            response = azure_client.chat.completions.create(
                model=deployment,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
                max_completion_tokens=max_tokens
            )
        else:
            response = azure_client.chat.completions.create(
                model=deployment,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
                max_tokens=max_tokens
            )
        
        response_time = time.time() - start_time
        answer = response.choices[0].message.content
        return answer, response_time
    
    except Exception as e:
        return f"ERROR: {str(e)}", 0.0

print("‚úÖ Generation function ready")

‚úÖ Generation function ready


In [62]:
def evaluate_answer(expected: str, generated: str, documents: List[Dict]) -> Dict:
    """Evaluate answer quality."""
    # Normalize text
    def normalize(text):
        return text.lower().strip()
    
    # Calculate similarity
    if expected:
        wer_score = wer(normalize(expected), normalize(generated)) * 100
        similarity = max(0, 100 - wer_score)
    else:
        similarity = 0
    
    # Check citations
    pdf_names = [doc["pdf_name"].replace(".pdf", "") for doc in documents]
    cited_pdfs = sum(1 for pdf in pdf_names if pdf in generated)
    citation_score = (cited_pdfs / len(pdf_names)) * 100 if pdf_names else 0
    
    # Completeness
    word_count = len(generated.split())
    completeness = min(100, (word_count / 50) * 100)
    
    return {
        "Similarity": round(similarity, 2),
        "Citation_Score": round(citation_score, 2),
        "Completeness": round(completeness, 2),
        "Quality_Score": round((similarity * 0.4 + citation_score * 0.4 + completeness * 0.2), 2)
    }

print("‚úÖ Evaluation functions ready")

‚úÖ Evaluation functions ready


## Run LLM Benchmark

In [63]:
# Run benchmark
results = []

for model_name in LLM_MODELS.keys():
    print(f"*******")
    print(f"Testing: {model_name}")
    print(f"**********")
    
    for example_key, messages in questions.items():
        user_msg = [m for m in messages if m["role"] == "user"][-1]
        query = user_msg["content"]
        
        print(f"  {example_key}: {query[:60]}...")
        
        # Retrieve and generate
        documents = retrieve_documents(query, top_k=3)
        answer, response_time = generate_answer(model_name, query, documents)
        
        if answer.startswith("ERROR"):
            print(f"    ‚ùå {answer}")
            continue
        
        print(f"    ‚úÖ {response_time:.2f}s")
        
        # Evaluate
        expected = expected_answers.get(example_key, {}).get("Answer", "")
        metrics = evaluate_answer(expected, answer, documents)
        
        results.append({
            "Model": model_name,
            "Question": example_key,
            "Response_Time": round(response_time, 2),
            **metrics
        })

print("*********")
print("‚úÖ Benchmark complete!")

*******
Testing: Llama-4-Maverick
**********
  Example1: Daha az quyu il…ô daha √ßox hasilat …ôld…ô etm…ôk √º√ß√ºn hansƒ± …ôsas...
    ‚úÖ 4.31s
  Example2: Q…ôrbi Ab≈üeron yataƒüƒ±nda suvurma t…ôdbirl…ôri hansƒ± tarixd…ô v…ô ...
    ‚úÖ 4.61s
  Example3: Pirallahƒ± strukturunda 1253 n√∂mr…ôli quyudan g√∂t√ºr√ºlm√º≈ü n√ºmun...
    ‚úÖ 3.92s
  Example4: Bakƒ± arxipelaqƒ± (BA) v…ô A≈üaƒüƒ± K√ºr √ß√∂k…ôkliyi (AK√á) √º√ß√ºn geote...
    ‚úÖ 4.13s
  Example5: Bu zonada hansƒ± prosesl…ôr ba≈ü verir?...
    ‚úÖ 3.50s
*******
Testing: DeepSeek-R1
**********
  Example1: Daha az quyu il…ô daha √ßox hasilat …ôld…ô etm…ôk √º√ß√ºn hansƒ± …ôsas...
    ‚úÖ 10.38s
  Example2: Q…ôrbi Ab≈üeron yataƒüƒ±nda suvurma t…ôdbirl…ôri hansƒ± tarixd…ô v…ô ...
    ‚úÖ 11.32s
  Example3: Pirallahƒ± strukturunda 1253 n√∂mr…ôli quyudan g√∂t√ºr√ºlm√º≈ü n√ºmun...
    ‚úÖ 10.45s
  Example4: Bakƒ± arxipelaqƒ± (BA) v…ô A≈üaƒüƒ± K√ºr √ß√∂k…ôkliyi (AK√á) √º√ß√ºn geote...
    ‚úÖ 10.56s
  Example5: Bu zonada hansƒ± prosesl…ô

In [55]:
# Analyze results
df = pd.DataFrame(results)
summary = df.groupby("Model").agg({
    "Quality_Score": "mean",
    "Similarity": "mean",
    "Citation_Score": "mean",
    "Completeness": "mean",
    "Response_Time": "mean"
}).round(2).sort_values("Quality_Score", ascending=False)

print("\n" + "="*100)
print("üìä LLM BENCHMARKING RESULTS")
print("="*100)
print(summary.to_string())
print("="*100)


üìä LLM BENCHMARKING RESULTS
                  Quality_Score  Similarity  Citation_Score  Completeness  Response_Time
Model                                                                                   
GPT-4.1                   52.00        0.00           80.00         100.0           6.38
Llama-4-Maverick          52.00        0.00           80.00         100.0           4.00
DeepSeek-R1               32.27        1.54           33.33          91.6          10.98


In [56]:
# Save results using dynamic path
output_dir = OUTPUT_DIR / "llm_benchmark"
output_dir.mkdir(parents=True, exist_ok=True)

df.to_csv(output_dir / "detailed_results.csv", index=False, encoding="utf-8")
summary.to_csv(output_dir / "summary.csv", encoding="utf-8")

print("\n‚úÖ Results saved to output/llm_benchmark/")


‚úÖ Results saved to output/llm_benchmark/
