# **Explore LLM Evaluation Techniques**

In [None]:
pip install nltk rouge-score bert-score

# **1- Automated Metrics**
## For Answer Quality (text generation):
## **BERTscore, ROUGE, and BLEU**

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import bert_score

# Basic word splitter (avoids nltk.word_tokenize)
def tokenize(text):
    return text.lower().split()

def compute_bleu(reference, candidate):
    ref_tokens = [tokenize(reference)]
    cand_tokens = tokenize(candidate)
    smoothie = SmoothingFunction().method4
    return sentence_bleu(ref_tokens, cand_tokens, smoothing_function=smoothie)

def compute_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return scores

def compute_bert_score(references, candidates):
    P, R, F1 = bert_score.score(candidates, references, lang="en", verbose=False)
    return {
        "precision": P.mean().item(),
        "recall": R.mean().item(),
        "f1": F1.mean().item()
    }

if __name__ == "__main__":
    reference = "The capital of France is Paris."
    candidate = "Paris is the capital of France."

    print("BLEU Score:", compute_bleu(reference, candidate))

    rouge_scores = compute_rouge(reference, candidate)
    print("ROUGE Scores:", {k: round(v.fmeasure, 4) for k, v in rouge_scores.items()})

    bert_scores = compute_bert_score([reference], [candidate])
    print("BERTScore:", {k: round(v, 4) for k, v in bert_scores.items()})


# **2- LLM As A Judge "G-Eval"**

In [None]:
!pip install openai

In [None]:

def judge_with_openai(question, candidate, reference):
    prompt = f"""
Evaluate this answer to a question using 1–5 ratings:

Question: {question}
Candidate Answer: {candidate}
Reference Answer: {reference}

Give ratings (1 to 5) for:
- Relevance
- Faithfulness
- Correctness

Return JSON like: {{"relevance": 4, "faithfulness": 5, "correctness": 4}}
"""

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are an expert grader for short answers."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=150
    )

    result = response.choices[0].message.content.strip()
    print(result)

# Example usage
question = "What is the capital of France?"
candidate = "Paris is the capital of France."
reference = "The capital of France is Paris."

judge_with_openai(question, candidate, reference)


# **3-Retrieval Component Metrics for RAG**


**Precision@k**: Measures the proportion of relevant documents in the top k retrieved documents.

**Recall@k**: Measures the proportion of relevant documents retrieved in the top k out of all relevant documents.

In [None]:
import numpy as np

def precision_at_k(retrieved_docs, relevant_docs, k):
    retrieved_at_k = np.array(retrieved_docs[:k])
    relevant_at_k = np.array(relevant_docs[:k])
    true_positives = np.sum((retrieved_at_k == 1) & (relevant_at_k == 1))
    return true_positives / k

def recall_at_k(retrieved_docs, relevant_docs, k):
    retrieved_at_k = np.array(retrieved_docs[:k])
    relevant_at_k = np.array(relevant_docs[:k])
    true_positives = np.sum((retrieved_at_k == 1) & (relevant_at_k == 1))
    total_relevant = np.sum(np.array(relevant_docs) == 1)  # use all, not just top-k
    return true_positives / total_relevant if total_relevant > 0 else 0

# Example usage
relevant_docs = [1, 0, 1, 0, 1]
retrieved_docs = [1, 1, 0, 1, 0]

precision_k3 = precision_at_k(retrieved_docs, relevant_docs, 3)
recall_k3 = recall_at_k(retrieved_docs, relevant_docs, 3)

print(f"Precision at k=3: {precision_k3:.2f}")
print(f"Recall at k=3: {recall_k3:.2f}")


# **4- Benchmark Datasets for RAG and LLM Evaluation**

In [None]:
# MS MARCO (Microsoft MAchine Reading COmprehension):
# Contains a large set of questions and answers to evaluate retrieval-based models.

def precision_at_k(retrieved_docs, relevant_docs, k):
    retrieved_k = retrieved_docs[:k]
    relevant_set = set(relevant_docs)
    true_positives = sum(1 for doc in retrieved_k if doc in relevant_set)
    return true_positives / k

def recall_at_k(retrieved_docs, relevant_docs, k):
    retrieved_k = retrieved_docs[:k]
    relevant_set = set(relevant_docs)
    true_positives = sum(1 for doc in retrieved_k if doc in relevant_set)
    total_relevant = len(relevant_set)
    return true_positives / total_relevant if total_relevant > 0 else 0

def evaluate_with_msmarco(query, retrieved_docs, ground_truth):
    precision_k = precision_at_k(retrieved_docs, ground_truth, k=10)
    recall_k = recall_at_k(retrieved_docs, ground_truth, k=10)
    print(f"🧪 MS MARCO Evaluation:")
    print(f"   - Precision@10: {precision_k:.2f}")
    print(f"   - Recall@10:    {recall_k:.2f}")

# Example with document IDs
query = "What is the capital of France?"
retrieved_docs = ['D1', 'D2', 'D5', 'D9', 'D3', 'D6', 'D7', 'D8', 'D10', 'D11']
ground_truth = {'D1', 'D3', 'D5'}  # MS MARCO provides these as QRELs

evaluate_with_msmarco(query, retrieved_docs, ground_truth)
