# SemEval 2026 Task 8: Multi-Turn RAG Evaluation

## Task C: Retrieval-Augmented Generation

This notebook implements **Task C** of the MTRAGEval benchmark, which evaluates the complete RAG pipeline: retrieving relevant documents and generating grounded answers.

---

### Objective

Given a multi-turn conversation:
1. Retrieve the most relevant documents from the corpus
2. Generate an answer that is grounded in the retrieved context

### Evaluation Criteria

| Criterion | Description |
|-----------|-------------|
| **Faithfulness** | Answer is supported by retrieved context |
| **Relevance** | Answer addresses the user's question |
| **Completeness** | All key aspects are covered |
| **Coherence** | Response is well-structured and clear |

---

## 1. Environment Setup

In [None]:
import os
import sys
import json
import zipfile
from tqdm import tqdm

# Project root detection
if os.path.exists("src"):
    PROJECT_ROOT = os.getcwd()
else:
    PROJECT_ROOT = os.path.abspath("..")

if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from src.ingestion import load_and_chunk_data, build_vector_store
from src.retrieval import get_retriever, get_qdrant_client

print(f"Project Root: {PROJECT_ROOT}")

---

## 2. Configuration

In [None]:
# ============================================================
# CONFIGURATION
# ============================================================

TEAM_NAME = "Gbgers"
DOMAINS = ["govt", "clapnq", "fiqa", "cloud"]

# Retriever settings
TOP_K_RETRIEVE = 20
TOP_K_RERANK = 5
COLLECTION_NAME = "mtrag_unified"

# Execution mode
TEST_MODE = True
TEST_SUBSET_SIZE = 1000
TEST_QUERY_LIMIT = 5

# Paths
CORPUS_BASE_DIR = os.path.join(PROJECT_ROOT, "dataset/corpora/passage_level")
CONVERSATIONS_FILE = os.path.join(PROJECT_ROOT, "dataset/human/conversations/conversations.json")
QDRANT_PATH = os.path.join(PROJECT_ROOT, "qdrant_db")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "data/submissions")
OUTPUT_FILE = os.path.join(OUTPUT_DIR, f"submission_TaskC_{TEAM_NAME}.jsonl")

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(QDRANT_PATH, exist_ok=True)

print(f"Mode: {'TEST' if TEST_MODE else 'FULL'}")
print(f"Output: {OUTPUT_FILE}")

---

## 3. Utility Functions

In [None]:
def extract_last_query(messages: list) -> str:
    """Extract the most recent user query from a conversation."""
    for msg in reversed(messages):
        if msg.get("speaker") == "user":
            return msg.get("text", "")
    return ""


def get_corpus_file(domain: str) -> str:
    """Get corpus file path, extracting from ZIP if necessary."""
    jsonl_path = os.path.join(CORPUS_BASE_DIR, f"{domain}.jsonl")
    zip_path = os.path.join(CORPUS_BASE_DIR, f"{domain}.jsonl.zip")
    
    if not os.path.exists(jsonl_path):
        if os.path.exists(zip_path):
            print(f"Extracting {domain}.jsonl...")
            with zipfile.ZipFile(zip_path, 'r') as zf:
                zf.extractall(CORPUS_BASE_DIR)
        else:
            return None
    return jsonl_path


def format_conversation_history(messages: list) -> str:
    """Format conversation history for context-aware generation."""
    history = []
    for msg in messages[:-1]:
        speaker = msg.get("speaker", "unknown").capitalize()
        text = msg.get("text", "")
        history.append(f"{speaker}: {text}")
    return "\n".join(history)

---

## 4. Build Vector Index

In [None]:
# Check for existing collection
need_build = True

if os.path.exists(QDRANT_PATH):
    try:
        client = get_qdrant_client(QDRANT_PATH)
        if client.collection_exists(COLLECTION_NAME):
            info = client.get_collection(COLLECTION_NAME)
            print(f"Existing collection found: {info.points_count} vectors")
            need_build = False
    except Exception as e:
        print(f"Warning: {e}")

if need_build:
    print(f"Building collection '{COLLECTION_NAME}'...")
    all_docs = []
    
    for domain in DOMAINS:
        corpus_path = get_corpus_file(domain)
        if not corpus_path:
            continue
        
        print(f"Loading {domain}...")
        docs = load_and_chunk_data(corpus_path)
        
        for doc in docs:
            doc.metadata["domain"] = domain
        
        if TEST_MODE and len(docs) > TEST_SUBSET_SIZE:
            docs = docs[:TEST_SUBSET_SIZE]
        
        all_docs.extend(docs)
        print(f"  Added {len(docs)} chunks")
    
    print(f"Total: {len(all_docs)} documents")
    build_vector_store(all_docs, persist_dir=QDRANT_PATH, collection_name=COLLECTION_NAME)
    print("Index built successfully.")

---

## 5. Initialize Retriever

In [None]:
print("Initializing retriever...")
retriever = get_retriever(
    qdrant_path=QDRANT_PATH,
    collection_name=COLLECTION_NAME,
    top_k_retrieve=TOP_K_RETRIEVE,
    top_k_rerank=TOP_K_RERANK
)
print("Retriever ready.")

---

## 6. Initialize Language Model

In [None]:
import torch
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig

MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"

print(f"Loading model: {MODEL_ID}")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=256,
        temperature=0.1,
        do_sample=True,
        repetition_penalty=1.1,
        return_full_text=False,
        pad_token_id=tokenizer.eos_token_id
    )

    llm = HuggingFacePipeline(pipeline=pipe)
    print("Model loaded successfully.")

except Exception as e:
    print(f"Error loading model: {e}")
    from langchain.llms.fake import FakeListLLM
    llm = FakeListLLM(responses=["[Dummy response]"])

---

## 7. RAG Prompt Template

The prompt is designed for **grounded generation** with strict faithfulness to retrieved context:

- **Role**: Document-grounded QA specialist
- **Context Grounding**: Explicit instruction to use only provided documents
- **Hallucination Prevention**: Clear guidance on handling insufficient context
- **Citation Awareness**: Encourages traceability to source

In [None]:
# ============================================================
# TASK C PROMPT: RETRIEVAL-AUGMENTED GENERATION
# ============================================================

PROMPT_TEMPLATE = """You are a document-grounded question answering specialist. Your expertise is synthesizing information from retrieved documents to provide accurate, well-supported answers.

ROLE:
- You are a precise, factual assistant that bases answers strictly on provided documents
- You prioritize accuracy over speculation
- You clearly distinguish between what the documents state and what is uncertain

TASK:
Answer the user's question using ONLY the information in the provided context documents. Follow the grounding rules strictly.

GROUNDING RULES:
1. Base your answer EXCLUSIVELY on the provided context
2. If the context contains sufficient information, synthesize a coherent, complete answer
3. If the context is partially relevant, answer what you can and note what is missing
4. If the context does not contain relevant information, respond: "The provided documents do not contain sufficient information to answer this question."
5. DO NOT introduce information from outside the context, even if you know it to be true
6. Maintain the original meaning and do not over-interpret

RESPONSE FORMAT:
- Provide a direct answer to the question
- Be concise but complete
- Use clear, professional language

---
RETRIEVED CONTEXT:
{context}
---

CONVERSATION HISTORY:
{conversation_history}

USER QUERY: {question}

RESPONSE:"""


def generate_answer(question: str, context: str, conversation_history: str = "") -> str:
    """Generate a grounded answer using RAG."""
    prompt = PROMPT_TEMPLATE.format(
        question=question,
        context=context if context.strip() else "[No documents retrieved]",
        conversation_history=conversation_history if conversation_history else "[No prior context]"
    )
    
    try:
        return llm.invoke(prompt)
    except Exception as e:
        return f"[Generation error: {e}]"

---

## 8. Execute RAG Pipeline

In [None]:
# Load conversations
print("Loading conversations...")
with open(CONVERSATIONS_FILE, 'r') as f:
    all_conversations = json.load(f)
print(f"Loaded {len(all_conversations)} conversations.")

all_results = []

for domain in DOMAINS:
    print(f"\n{'='*50}")
    print(f"Domain: {domain.upper()}")
    print(f"{'='*50}")
    
    # Filter by domain
    domain_convs = [c for c in all_conversations if domain.lower() in c.get("domain", "").lower()]
    print(f"Found {len(domain_convs)} conversations.")
    
    if not domain_convs:
        continue
    
    if TEST_MODE:
        print(f"Test mode: limiting to {TEST_QUERY_LIMIT} queries.")
        domain_convs = domain_convs[:TEST_QUERY_LIMIT]
    
    for conv in tqdm(domain_convs, desc=domain):
        messages = conv.get("messages", [])
        query = extract_last_query(messages)
        
        if not query:
            continue
        
        # Retrieve documents
        try:
            docs = retriever.invoke(query)
        except Exception as e:
            print(f"Retrieval error: {e}")
            docs = []
        
        # Format contexts
        contexts = []
        context_text = ""
        for i, doc in enumerate(docs):
            meta = doc.metadata
            parent_text = meta.get("parent_text") or doc.page_content
            contexts.append({
                "document_id": str(meta.get("doc_id") or meta.get("parent_id") or f"{domain}_{i}"),
                "score": float(meta.get("relevance_score") or 0.0),
                "text": parent_text
            })
            context_text += f"[Document {i+1}]\n{parent_text}\n\n"
        
        # Format conversation history
        history = format_conversation_history(messages)
        
        # Generate answer with RAG
        answer = generate_answer(query, context_text.strip(), history)
        
        # Build result
        all_results.append({
            "conversation_id": conv.get("author"),
            "task_id": f"{conv.get('author')}::1",
            "Collection": f"mt-rag-{domain}",
            "input": [{"speaker": m["speaker"], "text": m["text"]} for m in messages],
            "contexts": contexts,
            "predictions": [{"text": answer}]
        })

print(f"\nTotal results: {len(all_results)}")

---

## 9. Save Results

In [None]:
print(f"Saving {len(all_results)} results to {OUTPUT_FILE}...")

with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    for item in all_results:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print("Saved successfully.")

# Validation
if all_results:
    sample = all_results[0]
    has_contexts = "contexts" in sample and isinstance(sample["contexts"], list)
    has_predictions = "predictions" in sample and isinstance(sample["predictions"], list)
    
    if has_contexts and has_predictions:
        print("Validation: PASS - Structure correct.")
    else:
        print("Validation: FAIL - Invalid structure.")