# SemEval 2026 Task 8: Multi-Turn RAG Evaluation

## Task A: Document Retrieval

This notebook implements **Task A** of the MTRAGEval benchmark, which evaluates the quality of document retrieval in a multi-turn conversational setting.

---

### Objective

Given a multi-turn conversation, retrieve the top-K most relevant documents from a domain-specific corpus that can be used to answer the user's question.

### Retrieval Pipeline

```
Query --> Dense Retrieval (BGE-M3) --> Top-20 Candidates
                                            |
                                            v
                                   Cross-Encoder Reranking
                                            |
                                            v
                                      Top-5 Documents
```

### Components

| Component | Model | Purpose |
|-----------|-------|--------|
| Embeddings | `BAAI/bge-m3` | Dense vector representation |
| Reranker | `BAAI/bge-reranker-v2-m3` | Cross-encoder scoring |
| Vector Store | Qdrant | Efficient similarity search |

---

## 1. Environment Setup

In [None]:
import os
import sys
import json
import zipfile
from tqdm import tqdm
from pathlib import Path

# Project root detection
if os.path.exists("src"):
    PROJECT_ROOT = os.getcwd()
else:
    PROJECT_ROOT = os.path.abspath("..")

if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from src.ingestion import load_and_chunk_data, build_vector_store
from src.retrieval import get_retriever, get_qdrant_client

print(f"Project Root: {PROJECT_ROOT}")

---

## 2. Configuration

In [None]:
# ============================================================
# CONFIGURATION
# ============================================================

TEAM_NAME = "Gbgers"
DOMAINS = ["govt", "clapnq", "fiqa", "cloud"]

# Retriever settings
TOP_K_RETRIEVE = 20   # Initial dense retrieval
TOP_K_RERANK = 5      # After cross-encoder reranking
COLLECTION_NAME = "mtrag_unified"

# Execution mode
TEST_MODE = True
TEST_SUBSET_SIZE = 1000
TEST_QUERY_LIMIT = 10

# Paths
CORPUS_BASE_DIR = os.path.join(PROJECT_ROOT, "dataset/corpora/passage_level")
CONVERSATIONS_FILE = os.path.join(PROJECT_ROOT, "dataset/human/conversations/conversations.json")
QDRANT_PATH = os.path.join(PROJECT_ROOT, "qdrant_db")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "data/submissions")
OUTPUT_FILE = os.path.join(OUTPUT_DIR, f"submission_TaskA_{TEAM_NAME}.jsonl")

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(QDRANT_PATH, exist_ok=True)

print(f"Mode: {'TEST' if TEST_MODE else 'FULL'}")
print(f"Output: {OUTPUT_FILE}")

---

## 3. Utility Functions

In [None]:
def extract_last_query(messages: list) -> str:
    """Extract the most recent user query from a conversation."""
    for msg in reversed(messages):
        if msg.get("speaker") == "user":
            return msg.get("text", "")
    return ""


def get_corpus_file(domain: str) -> str:
    """Get corpus file path, extracting from ZIP if necessary."""
    jsonl_path = os.path.join(CORPUS_BASE_DIR, f"{domain}.jsonl")
    zip_path = os.path.join(CORPUS_BASE_DIR, f"{domain}.jsonl.zip")
    
    if not os.path.exists(jsonl_path):
        if os.path.exists(zip_path):
            print(f"Extracting {domain}.jsonl...")
            with zipfile.ZipFile(zip_path, 'r') as zf:
                zf.extractall(CORPUS_BASE_DIR)
        else:
            return None
    return jsonl_path

---

## 4. Build Unified Vector Index

In [None]:
# Check for existing collection
need_build = True

if os.path.exists(QDRANT_PATH):
    try:
        client = get_qdrant_client(QDRANT_PATH)
        if client.collection_exists(COLLECTION_NAME):
            info = client.get_collection(COLLECTION_NAME)
            print(f"Existing collection found: {info.points_count} vectors")
            need_build = False
    except Exception as e:
        print(f"Warning: {e}")

if need_build:
    print(f"Building collection '{COLLECTION_NAME}'...")
    all_docs = []
    
    for domain in DOMAINS:
        corpus_path = get_corpus_file(domain)
        if not corpus_path:
            print(f"Warning: Corpus not found for {domain}")
            continue
        
        print(f"Loading {domain}...")
        docs = load_and_chunk_data(corpus_path)
        
        for doc in docs:
            doc.metadata["domain"] = domain
        
        if TEST_MODE and len(docs) > TEST_SUBSET_SIZE:
            print(f"  Limiting to {TEST_SUBSET_SIZE} chunks")
            docs = docs[:TEST_SUBSET_SIZE]
        
        all_docs.extend(docs)
        print(f"  Added {len(docs)} chunks")
    
    print(f"Total: {len(all_docs)} documents")
    build_vector_store(all_docs, persist_dir=QDRANT_PATH, collection_name=COLLECTION_NAME)
    print("Index built successfully.")

---

## 5. Initialize Retriever

In [None]:
print("Initializing retriever...")
retriever = get_retriever(
    qdrant_path=QDRANT_PATH,
    collection_name=COLLECTION_NAME,
    top_k_retrieve=TOP_K_RETRIEVE,
    top_k_rerank=TOP_K_RERANK
)
print("Retriever ready.")

---

## 6. Execute Retrieval

In [None]:
# Load conversations
print("Loading conversations...")
with open(CONVERSATIONS_FILE, 'r') as f:
    all_conversations = json.load(f)
print(f"Loaded {len(all_conversations)} conversations.")

all_results = []

for domain in DOMAINS:
    print(f"\n{'='*50}")
    print(f"Domain: {domain.upper()}")
    print(f"{'='*50}")
    
    # Filter by domain
    domain_convs = [c for c in all_conversations if domain.lower() in c.get("domain", "").lower()]
    print(f"Found {len(domain_convs)} conversations.")
    
    if not domain_convs:
        continue
    
    if TEST_MODE:
        print(f"Test mode: limiting to {TEST_QUERY_LIMIT} queries.")
        domain_convs = domain_convs[:TEST_QUERY_LIMIT]
    
    for conv in tqdm(domain_convs, desc=domain):
        messages = conv.get("messages", [])
        query = extract_last_query(messages)
        
        if not query:
            continue
        
        # Retrieve documents
        try:
            docs = retriever.invoke(query)
        except Exception as e:
            print(f"Error: {e}")
            docs = []
        
        # Format contexts
        contexts = []
        for i, doc in enumerate(docs):
            meta = doc.metadata
            contexts.append({
                "document_id": str(meta.get("doc_id") or meta.get("parent_id") or f"{domain}_{i}"),
                "score": float(meta.get("relevance_score") or 0.0),
                "text": meta.get("parent_text") or doc.page_content
            })
        
        # Build result
        all_results.append({
            "conversation_id": conv.get("author"),
            "task_id": f"{conv.get('author')}::1",
            "Collection": f"mt-rag-{domain}",
            "input": [{"speaker": m["speaker"], "text": m["text"]} for m in messages],
            "contexts": contexts
        })

print(f"\nTotal results: {len(all_results)}")

---

## 7. Save Results

In [None]:
print(f"Saving {len(all_results)} results to {OUTPUT_FILE}...")

with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    for item in all_results:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print("Saved successfully.")

# Validation
if all_results:
    sample = all_results[0]
    if "contexts" in sample and isinstance(sample["contexts"], list):
        print("Validation: PASS - Structure correct.")
    else:
        print("Validation: FAIL - Invalid structure.")