# SemEval 2026 Task 8: Multi-Turn RAG Evaluation

## Complete Pipeline for All Tasks

This notebook implements the complete RAG (Retrieval-Augmented Generation) pipeline for the MTRAGEval benchmark, generating submission files for all three evaluation tasks in a single execution.

---

### Task Overview

| Task | Description | Output |
|------|-------------|--------|
| **Task A** | Document Retrieval | Top-K relevant passages for each query |
| **Task B** | Standalone Generation | Answer using only LLM parametric knowledge |
| **Task C** | RAG Generation | Answer using retrieved context + LLM |

### Pipeline Architecture

```
Conversation --> Query Extraction --> Retrieval (Task A)
                                          |
                                          v
                          +---------------+---------------+
                          |                               |
                    Task B: LLM Only              Task C: RAG
                    (No Context)                  (With Context)
                          |                               |
                          v                               v
                   predictions.jsonl              predictions.jsonl
```

---

## 0. Environment Setup (Kaggle/Colab)

Execute this cell first when running on Kaggle or Google Colab to clone the repository and install dependencies.

In [None]:
# ============================================================
# KAGGLE/COLAB ENVIRONMENT SETUP
# Uncomment the lines below when running on cloud platforms
# ============================================================

# import os
# 
# # Clone repository if not present
# if not os.path.exists("llm-semeval-task8"):
#     !git clone https://github.com/LookUpMark/llm-semeval-task8.git
# 
# # Change to project directory
# %cd llm-semeval-task8
# !git checkout dev
# 
# # Install dependencies
# !pip install -q \
#     langchain \
#     langchain-community \
#     langchain-huggingface \
#     langchain-qdrant \
#     qdrant-client \
#     sentence-transformers \
#     bitsandbytes \
#     accelerate \
#     transformers \
#     tqdm
# 
# # Verify GPU availability
# import torch
# print(f"CUDA Available: {torch.cuda.is_available()}")
# if torch.cuda.is_available():
#     print(f"GPU Device: {torch.cuda.get_device_name(0)}")
#     print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

---

## 1. Imports and Configuration

In [None]:
import os
import sys
import json
import zipfile
from tqdm import tqdm
from pathlib import Path

# ============================================================
# PROJECT ROOT DETECTION
# Automatically detect project root for both local and cloud
# ============================================================
if os.path.exists("src"):
    PROJECT_ROOT = os.getcwd()
elif os.path.exists("llm-semeval-task8"):
    PROJECT_ROOT = "llm-semeval-task8"
else:
    PROJECT_ROOT = os.path.abspath("..")

if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

print(f"Project Root: {PROJECT_ROOT}")

In [None]:
# ============================================================
# CONFIGURATION
# ============================================================

# Team Information
TEAM_NAME = "Gbgers"

# Domains to process
DOMAINS = ["govt", "clapnq", "fiqa", "cloud"]

# Retriever Configuration
TOP_K_RETRIEVE = 20   # Candidates before reranking (high recall)
TOP_K_RERANK = 5      # Final documents after reranking (high precision)
COLLECTION_NAME = "mtrag_unified"

# Execution Mode
# Set TEST_MODE = False for full submission
TEST_MODE = True
TEST_SUBSET_SIZE = 1000   # Chunks per domain (indexing)
TEST_QUERY_LIMIT = 5      # Conversations per domain (inference)

# Path Configuration
CORPUS_BASE_DIR = os.path.join(PROJECT_ROOT, "dataset/corpora/passage_level")
CONVERSATIONS_FILE = os.path.join(PROJECT_ROOT, "dataset/human/conversations/conversations.json")
QDRANT_PATH = os.path.join(PROJECT_ROOT, "qdrant_db")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "data/submissions")

# Output Files
FILE_A = os.path.join(OUTPUT_DIR, f"submission_TaskA_{TEAM_NAME}.jsonl")
FILE_B = os.path.join(OUTPUT_DIR, f"submission_TaskB_{TEAM_NAME}.jsonl")
FILE_C = os.path.join(OUTPUT_DIR, f"submission_TaskC_{TEAM_NAME}.jsonl")

# Create directories
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(QDRANT_PATH, exist_ok=True)

# Status
mode_str = "TEST MODE" if TEST_MODE else "FULL MODE"
print(f"Execution Mode: {mode_str}")
if TEST_MODE:
    print(f"  - Index subset: {TEST_SUBSET_SIZE} chunks/domain")
    print(f"  - Query subset: {TEST_QUERY_LIMIT} conversations/domain")

---

## 2. Utility Functions

In [None]:
def extract_last_query(messages: list) -> str:
    """
    Extract the most recent user query from a conversation.
    
    Args:
        messages: List of message dictionaries with 'speaker' and 'text' keys.
        
    Returns:
        The text of the last user message, or empty string if not found.
    """
    for msg in reversed(messages):
        if msg.get("speaker") == "user":
            return msg.get("text", "")
    return ""


def get_corpus_file(domain: str) -> str:
    """
    Get the path to a domain corpus file, extracting from ZIP if necessary.
    
    Args:
        domain: Domain name (e.g., 'govt', 'fiqa').
        
    Returns:
        Path to the JSONL file, or None if not available.
    """
    jsonl_path = os.path.join(CORPUS_BASE_DIR, f"{domain}.jsonl")
    zip_path = os.path.join(CORPUS_BASE_DIR, f"{domain}.jsonl.zip")
    
    if not os.path.exists(jsonl_path):
        if os.path.exists(zip_path):
            print(f"Extracting {domain}.jsonl from archive...")
            with zipfile.ZipFile(zip_path, 'r') as zf:
                zf.extractall(CORPUS_BASE_DIR)
        else:
            return None
    return jsonl_path


def save_jsonl(data: list, path: str) -> None:
    """
    Save a list of dictionaries to a JSONL file.
    
    Args:
        data: List of dictionaries to save.
        path: Output file path.
    """
    with open(path, 'w', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    print(f"Saved {len(data)} items to {path}")

---

## 3. Build Unified Vector Index

This step creates a single Qdrant collection containing embeddings from all four domains. The unified index enables cross-domain retrieval and reduces memory overhead.

In [None]:
from src.ingestion import load_and_chunk_data, build_vector_store
from src.retrieval import get_retriever, get_qdrant_client

# Check for existing collection
need_build = True

if os.path.exists(QDRANT_PATH):
    try:
        client = get_qdrant_client(QDRANT_PATH)
        if client.collection_exists(COLLECTION_NAME):
            info = client.get_collection(COLLECTION_NAME)
            print(f"Found existing collection '{COLLECTION_NAME}' with {info.points_count} vectors.")
            need_build = False
    except Exception as e:
        print(f"Warning: {e}")

if need_build:
    print(f"Building unified collection '{COLLECTION_NAME}'...")
    all_docs = []
    
    for domain in DOMAINS:
        corpus_path = get_corpus_file(domain)
        if not corpus_path:
            print(f"Warning: Corpus not found for domain '{domain}'")
            continue
        
        print(f"Loading {domain}...")
        docs = load_and_chunk_data(corpus_path)
        
        # Add domain metadata for filtering
        for doc in docs:
            doc.metadata["domain"] = domain
        
        if TEST_MODE and len(docs) > TEST_SUBSET_SIZE:
            print(f"  Limiting to {TEST_SUBSET_SIZE} chunks (test mode)")
            docs = docs[:TEST_SUBSET_SIZE]
        
        all_docs.extend(docs)
        print(f"  Added {len(docs)} chunks")
    
    print(f"Total documents to index: {len(all_docs)}")
    build_vector_store(all_docs, persist_dir=QDRANT_PATH, collection_name=COLLECTION_NAME)
    print("Index construction complete.")

---

## 4. Initialize Retriever and Language Model

In [None]:
# ============================================================
# RETRIEVER INITIALIZATION
# Two-stage retrieval: Dense search -> Cross-encoder reranking
# ============================================================
print("Initializing retriever...")
retriever = get_retriever(
    qdrant_path=QDRANT_PATH,
    collection_name=COLLECTION_NAME,
    top_k_retrieve=TOP_K_RETRIEVE,
    top_k_rerank=TOP_K_RERANK
)
print("Retriever initialized successfully.")

In [None]:
# ============================================================
# LANGUAGE MODEL INITIALIZATION
# Using 4-bit quantization for memory efficiency
# ============================================================
import torch
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig

MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"

print(f"Loading language model: {MODEL_ID}")

# 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=256,
        temperature=0.1,
        do_sample=True,
        repetition_penalty=1.1,
        return_full_text=False,
        pad_token_id=tokenizer.eos_token_id
    )

    llm = HuggingFacePipeline(pipeline=pipe)
    print("Language model loaded successfully.")

except Exception as e:
    print(f"Error loading model: {e}")
    print("Falling back to dummy LLM for pipeline testing.")
    from langchain.llms.fake import FakeListLLM
    llm = FakeListLLM(responses=["[Dummy response - model not loaded]"])

---

## 5. Prompt Templates

The following prompts are designed using established prompt engineering principles:

- **Role Assignment**: Defines the assistant's expertise and behavior
- **Task Specification**: Clear instructions on what to produce
- **Format Constraints**: Output structure requirements
- **Grounding**: Context-based answering for Task C

In [None]:
# ============================================================
# TASK B: STANDALONE GENERATION (No Context)
# The model must rely solely on its parametric knowledge.
# ============================================================
PROMPT_TASK_B = """You are an expert assistant with comprehensive knowledge across government policy, technology, and finance domains.

Your task is to provide a direct, accurate, and informative answer to the user's question based solely on your training knowledge.

Guidelines:
- Provide factual, well-structured responses
- If uncertain, acknowledge limitations rather than fabricating information
- Be concise but thorough
- Use clear, professional language

Question: {question}

Answer:"""

# ============================================================
# TASK C: RAG GENERATION (With Retrieved Context)
# The model must ground its response in the provided documents.
# ============================================================
PROMPT_TASK_C = """You are an expert assistant specializing in document-grounded question answering.

Your task is to answer the user's question using ONLY the information provided in the context below. Follow these guidelines strictly:

Guidelines:
1. Base your answer exclusively on the provided context
2. If the context contains relevant information, synthesize it into a coherent response
3. If the context does not contain sufficient information to answer the question, explicitly state: "The provided documents do not contain enough information to answer this question."
4. Do not introduce external knowledge not present in the context
5. Maintain accuracy and avoid speculation

---
CONTEXT:
{context}
---

Question: {question}

Answer:"""


def generate_answer(question: str, context: str = None) -> str:
    """
    Generate an answer using the appropriate prompt template.
    
    Args:
        question: The user's question.
        context: Optional retrieved context for RAG generation.
        
    Returns:
        Generated answer string.
    """
    if llm is None:
        return "[Error: Language model not initialized]"
    
    if context:
        prompt = PROMPT_TASK_C.format(question=question, context=context)
    else:
        prompt = PROMPT_TASK_B.format(question=question)
    
    try:
        return llm.invoke(prompt)
    except Exception as e:
        return f"[Generation error: {e}]"

---

## 6. Pipeline Execution

Process all conversations and generate outputs for all three tasks simultaneously.

In [None]:
# Load conversation data
print("Loading conversation data...")
with open(CONVERSATIONS_FILE, 'r') as f:
    all_conversations = json.load(f)
print(f"Loaded {len(all_conversations)} conversations.")

# Result containers
results_A = []  # Task A: Retrieval
results_B = []  # Task B: Generation (no context)
results_C = []  # Task C: RAG (with context)

# Process each domain
for domain in DOMAINS:
    print(f"\n{'='*60}")
    print(f"Processing Domain: {domain.upper()}")
    print(f"{'='*60}")
    
    # Filter conversations by domain
    domain_convs = [
        c for c in all_conversations 
        if domain.lower() in c.get("domain", "").lower()
    ]
    print(f"Found {len(domain_convs)} conversations.")
    
    if not domain_convs:
        continue
    
    # Apply test mode limit
    if TEST_MODE:
        print(f"Test mode: limiting to {TEST_QUERY_LIMIT} conversations.")
        domain_convs = domain_convs[:TEST_QUERY_LIMIT]
    
    # Process conversations
    for conv in tqdm(domain_convs, desc=f"{domain}"):
        messages = conv.get("messages", [])
        query = extract_last_query(messages)
        
        if not query:
            continue
        
        # --- TASK A: Retrieval ---
        try:
            docs = retriever.invoke(query)
        except Exception as e:
            print(f"Retrieval error: {e}")
            docs = []
        
        # Format retrieved contexts
        contexts = []
        context_text = ""
        for i, doc in enumerate(docs):
            meta = doc.metadata
            parent_text = meta.get("parent_text") or doc.page_content
            contexts.append({
                "document_id": str(meta.get("doc_id") or meta.get("parent_id") or f"{domain}_{i}"),
                "score": float(meta.get("relevance_score") or 0.0),
                "text": parent_text
            })
            context_text += parent_text + "\n\n"
        
        # --- TASK B: Generation (standalone) ---
        answer_b = generate_answer(query, context=None)
        
        # --- TASK C: RAG Generation ---
        answer_c = generate_answer(query, context=context_text.strip())
        
        # --- Format Results ---
        base_result = {
            "conversation_id": conv.get("author"),
            "task_id": f"{conv.get('author')}::1",
            "Collection": f"mt-rag-{domain}",
            "input": [{"speaker": m["speaker"], "text": m["text"]} for m in messages]
        }
        
        # Task A result
        result_a = base_result.copy()
        result_a["contexts"] = contexts
        results_A.append(result_a)
        
        # Task B result
        result_b = base_result.copy()
        result_b["predictions"] = [{"text": answer_b}]
        results_B.append(result_b)
        
        # Task C result
        result_c = base_result.copy()
        result_c["contexts"] = contexts
        result_c["predictions"] = [{"text": answer_c}]
        results_C.append(result_c)

print(f"\n{'='*60}")
print("Processing Complete")
print(f"{'='*60}")
print(f"Task A results: {len(results_A)}")
print(f"Task B results: {len(results_B)}")
print(f"Task C results: {len(results_C)}")

---

## 7. Save Submission Files

In [None]:
print("Saving submission files...")
print()

save_jsonl(results_A, FILE_A)
save_jsonl(results_B, FILE_B)
save_jsonl(results_C, FILE_C)

print()
print("All submission files saved successfully.")

---

## 8. Output Validation

In [None]:
print("Validating submission files...")
print()

def validate_task(results: list, task: str) -> tuple:
    """
    Validate the structure of task results.
    
    Returns:
        (is_valid, message) tuple
    """
    if not results:
        return False, "No results generated"
    
    sample = results[0]
    
    if task == "A":
        valid = "contexts" in sample and isinstance(sample["contexts"], list)
    elif task == "B":
        valid = "predictions" in sample and isinstance(sample["predictions"], list)
    elif task == "C":
        valid = "contexts" in sample and "predictions" in sample
    else:
        valid = False
    
    return valid, "Valid" if valid else "Invalid structure"


validation_results = [
    ("Task A (Retrieval)", results_A, "A"),
    ("Task B (Generation)", results_B, "B"),
    ("Task C (RAG)", results_C, "C")
]

all_valid = True
for name, results, task in validation_results:
    valid, msg = validate_task(results, task)
    status = "PASS" if valid else "FAIL"
    print(f"  {name}: {status} ({msg})")
    all_valid = all_valid and valid

print()
if all_valid:
    print("All validations passed. Ready for submission.")
else:
    print("Validation errors detected. Please review the output.")