# SemEval 2026 Task 8: Multi-Turn RAG Evaluation

## Complete Pipeline for All Tasks

This notebook implements the complete RAG (Retrieval-Augmented Generation) pipeline for the MTRAGEval benchmark, generating submission files for all three evaluation tasks in a single execution.

---

### Task Overview

| Task | Description | Output |
|------|-------------|--------|
| **Task A** | Document Retrieval | Top-K relevant passages for each query |
| **Task B** | Standalone Generation | Answer using only LLM parametric knowledge |
| **Task C** | RAG Generation | Answer using retrieved context + LLM |

### Components

| Component | Source | Description |
|-----------|--------|-------------|
| Retriever | `src.retrieval` | Dense Search (BGE) + Cross-Encoder Reranking |
| Generator | `src.generation` | Llama-3.1-8B with constrained prompts |
| Pipeline | `All_Tasks_Pipeline` | Unified execution loop |

---

## 0. Environment Setup

In [None]:
# ============================================================
# KAGGLE/COLAB SETUP
# ============================================================
# import os
# if not os.path.exists("llm-semeval-task8"):
#     !git clone https://github.com/LookUpMark/llm-semeval-task8.git
# %cd llm-semeval-task8
# !git checkout dev
# !pip install -q langchain langchain-community langchain-huggingface langchain-qdrant qdrant-client sentence-transformers bitsandbytes accelerate transformers tqdm

---

## 1. Imports and Configuration

In [None]:
import os
import sys
import json
import zipfile
from tqdm import tqdm
from pathlib import Path

# Project Root Detection
if os.path.exists("src"):
    PROJECT_ROOT = os.getcwd()
elif os.path.exists("llm-semeval-task8"):
    PROJECT_ROOT = "llm-semeval-task8"
else:
    PROJECT_ROOT = os.path.abspath("..")

if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

print(f"Project Root: {PROJECT_ROOT}")

# Import shared modules
try:
    from src.ingestion import load_and_chunk_data, build_vector_store
    from src.retrieval import get_retriever, get_qdrant_client
    from src.generation import create_generation_components
    # Import necessary LangChain components for custom chains
    from langchain_core.prompts import PromptTemplate
    from langchain_core.output_parsers import StrOutputParser
except ImportError as e:
    print(f"Error importing modules: {e}")
    print("Ensure you are running from the project root or notebook directory.")

In [None]:
# ============================================================
# CONFIGURATION
# ============================================================

TEAM_NAME = "Gbgers"
DOMAINS = ["govt", "clapnq", "fiqa", "cloud"]
COLLECTION_NAME = "mtrag_unified"

# Execution Mode
TEST_MODE = True
TEST_SUBSET_SIZE = 1000
TEST_QUERY_LIMIT = 5

# Paths
CORPUS_BASE_DIR = os.path.join(PROJECT_ROOT, "dataset/corpora/passage_level")
CONVERSATIONS_FILE = os.path.join(PROJECT_ROOT, "dataset/human/conversations/conversations.json")
QDRANT_PATH = os.path.join(PROJECT_ROOT, "qdrant_db")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "data/submissions")

# Output Files
FILE_A = os.path.join(OUTPUT_DIR, f"submission_TaskA_{TEAM_NAME}.jsonl")
FILE_B = os.path.join(OUTPUT_DIR, f"submission_TaskB_{TEAM_NAME}.jsonl")
FILE_C = os.path.join(OUTPUT_DIR, f"submission_TaskC_{TEAM_NAME}.jsonl")

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(QDRANT_PATH, exist_ok=True)

print(f"Mode: {'TEST' if TEST_MODE else 'FULL'}")

---

## 2. Utility Functions

In [None]:
def extract_last_query(messages: list) -> str:
    """Extract the most recent user query from a conversation."""
    for msg in reversed(messages):
        if msg.get("speaker") == "user":
            return msg.get("text", "")
    return ""

def get_corpus_file(domain: str) -> str:
    """Get or extract corpus file path."""
    jsonl_path = os.path.join(CORPUS_BASE_DIR, f"{domain}.jsonl")
    zip_path = os.path.join(CORPUS_BASE_DIR, f"{domain}.jsonl.zip")
    
    if not os.path.exists(jsonl_path):
        if os.path.exists(zip_path):
            print(f"Extracting {domain}.jsonl...")
            with zipfile.ZipFile(zip_path, 'r') as zf:
                zf.extractall(CORPUS_BASE_DIR)
        else:
            return None
    return jsonl_path

def save_jsonl(data: list, path: str) -> None:
    """Save list of dicts to JSONL file."""
    with open(path, 'w', encoding='utf-8') as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    print(f"Saved {len(data)} items to {path}")

---

## 3. Build Unified Vector Index

In [None]:
need_build = True
if os.path.exists(QDRANT_PATH):
    try:
        client = get_qdrant_client(QDRANT_PATH)
        if client.collection_exists(COLLECTION_NAME):
            info = client.get_collection(COLLECTION_NAME)
            print(f"Collection found: {info.points_count} vectors")
            need_build = False
    except: pass

if need_build:
    print(f"Building collection '{COLLECTION_NAME}'...")
    all_docs = []
    for domain in DOMAINS:
        path = get_corpus_file(domain)
        if not path: continue
        docs = load_and_chunk_data(path)
        for doc in docs: doc.metadata["domain"] = domain
        if TEST_MODE: docs = docs[:TEST_SUBSET_SIZE]
        all_docs.extend(docs)
    
    build_vector_store(all_docs, persist_dir=QDRANT_PATH, collection_name=COLLECTION_NAME)
    print("Index built.")

---

## 4. Initialize Components (Retriever & Generator)

We use the unified `create_generation_components` factory to load the LLM (Llama 3.1) and all associated chains.

In [None]:
# 1. Initialize Retriever
print("Initializing Retriever...")
retriever = get_retriever(
    qdrant_path=QDRANT_PATH,
    collection_name=COLLECTION_NAME
)

# 2. Initialize Generation Components (LLM + Chains)
# This loads Llama 3.1 8B Instruct with 4-bit quantization
print("Initializing Generator (Llama 3.1 8B)...")
gen_components = create_generation_components(model_id="meta-llama/Meta-Llama-3.1-8B-Instruct")

# 3. Create Custom Chain for Task B (Standalone Generation)
# Since Task B is NOT RAG, we need a separate prompt that doesn't use retrieved documents
task_b_prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are an expert assistant with comprehensive knowledge. 
Task: Answer the user's question based solely on your internal knowledge. Do not use external documents.
Response: Be concise, accurate, and professional.<|eot_id|>
<|start_header_id|>user<|end_header_id|>
{question}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>""",
    input_variables=["question"]
)
# Create chain reusing the LLM from gen_components
task_b_chain = task_b_prompt | gen_components.llm | StrOutputParser()

---

## 5. Execute Unified Pipeline

In [None]:
print("Loading conversations...")
with open(CONVERSATIONS_FILE, 'r') as f:
    all_conversations = json.load(f)

results_A, results_B, results_C = [], [], []

for domain in DOMAINS:
    print(f"\n{'='*40}\nProcessing Domain: {domain.upper()}\n{'='*40}")
    
    domain_convs = [c for c in all_conversations if domain.lower() in c.get("domain", "").lower()]
    if TEST_MODE:
        domain_convs = domain_convs[:TEST_QUERY_LIMIT]
    
    for conv in tqdm(domain_convs, desc=domain):
        messages = conv.get("messages", [])
        query = extract_last_query(messages)
        if not query: continue
        
        # --- TASK A: Retrieval ---
        docs = retriever.invoke(query)
        
        # Format Contexts
        contexts = []
        context_text = ""
        for i, doc in enumerate(docs):
            meta = doc.metadata
            text = meta.get("parent_text") or doc.page_content
            contexts.append({
                "document_id": str(meta.get("doc_id") or meta.get("parent_id") or f"{domain}_{i}"),
                "score": float(meta.get("relevance_score") or 0.0),
                "text": text
            })
            context_text += f"[Document {i+1}]\n{text}\n\n"
        
        # --- TASK B: Standalone Generation ---
        try:
            answer_b = task_b_chain.invoke({"question": query})
        except Exception as e:
            answer_b = str(e)
        
        # --- TASK C: RAG Generation (using src.generation.generator) ---
        try:
            # Uses key 'context' and 'question' as required by src/generation.py template
            answer_c = gen_components.generator.invoke({"context": context_text, "question": query})
        except Exception as e:
            answer_c = str(e)

        # --- Collect Results ---
        base = {
            "conversation_id": conv.get("author"),
            "task_id": f"{conv.get('author')}::1",
            "Collection": f"mt-rag-{domain}",
            "input": [{"speaker": m["speaker"], "text": m["text"]} for m in messages]
        }
        
        results_A.append({**base, "contexts": contexts})
        results_B.append({**base, "predictions": [{"text": answer_b}]})
        results_C.append({**base, "contexts": contexts, "predictions": [{"text": answer_c}]})

print(f"\nProcessing Complete. Total Conversations: {len(results_A)}")

---

## 6. Save Results

In [None]:
print("Saving submission files...")
save_jsonl(results_A, FILE_A)
save_jsonl(results_B, FILE_B)
save_jsonl(results_C, FILE_C)
print("Done.")