# SemEval 2026 Task 8: Multi-Turn RAG Evaluation

## Task B: Standalone Generation

This notebook implements **Task B** of the MTRAGEval benchmark, which evaluates the quality of answer generation using only the language model's parametric knowledge (no retrieval).

---

### Objective

Given a multi-turn conversation, generate an accurate and informative answer using only the LLM's internal knowledge, without access to external documents.

### Evaluation Focus

- **Factual Accuracy**: Correctness of generated information
- **Relevance**: How well the answer addresses the query
- **Coherence**: Logical structure and clarity
- **Completeness**: Coverage of key aspects

---

## 1. Environment Setup

In [None]:
import os
import sys
import json
from tqdm import tqdm

# Project root detection
if os.path.exists("src"):
    PROJECT_ROOT = os.getcwd()
else:
    PROJECT_ROOT = os.path.abspath("..")

if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

print(f"Project Root: {PROJECT_ROOT}")

---

## 2. Configuration

In [None]:
# ============================================================
# CONFIGURATION
# ============================================================

TEAM_NAME = "Gbgers"
DOMAINS = ["govt", "clapnq", "fiqa", "cloud"]

# Execution mode
TEST_MODE = True
TEST_QUERY_LIMIT = 5

# Paths
CONVERSATIONS_FILE = os.path.join(PROJECT_ROOT, "dataset/human/conversations/conversations.json")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "data/submissions")
OUTPUT_FILE = os.path.join(OUTPUT_DIR, f"submission_TaskB_{TEAM_NAME}.jsonl")

os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Mode: {'TEST' if TEST_MODE else 'FULL'}")
print(f"Output: {OUTPUT_FILE}")

---

## 3. Utility Functions

In [None]:
def extract_last_query(messages: list) -> str:
    """Extract the most recent user query from a conversation."""
    for msg in reversed(messages):
        if msg.get("speaker") == "user":
            return msg.get("text", "")
    return ""


def format_conversation_history(messages: list) -> str:
    """Format conversation history for context-aware generation."""
    history = []
    for msg in messages[:-1]:  # Exclude last message (the query)
        speaker = msg.get("speaker", "unknown").capitalize()
        text = msg.get("text", "")
        history.append(f"{speaker}: {text}")
    return "\n".join(history)

---

## 4. Initialize Language Model

In [None]:
import torch
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig

MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"

print(f"Loading model: {MODEL_ID}")

# 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=256,
        temperature=0.1,
        do_sample=True,
        repetition_penalty=1.1,
        return_full_text=False,
        pad_token_id=tokenizer.eos_token_id
    )

    llm = HuggingFacePipeline(pipeline=pipe)
    print("Model loaded successfully.")

except Exception as e:
    print(f"Error loading model: {e}")
    from langchain.llms.fake import FakeListLLM
    llm = FakeListLLM(responses=["[Dummy response]"])

---

## 5. Prompt Template

The prompt is designed using established prompt engineering principles:

- **Role Definition**: Expert assistant with domain expertise
- **Task Specification**: Clear instruction on response requirements
- **Behavioral Guidelines**: Constraints on response quality
- **Conversation Awareness**: Multi-turn context handling

In [None]:
# ============================================================
# TASK B PROMPT: STANDALONE GENERATION
# ============================================================

PROMPT_TEMPLATE = """You are an expert assistant with comprehensive knowledge spanning government policy, technology, finance, and general knowledge domains.

ROLE:
- You are a knowledgeable, helpful, and precise assistant
- You provide accurate information based on your training knowledge
- You communicate clearly and professionally

TASK:
Answer the user's question based solely on your internal knowledge. Do not reference or request external documents.

GUIDELINES:
1. Provide a direct, focused answer to the question
2. Be factually accurate - if uncertain, acknowledge limitations
3. Structure your response logically
4. Be concise while ensuring completeness
5. Use clear, professional language

---
CONVERSATION CONTEXT:
{conversation_history}
---

USER QUERY: {question}

RESPONSE:"""


def generate_answer(question: str, conversation_history: str = "") -> str:
    """Generate an answer using the LLM."""
    prompt = PROMPT_TEMPLATE.format(
        question=question,
        conversation_history=conversation_history if conversation_history else "[No prior context]"
    )
    
    try:
        return llm.invoke(prompt)
    except Exception as e:
        return f"[Generation error: {e}]"

---

## 6. Execute Generation

In [None]:
# Load conversations
print("Loading conversations...")
with open(CONVERSATIONS_FILE, 'r') as f:
    all_conversations = json.load(f)
print(f"Loaded {len(all_conversations)} conversations.")

all_results = []

for domain in DOMAINS:
    print(f"\n{'='*50}")
    print(f"Domain: {domain.upper()}")
    print(f"{'='*50}")
    
    # Filter by domain
    domain_convs = [c for c in all_conversations if domain.lower() in c.get("domain", "").lower()]
    print(f"Found {len(domain_convs)} conversations.")
    
    if not domain_convs:
        continue
    
    if TEST_MODE:
        print(f"Test mode: limiting to {TEST_QUERY_LIMIT} queries.")
        domain_convs = domain_convs[:TEST_QUERY_LIMIT]
    
    for conv in tqdm(domain_convs, desc=domain):
        messages = conv.get("messages", [])
        query = extract_last_query(messages)
        
        if not query:
            continue
        
        # Format conversation history
        history = format_conversation_history(messages)
        
        # Generate answer
        answer = generate_answer(query, history)
        
        # Build result
        all_results.append({
            "conversation_id": conv.get("author"),
            "task_id": f"{conv.get('author')}::1",
            "Collection": f"mt-rag-{domain}",
            "input": [{"speaker": m["speaker"], "text": m["text"]} for m in messages],
            "predictions": [{"text": answer}]
        })

print(f"\nTotal results: {len(all_results)}")

---

## 7. Save Results

In [None]:
print(f"Saving {len(all_results)} results to {OUTPUT_FILE}...")

with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    for item in all_results:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print("Saved successfully.")

# Validation
if all_results:
    sample = all_results[0]
    if "predictions" in sample and isinstance(sample["predictions"], list):
        print("Validation: PASS - Structure correct.")
    else:
        print("Validation: FAIL - Invalid structure.")