# SemEval 2026 Task 8: Multi-Turn RAG Evaluation

## Complete Pipeline for All Tasks (Graph-Enhanced)

This pipeline generates submissions for Tasks A, B, and C.

- **Task A (Retrieval)**: Uses BGE-M3 + Cross-Encoder Reranking.
- **Task B (Generation)**: Uses Direct LLM (Llama 3.1) with constraints.
- **Task C (RAG)**: Uses **Self-CRAG Graph** with Hallucination Check & Retries.


In [1]:
# KAGGLE SETUP (uncomment on Kaggle)
# import os
# if not os.path.exists("llm-semeval-task8"):
#     !git clone https://github.com/LookUpMark/llm-semeval-task8.git
# %cd llm-semeval-task8
# !git checkout dev
# !pip install -q langchain langchain-community langchain-huggingface langchain-qdrant qdrant-client sentence-transformers bitsandbytes accelerate transformers tqdm langgraph

In [2]:
import os, sys, json, zipfile
from tqdm import tqdm

if os.path.exists("src"): PROJECT_ROOT = os.getcwd()
elif os.path.exists("llm-semeval-task8"): PROJECT_ROOT = "llm-semeval-task8"
else: PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path: sys.path.insert(0, PROJECT_ROOT)

# Ingestion & Retrieval
from src.ingestion import load_and_chunk_data, build_vector_store
from src.retrieval import get_retriever, get_qdrant_client

# Task B (Simple Gen) & Helper
from src.generation import create_generation_components
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Task C (Advanced Graph)
from src.graph import initialize_graph

print(f"Project: {PROJECT_ROOT}")

In [3]:
# ============================================================
# CONFIGURATION
# ============================================================
TEAM_NAME = "Gbgers"
DOMAINS = ["govt", "clapnq", "fiqa", "cloud"]
COLLECTION_NAME = "mtrag_unified"

# --- IMPORTANT: Adjust these for your hardware ---
# TEST_MODE = True uses small subsets (~1k docs)
# TEST_MODE = False uses MAX_DOCS_PER_DOMAIN (~25k docs) for submission
TEST_MODE = False

TEST_CHUNK_LIMIT = 1000      # Chunks per domain for indexing
TEST_QUERY_LIMIT = 5         # Conversations per domain

# For FULL mode (~2.5h run)
MAX_DOCS_PER_DOMAIN = 25000

# Paths
CORPUS_DIR = os.path.join(PROJECT_ROOT, "dataset/corpora/passage_level")
CONV_FILE = os.path.join(PROJECT_ROOT, "dataset/human/conversations/conversations.json")
QDRANT_PATH = os.path.join(PROJECT_ROOT, "qdrant_db")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "data/submissions")

FILE_A = os.path.join(OUTPUT_DIR, f"submission_TaskA_{TEAM_NAME}.jsonl")
FILE_B = os.path.join(OUTPUT_DIR, f"submission_TaskB_{TEAM_NAME}.jsonl")
FILE_C = os.path.join(OUTPUT_DIR, f"submission_TaskC_{TEAM_NAME}.jsonl")

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(QDRANT_PATH, exist_ok=True)

print(f"Mode: {'TEST' if TEST_MODE else 'FULL'}")
if not TEST_MODE:
    print(f"Max docs/domain: {MAX_DOCS_PER_DOMAIN}")

In [4]:
def extract_last_query(msgs): return next((m["text"] for m in reversed(msgs) if m.get("speaker")=="user"), "")

def get_corpus(domain):
    p = os.path.join(CORPUS_DIR, f"{domain}.jsonl")
    z = p + ".zip"
    if not os.path.exists(p) and os.path.exists(z):
        with zipfile.ZipFile(z) as zf: zf.extractall(CORPUS_DIR)
    return p if os.path.exists(p) else None

def save_jsonl(data, path):
    with open(path, 'w') as f:
        for d in data: f.write(json.dumps(d, ensure_ascii=False)+'\n')
    print(f"Saved {len(data)} -> {path}")

In [5]:
# Build Index
need_build = True
try:
    client = get_qdrant_client(QDRANT_PATH)
    if client.collection_exists(COLLECTION_NAME):
        print(f"Collection exists: {client.get_collection(COLLECTION_NAME).points_count} vectors")
        need_build = False
except: pass

if need_build:
    print(f"Building '{COLLECTION_NAME}'...")
    all_docs = []
    limit = TEST_CHUNK_LIMIT if TEST_MODE else MAX_DOCS_PER_DOMAIN
    
    for domain in DOMAINS:
        path = get_corpus(domain)
        if not path: continue
        print(f"Loading {domain}...")
        docs = load_and_chunk_data(path)
        for d in docs: d.metadata["domain"] = domain
        
        # Apply limit
        if len(docs) > limit:
            print(f"  Limiting: {len(docs)} -> {limit}")
            docs = docs[:limit]
        
        all_docs.extend(docs)
        print(f"  Added {len(docs)} chunks")
    
    print(f"Total: {len(all_docs)} chunks")
    build_vector_store(all_docs, persist_dir=QDRANT_PATH, collection_name=COLLECTION_NAME)
    print("Done.")

In [6]:
# Initialize Components
print("Loading Retriever...")
retriever = get_retriever(qdrant_path=QDRANT_PATH, collection_name=COLLECTION_NAME)

print("Loading LLM for Task B...")
gen_components = create_generation_components()

print("Initializing Advanced Graph for Task C...")
graph_app = initialize_graph()

# Task B chain (no context)
task_b_prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are an expert assistant. Answer based on your knowledge. Be concise.<|eot_id|>
<|start_header_id|>user<|end_header_id|>
{question}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>""",
    input_variables=["question"]
)
task_b_chain = task_b_prompt | gen_components.llm | StrOutputParser()
print("All Systems Ready.")

In [7]:
# Execute Pipeline
with open(CONV_FILE) as f: conversations = json.load(f)
results_A, results_B, results_C = [], [], []

for domain in DOMAINS:
    print(f"\n=== {domain.upper()} ===")
    convs = [c for c in conversations if domain in c.get("domain", "").lower()]
    if TEST_MODE: convs = convs[:TEST_QUERY_LIMIT]
    print(f"Processing {len(convs)} conversations")
    
    for conv in tqdm(convs, desc=domain):
        msgs = conv.get("messages", [])
        q = extract_last_query(msgs)
        if not q: continue
        
        # --- Task A: Retrieve ---
        docs = retriever.invoke(q)
        contexts = []
        for i, d in enumerate(docs):
            txt = d.metadata.get("parent_text") or d.page_content
            contexts.append({"document_id": str(d.metadata.get("doc_id", f"{domain}_{i}")), "score": 0.0, "text": txt})
        
        # --- Task B: Generate (simple) ---
        try: ans_b = task_b_chain.invoke({"question": q})
        except Exception as e: ans_b = str(e)
        
        # --- Task C: RAG Generate (Advanced Graph) ---
        try:
            # Uses Self-CRAG with Retries
            response = graph_app.invoke({"question": q, "domain": domain})
            ans_c = response.get("generation", "I_DONT_KNOW")
        except Exception as e:
            ans_c = str(e)
        
        base = {"conversation_id": conv.get("author"), "task_id": f"{conv.get('author')}::1", "Collection": f"mt-rag-{domain}", "input": msgs}
        results_A.append({**base, "contexts": contexts})
        results_B.append({**base, "predictions": [{"text": ans_b}]})
        results_C.append({**base, "contexts": contexts, "predictions": [{"text": ans_c}]})

print(f"\nTotal: {len(results_A)} results")

In [8]:
save_jsonl(results_A, FILE_A)
save_jsonl(results_B, FILE_B)
save_jsonl(results_C, FILE_C)
print("Done!")