# SemEval 2026 Task 8 - Task A: Retrieval

This notebook focuses on **Task A: Retrieval**. 
Goal: Given a question, retrieve relevant documents from the corpus.

**Output Format**:
The submission file must contain a `contexts` field (List of Objects), where each object has:
- `document_id`: The unique ID of the document.
- `score`: The relevance score.
- `text`: The document content (optional but recommended for debugging).

In [None]:
# --- KAGGLE SETUP ---
# Uncomment and run this cell FIRST if you are running on Kaggle.
# It clones the repo, installs dependencies, and sets the working directory.

# import os
# if not os.path.exists("llm-semeval-task8"):
#     !git clone https://github.com/LookUpMark/llm-semeval-task8.git

# %cd llm-semeval-task8
# !git checkout dev
# !pip install -c scripts/evaluation/constraints.txt -r scripts/evaluation/requirements.txt

In [None]:
import os
import json
import sys
from tqdm import tqdm
from typing import List, Dict, Any

# Locate Project Root (Robust for Local vs Kaggle)
if os.path.exists("src"):
    # We are in the root (e.g. Kaggle after %cd)
    project_root = os.getcwd()
else:
    # We are likely in 'notebooks/' (Local)
    project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

if project_root not in sys.path:
    sys.path.append(project_root)

# Import RAG pipeline
try:
    from src.graph import app
except ImportError:
    print("ERROR: Run this notebook from the 'notebooks/' directory or repo root.")

In [None]:
# --- CONFIGURATION ---
TEAM_NAME = "Gbgers"
TASK_TYPE = "TaskA"
# Adjust path based on execution environment
base_path = "." if os.path.exists("dataset") else ".."
INPUT_FILE = os.path.join(base_path, "dataset/human/generation_tasks/reference.jsonl")
OUTPUT_DIR = os.path.join(base_path, "data/submissions")
OUTPUT_FILE = os.path.join(OUTPUT_DIR, f"submission_{TASK_TYPE}_{TEAM_NAME}.jsonl")

os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Target: {OUTPUT_FILE}")

In [None]:
def load_data(path):
    data = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data

test_data = load_data(INPUT_FILE)
print(f"Loaded {len(test_data)} questions.")

In [None]:
results = []

print("Running Retrieval Task...")
for item in tqdm(test_data):
    question = item.get("question")
    
    # Invoke Graph (will retrieve docs)
    try:
        # Start graph
        resp = app.invoke({"question": question})
        raw_docs = resp.get("documents", [])
    except Exception as e:
        print(f"Error on '{question}': {e}")
        raw_docs = []
    
    # Format Contexts
    contexts = []
    for doc in raw_docs:
        meta = getattr(doc, "metadata", {})
        contexts.append({
            "document_id": meta.get("id", meta.get("document_id", "unknown")),
            "text": getattr(doc, "page_content", ""),
            "score": float(meta.get("relevance_score", meta.get("score", 0.0)))
        })
    
    output_item = item.copy()
    output_item["contexts"] = contexts
    results.append(output_item)

print(f"Done. Generated {len(results)} entries.")

In [None]:
# Save
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    for x in results:
        json.dump(x, f)
        f.write('\n')
print(f"Saved to {OUTPUT_FILE}")

# Simple Validation
sample = results[0]
if "contexts" in sample and isinstance(sample["contexts"], list):
    print("\033[92mVALIDATION PASS: Structure correct.\033[0m")
else:
    print("\033[91mVALIDATION FAIL: Missing 'contexts'.\033[0m")