# SemEval 2026 Task 8 - Task C: RAG (End-to-End)

This notebook focuses on **Task C: Retrieval-Augmented Generation**. 
Goal: Given a question, retrieve documents AND generate an answer.

**Output Format**:
The submission file must contain BOTH:
- `contexts`: List of objects (`document_id`, `score`, `text`).
- `predictions`: List of objects (`text` for output answer).

In [None]:
# --- KAGGLE SETUP ---
# Uncomment and run this cell FIRST if you are running on Kaggle.
# It clones the repo, installs dependencies, and sets the working directory.

# import os
# if not os.path.exists("llm-semeval-task8"):
#     !git clone https://github.com/LookUpMark/llm-semeval-task8.git

# %cd llm-semeval-task8
# !git checkout dev
# !pip install -c scripts/evaluation/constraints.txt -r scripts/evaluation/requirements.txt

In [None]:
import os
import json
import sys
from tqdm import tqdm

# Locate Project Root (Robust for Local vs Kaggle)
if os.path.exists("src"):
    # We are in the root (e.g. Kaggle after %cd)
    project_root = os.getcwd()
else:
    # We are likely in 'notebooks/' (Local)
    project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

if project_root not in sys.path:
    sys.path.append(project_root)

try:
    from src.graph import app
except ImportError:
    print("ERROR: Check imports.")

In [None]:
# --- CONFIGURATION ---
TEAM_NAME = "Gbgers"
TASK_TYPE = "TaskC"
# Adjust path based on execution environment
base_path = "." if os.path.exists("dataset") else ".."
INPUT_FILE = os.path.join(base_path, "dataset/human/generation_tasks/reference.jsonl")
OUTPUT_DIR = os.path.join(base_path, "data/submissions")
OUTPUT_FILE = os.path.join(OUTPUT_DIR, f"submission_{TASK_TYPE}_{TEAM_NAME}.jsonl")

os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Target: {OUTPUT_FILE}")

In [None]:
with open(INPUT_FILE) as f:
    test_data = [json.loads(line) for line in f if line.strip()]

results = []
print(f"Running RAG for {len(test_data)} items...")

for item in tqdm(test_data):
    question = item.get("question")
    
    try:
        resp = app.invoke({"question": question})
        raw_docs = resp.get("documents", [])
        gen_text = resp.get("generation", "")
    except Exception as e:
        print(f"Error: {e}")
        raw_docs = []
        gen_text = "Error"
    
    # Format Contexts
    contexts = []
    for doc in raw_docs:
        meta = getattr(doc, "metadata", {})
        contexts.append({
            "document_id": meta.get("id", meta.get("document_id", "unknown")),
            "text": getattr(doc, "page_content", ""),
            "score": float(meta.get("relevance_score", meta.get("score", 0.0)))
        })
        
    # Format Prediction
    predictions = [{"text": gen_text if gen_text else "No Answer"}]
    
    output_item = item.copy()
    output_item["contexts"] = contexts
    output_item["predictions"] = predictions
    results.append(output_item)

print("RAG Execution Completed.")

In [None]:
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    for x in results:
        json.dump(x, f)
        f.write('\n')
print(f"Saved to {OUTPUT_FILE}")

# Validation
sample = results[0]
valid_c = ("contexts" in sample and isinstance(sample["contexts"], list)) and \
          ("predictions" in sample and isinstance(sample["predictions"], list))

if valid_c:
    print("\033[92mVALIDATION PASS: Structure correct for Task C.\033[0m")
else:
    print("\033[91mVALIDATION FAIL: Missing fields.\033[0m")