# SemEval 2026 Task 5

This notebook trains a **Transformer** model to predict plausibility scores (1–5) for word senses in narrative contexts.

Metrics reported:
- Spearman correlation
- Accuracy within standard deviation 

In [None]:
#!pip install --upgrade --force-reinstall torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130

# %pip install -q transformers datasets accelerate evaluate scipy

In [None]:
from __future__ import annotations

import json
import statistics
import sys
from pathlib import Path
from typing import Any

import numpy as np
from scipy.stats import spearmanr

import torch
from datasets import Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    set_seed,
 )

if 'google.colab' in sys.modules:
    print('Running in Google Colab. Mounting Google Drive...')
    from google.colab import drive
    drive.mount('/content/drive')
    PROJECT_ROOT = Path('/content/drive/My Drive/Colab Notebooks/semeval26-05-scripts')
else:
    print('Not running in Google Colab.')
    PROJECT_ROOT = Path.cwd()

DATA_DIR = PROJECT_ROOT / 'data'
TRAIN_JSON = DATA_DIR / 'train.json'
DEV_JSON = DATA_DIR / 'dev.json'
TEST_JSON = DATA_DIR / 'test.json'

assert TEST_JSON.exists(), f'Missing: {TEST_JSON}'
assert TRAIN_JSON.exists(), f'Missing: {TRAIN_JSON}'
assert DEV_JSON.exists(), f'Missing: {DEV_JSON}'

print('Python:', sys.executable)
print('Torch:', torch.__version__)
print('Torch CUDA build:', torch.version.cuda)
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))

print('Train file:', TRAIN_JSON)
print('Dev file:', DEV_JSON)
print('Test file:', TEST_JSON)

Running in Google Colab. Mounting Google Drive...
Mounted at /content/drive
Python: /usr/bin/python3
Torch: 2.9.0+cu126
Torch CUDA build: 12.6
CUDA available: True
GPU: Tesla T4
Train file: /content/drive/My Drive/Colab Notebooks/semeval26-05-scripts/data/train.json
Dev file: /content/drive/My Drive/Colab Notebooks/semeval26-05-scripts/data/dev.json
Test file: /content/drive/My Drive/Colab Notebooks/semeval26-05-scripts/data/test.json


In [3]:
from pathlib import Path

drive_path = Path('/content/drive/My Drive/Colab Notebooks/semeval26-05-scripts/data')

if drive_path.exists():
    print(f"Contents of {drive_path}:")
    for item in drive_path.iterdir():
        print(f"- {item.name}")
else:
    print(f"Directory not found: {drive_path}. Please ensure your Google Drive is mounted correctly.")

Contents of /content/drive/My Drive/Colab Notebooks/semeval26-05-scripts/data:
- train.json
- dev.json
- test.json


## 2) Data loading
Load the JSON files and convert them into flat examples.

In [None]:
def load_split(path):
    with path.open('r', encoding='utf-8') as f:
        return json.load(f)

def iter_sorted_items(raw):
    for k in sorted(raw.keys(), key=lambda x: int(x)):
        yield k, raw[k]

train_raw = load_split(TRAIN_JSON)
dev_raw = load_split(DEV_JSON)
test_raw = load_split(TEST_JSON)

print('Train samples:', len(train_raw))
print('Dev samples:', len(dev_raw))
print('Test samples:', len(test_raw))
print('Example fields:', list(next(iter(train_raw.values())).keys()))

Train samples: 2280
Dev samples: 588
Test samples: 930
Example fields: ['homonym', 'judged_meaning', 'precontext', 'sentence', 'ending', 'choices', 'average', 'stdev', 'nonsensical', 'sample_id', 'example_sentence']


## 3) Data preprocessing
Build the model input text and labels.

We fine-tune a Transformer **classifier** to predict an integer score (1–5).
To help the model, we format inputs with explicit sections (precontext/sentence/ending/etc.) plus a direct question.

In [None]:
from typing import Any
import numpy as np
import json
from pathlib import Path

PROJECT_ROOT = Path('/content/drive/My Drive/Colab Notebooks/semeval26-05-scripts')
DATA_DIR = PROJECT_ROOT / 'data'
TRAIN_JSON = DATA_DIR / 'train.json'
DEV_JSON = DATA_DIR / 'dev.json'

def load_split(path):
    with path.open('r', encoding='utf-8') as f:
        return json.load(f)

def iter_sorted_items(raw):
    for k in sorted(raw.keys(), key=lambda x: int(x)):
        yield k, raw[k]

train_raw = load_split(TRAIN_JSON)
dev_raw = load_split(DEV_JSON)

import re
def mark_homonym(text, hom):
    hom = (hom or "").strip()
    if not hom:
        return text
    pattern = re.compile(rf"\b({re.escape(hom)})\b", flags=re.IGNORECASE)
    return pattern.sub(r"<t> \1 </t>", text, count=1)

def build_pair(sample):
    pre = str(sample.get('precontext', '')).strip()
    sent = str(sample.get('sentence', '')).strip()
    end = str(sample.get('ending', '')).strip()

    hom = str(sample.get('homonym', '')).strip()
    meaning = str(sample.get('judged_meaning', '')).strip()
    ex = str(sample.get('example_sentence', '')).strip()

    story = " ".join(x for x in [pre, sent, end] if x)
    story = mark_homonym(story, hom)

    nons = sample.get("nonsensical", [])
    n_rate = sum(bool(x) for x in nons) / max(1, len(nons))
    sense = f"{hom} = {meaning}. Example: {ex}. Nonsense_votes: {n_rate:.2f}"
    return story, sense

def choices_to_soft(choices):
    counts = np.zeros(5, dtype=np.float32)
    for c in choices:
        counts[int(c) - 1] += 1.0
    probs = counts / counts.sum()
    return probs.tolist()

def clip_round_to_1_5(x):
    return int(np.clip(int(round(float(x))), 1, 5))

train_ids, train_a, train_b, train_labels_soft = [], [], [], []
for k, s in iter_sorted_items(train_raw):
    a, b = build_pair(s)
    train_ids.append(k)
    train_a.append(a)
    train_b.append(b)
    train_labels_soft.append(choices_to_soft(list(map(int, s['choices']))))

dev_ids, dev_a, dev_b, dev_labels_soft = [], [], [], []
dev_avg, dev_choices = [], []
for k, s in iter_sorted_items(dev_raw):
    a, b = build_pair(s)
    dev_ids.append(k)
    dev_a.append(a)
    dev_b.append(b)
    dev_labels_soft.append(choices_to_soft(list(map(int, s['choices']))))
    dev_avg.append(float(s['average']))
    dev_choices.append(list(map(int, s['choices'])))

test_ids, test_a, test_b = [], [], []

for k, s in iter_sorted_items(test_raw):
    a, b = build_pair(s)
    sid = str(s.get("sample_id", k))
    test_ids.append(sid)
    test_a.append(a)
    test_b.append(b)


print('Train size:', len(train_ids), 'Dev size:', len(dev_ids))
print('Dev rounded distribution:', {i: sum(clip_round_to_1_5(a) == i for a in dev_avg) for i in range(1, 6)})
print('\nSample story (A):\n', train_a[0][:300])
print('\nSample sense  (B):\n', train_b[0][:200])
print("Sample test A:", test_a[0][:200])
print("Sample test B:", test_b[0][:200])

Train size: 2280 Dev size: 588
Dev rounded distribution: {1: 68, 2: 133, 3: 145, 4: 147, 5: 95}

Sample story (A):
 The old machine hummed in the corner of the workshop. Clara examined its dusty dials with a furrowed brow. She wondered if it could be brought back to life. The <t> potential </t> couldn't be measured. She collected a battery reader and looked on earnestly, willing some life back into the old machin

Sample sense  (B):
 potential = the difference in electrical charge between two points in a circuit expressed in volts. Example: The circuit has a high potential difference.. Nonsense_votes: 0.00
Sample test A: Maya had always been fascinated by the old mansion on the hill. She finally got a chance to explore it when the new owners offered guided tours. As she walked through the grand hallways, she marveled 
Sample test B: levels = a structure consisting of a room or set of rooms at a single position along a vertical scale. Example: What level is the conference room on?. Nons

## 4) Tokenization
Tokenize the text with a pretrained Transformer tokenizer.

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding, set_seed
from datasets import Dataset
import torch

MODEL_NAME = "microsoft/deberta-v3-large"

MAX_LENGTH = 320
LEARNING_RATE = 6.605331684042691e-06
NUM_EPOCHS = 8
WEIGHT_DECAY = 0.038665688189882155
WARMUP_RATIO = 0.07954725255425056
SEED = 42

set_seed(SEED)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

tokenizer.add_tokens(["<t>", "</t>"])

train_ds = Dataset.from_dict({
    'id': train_ids,
    'text_a': train_a,
    'text_b': train_b,
    'labels': train_labels_soft,
})
dev_ds = Dataset.from_dict({
    'id': dev_ids,
    'text_a': dev_a,
    'text_b': dev_b,
    'labels': dev_labels_soft,
})

test_ds = Dataset.from_dict({
    "id": test_ids,
    "text_a": test_a,
    "text_b": test_b,
})

def tokenize_batch(batch):
    return tokenizer(
        batch['text_a'],
        batch['text_b'],
        truncation='only_first',
        max_length=MAX_LENGTH
    )

train_tok = train_ds.map(tokenize_batch, batched=True, remove_columns=['text_a', 'text_b'])
dev_tok = dev_ds.map(tokenize_batch, batched=True, remove_columns=['text_a', 'text_b'])
test_tok = test_ds.map(tokenize_batch, batched=True, remove_columns=["text_a", "text_b"])

USE_CUDA = torch.cuda.is_available()
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    pad_to_multiple_of=8 if USE_CUDA else None,
)

print(f'Model: {MODEL_NAME} | MAX_LENGTH={MAX_LENGTH} | epochs={NUM_EPOCHS} | lr={LEARNING_RATE}')
print(train_tok)
print(dev_tok)
print(test_tok)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



Map:   0%|          | 0/2280 [00:00<?, ? examples/s]

Map:   0%|          | 0/588 [00:00<?, ? examples/s]

Map:   0%|          | 0/930 [00:00<?, ? examples/s]

Model: microsoft/deberta-v3-large | MAX_LENGTH=320 | epochs=8 | lr=6.605331684042691e-06
Dataset({
    features: ['id', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2280
})
Dataset({
    features: ['id', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 588
})
Dataset({
    features: ['id', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 930
})


## 5) Model + training
We fine-tune a pretrained model as a **5-class classifier** (labels 1–5).

In [None]:
import statistics
import numpy as np
import torch
import torch.nn.functional as F
from scipy.stats import spearmanr
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

use_cuda = torch.cuda.is_available()
print("CUDA:", use_cuda)
if use_cuda:
    print("GPU:", torch.cuda.get_device_name(0))

# if use_cuda:
#    torch.cuda.empty_cache()

def is_within_standard_deviation(prediction, labels):
    avg = sum(labels) / len(labels)
    stdev = statistics.stdev(labels)
    if (avg - stdev) < prediction < (avg + stdev):
        return True
    if abs(avg - prediction) < 1:
        return True
    return False

def compute_metrics(eval_pred):
    logits, _ = eval_pred
    probs = torch.softmax(torch.tensor(logits), dim=-1).cpu().numpy()

    pred_argmax = (probs.argmax(axis=-1) + 1).astype(int)

    weights = np.arange(1, 6, dtype=np.float32)
    ev = (probs * weights).sum(axis=1)
    pred_ev = np.clip(np.digitize(ev, [1.5, 2.5, 3.5, 4.5]) + 1, 1, 5)

    counts = np.bincount(pred_ev, minlength=6)[1:6]
    print("Counts ratings 1..5 (EV):", counts)

    spearman_corr_ev, _ = spearmanr(pred_ev.tolist(), np.asarray(dev_avg, dtype=float))
    acc_within_sd_ev = sum(
        is_within_standard_deviation(int(p), choices)
        for p, choices in zip(pred_ev.tolist(), dev_choices)
    ) / len(dev_choices)

    spearman_corr_am, _ = spearmanr(pred_argmax.tolist(), np.asarray(dev_avg, dtype=float))
    acc_within_sd_am = sum(
        is_within_standard_deviation(int(p), choices)
        for p, choices in zip(pred_argmax.tolist(), dev_choices)
    ) / len(dev_choices)

    return {
        "spearman_ev": float(spearman_corr_ev) if spearman_corr_ev == spearman_corr_ev else 0.0,
        "acc_within_sd_ev": float(acc_within_sd_ev),
        "spearman_argmax": float(spearman_corr_am) if spearman_corr_am == spearman_corr_am else 0.0,
        "acc_within_sd_argmax": float(acc_within_sd_am),
        "acc_within_sd": float(max(acc_within_sd_ev, acc_within_sd_am)),
    }

def compute_loss_func(outputs, labels, num_items_in_batch=None):
    logits = outputs.logits
    soft = labels.to(logits.device)
    hard = soft.argmax(dim=-1)

    loss_hard = F.cross_entropy(logits, hard)
    log_probs = F.log_softmax(logits, dim=-1)
    loss_soft = -(soft * log_probs).sum(dim=-1).mean()
    p = 0.10490799853094769
    return p * loss_soft + (1 - p) * loss_hard

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=5)
model.resize_token_embeddings(len(tokenizer))

PER_DEVICE_TRAIN_BS = 8 if use_cuda else 2
GRAD_ACCUM = 1 if use_cuda else 4

training_args = TrainingArguments(
    output_dir=str(PROJECT_ROOT / "transformer_runs_best"),
    overwrite_output_dir=True,
    do_eval=True,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BS,
    per_device_eval_batch_size=8 if use_cuda else 4,
    gradient_accumulation_steps=GRAD_ACCUM,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    lr_scheduler_type="cosine",
    fp16=use_cuda,
    optim="adamw_torch",
    dataloader_pin_memory=use_cuda,
    dataloader_num_workers=0,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="acc_within_sd",
    greater_is_better=True,
    logging_strategy="steps",
    logging_steps=200,
    report_to=[],
    seed=SEED,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=dev_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_loss_func=compute_loss_func,
    compute_metrics=compute_metrics,
)

print(f"Training {MODEL_NAME} | bs={PER_DEVICE_TRAIN_BS} | grad_accum={GRAD_ACCUM} | epochs={NUM_EPOCHS}")
trainer.train()

eval_result = trainer.evaluate()
print("\n=== Final Evaluation (best checkpoint) ===")
print(eval_result)

CUDA: True
GPU: Tesla T4


pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Training microsoft/deberta-v3-large | bs=8 | grad_accum=1 | epochs=8


Epoch,Training Loss,Validation Loss,Spearman Ev,Acc Within Sd Ev,Spearman Argmax,Acc Within Sd Argmax,Acc Within Sd
1,1.5798,1.55665,0.0,0.527211,0.01313,0.336735,0.527211
2,1.563,1.539608,0.397428,0.62585,0.366388,0.421769,0.62585
3,1.4407,1.490701,0.449231,0.630952,0.393176,0.391156,0.630952
4,1.321,1.455682,0.611271,0.714286,0.59968,0.588435,0.714286
5,1.1363,1.43655,0.64786,0.765306,0.653789,0.617347,0.765306
6,1.05,1.520589,0.665318,0.746599,0.659502,0.678571,0.746599
7,0.9629,1.560222,0.660661,0.731293,0.656972,0.683673,0.731293
8,0.8608,1.57162,0.662127,0.722789,0.65825,0.67517,0.722789


Counts ratings 1..5 (EV): [  0   0 588   0   0]


  spearman_corr_ev, _ = spearmanr(pred_ev.tolist(), np.asarray(dev_avg, dtype=float))


Counts ratings 1..5 (EV): [  0   0 263 325   0]
Counts ratings 1..5 (EV): [  0 419  61 108   0]
Counts ratings 1..5 (EV): [  0 176 141 145 126]
Counts ratings 1..5 (EV): [  7 217  88 196  80]
Counts ratings 1..5 (EV): [ 54 161 121 129 123]
Counts ratings 1..5 (EV): [ 86 138 124 106 134]
Counts ratings 1..5 (EV): [ 94 140 118 100 136]


Counts ratings 1..5 (EV): [  7 217  88 196  80]

=== Final Evaluation (best checkpoint) ===
{'eval_loss': 1.4365500211715698, 'eval_spearman_ev': 0.647860316691713, 'eval_acc_within_sd_ev': 0.7653061224489796, 'eval_spearman_argmax': 0.6537885954416616, 'eval_acc_within_sd_argmax': 0.6173469387755102, 'eval_acc_within_sd': 0.7653061224489796, 'eval_runtime': 9.8717, 'eval_samples_per_second': 59.564, 'eval_steps_per_second': 7.496, 'epoch': 8.0}


# Parameter tuning with Optuna

In [None]:
import optuna
import torch
import numpy as np
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, set_seed

use_cuda = torch.cuda.is_available()
print("CUDA:", use_cuda)
if use_cuda:
    print("GPU:", torch.cuda.get_device_name(0))

def tokenize_with_length(max_len):
    def tok(batch):
        return tokenizer(
            batch["text_a"],
            batch["text_b"],
            truncation="only_first",
            max_length=max_len,
        )
    tr = train_ds.map(tok, batched=True, remove_columns=["text_a", "text_b"])
    dv = dev_ds.map(tok, batched=True, remove_columns=["text_a", "text_b"])
    return tr, dv

def objective(trial):

    trial_seed = SEED + trial.number
    set_seed(trial_seed)

    lr = trial.suggest_float("learning_rate", 3e-6, 3e-5, log=True)
    wd = trial.suggest_float("weight_decay", 0.0, 0.05)
    warmup = trial.suggest_float("warmup_ratio", 0.03, 0.20)
    epochs = trial.suggest_int("num_train_epochs", 3, 8)

    max_len = trial.suggest_categorical("max_length", [256, 320, 384, 512])
    bs = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 24, 32])
    grad_accum = trial.suggest_categorical("gradient_accumulation_steps", [1, 2])

    soft_w = trial.suggest_float("soft_weight", 0.05, 0.25)

    train_tok, dev_tok = tokenize_with_length(max_len)

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=5)
    model.resize_token_embeddings(len(tokenizer))

    def trial_compute_loss(outputs, labels, num_items_in_batch=None):
        logits = outputs.logits
        soft = labels.to(logits.device)

        weights = torch.arange(1, 6, device=logits.device, dtype=soft.dtype)
        hard = torch.clamp(torch.round((soft * weights).sum(dim=-1)), 1, 5).long() - 1

        loss_hard = F.cross_entropy(logits, hard)
        log_probs = F.log_softmax(logits, dim=-1)
        loss_soft = -(soft * log_probs).sum(dim=-1).mean()

        return soft_w * loss_soft + (1.0 - soft_w) * loss_hard

    args = TrainingArguments(
        output_dir=str(PROJECT_ROOT / f"optuna_trial_{trial.number:03d}"),
        overwrite_output_dir=True,
        do_eval=True,

        learning_rate=lr,
        per_device_train_batch_size=bs,
        per_device_eval_batch_size=min(bs, 32),
        gradient_accumulation_steps=grad_accum,
        num_train_epochs=epochs,
        weight_decay=wd,
        warmup_ratio=warmup,
        lr_scheduler_type="cosine",

        bf16=use_cuda,
        fp16=False,

        optim="adamw_torch",
        dataloader_pin_memory=use_cuda,
        dataloader_num_workers=2,

        eval_strategy="epoch",
        save_strategy="no",
        save_total_limit=1,
        load_best_model_at_end=False,
        metric_for_best_model="acc_within_sd",
        greater_is_better=True,

        logging_strategy="steps",
        logging_steps=200,
        report_to=[],
        seed=trial_seed,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_tok,
        eval_dataset=dev_tok,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_loss_func=trial_compute_loss,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    metrics = trainer.evaluate()
    score = float(metrics["eval_acc_within_sd"])

    trial.report(score, step=epochs)
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("\n=== BEST ===")
print("best value:", study.best_value)
print("best params:", study.best_params)


## 6) Generate predictions.jsonl
Export test predictions as required by the SemEval format: one JSON per line with `id` and integer `prediction` in [1..5].

In [None]:
import json
import numpy as np
import torch

pred_out = trainer.predict(test_tok)
logits = pred_out.predictions
probs = torch.softmax(torch.tensor(logits), dim=-1).cpu().numpy()

weights = np.arange(1, 6, dtype=np.float32)
ev = (probs * weights).sum(axis=1)
pred_int = np.clip(np.digitize(ev, [1.5, 2.5, 3.5, 4.5]) + 1, 1, 5).astype(int).tolist()

out_path = PROJECT_ROOT / "predictions_test.jsonl"
id = 0
with out_path.open("w", encoding="utf-8") as f:
    for sid, p in zip(test_ids, pred_int):
        f.write(json.dumps({"id": str(id), "prediction": int(p)}) + "\n")
        id += 1

print("Wrote:", out_path)
print("Sample preds:", list(zip(test_ids[:10], pred_int[:10])))


Wrote: /content/drive/My Drive/Colab Notebooks/semeval26-05-scripts/predictions_test.jsonl
Sample preds: [('2017', 5), ('2018', 3), ('2019', 3), ('2020', 2), ('2021', 4), ('2022', 3), ('3127', 2), ('3128', 3), ('3129', 2), ('3130', 3)]


## 7) (Optional) Run official scorer
This validates formatting and reports official metrics on dev.

In [9]:
import sys
import subprocess

scoring_script = PROJECT_ROOT / 'semeval26-05-scripts' / 'scoring.py'
gold = PROJECT_ROOT / 'semeval26-05-scripts' / 'input' / 'ref' / 'solution.jsonl'
preds = PROJECT_ROOT / 'semeval26-05-scripts' / 'input' / 'res' / 'predictions.jsonl'
scores_out = PROJECT_ROOT / 'semeval26-05-scripts' / 'output' / 'scores.json'
scores_out.parent.mkdir(parents=True, exist_ok=True)

cmd = [
    str(Path(sys.executable)),
    str(scoring_script),
    str(gold),
    str(preds),
    str(scores_out),
]

print('Running:', ' '.join(cmd))
result = subprocess.run(cmd, capture_output=True, text=True)
print(result.stdout)
print(result.stderr)
print('Scores JSON:', scores_out)

Running: /usr/bin/python3 /content/drive/My Drive/Colab Notebooks/semeval26-05-scripts/semeval26-05-scripts/scoring.py /content/drive/My Drive/Colab Notebooks/semeval26-05-scripts/semeval26-05-scripts/input/ref/solution.jsonl /content/drive/My Drive/Colab Notebooks/semeval26-05-scripts/semeval26-05-scripts/input/res/predictions.jsonl /content/drive/My Drive/Colab Notebooks/semeval26-05-scripts/semeval26-05-scripts/output/scores.json

/usr/bin/python3: can't open file '/content/drive/My Drive/Colab Notebooks/semeval26-05-scripts/semeval26-05-scripts/scoring.py': [Errno 2] No such file or directory

Scores JSON: /content/drive/My Drive/Colab Notebooks/semeval26-05-scripts/semeval26-05-scripts/output/scores.json
