In [13]:
from ollama import chat
from ollama import ChatResponse
from pydantic import BaseModel
import subprocess
import gc
import pandas as pd
import json
from icecream import ic
import os
from tqdm import tqdm
import time

In [None]:
# Ones installed on my pc
model_names = [
    "llama3.1",
    "olmo-3:7b-instruct",
    "olmo-3",        # Cannot disable the thinking here but it's essentially the preivous one
    "granite3.3",
    "ministral-3",
    "qwen3",
    "qwen2.5-coder",
    "deepseek-r1",   # here with no thinking
    "deepseek-r1",
    "gemma3",
]

model_thinking = [
    False,
    False,
    True,
    False,
    False,
    False,
    False,
    True,
    False,
    False,
]

In [15]:
SYSTEM_PROMPT = (
    """
    You are an expert NLU annotator. Your job is to rate how plausible a candidate meaning (sense)
    is for the HOMONYM used in the target sentence within the short story.

    Return ONLY a single JSON object with one key: "score" and an integer value 1, 2, 3, 4 or 5.
    Integer mapping:
      1 = Definitely not
      2 = Probably not
      3 = Ambiguous / Unsure
      4 = Probably yes
      5 = Definitely yes

    The response must be a JSON object and nothing else, for example: {{"score": 4}}
    """
)

USER_PROMPT = (
    """
    [STORY]
    {full_story_text}

    [HOMONYM]
    {homonym}

    [CANDIDATE SENSE]
    {sense_text}

    [TASK]
    Based on the STORY above, decide how plausible it is that the HOMONYM is used with the
    CANDIDATE SENSE in the target sentence.

    Return ONLY a single JSON object with one key "score" and an integer value (1-5)
    as described by the system message. Example output: {{"score": 3}}
    """
)

In [16]:
def create_full_story_text(item):
    """Compose the story text used as context for rating.

    Uses `precontext`, `sentence`, and `ending` fields when available and joins them into a single string.
    """
    fullstory = f"{item.get('precontext', '')} {item.get('sentence', '')} {item.get('ending', '')}"
    return fullstory.strip()


def create_message(item):
    sense = f"{item.get('judged_meaning', '')} as in \"{item.get('example_sentence', '')}\"".strip()
    homonym = item.get("homonym", "")
    full_story_text = create_full_story_text(item)

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT.format(
            full_story_text=full_story_text,
            homonym=homonym,
            sense_text=sense,
        )},
    ]
    return messages

In [17]:
TRAIN_JSON_FILE = "../data/train.json"
DEV_JSON_FILE = "../data/dev.json"

def load_data(file_path):
    """
    Loads the json containing the dataset and return a pandas dataframe.
    """
    with open(file_path, 'r') as f:
        data = json.load(f)
    # Transpose because the json is {id: {features...}, ...}
    df = pd.DataFrame(data).T
    # Ensure 'average' is float
    df['average'] = df['average'].astype(float)
    # Ensure 'choices' is list (for scoring later)
    return df

df_train = load_data(TRAIN_JSON_FILE)
df_dev = load_data(DEV_JSON_FILE)

In [18]:
class Score(BaseModel):
    score: int

In [19]:
# Random element in the dataset
item = df_train.sample(1).iloc[0].to_dict()

messages = create_message(item)

model_number = 6  # change to try different models

response: ChatResponse = chat(model="qwen2.5-coder",
                              messages=messages,
                              think=False,
                              format=Score.model_json_schema(),
                              options={
                                  "temperature": 0
                              }
                              )

# ic(messages)
ic(response.model)
ic(response.total_duration * 10e-9)  # convert from ns to s
ic(response.message.role)
ic(response.message.content)
ic(response.message.thinking)
pass

[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mresponse[39m[38;5;245m.[39m[38;5;247mmodel[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m'[39m[38;5;36mqwen2.5-coder[39m[38;5;36m'[39m
[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mresponse[39m[38;5;245m.[39m[38;5;247mtotal_duration[39m[38;5;245m [39m[38;5;245m*[39m[38;5;245m [39m[38;5;36m10e-9[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m5.1274763100000005[39m
[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mresponse[39m[38;5;245m.[39m[38;5;247mmessage[39m[38;5;245m.[39m[38;5;247mrole[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m'[39m[38;5;36massistant[39m[38;5;36m'[39m
[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mresponse[39m[38;5;245m.[39m[38;5;247mmessage[39m[38;5;245m.[39m[38;5;247mcontent[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m'[39m[38;5;36m{[39m[38;5;36m"[39m[38;5;36mscore[39m[38;5;36m"[39m[38;5;36m:

In [20]:
def get_dev_predictions(model_name, df, max_examples=None, think=False):
    preds = []
    failed_ids = []
    ids = list(df.index.astype(str))
    if max_examples is not None:
        ids = ids[:max_examples]

    total_start = time.perf_counter()
    per_item_times = []

    for idx in tqdm(ids):
        item = df.loc[idx].to_dict()
        messages = create_message(item)

        start = time.perf_counter()
        try:
            response: ChatResponse = chat(model=model_name,
                                          messages=messages,
                                          think=think,
                                          format=Score.model_json_schema(),
                                          options={
                                              "temperature": 0
                                          }
                                          )
            elapsed = time.perf_counter() - start

            per_item_times.append((idx, elapsed))

            content = response.message.content
            try:
                s = Score.model_validate_json(content)
                pred = int(s.score)
                if pred < 1 or pred > 5:
                    raise ValueError("score out of range")
            except Exception:
                # Keep id of failed element so it can be removed from evaluation
                print("Invalid JSON or missing/invalid score for item id:", idx, "content:", content)
                pred = None
                failed_ids.append(str(idx))

        except Exception as e:
            elapsed = time.perf_counter() - start
            print(f"Error calling model {model_name} for id {idx}: {e}")
            pred = None
            failed_ids.append(str(idx))
            per_item_times.append((idx, elapsed))

        preds.append({"id": str(idx), "prediction": pred, "time": elapsed})

    total_elapsed = time.perf_counter() - total_start
    # attach summary timings as metadata and failed ids
    return {
        "predictions": preds,
        "failed_ids": failed_ids,
        "total_time": total_elapsed,
        "per_item_times": per_item_times,
        "avg_time": sum(t for _, t in per_item_times) / len(per_item_times) if per_item_times else 0,
    }

In [21]:
for model_name, think in zip(model_names, model_thinking):
    if think:
        MAX_EXAMPLES = 100  # set to an int to limit samples per model
    else:
        MAX_EXAMPLES = None  # set to None for not thinking models

    print(f"\n=== Running model: {model_name} ===")

    # If deepseek check if thinking to have different directoies
    if think:
        OUT_DIR = f"../llm-ollama/zero-shot/{model_name.replace(':', '-')}-think"
        os.makedirs(OUT_DIR, exist_ok=True)
    else:
        OUT_DIR = f"../llm-ollama/zero-shot/{model_name.replace(':', '-')}"
        os.makedirs(OUT_DIR, exist_ok=True)

    # get predictions (may take a while if MAX_EXAMPLES is None)
    # res = get_dev_predictions(model_name, df_dev, max_examples=MAX_EXAMPLES)
    res = get_dev_predictions(model_name, df_dev, max_examples=MAX_EXAMPLES, think=think)

    preds = res["predictions"]

    pred_file = os.path.join(OUT_DIR, "predictions.jsonl")
    # Only write successful predictions (skip None) so pred/ref sizes are aligned
    with open(pred_file, "w") as f:
        for p in preds:
            if p["prediction"] is None:
                # skip failed predictions (they are recorded in failed_ids)
                continue
            f.write(json.dumps({"id": p["id"], "prediction": p["prediction"]}) + "\n")

    # Save failed ids so they can be excluded from scoring
    failed_file = os.path.join(OUT_DIR, "failed_ids.jsonl")
    with open(failed_file, "w") as f:
        for fid in res.get("failed_ids", []):
            f.write(json.dumps({"id": fid}) + "\n")

    # Save timing info and per-item times
    timing_file = os.path.join(OUT_DIR, "timing.txt")
    with open(timing_file, "w") as f:
        f.write(f"total_time_sec: {res['total_time']:.4f}\n")
        f.write(f"avg_time_sec: {res['avg_time']:.4f}\n")
        f.write("per_item_times_sec:\n")
        for idx, t in res["per_item_times"]:
            f.write(f"{idx}: {t:.4f}\n")

    # Create ref.jsonl from df_dev (respect MAX_EXAMPLES) inside the model folder
    # Exclude any ids that failed JSON parsing so they won't be evaluated
    failed_set = set(res.get("failed_ids", []))
    ref_file = os.path.join(OUT_DIR, "ref.jsonl")
    with open(ref_file, "w") as f:
        for idx, row in df_dev.iterrows():
            if MAX_EXAMPLES is not None and int(idx) >= MAX_EXAMPLES:
                break
            if str(idx) in failed_set:
                # skip items that produced invalid JSON for this model
                continue
            f.write(json.dumps({"id": str(idx), "label": row["choices"]}) + "\n")

    print(f"Predictions saved to {pred_file}")
    print(f"Gold data saved to {ref_file}")
    print(f"Timing saved to {timing_file}")

    # Sanity check: warn if counts differ
    n_preds = sum(1 for _ in open(pred_file, "r"))
    n_refs = sum(1 for _ in open(ref_file, "r"))
    if n_preds != n_refs:
        print(f"Warning: #preds ({n_preds}) != #refs ({n_refs}). failed_ids_len={len(res.get('failed_ids', []))}")

    # If there is failed attemp rewrite all of the ids so that they are consecutive. This is needed for the scoring script
    if len(res.get('failed_ids', [])) > 0:
        # Rewrite pred_file with consecutive ids
        new_pred_file = os.path.join(OUT_DIR, "predictions_consecutive_ids.jsonl")
        with open(pred_file, "r") as fin, open(new_pred_file, "w") as fout:
            for new_id, line in enumerate(fin):
                obj = json.loads(line)
                obj["id"] = str(new_id)
                fout.write(json.dumps(obj) + "\n")
        pred_file = new_pred_file

        # Rewrite ref_file with consecutive ids
        new_ref_file = os.path.join(OUT_DIR, "ref_consecutive_ids.jsonl")
        with open(ref_file, "r") as fin, open(new_ref_file, "w") as fout:
            for new_id, line in enumerate(fin):
                obj = json.loads(line)
                obj["id"] = str(new_id)
                fout.write(json.dumps(obj) + "\n")
        ref_file = new_ref_file

    # Run scoring script for this model outputs
    res = subprocess.run(["python", "../score/scoring.py", ref_file, pred_file, os.path.join(OUT_DIR, "score.json")], capture_output=True, text=True)
    print(res.stdout)
    if res.stderr:
        print("Scoring STDERR:")
        print(res.stderr)

    # 
    subprocess.run(["ollama", "stop", model_name], check=False)
    gc.collect()




=== Running model: llama3.1 ===


100%|██████████| 588/588 [04:30<00:00,  2.17it/s]


Predictions saved to ../llm-ollama/zero-shot/llama3.1/predictions.jsonl
Gold data saved to ../llm-ollama/zero-shot/llama3.1/ref.jsonl
Timing saved to ../llm-ollama/zero-shot/llama3.1/timing.txt
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/zero-shot/llama3.1/predictions.jsonl on ../llm-ollama/zero-shot/llama3.1/ref.jsonl
----------
Spearman Correlation: 0.4943482359983337
Spearman p-Value: 1.4601503972407373e-37
----------
Accuracy: 0.7193877551020408 (423/588)
Results dumped into scores.json successfully.



[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h


=== Running model: olmo-3:7b-instruct ===


100%|██████████| 588/588 [03:35<00:00,  2.73it/s]


Predictions saved to ../llm-ollama/zero-shot/olmo-3-7b-instruct/predictions.jsonl
Gold data saved to ../llm-ollama/zero-shot/olmo-3-7b-instruct/ref.jsonl
Timing saved to ../llm-ollama/zero-shot/olmo-3-7b-instruct/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/zero-shot/olmo-3-7b-instruct/predictions.jsonl on ../llm-ollama/zero-shot/olmo-3-7b-instruct/ref.jsonl
----------
Spearman Correlation: 0.36649972972283523
Spearman p-Value: 3.931040774206021e-20
----------
Accuracy: 0.5578231292517006 (328/588)
Results dumped into scores.json successfully.


=== Running model: olmo-3 ===


100%|██████████| 100/100 [43:41<00:00, 26.22s/it] 


Predictions saved to ../llm-ollama/zero-shot/olmo-3-think/predictions.jsonl
Gold data saved to ../llm-ollama/zero-shot/olmo-3-think/ref.jsonl
Timing saved to ../llm-ollama/zero-shot/olmo-3-think/timing.txt
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/zero-shot/olmo-3-think/predictions.jsonl on ../llm-ollama/zero-shot/olmo-3-think/ref.jsonl
----------
Spearman Correlation: 0.6231983653659102
Spearman p-Value: 4.382612224619235e-12
----------
Accuracy: 0.68 (68/100)
Results dumped into scores.json successfully.



[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h


=== Running model: granite3.3 ===


100%|██████████| 588/588 [04:00<00:00,  2.44it/s]


Predictions saved to ../llm-ollama/zero-shot/granite3.3/predictions.jsonl
Gold data saved to ../llm-ollama/zero-shot/granite3.3/ref.jsonl
Timing saved to ../llm-ollama/zero-shot/granite3.3/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/zero-shot/granite3.3/predictions.jsonl on ../llm-ollama/zero-shot/granite3.3/ref.jsonl
----------
Spearman Correlation: 0.4828563815068942
Spearman p-Value: 1.1264137444260875e-35
----------
Accuracy: 0.5799319727891157 (341/588)
Results dumped into scores.json successfully.


=== Running model: ministral-3 ===


100%|██████████| 588/588 [04:44<00:00,  2.06it/s]


Predictions saved to ../llm-ollama/zero-shot/ministral-3/predictions.jsonl
Gold data saved to ../llm-ollama/zero-shot/ministral-3/ref.jsonl
Timing saved to ../llm-ollama/zero-shot/ministral-3/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/zero-shot/ministral-3/predictions.jsonl on ../llm-ollama/zero-shot/ministral-3/ref.jsonl
----------
Spearman Correlation: 0.5830923225755714
Spearman p-Value: 7.545086025322315e-55
----------
Accuracy: 0.5476190476190477 (322/588)
Results dumped into scores.json successfully.


=== Running model: qwen3 ===


100%|██████████| 588/588 [04:05<00:00,  2.39it/s]


Predictions saved to ../llm-ollama/zero-shot/qwen3/predictions.jsonl
Gold data saved to ../llm-ollama/zero-shot/qwen3/ref.jsonl
Timing saved to ../llm-ollama/zero-shot/qwen3/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/zero-shot/qwen3/predictions.jsonl on ../llm-ollama/zero-shot/qwen3/ref.jsonl
----------
Spearman Correlation: 0.5733452609839235
Spearman p-Value: 1.0959055070644954e-52
----------
Accuracy: 0.6598639455782312 (388/588)
Results dumped into scores.json successfully.


=== Running model: qwen2.5-coder ===


100%|██████████| 588/588 [04:31<00:00,  2.17it/s]


Predictions saved to ../llm-ollama/zero-shot/qwen2.5-coder/predictions.jsonl
Gold data saved to ../llm-ollama/zero-shot/qwen2.5-coder/ref.jsonl
Timing saved to ../llm-ollama/zero-shot/qwen2.5-coder/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/zero-shot/qwen2.5-coder/predictions.jsonl on ../llm-ollama/zero-shot/qwen2.5-coder/ref.jsonl
----------
Spearman Correlation: 0.47792729606676804
Spearman p-Value: 6.910551336970857e-35
----------
Accuracy: 0.7074829931972789 (416/588)
Results dumped into scores.json successfully.


=== Running model: deepseek-r1 ===


  7%|▋         | 7/100 [17:48<8:38:11, 334.32s/it]

Invalid JSON or missing/invalid score for item id: 6 content: 


100%|██████████| 100/100 [36:32<00:00, 21.93s/it]  


Predictions saved to ../llm-ollama/zero-shot/deepseek-r1-think/predictions.jsonl
Gold data saved to ../llm-ollama/zero-shot/deepseek-r1-think/ref.jsonl
Timing saved to ../llm-ollama/zero-shot/deepseek-r1-think/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/zero-shot/deepseek-r1-think/predictions_consecutive_ids.jsonl on ../llm-ollama/zero-shot/deepseek-r1-think/ref_consecutive_ids.jsonl
----------
Spearman Correlation: 0.6529396484772456
Spearman p-Value: 2.409629156925232e-13
----------
Accuracy: 0.696969696969697 (69/99)
Results dumped into scores.json successfully.


=== Running model: deepseek-r1 ===


100%|██████████| 588/588 [04:04<00:00,  2.41it/s]


Predictions saved to ../llm-ollama/zero-shot/deepseek-r1/predictions.jsonl
Gold data saved to ../llm-ollama/zero-shot/deepseek-r1/ref.jsonl
Timing saved to ../llm-ollama/zero-shot/deepseek-r1/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/zero-shot/deepseek-r1/predictions.jsonl on ../llm-ollama/zero-shot/deepseek-r1/ref.jsonl
----------
Spearman Correlation: 0.5477998321708117
Spearman p-Value: 2.3758214682578052e-47
----------
Accuracy: 0.6513605442176871 (383/588)
Results dumped into scores.json successfully.


=== Running model: gemma3 ===


100%|██████████| 588/588 [05:43<00:00,  1.71it/s]


Predictions saved to ../llm-ollama/zero-shot/gemma3/predictions.jsonl
Gold data saved to ../llm-ollama/zero-shot/gemma3/ref.jsonl
Timing saved to ../llm-ollama/zero-shot/gemma3/timing.txt
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/zero-shot/gemma3/predictions.jsonl on ../llm-ollama/zero-shot/gemma3/ref.jsonl
----------
Spearman Correlation: 0.4673515563127601
Spearman p-Value: 3.0703377733995533e-33
----------
Accuracy: 0.6105442176870748 (359/588)
Results dumped into scores.json successfully.



[?2026h[?25l[1G⠙ [K[?25h[?2026l[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h