In [9]:
from ollama import chat
from ollama import ChatResponse
from pydantic import BaseModel
import subprocess
import gc
import pandas as pd
import json
from icecream import ic
import os
from tqdm import tqdm
import time

In [10]:
# Ones installed on my pc
model_names = [
    "llama3.1:latest",
    "olmo-3:7b-instruct",
    # "olmo-3:latest",        # Cannot disable the thinking here but it's essentially the preivous one
    "granite3.3:latest",
    "ministral-3:latest",
    "qwen3:latest",   
    "deepseek-r1:latest",   # here with no thinking
    "gemma3:latest",
]

# Just to know if they are thinking models
support_thinking = [
    False,
    False,
    # True,
    False,
    False,
    True,
    False,
]

In [11]:
SYSTEM_PROMPT = (
    """
    You are an expert NLU annotator. Your job is to rate how plausible a candidate meaning (sense)
    is for the HOMONYM used in the target sentence within the short story.

    Return ONLY a single JSON object with one key: "score" and an integer value 1, 2, 3, 4 or 5.
    Integer mapping:
      1 = Definitely not
      2 = Probably not
      3 = Ambiguous / Unsure
      4 = Probably yes
      5 = Definitely yes

    The response must be a JSON object and nothing else, for example: {{"score": 4}}
    """
)

USER_PROMPT = (
    """
    [STORY]
    {full_story_text}

    [HOMONYM]
    {homonym}

    [CANDIDATE SENSE]
    {sense_text}

    [TASK]
    Based on the STORY above, decide how plausible it is that the HOMONYM is used with the
    CANDIDATE SENSE in the target sentence.

    Return ONLY a single JSON object with one key "score" and an integer value (1-5)
    as described by the system message. Example output: {{"score": 3}}
    """
)


def create_full_story_text(item):
    """Compose the story text used as context for rating.

    Uses `precontext`, `sentence`, and `ending` fields when available and joins them into a single string.
    """
    fullstory = f"{item.get('precontext', '')} {item.get('sentence', '')} {item.get('ending', '')}"
    return fullstory.strip()


def create_message(item):
    sense = f"{item.get('judged_meaning', '')} as in \"{item.get('example_sentence', '')}\"".strip()
    homonym = item.get("homonym", "")
    full_story_text = create_full_story_text(item)

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT.format(
            full_story_text=full_story_text,
            homonym=homonym,
            sense_text=sense,
        )},
    ]
    return messages

In [12]:
TRAIN_JSON_FILE = "../data/train.json"
DEV_JSON_FILE = "../data/dev.json"

def load_data(file_path):
    """
    Loads the json containing the dataset and return a pandas dataframe.
    """
    with open(file_path, 'r') as f:
        data = json.load(f)
    # Transpose because the json is {id: {features...}, ...}
    df = pd.DataFrame(data).T
    # Ensure 'average' is float
    df['average'] = df['average'].astype(float)
    # Ensure 'choices' is list (for scoring later)
    return df

df_train = load_data(TRAIN_JSON_FILE)
df_dev = load_data(DEV_JSON_FILE)

In [13]:
class Score(BaseModel):
    score: int

In [14]:
# Random element in the dataset
item = df_train.sample(1).iloc[0].to_dict()

messages = create_message(item)

model_number = 6  # change to try different models

response: ChatResponse = chat(model=model_names[model_number],
                              messages=messages,
                              think=False,
                              format=Score.model_json_schema(),
                              options={
                                  "temperature": 0
                              }
                              )

# ic(messages)
ic(response.model)
ic(response.total_duration * 10e-9)  # convert from ns to s
ic(response.message.role)
ic(response.message.content)
ic(response.message.thinking)
pass

[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mresponse[39m[38;5;245m.[39m[38;5;247mmodel[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m'[39m[38;5;36mgemma3:latest[39m[38;5;36m'[39m
[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mresponse[39m[38;5;245m.[39m[38;5;247mtotal_duration[39m[38;5;245m [39m[38;5;245m*[39m[38;5;245m [39m[38;5;36m10e-9[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m47.26069995[39m
[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mresponse[39m[38;5;245m.[39m[38;5;247mmessage[39m[38;5;245m.[39m[38;5;247mrole[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m'[39m[38;5;36massistant[39m[38;5;36m'[39m
[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mresponse[39m[38;5;245m.[39m[38;5;247mmessage[39m[38;5;245m.[39m[38;5;247mcontent[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m'''[39m[38;5;36m{[39m[38;5;36m"[39m[38;5;36mscore[39m[38;5;36m"[39m[38;5;36m: 2}[

In [15]:
def get_dev_predictions(model_name, df, max_examples=None):
    preds = []
    failed_ids = []
    ids = list(df.index.astype(str))
    if max_examples is not None:
        ids = ids[:max_examples]

    total_start = time.perf_counter()
    per_item_times = []

    for idx in tqdm(ids):
        item = df.loc[idx].to_dict()
        messages = create_message(item)

        start = time.perf_counter()
        try:
            response: ChatResponse = chat(model=model_name,
                                          messages=messages,
                                          think=False,          # No thinking because otherwise takes ~ 6hours for the whole dev dataset per thinking model
                                          format=Score.model_json_schema(),
                                          options={
                                              "temperature": 0
                                          }
                                          )
            elapsed = time.perf_counter() - start

            per_item_times.append((idx, elapsed))

            content = response.message.content
            try:
                s = Score.model_validate_json(content)
                pred = int(s.score)
                if pred < 1 or pred > 5:
                    raise ValueError("score out of range")
            except Exception:
                # Keep id of failed element so it can be removed from evaluation
                print("Invalid JSON or missing/invalid score for item id:", idx, "content:", content)
                pred = None
                failed_ids.append(str(idx))

        except Exception as e:
            elapsed = time.perf_counter() - start
            print(f"Error calling model {model_name} for id {idx}: {e}")
            pred = None
            failed_ids.append(str(idx))
            per_item_times.append((idx, elapsed))

        preds.append({"id": str(idx), "prediction": pred, "time": elapsed})

    total_elapsed = time.perf_counter() - total_start
    # attach summary timings as metadata and failed ids
    return {
        "predictions": preds,
        "failed_ids": failed_ids,
        "total_time": total_elapsed,
        "per_item_times": per_item_times,
        "avg_time": sum(t for _, t in per_item_times) / len(per_item_times) if per_item_times else 0,
    }

### Run this cell and manually change the model

In [None]:
# Run smoke-test for each model in `model_names`
MAX_EXAMPLES = None  # set to an int to limit samples per model

for model_name in model_names:
    print(f"\n=== Running model: {model_name} ===")
    OUT_DIR = f"../llm-ollama/zero-shot/{model_name.replace(':', '-') }"
    os.makedirs(OUT_DIR, exist_ok=True)

    # get predictions (may take a while if MAX_EXAMPLES is None)
    res = get_dev_predictions(model_name, df_dev, max_examples=MAX_EXAMPLES)

    preds = res["predictions"]

    pred_file = os.path.join(OUT_DIR, "predictions.jsonl")
    with open(pred_file, "w") as f:
        for p in preds:
            f.write(json.dumps({"id": p["id"], "prediction": p["prediction"]}) + "\n")

    # Save failed ids so they can be excluded from scoring
    failed_file = os.path.join(OUT_DIR, "failed_ids.jsonl")
    with open(failed_file, "w") as f:
        for fid in res.get("failed_ids", []):
            f.write(json.dumps({"id": fid}) + "\n")

    # Save timing info and per-item times
    timing_file = os.path.join(OUT_DIR, "timing.txt")
    with open(timing_file, "w") as f:
        f.write(f"total_time_sec: {res['total_time']:.4f}\n")
        f.write(f"avg_time_sec: {res['avg_time']:.4f}\n")
        f.write("per_item_times_sec:\n")
        for idx, t in res["per_item_times"]:
            f.write(f"{idx}: {t:.4f}\n")

    # Create ref.jsonl from df_dev (respect MAX_EXAMPLES) inside the model folder
    # Exclude any ids that failed JSON parsing so they won't be evaluated
    failed_set = set(res.get("failed_ids", []))
    ref_file = os.path.join(OUT_DIR, "ref.jsonl")
    with open(ref_file, "w") as f:
        for idx, row in df_dev.iterrows():
            if MAX_EXAMPLES is not None and int(idx) >= MAX_EXAMPLES:
                break
            if str(idx) in failed_set:
                # skip items that produced invalid JSON for this model
                continue
            f.write(json.dumps({"id": str(idx), "label": row["choices"]}) + "\n")

    print(f"Predictions saved to {pred_file}")
    print(f"Gold data saved to {ref_file}")
    print(f"Timing saved to {timing_file}")

    # Run scoring script for this model outputs
    res = subprocess.run(["python", "../score/scoring.py", ref_file, pred_file, os.path.join(OUT_DIR, "score.json")], capture_output=True, text=True)
    print(res.stdout)
    if res.stderr:
        print("Scoring STDERR:")
        print(res.stderr)

    # 
    subprocess.run(["ollama", "stop", model_name], check=False)
    gc.collect()


=== Running model: llama3.1:latest ===


100%|██████████| 588/588 [03:18<00:00,  2.97it/s]



Predictions saved to ../llm-ollama/llama3.1-latest/predictions.jsonl
Gold data saved to ../llm-ollama/llama3.1-latest/ref.jsonl
Timing saved to ../llm-ollama/llama3.1-latest/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/llama3.1-latest/predictions.jsonl on ../llm-ollama/llama3.1-latest/ref.jsonl
----------
Spearman Correlation: 0.4943482359983337
Spearman p-Value: 1.4601503972407373e-37
----------
Accuracy: 0.7193877551020408 (423/588)
Results dumped into scores.json successfully.


=== Running model: olmo-3:7b-instruct ===


100%|██████████| 588/588 [03:30<00:00,  2.79it/s]



Predictions saved to ../llm-ollama/olmo-3-7b-instruct/predictions.jsonl
Gold data saved to ../llm-ollama/olmo-3-7b-instruct/ref.jsonl
Timing saved to ../llm-ollama/olmo-3-7b-instruct/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/olmo-3-7b-instruct/predictions.jsonl on ../llm-ollama/olmo-3-7b-instruct/ref.jsonl
----------
Spearman Correlation: 0.36649972972283523
Spearman p-Value: 3.931040774206021e-20
----------
Accuracy: 0.5578231292517006 (328/588)
Results dumped into scores.json successfully.


=== Running model: granite3.3:latest ===


100%|██████████| 588/588 [03:19<00:00,  2.95it/s]



Predictions saved to ../llm-ollama/granite3.3-latest/predictions.jsonl
Gold data saved to ../llm-ollama/granite3.3-latest/ref.jsonl
Timing saved to ../llm-ollama/granite3.3-latest/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/granite3.3-latest/predictions.jsonl on ../llm-ollama/granite3.3-latest/ref.jsonl
----------
Spearman Correlation: 0.4828563815068942
Spearman p-Value: 1.1264137444260875e-35
----------
Accuracy: 0.5799319727891157 (341/588)
Results dumped into scores.json successfully.


=== Running model: ministral-3:latest ===


100%|██████████| 588/588 [04:36<00:00,  2.13it/s]



Predictions saved to ../llm-ollama/ministral-3-latest/predictions.jsonl
Gold data saved to ../llm-ollama/ministral-3-latest/ref.jsonl
Timing saved to ../llm-ollama/ministral-3-latest/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/ministral-3-latest/predictions.jsonl on ../llm-ollama/ministral-3-latest/ref.jsonl
----------
Spearman Correlation: 0.5830923225755714
Spearman p-Value: 7.545086025322315e-55
----------
Accuracy: 0.5476190476190477 (322/588)
Results dumped into scores.json successfully.


=== Running model: qwen3:latest ===


100%|██████████| 588/588 [03:59<00:00,  2.45it/s]



Predictions saved to ../llm-ollama/qwen3-latest/predictions.jsonl
Gold data saved to ../llm-ollama/qwen3-latest/ref.jsonl
Timing saved to ../llm-ollama/qwen3-latest/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/qwen3-latest/predictions.jsonl on ../llm-ollama/qwen3-latest/ref.jsonl
----------
Spearman Correlation: 0.5733452609839235
Spearman p-Value: 1.0959055070644954e-52
----------
Accuracy: 0.6598639455782312 (388/588)
Results dumped into scores.json successfully.


=== Running model: deepseek-r1:latest ===


100%|██████████| 588/588 [04:00<00:00,  2.44it/s]



Predictions saved to ../llm-ollama/deepseek-r1-latest/predictions.jsonl
Gold data saved to ../llm-ollama/deepseek-r1-latest/ref.jsonl
Timing saved to ../llm-ollama/deepseek-r1-latest/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/deepseek-r1-latest/predictions.jsonl on ../llm-ollama/deepseek-r1-latest/ref.jsonl
----------
Spearman Correlation: 0.5477998321708117
Spearman p-Value: 2.3758214682578052e-47
----------
Accuracy: 0.6513605442176871 (383/588)
Results dumped into scores.json successfully.


=== Running model: gemma3:latest ===


100%|██████████| 588/588 [05:29<00:00,  1.78it/s]



Predictions saved to ../llm-ollama/gemma3-latest/predictions.jsonl
Gold data saved to ../llm-ollama/gemma3-latest/ref.jsonl
Timing saved to ../llm-ollama/gemma3-latest/timing.txt
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/gemma3-latest/predictions.jsonl on ../llm-ollama/gemma3-latest/ref.jsonl
----------
Spearman Correlation: 0.4673515563127601
Spearman p-Value: 3.0703377733995533e-33
----------
Accuracy: 0.6105442176870748 (359/588)
Results dumped into scores.json successfully.

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/gemma3-latest/predictions.jsonl on ../llm-ollama/gemma3-latest/ref.jsonl
----------
Spearman Correlation: 0.4673515563127601
Spearman p-Value: 3.0703377733995533e-33
----------
Accuracy: 0.6105442176870748 (359/588)
Results dumped into scores.json successfully.



[?2026h[?25l[1G⠙ [K[?25h[?2026l[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h