In [1]:
from ollama import chat
from ollama import ChatResponse
from pydantic import BaseModel
import subprocess
import gc
import pandas as pd
import json
from icecream import ic
import os
from tqdm import tqdm
import time

In [2]:
# Ones installed on my pc
model_names = [
    "llama3.1",
    "olmo-3:7b-instruct",
    "olmo-3",        # Cannot disable the thinking here but it's essentially the preivous one
    "granite3.3",
    "ministral-3",
    "qwen3",
    "qwen2.5-coder",
    "deepseek-r1",  # Thinking
    "deepseek-r1",  # No thinking
    "gemma3",
]

model_thinking = [
    False,
    False,
    True,
    False,
    False,
    False,
    False,
    True,
    False,
    False,
]

In [3]:
SYSTEM_PROMPT = (
    """
    You are an expert NLU annotator. Your job is to rate how plausible a candidate meaning (sense)
    is for the HOMONYM used in the target sentence within the short story.

    Return ONLY a single JSON object with one key: "score" and an integer value 1, 2, 3, 4 or 5.
    Integer mapping:
      1 = Definitely not
      2 = Probably not
      3 = Ambiguous / Unsure
      4 = Probably yes
      5 = Definitely yes

    The response must be a JSON object and nothing else, for example: {{"score": 4}}

    [EXAMPLES]
    {few_shot_examples}
    """
)

USER_PROMPT = (
    """
    Now, label this new instance:

    [STORY]
    {full_story_text}

    [HOMONYM]
    {homonym}

    [CANDIDATE SENSE]
    {sense_text}

    [TASK]
    Based on the STORY above, decide how plausible it is that the HOMONYM is used with the
    CANDIDATE SENSE in the target sentence.

    Return ONLY a single JSON object with one key "score" and an integer value (1-5)
    as described by the system message. Example output: {{"score": 3}}
    """
)

def build_few_shot_examples(df):
    """Build a five-shot examples string (one example per score 1-5) from `df`.

    Uses rounded average to determine the representative score for an item.
    """
    examples = []
    used_ids = set()

    for score in range(1, 6):
        found = False
        for idx, row in df.iterrows():
            if str(idx) in used_ids:
                continue
            try:
                avg = float(row.get("average", 0))
            except Exception:
                avg = 0.0
            if round(avg) == score:
                full = create_full_story_text(row)
                sense = f"{row.get('judged_meaning', '')} as in \"{row.get('example_sentence','')}\""
                ex = (
                    "[STORY]\n"
                    f"{full}\n\n"
                    "[HOMONYM]\n"
                    f"{row.get('homonym', '')}\n\n"
                    "[CANDIDATE SENSE]\n"
                    f"{sense}\n\n"
                    "[EXAMPLE OUTPUT]\n"
                    f'{{"score": {score}}}'
                )
                examples.append(ex)
                used_ids.add(str(idx))
                found = True
                break
        if not found:
            # fallback: include a short template example if no example found for this score
            examples.append(f"[EXAMPLE OUTPUT]\n{{\"score\": {score}}}")

    return "\n\n".join(examples)

def create_full_story_text(item):
    """Compose the story text used as context for rating.

    Uses `precontext`, `sentence`, and `ending` fields when available and joins them into a single string.
    """
    fullstory = f"{item.get('precontext', '')} {item.get('sentence', '')} {item.get('ending', '')}"
    return fullstory.strip()



In [4]:
TRAIN_JSON_FILE = "../data/train.json"
DEV_JSON_FILE = "../data/dev.json"

def load_data(file_path):
    """
    Loads the json containing the dataset and return a pandas dataframe.
    """
    with open(file_path, 'r') as f:
        data = json.load(f)
    # Transpose because the json is {id: {features...}, ...}
    df = pd.DataFrame(data).T
    # Ensure 'average' is float
    df['average'] = df['average'].astype(float)
    # Ensure 'choices' is list (for scoring later)
    return df

df_train = load_data(TRAIN_JSON_FILE)
df_dev = load_data(DEV_JSON_FILE)

In [5]:
# Build and expose the few-shot examples string used by prompts
few_shot_examples = build_few_shot_examples(df_train)
ic(few_shot_examples)

def create_message(item):
    sense = f"{item.get('judged_meaning', '')} as in \"{item.get('example_sentence', '')}\"".strip()
    homonym = item.get("homonym", "")
    full_story_text = create_full_story_text(item)

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT.format(
            few_shot_examples = few_shot_examples
        )},
        {"role": "user", "content": USER_PROMPT.format(
            full_story_text=full_story_text,
            homonym=homonym,
            sense_text=sense
        )},
    ]
    return messages

[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mfew_shot_examples[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m'''[39m[38;5;36m[STORY][39m
[38;5;36m                        The troops were stationed at a remote camp for training. To pass the time, they often engaged in recreational sports. One afternoon, they decided to play a game on the field next to their tents. The soldiers were playing a bat and ball game, which involved them running towards the base. It didn[39m[38;5;36m'[39m[38;5;36mt take long for the winning team to get a home run.[39m
[38;5;36m                        [39m
[38;5;36m                        [HOMONYM][39m
[38;5;36m                        base[39m
[38;5;36m                        [39m
[38;5;36m                        [CANDIDATE SENSE][39m
[38;5;36m                        installation from which a military force initiates operations as in [39m[38;5;36m"[39m[38;5;36mThe troops strategized at their forward base.[39m[38;5;36m

In [6]:
class Score(BaseModel):
    score: int

In [7]:
# Random element in the dataset
item = df_train.sample(1).iloc[0].to_dict()

messages = create_message(item)

model_number = 6  # change to try different models

response: ChatResponse = chat(model=model_names[model_number],
                              messages=messages,
                              think=False,
                              format=Score.model_json_schema(),
                              options={
                                  "temperature": 0
                              }
                              )

# ic(messages)
ic(response.model)
ic(response.total_duration * 10e-9)  # convert from ns to s
ic(response.message.role)
ic(response.message.content)
ic(response.message.thinking)
pass

[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mresponse[39m[38;5;245m.[39m[38;5;247mmodel[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m'[39m[38;5;36mqwen2.5-coder[39m[38;5;36m'[39m
[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mresponse[39m[38;5;245m.[39m[38;5;247mtotal_duration[39m[38;5;245m [39m[38;5;245m*[39m[38;5;245m [39m[38;5;36m10e-9[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m131.6115557[39m
[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mresponse[39m[38;5;245m.[39m[38;5;247mmessage[39m[38;5;245m.[39m[38;5;247mrole[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m'[39m[38;5;36massistant[39m[38;5;36m'[39m
[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mresponse[39m[38;5;245m.[39m[38;5;247mmessage[39m[38;5;245m.[39m[38;5;247mcontent[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m'[39m[38;5;36m{[39m[38;5;36m"[39m[38;5;36mscore[39m[38;5;36m"[39m[38;5;36m: 4}[39

In [8]:
def get_dev_predictions(model_name, df, max_examples=None, think=False):
    preds = []
    failed_ids = []
    ids = list(df.index.astype(str))
    if max_examples is not None:
        ids = ids[:max_examples]

    total_start = time.perf_counter()
    per_item_times = []

    for idx in tqdm(ids):
        item = df.loc[idx].to_dict()
        messages = create_message(item)

        start = time.perf_counter()
        try:
            response: ChatResponse = chat(model=model_name,
                                          messages=messages,
                                          think=think,          # No thinking because otherwise takes ~ 6hours for the whole dev dataset per thinking model
                                          format=Score.model_json_schema(),
                                          options={
                                              "temperature": 0
                                          }
                                          )
            elapsed = time.perf_counter() - start

            per_item_times.append((idx, elapsed))

            content = response.message.content
            try:
                s = Score.model_validate_json(content)
                pred = int(s.score)
                if pred < 1 or pred > 5:
                    raise ValueError("score out of range")
            except Exception:
                # Keep id of failed element so it can be removed from evaluation
                print("Invalid JSON or missing/invalid score for item id:", idx, "content:", content)
                pred = None
                failed_ids.append(str(idx))

        except Exception as e:
            elapsed = time.perf_counter() - start
            print(f"Error calling model {model_name} for id {idx}: {e}")
            pred = None
            failed_ids.append(str(idx))
            per_item_times.append((idx, elapsed))

        preds.append({"id": str(idx), "prediction": pred, "time": elapsed})

    total_elapsed = time.perf_counter() - total_start
    # attach summary timings as metadata and failed ids
    return {
        "predictions": preds,
        "failed_ids": failed_ids,
        "total_time": total_elapsed,
        "per_item_times": per_item_times,
        "avg_time": sum(t for _, t in per_item_times) / len(per_item_times) if per_item_times else 0,
    }

In [9]:
for model_name, think in zip(model_names, model_thinking):
    if think:
        MAX_EXAMPLES = 100  # set to an int to limit samples per model
    else:
        MAX_EXAMPLES = None  # set to None for not thinking models

    print(f"\n=== Running model: {model_name} ===")

    # If deepseek check if thinking to have different directoies
    if think:
        OUT_DIR = f"../llm-ollama/five-shot/{model_name.replace(':', '-')}-think"
        os.makedirs(OUT_DIR, exist_ok=True)
    else:
        OUT_DIR = f"../llm-ollama/five-shot/{model_name.replace(':', '-')}"
        os.makedirs(OUT_DIR, exist_ok=True)

    # get predictions (may take a while if MAX_EXAMPLES is None)
    # res = get_dev_predictions(model_name, df_dev, max_examples=MAX_EXAMPLES)
    res = get_dev_predictions(model_name, df_dev, max_examples=MAX_EXAMPLES, think=think)

    preds = res["predictions"]

    pred_file = os.path.join(OUT_DIR, "predictions.jsonl")
    with open(pred_file, "w") as f:
        for p in preds:
            f.write(json.dumps({"id": p["id"], "prediction": p["prediction"]}) + "\n")

    # Save failed ids so they can be excluded from scoring
    failed_file = os.path.join(OUT_DIR, "failed_ids.jsonl")
    with open(failed_file, "w") as f:
        for fid in res.get("failed_ids", []):
            f.write(json.dumps({"id": fid}) + "\n")

    # Save timing info and per-item times
    timing_file = os.path.join(OUT_DIR, "timing.txt")
    with open(timing_file, "w") as f:
        f.write(f"total_time_sec: {res['total_time']:.4f}\n")
        f.write(f"avg_time_sec: {res['avg_time']:.4f}\n")
        f.write("per_item_times_sec:\n")
        for idx, t in res["per_item_times"]:
            f.write(f"{idx}: {t:.4f}\n")

    # Create ref.jsonl from df_dev (respect MAX_EXAMPLES) inside the model folder
    # Exclude any ids that failed JSON parsing so they won't be evaluated
    failed_set = set(res.get("failed_ids", []))
    ref_file = os.path.join(OUT_DIR, "ref.jsonl")
    with open(ref_file, "w") as f:
        for idx, row in df_dev.iterrows():
            if MAX_EXAMPLES is not None and int(idx) >= MAX_EXAMPLES:
                break
            if str(idx) in failed_set:
                # skip items that produced invalid JSON for this model
                continue
            f.write(json.dumps({"id": str(idx), "label": row["choices"]}) + "\n")

    print(f"Predictions saved to {pred_file}")
    print(f"Gold data saved to {ref_file}")
    print(f"Timing saved to {timing_file}")

    # If there is failed attemp rewrite all of the ids so that they are consecutive. This is needed for the scoring script
    if len(res.get('failed_ids', [])) > 0:
        # Rewrite pred_file with consecutive ids
        new_pred_file = os.path.join(OUT_DIR, "predictions_consecutive_ids.jsonl")
        with open(pred_file, "r") as fin, open(new_pred_file, "w") as fout:
            for new_id, line in enumerate(fin):
                obj = json.loads(line)
                obj["id"] = str(new_id)
                fout.write(json.dumps(obj) + "\n")
        pred_file = new_pred_file

        # Rewrite ref_file with consecutive ids
        new_ref_file = os.path.join(OUT_DIR, "ref_consecutive_ids.jsonl")
        with open(ref_file, "r") as fin, open(new_ref_file, "w") as fout:
            for new_id, line in enumerate(fin):
                obj = json.loads(line)
                obj["id"] = str(new_id)
                fout.write(json.dumps(obj) + "\n")
        ref_file = new_ref_file
        
    # Run scoring script for this model outputs
    res = subprocess.run(["python", "../score/scoring.py", ref_file, pred_file, os.path.join(OUT_DIR, "score.json")], capture_output=True, text=True)
    print(res.stdout)
    if res.stderr:
        print("Scoring STDERR:")
        print(res.stderr)

    # 
    subprocess.run(["ollama", "stop", model_name], check=False)
    gc.collect()


=== Running model: llama3.1 ===


100%|██████████| 588/588 [04:41<00:00,  2.09it/s]


Predictions saved to ../llm-ollama/five-shot/llama3.1/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot/llama3.1/ref.jsonl
Timing saved to ../llm-ollama/five-shot/llama3.1/timing.txt
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot/llama3.1/predictions.jsonl on ../llm-ollama/five-shot/llama3.1/ref.jsonl
----------
Spearman Correlation: 0.31508308957374087
Spearman p-Value: 5.106304522515722e-15
----------
Accuracy: 0.6479591836734694 (381/588)
Results dumped into scores.json successfully.



[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h


=== Running model: olmo-3:7b-instruct ===


100%|██████████| 588/588 [03:43<00:00,  2.63it/s]


Predictions saved to ../llm-ollama/five-shot/olmo-3-7b-instruct/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot/olmo-3-7b-instruct/ref.jsonl
Timing saved to ../llm-ollama/five-shot/olmo-3-7b-instruct/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot/olmo-3-7b-instruct/predictions.jsonl on ../llm-ollama/five-shot/olmo-3-7b-instruct/ref.jsonl
----------
Spearman Correlation: 0.41489872936938266
Spearman p-Value: 7.204535289671061e-26
----------
Accuracy: 0.6275510204081632 (369/588)
Results dumped into scores.json successfully.


=== Running model: olmo-3 ===


100%|██████████| 100/100 [1:00:52<00:00, 36.53s/it]


Predictions saved to ../llm-ollama/five-shot/olmo-3-think/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot/olmo-3-think/ref.jsonl
Timing saved to ../llm-ollama/five-shot/olmo-3-think/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot/olmo-3-think/predictions.jsonl on ../llm-ollama/five-shot/olmo-3-think/ref.jsonl
----------
Spearman Correlation: 0.4913225231409942
Spearman p-Value: 2.0980337824087246e-07
----------
Accuracy: 0.62 (62/100)
Results dumped into scores.json successfully.


=== Running model: granite3.3 ===


100%|██████████| 588/588 [03:13<00:00,  3.03it/s]


Predictions saved to ../llm-ollama/five-shot/granite3.3/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot/granite3.3/ref.jsonl
Timing saved to ../llm-ollama/five-shot/granite3.3/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot/granite3.3/predictions.jsonl on ../llm-ollama/five-shot/granite3.3/ref.jsonl
----------
Spearman Correlation: 0.48629300429948846
Spearman p-Value: 3.124566219001688e-36
----------
Accuracy: 0.6258503401360545 (368/588)
Results dumped into scores.json successfully.


=== Running model: ministral-3 ===


100%|██████████| 588/588 [04:45<00:00,  2.06it/s]


Predictions saved to ../llm-ollama/five-shot/ministral-3/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot/ministral-3/ref.jsonl
Timing saved to ../llm-ollama/five-shot/ministral-3/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot/ministral-3/predictions.jsonl on ../llm-ollama/five-shot/ministral-3/ref.jsonl
----------
Spearman Correlation: 0.6168662767618583
Spearman p-Value: 6.160192035217521e-63
----------
Accuracy: 0.576530612244898 (339/588)
Results dumped into scores.json successfully.


=== Running model: qwen3 ===


100%|██████████| 588/588 [04:08<00:00,  2.37it/s]


Predictions saved to ../llm-ollama/five-shot/qwen3/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot/qwen3/ref.jsonl
Timing saved to ../llm-ollama/five-shot/qwen3/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot/qwen3/predictions.jsonl on ../llm-ollama/five-shot/qwen3/ref.jsonl
----------
Spearman Correlation: 0.5372580369215413
Spearman p-Value: 2.7984587446528573e-45
----------
Accuracy: 0.6598639455782312 (388/588)
Results dumped into scores.json successfully.


=== Running model: qwen2.5-coder ===


100%|██████████| 588/588 [04:40<00:00,  2.09it/s]


Predictions saved to ../llm-ollama/five-shot/qwen2.5-coder/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot/qwen2.5-coder/ref.jsonl
Timing saved to ../llm-ollama/five-shot/qwen2.5-coder/timing.txt
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot/qwen2.5-coder/predictions.jsonl on ../llm-ollama/five-shot/qwen2.5-coder/ref.jsonl
----------
Spearman Correlation: 0.507488887797628
Spearman p-Value: 8.253353707346186e-40
----------
Accuracy: 0.7142857142857143 (420/588)
Results dumped into scores.json successfully.



[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h


=== Running model: deepseek-r1 ===


100%|██████████| 100/100 [24:32<00:00, 14.72s/it]


Predictions saved to ../llm-ollama/five-shot/deepseek-r1-think/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot/deepseek-r1-think/ref.jsonl
Timing saved to ../llm-ollama/five-shot/deepseek-r1-think/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot/deepseek-r1-think/predictions.jsonl on ../llm-ollama/five-shot/deepseek-r1-think/ref.jsonl
----------
Spearman Correlation: 0.6298055837217469
Spearman p-Value: 2.22512049732196e-12
----------
Accuracy: 0.72 (72/100)
Results dumped into scores.json successfully.


=== Running model: deepseek-r1 ===


100%|██████████| 588/588 [04:05<00:00,  2.40it/s]


Predictions saved to ../llm-ollama/five-shot/deepseek-r1/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot/deepseek-r1/ref.jsonl
Timing saved to ../llm-ollama/five-shot/deepseek-r1/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot/deepseek-r1/predictions.jsonl on ../llm-ollama/five-shot/deepseek-r1/ref.jsonl
----------
Spearman Correlation: 0.5288611239625444
Spearman p-Value: 1.109321351632556e-43
----------
Accuracy: 0.6700680272108843 (394/588)
Results dumped into scores.json successfully.


=== Running model: gemma3 ===


100%|██████████| 588/588 [05:40<00:00,  1.73it/s]


Predictions saved to ../llm-ollama/five-shot/gemma3/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot/gemma3/ref.jsonl
Timing saved to ../llm-ollama/five-shot/gemma3/timing.txt
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot/gemma3/predictions.jsonl on ../llm-ollama/five-shot/gemma3/ref.jsonl
----------
Spearman Correlation: 0.4673959399881929
Spearman p-Value: 3.022670690391306e-33
----------
Accuracy: 0.6360544217687075 (374/588)
Results dumped into scores.json successfully.



[?2026h[?25l[1G⠙ [K[?25h[?2026l[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h