In [10]:
from ollama import chat
from ollama import ChatResponse
from pydantic import BaseModel
import subprocess
import gc
import pandas as pd
import json
from icecream import ic
import os
from tqdm import tqdm
import time

In [11]:
# Ones installed on my pc
model_names = [
    "llama3.1",
    "olmo-3:7b-instruct",
    "olmo-3",        # Cannot disable the thinking here but it's essentially the preivous one
    "granite3.3",
    "ministral-3",
    "qwen3",
    "qwen2.5-coder",
    "deepseek-r1",   # here with no thinking
    "deepseek-r1",
    "gemma3",
    "phi4-mini",
]

model_thinking = [
    False,
    False,
    True,
    False,
    False,
    False,
    False,
    True,
    False,
    False,
    False
]

In [12]:
SYSTEM_PROMPT = (
    """
    You are an expert NLU annotator. Your job is to rate how plausible a candidate meaning (sense)
    is for the HOMONYM used in the target sentence within the short story.

    Return ONLY a single JSON object with one key: "score" and an integer value 1, 2, 3, 4 or 5.
    Integer mapping:
      1 = Definitely not
      2 = Probably not
      3 = Ambiguous / Unsure
      4 = Probably yes
      5 = Definitely yes

    The response must be a JSON object and nothing else, for example: {{"score": 4}}

    [EXAMPLES]
    {few_shot_examples}
    """
)

USER_PROMPT = (
    """
    Now, label this new instance:

    [STORY]
    {full_story_text}

    [HOMONYM]
    {homonym}

    [CANDIDATE SENSE]
    {sense_text}

    [ADDITIONAL CONTEXT]
    A DeBERTa model (Accuracy: {deberta_accuracy:.2f}, Spearman Correlation: {deberta_spearman:.2f}) predicted a score of {deberta_prediction:.2f} for this example.
    You can use this information to guide your decision, but rely on your own judgment if the context strongly suggests otherwise.

    [TASK]
    Based on the STORY above, decide how plausible it is that the HOMONYM is used with the
    CANDIDATE SENSE in the target sentence.

    Return ONLY a single JSON object with one key "score" and an integer value (1-5)
    as described by the system message. Example output: {{"score": 3}}
    """
)

In [13]:
def create_full_story_text(item):
    """Compose the story text used as context for rating.

    Uses `precontext`, `sentence`, and `ending` fields when available and joins them into a single string.
    """
    fullstory = f"{item.get('precontext', '')} {item.get('sentence', '')} {item.get('ending', '')}"
    return fullstory.strip()

def build_few_shot_examples(df):
    """Build a five-shot examples string (one example per score 1-5) from `df`.

    Uses rounded average to determine the representative score for an item.
    """
    examples = []
    used_ids = set()

    for score in range(1, 6):
        found = False
        for idx, row in df.iterrows():
            if str(idx) in used_ids:
                continue
            try:
                avg = float(row.get("average", 0))
            except Exception:
                avg = 0.0
            if round(avg) == score:
                full = create_full_story_text(row)
                sense = f"{row.get('judged_meaning', '')} as in \"{row.get('example_sentence','')}\""
                ex = (
                    "[STORY]\n"
                    f"{full}\n\n"
                    "[HOMONYM]\n"
                    f"{row.get('homonym', '')}\n\n"
                    "[CANDIDATE SENSE]\n"
                    f"{sense}\n\n"
                    "[EXAMPLE OUTPUT]\n"
                    f'{{"score": {score}}}'
                )
                examples.append(ex)
                used_ids.add(str(idx))
                found = True
                break
        if not found:
            # fallback: include a short template example if no example found for this score
            examples.append(f"[EXAMPLE OUTPUT]\n{{\"score\": {score}}}")

    return "\n\n".join(examples)

def create_message(item, deberta_pred, deberta_scores):
    sense = f"{item.get('judged_meaning', '')} as in \"{item.get('example_sentence', '')}\"".strip()
    homonym = item.get("homonym", "")
    full_story_text = create_full_story_text(item)

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT.format(few_shot_examples=few_shot_examples)},
        {"role": "user", "content": USER_PROMPT.format(
            full_story_text=full_story_text,
            homonym=homonym,
            sense_text=sense,
            deberta_accuracy=deberta_scores['accuracy'],
            deberta_spearman=deberta_scores['spearman'],
            deberta_prediction=deberta_pred
        )},
    ]
    return messages

In [14]:
TRAIN_JSON_FILE = "../data/train.json"
DEV_JSON_FILE = "../data/dev.json"
DEBERTA_PRED_FILE = "../deberta-finetune-2/predictions.jsonl"
DEBERTA_SCORE_FILE = "../deberta-finetune-2/score.json"

def load_data(file_path):
    """
    Loads the json containing the dataset and return a pandas dataframe.
    """
    with open(file_path, 'r') as f:
        data = json.load(f)
    # Transpose because the json is {id: {features...}, ...}
    df = pd.DataFrame(data).T
    # Ensure 'average' is float
    df['average'] = df['average'].astype(float)
    # Ensure 'choices' is list (for scoring later)
    return df

def load_deberta_results(pred_file, score_file):
    with open(score_file, 'r') as f:
        scores = json.load(f)
    
    preds = {}
    with open(pred_file, 'r') as f:
        for line in f:
            item = json.loads(line)
            preds[str(item['id'])] = item['prediction']
            
    return scores, preds

df_train = load_data(TRAIN_JSON_FILE)
df_dev = load_data(DEV_JSON_FILE)
deberta_scores, deberta_preds = load_deberta_results(DEBERTA_PRED_FILE, DEBERTA_SCORE_FILE)

# Build and expose the few-shot examples string used by prompts
few_shot_examples = build_few_shot_examples(df_train)
ic(few_shot_examples)

[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mfew_shot_examples[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m'''[39m[38;5;36m[STORY][39m
[38;5;36m                        The troops were stationed at a remote camp for training. To pass the time, they often engaged in recreational sports. One afternoon, they decided to play a game on the field next to their tents. The soldiers were playing a bat and ball game, which involved them running towards the base. It didn[39m[38;5;36m'[39m[38;5;36mt take long for the winning team to get a home run.[39m
[38;5;36m                        [39m
[38;5;36m                        [HOMONYM][39m
[38;5;36m                        base[39m
[38;5;36m                        [39m
[38;5;36m                        [CANDIDATE SENSE][39m
[38;5;36m                        installation from which a military force initiates operations as in [39m[38;5;36m"[39m[38;5;36mThe troops strategized at their forward base.[39m[38;5;36m

'[STORY]\nThe troops were stationed at a remote camp for training. To pass the time, they often engaged in recreational sports. One afternoon, they decided to play a game on the field next to their tents. The soldiers were playing a bat and ball game, which involved them running towards the base. It didn\'t take long for the winning team to get a home run.\n\n[HOMONYM]\nbase\n\n[CANDIDATE SENSE]\ninstallation from which a military force initiates operations as in "The troops strategized at their forward base."\n\n[EXAMPLE OUTPUT]\n{"score": 1}\n\n[STORY]\nThe old machine hummed in the corner of the workshop. Clara examined its dusty dials with a furrowed brow. She wondered if it could be brought back to life. The potential couldn\'t be measured. The machine could make such wonderful clothing if it were in working order.\n\n[HOMONYM]\npotential\n\n[CANDIDATE SENSE]\nthe difference in electrical charge between two points in a circuit expressed in volts as in "The circuit has a high poten

In [15]:
class Score(BaseModel):
    score: int

In [16]:
# Random element in the dataset
sample = df_dev.sample(1)
idx = str(sample.index[0])
item = sample.iloc[0].to_dict()

# Get deberta prediction
deberta_pred = deberta_preds.get(idx, 0.0)

messages = create_message(item, deberta_pred, deberta_scores)

model_number = 6  # change to try different models

response: ChatResponse = chat(model="qwen2.5-coder",
                              messages=messages,
                              think=False,
                              format=Score.model_json_schema(),
                              options={
                                  "temperature": 0
                              }
                              )

# ic(messages)
ic(response.model)
ic(response.total_duration * 10e-9)  # convert from ns to s
ic(response.message.role)
ic(response.message.content)
ic(response.message.thinking)
ic(messages)

[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mresponse[39m[38;5;245m.[39m[38;5;247mmodel[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m'[39m[38;5;36mqwen2.5-coder[39m[38;5;36m'[39m
[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mresponse[39m[38;5;245m.[39m[38;5;247mtotal_duration[39m[38;5;245m [39m[38;5;245m*[39m[38;5;245m [39m[38;5;36m10e-9[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m76.38477409000001[39m
[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mresponse[39m[38;5;245m.[39m[38;5;247mmessage[39m[38;5;245m.[39m[38;5;247mrole[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m'[39m[38;5;36massistant[39m[38;5;36m'[39m
[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mresponse[39m[38;5;245m.[39m[38;5;247mmessage[39m[38;5;245m.[39m[38;5;247mcontent[39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m'[39m[38;5;36m{[39m[38;5;36m"[39m[38;5;36mscore[39m[38;5;36m"[39m[38;5;36m: 

[{'role': 'system',
  'content': '\n    You are an expert NLU annotator. Your job is to rate how plausible a candidate meaning (sense)\n    is for the HOMONYM used in the target sentence within the short story.\n\n    Return ONLY a single JSON object with one key: "score" and an integer value 1, 2, 3, 4 or 5.\n    Integer mapping:\n      1 = Definitely not\n      2 = Probably not\n      3 = Ambiguous / Unsure\n      4 = Probably yes\n      5 = Definitely yes\n\n    The response must be a JSON object and nothing else, for example: {"score": 4}\n\n    [EXAMPLES]\n    [STORY]\nThe troops were stationed at a remote camp for training. To pass the time, they often engaged in recreational sports. One afternoon, they decided to play a game on the field next to their tents. The soldiers were playing a bat and ball game, which involved them running towards the base. It didn\'t take long for the winning team to get a home run.\n\n[HOMONYM]\nbase\n\n[CANDIDATE SENSE]\ninstallation from which a mil

In [17]:
def get_dev_predictions(model_name, df, deberta_preds, deberta_scores, max_examples=None, think=False):
    preds = []
    failed_ids = []
    ids = list(df.index.astype(str))
    if max_examples is not None:
        ids = ids[:max_examples]

    total_start = time.perf_counter()
    per_item_times = []

    for idx in tqdm(ids):
        item = df.loc[idx].to_dict()
        
        deberta_pred = deberta_preds.get(str(idx))
        if deberta_pred is None:
            print(f"Warning: No DeBERTa prediction for id {idx}")
            deberta_pred = 0.0

        messages = create_message(item, deberta_pred, deberta_scores)

        start = time.perf_counter()
        try:
            response: ChatResponse = chat(model=model_name,
                                          messages=messages,
                                          think=think,
                                          format=Score.model_json_schema(),
                                          options={
                                              "temperature": 0
                                          }
                                          )
            elapsed = time.perf_counter() - start

            per_item_times.append((idx, elapsed))

            content = response.message.content
            try:
                s = Score.model_validate_json(content)
                pred = int(s.score)
                if pred < 1 or pred > 5:
                    raise ValueError("score out of range")
            except Exception:
                # Keep id of failed element so it can be removed from evaluation
                print("Invalid JSON or missing/invalid score for item id:", idx, "content:", content)
                pred = None
                failed_ids.append(str(idx))

        except Exception as e:
            elapsed = time.perf_counter() - start
            print(f"Error calling model {model_name} for id {idx}: {e}")
            pred = None
            failed_ids.append(str(idx))
            per_item_times.append((idx, elapsed))

        preds.append({"id": str(idx), "prediction": pred, "time": elapsed})

    total_elapsed = time.perf_counter() - total_start
    # attach summary timings as metadata and failed ids
    return {
        "predictions": preds,
        "failed_ids": failed_ids,
        "total_time": total_elapsed,
        "per_item_times": per_item_times,
        "avg_time": sum(t for _, t in per_item_times) / len(per_item_times) if per_item_times else 0,
    }

In [18]:
for model_name, think in zip(model_names, model_thinking):
    if think:
        MAX_EXAMPLES = 100  # set to an int to limit samples per model
    else:
        MAX_EXAMPLES = None  # set to None for not thinking models

    print(f"\n=== Running model: {model_name} ===")

    # If deepseek check if thinking to have different directoies
    if think:
        OUT_DIR = f"../llm-ollama/five-shot-deberta/{model_name.replace(':', '-')}-think"
        os.makedirs(OUT_DIR, exist_ok=True)
    else:
        OUT_DIR = f"../llm-ollama/five-shot-deberta/{model_name.replace(':', '-')}"
        os.makedirs(OUT_DIR, exist_ok=True)

    # get predictions (may take a while if MAX_EXAMPLES is None)
    # res = get_dev_predictions(model_name, df_dev, max_examples=MAX_EXAMPLES)
    res = get_dev_predictions(model_name, df_dev, deberta_preds, deberta_scores, max_examples=MAX_EXAMPLES, think=think)

    preds = res["predictions"]

    pred_file = os.path.join(OUT_DIR, "predictions.jsonl")
    # Only write successful predictions (skip None) so pred/ref sizes are aligned
    with open(pred_file, "w") as f:
        for p in preds:
            if p["prediction"] is None:
                # skip failed predictions (they are recorded in failed_ids)
                continue
            f.write(json.dumps({"id": p["id"], "prediction": p["prediction"]}) + "\n")

    # Save failed ids so they can be excluded from scoring
    failed_file = os.path.join(OUT_DIR, "failed_ids.jsonl")
    with open(failed_file, "w") as f:
        for fid in res.get("failed_ids", []):
            f.write(json.dumps({"id": fid}) + "\n")

    # Save timing info and per-item times
    timing_file = os.path.join(OUT_DIR, "timing.txt")
    with open(timing_file, "w") as f:
        f.write(f"total_time_sec: {res['total_time']:.4f}\n")
        f.write(f"avg_time_sec: {res['avg_time']:.4f}\n")
        f.write("per_item_times_sec:\n")
        for idx, t in res["per_item_times"]:
            f.write(f"{idx}: {t:.4f}\n")

    # Create ref.jsonl from df_dev (respect MAX_EXAMPLES) inside the model folder
    # Exclude any ids that failed JSON parsing so they won't be evaluated
    failed_set = set(res.get("failed_ids", []))
    ref_file = os.path.join(OUT_DIR, "ref.jsonl")
    with open(ref_file, "w") as f:
        for idx, row in df_dev.iterrows():
            if MAX_EXAMPLES is not None and int(idx) >= MAX_EXAMPLES:
                break
            if str(idx) in failed_set:
                # skip items that produced invalid JSON for this model
                continue
            f.write(json.dumps({"id": str(idx), "label": row["choices"]}) + "\n")

    print(f"Predictions saved to {pred_file}")
    print(f"Gold data saved to {ref_file}")
    print(f"Timing saved to {timing_file}")

    # Sanity check: warn if counts differ
    n_preds = sum(1 for _ in open(pred_file, "r"))
    n_refs = sum(1 for _ in open(ref_file, "r"))
    if n_preds != n_refs:
        print(f"Warning: #preds ({n_preds}) != #refs ({n_refs}). failed_ids_len={len(res.get('failed_ids', []))}")

    # If there is failed attemp rewrite all of the ids so that they are consecutive. This is needed for the scoring script
    if len(res.get('failed_ids', [])) > 0:
        # Rewrite pred_file with consecutive ids
        new_pred_file = os.path.join(OUT_DIR, "predictions_consecutive_ids.jsonl")
        with open(pred_file, "r") as fin, open(new_pred_file, "w") as fout:
            for new_id, line in enumerate(fin):
                obj = json.loads(line)
                obj["id"] = str(new_id)
                fout.write(json.dumps(obj) + "\n")
        pred_file = new_pred_file

        # Rewrite ref_file with consecutive ids
        new_ref_file = os.path.join(OUT_DIR, "ref_consecutive_ids.jsonl")
        with open(ref_file, "r") as fin, open(new_ref_file, "w") as fout:
            for new_id, line in enumerate(fin):
                obj = json.loads(line)
                obj["id"] = str(new_id)
                fout.write(json.dumps(obj) + "\n")
        ref_file = new_ref_file

    # Run scoring script for this model outputs
    res = subprocess.run(["python", "../score/scoring.py", ref_file, pred_file, os.path.join(OUT_DIR, "score.json")], capture_output=True, text=True)
    print(res.stdout)
    if res.stderr:
        print("Scoring STDERR:")
        print(res.stderr)

    # 
    subprocess.run(["ollama", "stop", model_name], check=False)
    gc.collect()


=== Running model: llama3.1 ===


100%|██████████| 588/588 [04:55<00:00,  1.99it/s]



Predictions saved to ../llm-ollama/five-shot-deberta/llama3.1/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot-deberta/llama3.1/ref.jsonl
Timing saved to ../llm-ollama/five-shot-deberta/llama3.1/timing.txt
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot-deberta/llama3.1/predictions.jsonl on ../llm-ollama/five-shot-deberta/llama3.1/ref.jsonl
----------
Spearman Correlation: 0.21327403858869737
Spearman p-Value: 1.7815208390395438e-07
----------
Accuracy: 0.6309523809523809 (371/588)
Results dumped into scores.json successfully.

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot-deberta/llama3.1/predictions.jsonl on ../llm-ollama/five-shot-deberta/llama3.1/ref.jsonl
----------
Spearman Correlation: 0.21327403858869737
Spearman p-Value: 1.7815208390395438e-07
----------
Accuracy: 0.6309523809523809 (371/588)
Results dumped into scores.json successfully.



[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h


=== Running model: olmo-3:7b-instruct ===


100%|██████████| 588/588 [04:01<00:00,  2.44it/s]



Predictions saved to ../llm-ollama/five-shot-deberta/olmo-3-7b-instruct/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot-deberta/olmo-3-7b-instruct/ref.jsonl
Timing saved to ../llm-ollama/five-shot-deberta/olmo-3-7b-instruct/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot-deberta/olmo-3-7b-instruct/predictions.jsonl on ../llm-ollama/five-shot-deberta/olmo-3-7b-instruct/ref.jsonl
----------
Spearman Correlation: 0.3893768282030252
Spearman p-Value: 1.0028660563744057e-22
----------
Accuracy: 0.6054421768707483 (356/588)
Results dumped into scores.json successfully.


=== Running model: olmo-3 ===


  2%|▏         | 2/100 [20:21<19:00:42, 698.40s/it]

Invalid JSON or missing/invalid score for item id: 1 content: 


100%|██████████| 100/100 [1:28:09<00:00, 52.90s/it]



Predictions saved to ../llm-ollama/five-shot-deberta/olmo-3-think/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot-deberta/olmo-3-think/ref.jsonl
Timing saved to ../llm-ollama/five-shot-deberta/olmo-3-think/timing.txt
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot-deberta/olmo-3-think/predictions_consecutive_ids.jsonl on ../llm-ollama/five-shot-deberta/olmo-3-think/ref_consecutive_ids.jsonl
----------
Spearman Correlation: 0.7032649628484955
Spearman p-Value: 4.819658776971543e-16
----------
Accuracy: 0.7373737373737373 (73/99)
Results dumped into scores.json successfully.


=== Running model: granite3.3 ===
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot-deberta/olmo-3-think/predictions_consecutive_ids.jsonl on ../llm-ollama/five-shot-deberta/olmo-3-think/ref_consecutive_ids.jsonl
----------
Spearman Correlation: 0.7032649628484955
Spearman p-Value: 4.819658776971543e

100%|██████████| 588/588 [04:00<00:00,  2.44it/s][?25h



Predictions saved to ../llm-ollama/five-shot-deberta/granite3.3/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot-deberta/granite3.3/ref.jsonl
Timing saved to ../llm-ollama/five-shot-deberta/granite3.3/timing.txt
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot-deberta/granite3.3/predictions.jsonl on ../llm-ollama/five-shot-deberta/granite3.3/ref.jsonl
----------
Spearman Correlation: 0.5341352167758447
Spearman p-Value: 1.1130518155277229e-44
----------
Accuracy: 0.6581632653061225 (387/588)
Results dumped into scores.json successfully.

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot-deberta/granite3.3/predictions.jsonl on ../llm-ollama/five-shot-deberta/granite3.3/ref.jsonl
----------
Spearman Correlation: 0.5341352167758447
Spearman p-Value: 1.1130518155277229e-44
----------
Accuracy: 0.6581632653061225 (387/588)
Results dumped into scores.json successfully.



[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h


=== Running model: ministral-3 ===


100%|██████████| 588/588 [05:09<00:00,  1.90it/s]



Predictions saved to ../llm-ollama/five-shot-deberta/ministral-3/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot-deberta/ministral-3/ref.jsonl
Timing saved to ../llm-ollama/five-shot-deberta/ministral-3/timing.txt
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot-deberta/ministral-3/predictions.jsonl on ../llm-ollama/five-shot-deberta/ministral-3/ref.jsonl
----------
Spearman Correlation: 0.6031249998091451
Spearman p-Value: 1.5743713916968723e-59
----------
Accuracy: 0.5680272108843537 (334/588)
Results dumped into scores.json successfully.

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot-deberta/ministral-3/predictions.jsonl on ../llm-ollama/five-shot-deberta/ministral-3/ref.jsonl
----------
Spearman Correlation: 0.6031249998091451
Spearman p-Value: 1.5743713916968723e-59
----------
Accuracy: 0.5680272108843537 (334/588)
Results dumped into scores.json successfully.



[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h


=== Running model: qwen3 ===


100%|██████████| 588/588 [04:27<00:00,  2.20it/s]



Predictions saved to ../llm-ollama/five-shot-deberta/qwen3/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot-deberta/qwen3/ref.jsonl
Timing saved to ../llm-ollama/five-shot-deberta/qwen3/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot-deberta/qwen3/predictions.jsonl on ../llm-ollama/five-shot-deberta/qwen3/ref.jsonl
----------
Spearman Correlation: 0.5964809802232212
Spearman p-Value: 6.109637912088627e-58
----------
Accuracy: 0.7023809523809523 (413/588)
Results dumped into scores.json successfully.


=== Running model: qwen2.5-coder ===


100%|██████████| 588/588 [04:56<00:00,  1.98it/s]



Predictions saved to ../llm-ollama/five-shot-deberta/qwen2.5-coder/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot-deberta/qwen2.5-coder/ref.jsonl
Timing saved to ../llm-ollama/five-shot-deberta/qwen2.5-coder/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot-deberta/qwen2.5-coder/predictions.jsonl on ../llm-ollama/five-shot-deberta/qwen2.5-coder/ref.jsonl
----------
Spearman Correlation: 0.5518309094830965
Spearman p-Value: 3.6661867213544433e-48
----------
Accuracy: 0.7517006802721088 (442/588)
Results dumped into scores.json successfully.


=== Running model: deepseek-r1 ===


 40%|████      | 40/100 [26:10<5:07:55, 307.92s/it]

Invalid JSON or missing/invalid score for item id: 39 content: 


 45%|████▌     | 45/100 [43:56<5:31:43, 361.88s/it]

Invalid JSON or missing/invalid score for item id: 44 content: 


100%|██████████| 100/100 [58:01<00:00, 34.81s/it]  



Predictions saved to ../llm-ollama/five-shot-deberta/deepseek-r1-think/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot-deberta/deepseek-r1-think/ref.jsonl
Timing saved to ../llm-ollama/five-shot-deberta/deepseek-r1-think/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot-deberta/deepseek-r1-think/predictions_consecutive_ids.jsonl on ../llm-ollama/five-shot-deberta/deepseek-r1-think/ref_consecutive_ids.jsonl
----------
Spearman Correlation: 0.726248416441722
Spearman p-Value: 2.622886724811437e-17
----------
Accuracy: 0.8163265306122449 (80/98)
Results dumped into scores.json successfully.


=== Running model: deepseek-r1 ===


100%|██████████| 588/588 [04:24<00:00,  2.22it/s]



Predictions saved to ../llm-ollama/five-shot-deberta/deepseek-r1/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot-deberta/deepseek-r1/ref.jsonl
Timing saved to ../llm-ollama/five-shot-deberta/deepseek-r1/timing.txt


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot-deberta/deepseek-r1/predictions.jsonl on ../llm-ollama/five-shot-deberta/deepseek-r1/ref.jsonl
----------
Spearman Correlation: 0.5653370548068162
Spearman p-Value: 5.791112711132222e-51
----------
Accuracy: 0.7210884353741497 (424/588)
Results dumped into scores.json successfully.


=== Running model: gemma3 ===


100%|██████████| 588/588 [07:48<00:00,  1.26it/s]



Predictions saved to ../llm-ollama/five-shot-deberta/gemma3/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot-deberta/gemma3/ref.jsonl
Timing saved to ../llm-ollama/five-shot-deberta/gemma3/timing.txt
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot-deberta/gemma3/predictions.jsonl on ../llm-ollama/five-shot-deberta/gemma3/ref.jsonl
----------
Spearman Correlation: 0.6062423136260214
Spearman p-Value: 2.7456753541621685e-60
----------
Accuracy: 0.6836734693877551 (402/588)
Results dumped into scores.json successfully.


=== Running model: phi4-mini ===
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot-deberta/gemma3/predictions.jsonl on ../llm-ollama/five-shot-deberta/gemma3/ref.jsonl
----------
Spearman Correlation: 0.6062423136260214
Spearman p-Value: 2.7456753541621685e-60
----------
Accuracy: 0.6836734693877551 (402/588)
Results dumped into scores.json successfully.


=

100%|██████████| 588/588 [04:50<00:00,  2.02it/s]6h[?25l[1G[K[?25h[?2026l[2K[1G[?25h



Predictions saved to ../llm-ollama/five-shot-deberta/phi4-mini/predictions.jsonl
Gold data saved to ../llm-ollama/five-shot-deberta/phi4-mini/ref.jsonl
Timing saved to ../llm-ollama/five-shot-deberta/phi4-mini/timing.txt
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot-deberta/phi4-mini/predictions.jsonl on ../llm-ollama/five-shot-deberta/phi4-mini/ref.jsonl
----------
Spearman Correlation: 0.4256846968440348
Spearman p-Value: 2.7886112233007414e-27
----------
Accuracy: 0.6904761904761905 (406/588)
Results dumped into scores.json successfully.

Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/five-shot-deberta/phi4-mini/predictions.jsonl on ../llm-ollama/five-shot-deberta/phi4-mini/ref.jsonl
----------
Spearman Correlation: 0.4256846968440348
Spearman p-Value: 2.7886112233007414e-27
----------
Accuracy: 0.6904761904761905 (406/588)
Results dumped into scores.json successfully.



[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h