In [1]:
from langchain.messages import HumanMessage, SystemMessage
from langchain_core.messages import ChatMessage
from langchain_ollama import ChatOllama
from langchain.agents import create_agent
from pydantic import BaseModel, Field
import subprocess
import gc
import pandas as pd
import json
from icecream import ic
import os
from tqdm import tqdm
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Ones installed on my pc
model_names = [
    "llama3.1:latest",
    "olmo-3:7b-instruct",
    "olmo-3:latest",
    "granite3.3:latest",
    "ministral-3:latest",
    "qwen3:latest",   
    "deepseek-r1:latest",
    "gemma3:latest",
]

In [3]:
llm = ChatOllama(
    model=model_names[0],
    temperature=0,
    # other params...
)
messages = [
    (
        "system",
        "You are a helpful assistant that translates English to French. Translate the user sentence.",
    ),
    ("human", "I love programming."),
]
ai_msg = llm.invoke(messages)
print(ai_msg)

subprocess.run(["ollama", "stop", model_names[0]], check=False)
gc.collect()

content='The translation of "I love programming" in French is:\n\n"J\'adore le programmation."' additional_kwargs={} response_metadata={'model': 'llama3.1:latest', 'created_at': '2025-12-21T13:41:59.532371463Z', 'done': True, 'done_reason': 'stop', 'total_duration': 638740622, 'load_duration': 123912538, 'prompt_eval_count': 35, 'prompt_eval_duration': 42621121, 'eval_count': 22, 'eval_duration': 446041227, 'logprobs': None, 'model_name': 'llama3.1:latest', 'model_provider': 'ollama'} id='lc_run--019b4125-3feb-7432-b8b5-00a8bef54c1f-0' usage_metadata={'input_tokens': 35, 'output_tokens': 22, 'total_tokens': 57}


[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

45

In [4]:
ic(ai_msg)
ic(ai_msg.response_metadata)

[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mai_msg[39m[38;5;245m:[39m[38;5;245m [39m[38;5;247mAIMessage[39m[38;5;245m([39m[38;5;247mcontent[39m[38;5;245m=[39m[38;5;36m'[39m[38;5;36mThe translation of [39m[38;5;36m"[39m[38;5;36mI love programming[39m[38;5;36m"[39m[38;5;36m in French is:[39m
[38;5;245m            [39m
[38;5;245m            [39m[38;5;36m"[39m[38;5;36mJ[39m[38;5;166m\'[39m[38;5;36madore le programmation.[39m[38;5;36m"[39m[38;5;36m'[39m[38;5;36m, additional_kwargs=[39m[38;5;166m{}[39m[38;5;36m, response_metadata=[39m[38;5;36m{[39m[38;5;36m'[39m[38;5;247mmodel[39m[38;5;36m'[39m[38;5;36m: [39m[38;5;36m'[39m[38;5;247mllama3[39m[38;5;36m.1[39m[38;5;245m:[39m[38;5;247mlatest[39m[38;5;36m'[39m[38;5;36m, [39m[38;5;36m'[39m[38;5;247mcreated_at[39m[38;5;36m'[39m[38;5;36m: [39m[38;5;36m'[39m[38;5;36m2025[39m[38;5;245m-[39m[38;5;36m12[39m[38;5;245m-[39m[38;5;36m21[39m[38;5;24

{'model': 'llama3.1:latest',
 'created_at': '2025-12-21T13:41:59.532371463Z',
 'done': True,
 'done_reason': 'stop',
 'total_duration': 638740622,
 'load_duration': 123912538,
 'prompt_eval_count': 35,
 'prompt_eval_duration': 42621121,
 'eval_count': 22,
 'eval_duration': 446041227,
 'logprobs': None,
 'model_name': 'llama3.1:latest',
 'model_provider': 'ollama'}

In [5]:
SYSTEM_PROMPT = (
    """
    You are an expert NLU annotator. Your job is to rate how plausible a candidate meaning (sense)
    is for the HOMONYM used in the target sentence within the short story.

    Return ONLY a single JSON object with one key: "score" and an integer value 1, 2, 3, 4 or 5.
    Integer mapping:
      1 = Definitely not
      2 = Probably not
      3 = Ambiguous / Unsure
      4 = Probably yes
      5 = Definitely yes

    The response must be a JSON object and nothing else, for example: {{"score": 4}}
    """
)

USER_PROMPT = (
    """
    [STORY]
    {full_story_text}

    [HOMONYM]
    {homonym}

    [CANDIDATE SENSE]
    {sense_text}

    [TASK]
    Based on the STORY above, decide how plausible it is that the HOMONYM is used with the
    CANDIDATE SENSE in the target sentence.

    Return ONLY a single JSON object with one key "score" and an integer value (1-5)
    as described by the system message. Example output: {{"score": 3}}
    """
)


def create_full_story_text(item):
    """Compose the story text used as context for rating.

    Uses `precontext`, `sentence`, and `ending` fields when available and joins them into a single string.
    """
    fullstory = f"{item.get('precontext', '')} {item.get('sentence', '')} {item.get('ending', '')}"
    return fullstory.strip()


def create_message(item):
    sense = f"{item.get('judged_meaning', '')} as in {item.get('example_sentence', '')}".strip()
    homonym = item.get("homonym", "")
    full_story_text = create_full_story_text(item)
    system_message = SystemMessage(content=SYSTEM_PROMPT)
    human_message = HumanMessage(
        content=USER_PROMPT.format(
            full_story_text=full_story_text,
            homonym=homonym,
            sense_text=sense,
        )
    )
    # Return langchain Message objects for use with chat models
    return [system_message, human_message]


def create_agent_input(item):
    """Return the dict input shape expected by `create_agent`.

    The agent expects an input dict with a `messages` key containing a list of
    dicts with `role` and `content` fields (e.g., {"messages": [{"role":"user","content":"..."}]}).
    """
    sys_msg, human_msg = create_message(item)
    messages = [
        {"role": "system", "content": sys_msg.content},
        {"role": "user", "content": human_msg.content},
    ]
    return {"messages": messages}

In [6]:
TRAIN_JSON_FILE = "../data/train.json"
DEV_JSON_FILE = "../data/dev.json"

def load_data(file_path):
    """
    Loads the json containing the dataset and return a pandas dataframe.
    """
    with open(file_path, 'r') as f:
        data = json.load(f)
    # Transpose because the json is {id: {features...}, ...}
    df = pd.DataFrame(data).T
    # Ensure 'average' is float
    df['average'] = df['average'].astype(float)
    # Ensure 'choices' is list (for scoring later)
    return df

df_train = load_data(TRAIN_JSON_FILE)
df_dev = load_data(DEV_JSON_FILE)

In [7]:
class Score(BaseModel):
    score: int = Field(
        description="Rating of the candidate sense for the homonym in the story, from 1 to 5."
    )

# Create agent with the LLM and output parser
llm = ChatOllama(
    model=model_names[0],
    temperature=0,
    # other params...
)

agent = create_agent(
    model=llm,
    response_format=Score
)

In [8]:
# Random element in the dataset
item = df_train.sample(1).iloc[0].to_dict()

# Build the agent input in the expected dict format
input_data = create_agent_input(item)

# Invoke the agent with the correctly shaped input (dict with 'messages')
ai_msg = agent.invoke(input_data)

# Extract structured response if available
structured = None
structured = ai_msg.get("structured_response")
ic(structured)
ic(item["average"])
ic(item["choices"])
ic(create_message(item))
# ic(ai_msg["messages"])
pass

[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mstructured[39m[38;5;245m:[39m[38;5;245m [39m[38;5;247mScore[39m[38;5;245m([39m[38;5;247mscore[39m[38;5;245m=[39m[38;5;36m2[39m[38;5;245m)[39m
[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mitem[39m[38;5;245m[[39m[38;5;36m"[39m[38;5;36maverage[39m[38;5;36m"[39m[38;5;245m][39m[38;5;245m:[39m[38;5;245m [39m[38;5;36m5.0[39m
[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mitem[39m[38;5;245m[[39m[38;5;36m"[39m[38;5;36mchoices[39m[38;5;36m"[39m[38;5;245m][39m[38;5;245m:[39m[38;5;245m [39m[38;5;245m[[39m[38;5;36m5[39m[38;5;245m,[39m[38;5;245m [39m[38;5;36m5[39m[38;5;245m,[39m[38;5;245m [39m[38;5;36m5[39m[38;5;245m,[39m[38;5;245m [39m[38;5;36m5[39m[38;5;245m,[39m[38;5;245m [39m[38;5;36m5[39m[38;5;245m][39m
[38;5;247mic[39m[38;5;245m|[39m[38;5;245m [39m[38;5;247mcreate_message[39m[38;5;245m([39m[38;5;247mitem[39m

In [9]:
def get_dev_predictions(agent, df, max_examples=None):
    """Run the agent on up to `max_examples` items from df and return a list of dicts.

    Returns list of {"id": str(id), "prediction": int, "time": float}
    """
    preds = []
    ids = list(df.index.astype(str))
    if max_examples is not None:
        ids = ids[:max_examples]

    total_start = time.perf_counter()
    per_item_times = []

    for idx in tqdm(ids):
        item = df.loc[idx].to_dict()
        input_data = create_agent_input(item)

        start = time.perf_counter()
        ai_msg = agent.invoke(input_data)
        elapsed = time.perf_counter() - start

        per_item_times.append((idx, elapsed))

        # extract structured response if available
        structured = ai_msg.get("structured_response")
        try:
            pred = int(structured.score)
        except Exception:
            print("Error parsing structured response:", structured, "for item id:", idx)
            pred = None

        preds.append({"id": str(idx), "prediction": pred, "time": elapsed})

    total_elapsed = time.perf_counter() - total_start
    # attach summary timings as metadata
    return {
        "predictions": preds,
        "total_time": total_elapsed,
        "per_item_times": per_item_times,
        "avg_time": sum(t for _, t in per_item_times) / len(per_item_times) if per_item_times else 0,
    }

In [10]:
# Smoke test: run on 10 dev items and save predictions + ref, then call scoring.py
OUT_DIR = f"""../llm-ollama/{model_names[0].replace(":", "-")}"""
os.makedirs(OUT_DIR, exist_ok=True)

MAX_EXAMPLES = None

res = get_dev_predictions(agent, df_dev, max_examples=MAX_EXAMPLES)

preds = res["predictions"]

pred_file = os.path.join(OUT_DIR, "predictions.jsonl")
with open(pred_file, "w") as f:
    for p in preds:
        f.write(json.dumps({"id": p["id"], "prediction": p["prediction"]}) + "\n")

# Save timing info
timing_file = os.path.join(OUT_DIR, "timing.txt")
with open(timing_file, "w") as f:
    f.write(f"total_time_sec: {res['total_time']:.4f}\n")
    f.write(f"avg_time_sec: {res['avg_time']:.4f}\n")
    f.write("per_item_times_sec:\n")
    for idx, t in res["per_item_times"]:
        f.write(f"{idx}: {t:.4f}\n")

# Create ref.jsonl from df_dev
ref_file = os.path.join(OUT_DIR, "ref.jsonl")
with open(ref_file, "w") as f:
    for idx, row in df_dev.iterrows():
        if MAX_EXAMPLES is not None and int(idx) >= MAX_EXAMPLES:
            break
        f.write(json.dumps({"id": str(idx), "label": row["choices"]}) + "\n")

print(f"Predictions saved to {pred_file}")
print(f"Gold data saved to {ref_file}")
print(f"Timing saved to {timing_file}")

# Run scoring script
res = subprocess.run(["python", "../score/scoring.py", ref_file, pred_file, os.path.join(OUT_DIR, "score.json")], capture_output=True, text=True)
print(res.stdout)
print(res.stderr)

100%|██████████| 588/588 [05:05<00:00,  1.92it/s]



Predictions saved to ../llm-ollama/llama3.1-latest/predictions.jsonl
Gold data saved to ../llm-ollama/llama3.1-latest/ref.jsonl
Timing saved to ../llm-ollama/llama3.1-latest/timing.txt
Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/llama3.1-latest/predictions.jsonl on ../llm-ollama/llama3.1-latest/ref.jsonl
----------
Spearman Correlation: 0.37265048792920324
Spearman p-Value: 8.26821845294649e-21
----------
Accuracy: 0.6717687074829932 (395/588)
Results dumped into scores.json successfully.


Importing...
Starting Scoring script...
Everything looks OK. Evaluating file ../llm-ollama/llama3.1-latest/predictions.jsonl on ../llm-ollama/llama3.1-latest/ref.jsonl
----------
Spearman Correlation: 0.37265048792920324
Spearman p-Value: 8.26821845294649e-21
----------
Accuracy: 0.6717687074829932 (395/588)
Results dumped into scores.json successfully.


