In [None]:
# Minify the cleaned data
def minify_json_data(data):
    """Convert data to minified JSON string (no whitespace)."""
    return json.dumps(data, separators=(",", ":"), ensure_ascii=False)

# Create minified version
data_cleaned_minified = minify_json_data(data_clean)

# Save to file
with open('data_cleaned_minified.json', 'w', encoding='utf-8') as f:
    f.write(data_cleaned_minified)

print("✓ data_cleaned_minified.json created!")
print(f"\nFile size: {len(data_cleaned_minified)} characters")
print(f"\nPreview (first 200 chars):\n{data_cleaned_minified[:200]}...")

✓ data_cleaned_minified.json created!

File size: 282211 characters

Preview (first 200 chars):
{"0":{"homonym":"track","judged_meaning":"a pair of parallel rails providing a runway for wheels","precontext":"The detectives arrived at the abandoned train station. They were looking for signs of th...


In [None]:
# Create solution.jsonl from original data
def create_solution_file(data, output_path='solution.jsonl'):
    """
    Extract id and choices (labels) from data and create a JSONL file.
    Each line contains one JSON object with 'id' and 'label' fields.
    """
    with open(output_path, 'w', encoding='utf-8') as f:
        for sample_id, entry in data.items():
            # Extract choices as label
            if isinstance(entry, dict) and 'choices' in entry:
                solution_entry = {
                    "id": sample_id,
                    "label": entry['choices']
                }
                # Write one JSON object per line
                f.write(json.dumps(solution_entry, ensure_ascii=False) + '\n')

    print(f"✓ {output_path} created!")

# Generate the solution file
create_solution_file(data)

# Display preview
print("\nPreview of solution.jsonl:")
with open('solution.jsonl', 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i < 5:  # Show first 5 lines
            print(line.strip())
        else:
            break
print("...")

✓ solution.jsonl created!

Preview of solution.jsonl:
{"id": "0", "label": [4, 5, 3, 1, 5]}
{"id": "1", "label": [3, 3, 4, 4, 4]}
{"id": "2", "label": [5, 5, 2, 3, 4]}
{"id": "3", "label": [4, 5, 4, 3, 5]}
{"id": "4", "label": [1, 5, 4, 4, 1]}
...


In [None]:
import json
import pandas as pd
import random

# ---- CONFIG ----
SHUFFLE_LABELS = True  # Shuffle annotation order (order not important)
RANDOM_SEED = 42       # For reproducibility

def build_input(row):
    """Format row data into LLM input text."""
    return (
        f"Homonym: {row.get('homonym', '')}\n"
        f"Judged meaning: {row.get('judged_meaning', '')}\n"
        f"Context: {row.get('precontext', '')}\n"
        f"Sentence: {row.get('sentence', '')}\n"
        f"Ending: {row.get('ending', '')}\n"
        f"Example meaning sentence: {row.get('example_sentence', '')}"
    ).strip()

def expand_annotations(merged_df, shuffle=True, seed=42):
    """
    Create a training sample for EACH individual annotation.

    Args:
        merged_df: DataFrame with columns input_text and label (list)
        shuffle: If True, shuffle annotation order in each list
        seed: Seed for reproducibility

    Returns:
        List of dictionaries with text and individual label
    """
    if seed:
        random.seed(seed)

    output_records = []

    for _, row in merged_df.iterrows():
        text = row['input_text']
        labels = row['label']

        # Check that label is a list
        if not isinstance(labels, list):
            labels = [labels]

        # Filter valid values (1-5)
        valid_labels = [l for l in labels if isinstance(l, (int, float)) and 1 <= l <= 5]

        if not valid_labels:
            continue

        # Optional: shuffle annotation order
        if shuffle:
            valid_labels = valid_labels.copy()
            random.shuffle(valid_labels)

        # Create a sample for each annotation
        for label in valid_labels:
            output_records.append({
                "text": text,
                "label": float(label)
            })

    return output_records

# Load minified cleaned data
with open('data_cleaned_minified.json', 'r', encoding='utf-8') as f:
    data_cleaned_minified = json.load(f)

dev_df = pd.DataFrame.from_dict(data_cleaned_minified, orient="index")
# Reset index to make sample_id a column
dev_df = dev_df.reset_index().rename(columns={'index': 'sample_id'})
print(f"✓ Loaded DEV set: {len(dev_df)} rows")

# Load solution data
sol_df = pd.read_json('solution.jsonl', lines=True)
sol_df = sol_df.rename(columns={"id": "sample_id"})
print(f"✓ Loaded solution set: {len(sol_df)} rows")

# Convert join keys to string
dev_df["sample_id"] = dev_df["sample_id"].astype(str)
sol_df["sample_id"] = sol_df["sample_id"].astype(str)

# Merge datasets
merged = dev_df.merge(sol_df, on="sample_id", how="inner")
print(f"✓ Merged dataset: {len(merged)} samples")

# Warn about lost samples
lost = len(dev_df) - len(merged)
if lost > 0:
    print(f"⚠ Warning: lost {lost} samples during merge ({lost/len(dev_df)*100:.1f}%)")

# Build text fields
merged["input_text"] = merged.apply(build_input, axis=1)

# Expand: create one sample per annotation
output_records = expand_annotations(
    merged,
    shuffle=SHUFFLE_LABELS,
    seed=RANDOM_SEED
)

print(f"✓ Expanded to {len(output_records)} training samples from {len(merged)} unique texts")
print(f"  → Average {len(output_records)/len(merged):.1f} annotations per sample")

# Save output
with open('train_llm_regression.jsonl', "w", encoding="utf-8") as f:
    for record in output_records:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"✓ Saved {len(output_records)} samples → train_llm_regression.jsonl")

# Display preview
print("\nPreview of train_llm_regression.jsonl:")
with open('train_llm_regression.jsonl', 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i < 3:  # Show first 3 lines
            print(json.loads(line))
        else:
            break

✓ Loaded DEV set: 588 rows
✓ Loaded solution set: 588 rows
✓ Merged dataset: 588 samples
✓ Expanded to 2952 training samples from 588 unique texts
  → Average 5.0 annotations per sample
✓ Saved 2952 samples → train_llm_regression.jsonl

Preview of train_llm_regression.jsonl:
{'text': 'Homonym: track\nJudged meaning: a pair of parallel rails providing a runway for wheels\nContext: The detectives arrived at the abandoned train station. They were looking for signs of the missing artifact. A faint trail caught their attention.\nSentence: They followed the track.\nEnding: They began to run along the abandoned railway line, hopping from wooden sleeper to sleeper to avoid twisting an ankle.\nExample meaning sentence: The train glided smoothly along the track.', 'label': 1.0}
{'text': 'Homonym: track\nJudged meaning: a pair of parallel rails providing a runway for wheels\nContext: The detectives arrived at the abandoned train station. They were looking for signs of the missing artifact. A fain

In [None]:
from google.colab import files
uploaded = files.upload()

import json

# Load the data
with open('dev.json', 'r') as f:
    data = json.load(f)

print("Original data:")
print(json.dumps(data, indent=2, ensure_ascii=False)[:500])  # Show preview

# Function to clean the data
def drop_fields(entries, fields):
    """Remove specified fields from each entry."""
    field_set = set(fields)

    for entry in entries.values():
        if not isinstance(entry, dict):
            continue
        for field in field_set:
            entry.pop(field, None)

# List of fields to remove
fields_to_remove = ["choices", "average", "stdev", "nonsensical", "sample_id"]

# Create cleaned data (make a copy to preserve original if needed)
import copy
data_clean = copy.deepcopy(data)

# Clean the data
drop_fields(data_clean, fields_to_remove)

print("\n" + "="*50)
print("Cleaned data:")
print(json.dumps(data_clean, indent=4, ensure_ascii=False))

print(f"\n✓ data_clean is ready to use!")

Saving dev.json to dev (1).json
Original data:
{
  "0": {
    "homonym": "track",
    "judged_meaning": "a pair of parallel rails providing a runway for wheels",
    "precontext": "The detectives arrived at the abandoned train station. They were looking for signs of the missing artifact. A faint trail caught their attention.",
    "sentence": "They followed the track.",
    "ending": "They began to run along the abandoned railway line, hopping from wooden sleeper to sleeper to avoid twisting an ankle.",
    "choices": [
      4,
      5,
   

Cleaned data:
{
    "0": {
        "homonym": "track",
        "judged_meaning": "a pair of parallel rails providing a runway for wheels",
        "precontext": "The detectives arrived at the abandoned train station. They were looking for signs of the missing artifact. A faint trail caught their attention.",
        "sentence": "They followed the track.",
        "ending": "They began to run along the abandoned railway line, hopping from wooden sl

In [None]:
# Install required package
!pip install -q bitsandbytes accelerate

import json
import os
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

from transformers import AutoTokenizer, AutoModel
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error


# -------------------------
# Dataset
# -------------------------
class JsonlRegressionDataset(Dataset):
    def __init__(self, jsonl_path):
        self.samples = []
        with open(jsonl_path, "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line.strip())
                self.samples.append({"text": data["text"], "label": float(data["label"])})

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]


# -------------------------
# Regressor head
# -------------------------
class Regressor(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, 512),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(512, 128),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1)
        )
    def forward(self, x):
        return self.net(x).squeeze(-1)


# -------------------------
# Mean pooling
# -------------------------
def mean_pooling(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).float()
    return torch.sum(last_hidden_state * mask, dim=1) / torch.clamp(mask.sum(dim=1), min=1e-9)


# -------------------------
# Evaluation
# -------------------------
@torch.no_grad()
def evaluate(embedder, regressor, loader, device):
    embedder.eval(); regressor.eval()
    preds, trues = [], []
    for batch in loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = embedder(input_ids=input_ids, attention_mask=attention_mask)
        embeds = mean_pooling(outputs.last_hidden_state, attention_mask)
        pred = regressor(embeds)

        preds.extend(pred.cpu().numpy().tolist())
        trues.extend(labels.cpu().numpy().tolist())

    mse = mean_squared_error(trues, preds)
    spearman = spearmanr(trues, preds).correlation
    return mse, spearman


# -------------------------
# Configuration
# -------------------------
TRAIN_JSONL = "train_llm_regression.jsonl"
MODEL_NAME = "unsloth/llama-3.1-8b-instruct-bnb-4bit"
OUTPUT_DIR = "models/llama_regressor_unsloth"
BATCH_SIZE = 28
EPOCHS = 4
LR = 3e-4
MAX_LENGTH = 512

os.makedirs(OUTPUT_DIR, exist_ok=True)

# -------------------------
# Load tokenizer and model
# -------------------------
print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Model (already 4-bit quantized)
embedder = AutoModel.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True
)
embedder.eval()
for p in embedder.parameters():
    p.requires_grad = False

# -------------------------
# Dataset preparation
# -------------------------
print("Loading dataset...")
dataset = JsonlRegressionDataset(TRAIN_JSONL)
val_size = int(0.1 * len(dataset))
train_ds, val_ds = random_split(dataset, [len(dataset) - val_size, val_size])

def collate_fn(batch):
    texts = [x["text"] for x in batch]
    labels = [x["label"] for x in batch]
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=MAX_LENGTH,
        padding=True,
        return_tensors="pt"
    )
    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": torch.tensor(labels, dtype=torch.float32)
    }

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  collate_fn=collate_fn)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# -------------------------
# Get embedding dimension
# -------------------------
sample = next(iter(train_loader))
with torch.no_grad():
    dim = embedder(**{k: v.to(embedder.device) for k, v in sample.items() if k != "labels"}).last_hidden_state.shape[-1]

# -------------------------
# Initialize regressor
# -------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
regressor = Regressor(dim).to(device)
optimizer = torch.optim.AdamW(regressor.parameters(), lr=LR)
loss_fn = nn.MSELoss()

best_spearman = -1.0
print(f"Training on {len(train_ds)} samples | Val: {len(val_ds)} | Embedding dim: {dim}")

# -------------------------
# Training loop
# -------------------------
for epoch in range(1, EPOCHS + 1):
    regressor.train()
    losses = []
    for batch in tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch.pop("labels")

        with torch.no_grad():
            out = embedder(**batch)
            embeds = mean_pooling(out.last_hidden_state, batch["attention_mask"])

        preds = regressor(embeds)
        loss = loss_fn(preds, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        losses.append(loss.item())

    val_mse, val_spearman = evaluate(embedder, regressor, val_loader, device)
    print(f"Epoch {epoch} | Loss: {np.mean(losses):.4f} | Val MSE: {val_mse:.4f} | Spearman: {val_spearman:.4f}")

    if val_spearman > best_spearman:
        best_spearman = val_spearman
        torch.save(regressor.state_dict(), os.path.join(OUTPUT_DIR, "regressor_best.pt"))
        with open(os.path.join(OUTPUT_DIR, "meta.json"), "w") as f:
            json.dump({"model_name": MODEL_NAME, "spearman": best_spearman}, f)
        print(f"✓ NEW BEST → Spearman = {best_spearman:.4f}")

print(f"\n✓ Training complete! Best Spearman: {best_spearman:.4f}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hLoading tokenizer and model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

Loading dataset...


FileNotFoundError: [Errno 2] No such file or directory: 'train_llm_regression.jsonl'