In [1]:
# %% [markdown] ----------------------------------------------------------
# # Task 5 – CVSS Regressor
# Uses the frozen CodeBERT encoder + a 2-layer MLP head to predict a
# CVSS v3 score (0–10).  Expected runtime: <15 min on RTX-3060.

# %% [code] 0 Imports & paths
import pathlib, torch, time, json, evaluate
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer
from torch import nn
from datasets import load_dataset, Value
from src.train_classifier_utils import seed_everything          # same seed helper
from src.reg_utils import regression_metrics
import math # new helper
import sklearn.metrics as sk

TRAIN_JSONL = "../data/splits/train.jsonl"
VALID_JSONL = "../data/splits/valid.jsonl"
OUT_DIR     = pathlib.Path("../models/cvss_regressor")
OUT_DIR.mkdir(parents=True, exist_ok=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
seed_everything(42)
print("Device:", DEVICE)

# %% [code] 1 Load splits & tokenise
tok = AutoTokenizer.from_pretrained("microsoft/codebert-base")
MAX_LEN = 512

def tokenize(batch):
    return tok(batch["Function before"],
               truncation=True, padding="longest", max_length=MAX_LEN)

ds = load_dataset("json",
                  data_files={"train": TRAIN_JSONL,
                              "validation": VALID_JSONL})
ds = ds.filter(lambda x: x["cvss"] is not None and not math.isnan(x["cvss"]))
ds = ds.map(tokenize, batched=True, remove_columns=["Function before"])

ds = ds.rename_column("cvss", "labels")      # Trainer now sees 'labels'
ds = ds.cast_column("labels", Value("float32"))     # regression needs float
ds.set_format(type="torch")

# %% [code] 2 Build the frozen encoder + MLP head
class CodeBERTRegressor(nn.Module):
    def __init__(self, base_model_name="microsoft/codebert-base"):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_model_name)   # 768-d CLS
        for p in self.encoder.parameters():          # freeze
            p.requires_grad = False
        self.mlp = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        cls = self.encoder(input_ids=input_ids,
                           attention_mask=attention_mask).last_hidden_state[:, 0, :]
        pred = self.mlp(cls).squeeze(-1)             # (batch,)
        if labels is not None:
            loss_fn = nn.MSELoss()
            loss = loss_fn(pred, labels.float())
            return {"loss": loss, "logits": pred}
        return {"logits": pred}

model = CodeBERTRegressor().to(DEVICE)
print("Trainable parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))

# %% [code] 3 TrainingArguments
args = TrainingArguments(
    output_dir=str(OUT_DIR),
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-4,
    fp16=(DEVICE == "cuda"),
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_mae",
    greater_is_better=False,            # lower MAE is better
    logging_steps=50,
    seed=42
)

# %% [code] 4 HF Trainer + custom metrics
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = preds.flatten()
    labels = labels.flatten()

    mae  = sk.mean_absolute_error(labels, preds)
    mse  = sk.mean_squared_error(labels, preds)   # always exists
    rmse = math.sqrt(mse)
    r2   = sk.r2_score(labels, preds)

    return {"mae": mae, "rmse": rmse, "r2": r2}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    compute_metrics=compute_metrics,
)

t0 = time.time()
trainer.train()
print(f"⏱ Finished in {(time.time()-t0)/60:.1f} min")

# %% [code] 5 Save artefacts
trainer.save_model(str(OUT_DIR))
metrics = trainer.evaluate(ds["validation"])
with open(OUT_DIR / "metrics.json", "w") as w:
    json.dump(metrics, w, indent=2)
metrics


  from .autonotebook import tqdm as notebook_tqdm


Device: cpu


Generating train split: 563 examples [00:00, 16082.28 examples/s]
Generating validation split: 70 examples [00:00, 4999.00 examples/s]
Filter: 100%|██████████| 563/563 [00:00<00:00, 13089.40 examples/s]
Filter: 100%|██████████| 70/70 [00:00<00:00, 1941.82 examples/s]
Map: 100%|██████████| 282/282 [00:00<00:00, 1336.20 examples/s]
Map: 100%|██████████| 35/35 [00:00<00:00, 538.69 examples/s]
Casting the dataset: 100%|██████████| 282/282 [00:00<00:00, 11053.32 examples/s]
Casting the dataset: 100%|██████████| 35/35 [00:00<00:00, 1166.33 examples/s]


Trainable parameters: 197121




Epoch,Training Loss,Validation Loss,Mae,Rmse,R2
1,No log,2.674279,1.37588,1.635322,-2.494392
2,No log,1.481547,1.214155,1.217188,-0.935889
3,7.166000,1.462202,1.205687,1.209215,-0.910611




⏱ Finished in 21.2 min


SafetensorError: Error while serializing: IoError(Os { code: 1224, kind: Uncategorized, message: "The requested operation cannot be performed on a file with a user-mapped section open." })