In [2]:
# %% [markdown] -------------------------------------------------------------
# # Task 4 – Fine-tune binary vulnerability classifier
# • Base model : microsoft/codebert-base (110 M)
# • Adapter   : LoRA  (rank = 8)
# • Output    : models/codebert_mini_lora/  +  metrics.json
# Expected run-time on RTX-3060: ≈ 45 min (5 epochs, batch 8, fp16)

# %% [code] ▸ 0 Imports & paths
import pathlib, time, json, torch, evaluate
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, DataCollatorWithPadding)
from peft import LoraConfig, get_peft_model
from src.train_classifier_utils import seed_everything, load_splits

TRAIN_JSONL = "../data/splits/train.jsonl"
VALID_JSONL = "../data/splits/valid.jsonl"
OUT_DIR     = pathlib.Path("../models/codebert_mini_lora")
OUT_DIR.mkdir(parents=True, exist_ok=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
seed_everything(42)
print("Device:", DEVICE)

# %% [code] ▸ 1 Load splits
ds = load_splits(TRAIN_JSONL, VALID_JSONL)
print(ds)

# %% [code] ▸ 2 Tokeniser & map
tok = AutoTokenizer.from_pretrained("microsoft/codebert-base")
MAX_LEN = 512

def tokenize(batch):
    return tok(batch["Function before"],
               truncation=True, max_length=MAX_LEN, padding="max_length")

ds = ds.map(tokenize, batched=True, remove_columns=["Function before"])
ds = ds.rename_column("is_vuln", "labels")
ds.set_format(type="torch")

# %% [code] ▸ 3 Base model + LoRA adapter
base = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/codebert-base", num_labels=2
)

lora_cfg = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.05,
    target_modules=["query","value"],  # attention layers
    bias="none", task_type="SEQ_CLS"
)
model = get_peft_model(base, lora_cfg).to(DEVICE)
model.print_trainable_parameters()    # sanity: ~0.5 M params trainable

# %% [code] ▸ 4 Metric callback
sk_metrics = evaluate.combine(["precision", "recall", "f1"])
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return sk_metrics.compute(predictions=preds, references=labels)

# be sure to import evaluate earlier:
#   import evaluate
#   sk_metrics = evaluate.combine(["precision", "recall", "f1"])

# %% [code] ▸ 5 HF Trainer setup
args = TrainingArguments(
    output_dir=str(OUT_DIR),
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-4,
    warmup_ratio=0.05,
    fp16=(DEVICE=="cuda"),
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    seed=42
)

# data_collator = DataCollatorWithPadding(tokenizer=tok, padding="longest")

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    compute_metrics=compute_metrics
)

t0 = time.time()
trainer.train()
print(f"⏱ Finished in {(time.time()-t0)/60:.1f} min")

# %% [code] ▸ 6 Save weights & metrics
trainer.save_model(str(OUT_DIR))            # saves only LoRA adapter
metrics = trainer.evaluate(ds["validation"])
with open(OUT_DIR / "metrics.json", "w") as w:
    json.dump(metrics, w, indent=2)
metrics


Device: cpu


Generating train split: 160 examples [00:00, 11211.16 examples/s]
Generating validation split: 20 examples [00:00, 3377.06 examples/s]


DatasetDict({
    train: Dataset({
        features: ['Function before', 'is_vuln'],
        num_rows: 160
    })
    validation: Dataset({
        features: ['Function before', 'is_vuln'],
        num_rows: 20
    })
})


Map: 100%|██████████| 160/160 [00:00<00:00, 1673.17 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 777.75 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 887,042 || all params: 125,534,212 || trainable%: 0.7066


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.703443,0.0,0.0,0.0
2,No log,0.706722,0.0,0.0,0.0
3,0.732200,0.77943,0.5,1.0,0.666667
4,0.732200,0.710156,0.0,0.0,0.0
5,0.721500,0.699326,0.0,0.0,0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


⏱ Finished in 24.0 min




{'eval_loss': 0.7794302105903625,
 'eval_precision': 0.5,
 'eval_recall': 1.0,
 'eval_f1': 0.6666666666666666,
 'eval_runtime': 6.4945,
 'eval_samples_per_second': 3.08,
 'eval_steps_per_second': 0.462,
 'epoch': 5.0}

In [None]:
from transformers import TrainingArguments
TrainingArguments(
    output_dir="tmp",
    eval_strategy="epoch",
    logging_steps=10,
    per_device_train_batch_size=2,
    num_train_epochs=1
)
