In [1]:
 # %% [markdown] -------------------------------------------------------------
# # Task 4 – Fine-tune binary vulnerability classifier
# • Base model : microsoft/codebert-base (110 M)
# • Adapter   : LoRA  (rank = 8)
# • Output    : models/codebert_mini_lora/  +  metrics.json
# Expected run-time on RTX-3060: ≈ 45 min (5 epochs, batch 8, fp16)

# %% [code] ▸ 0 Imports & paths
import pathlib, time, json, torch, evaluate
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, DataCollatorWithPadding)
from peft import LoraConfig, get_peft_model
from src.train_classifier_utils import seed_everything, load_splits
from transformers import DataCollatorWithPadding
from torch.nn import CrossEntropyLoss



TRAIN_JSONL = "../data/splits/train.jsonl"
VALID_JSONL = "../data/splits/valid.jsonl"
OUT_DIR     = pathlib.Path("../models/codebert_mini_lora")
OUT_DIR.mkdir(parents=True, exist_ok=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
seed_everything(42)
print("Device:", DEVICE)

# %% [code] ▸ 1 Load splits
ds = load_splits(TRAIN_JSONL, VALID_JSONL)
print(ds)

# %% [code] ▸ 2 Tokeniser & map
tok = AutoTokenizer.from_pretrained("microsoft/codebert-base")
data_collator = DataCollatorWithPadding(tokenizer=tok, padding="longest")
MAX_LEN = 512

def tokenize(batch):
    return tok(batch["Function before"],
               truncation=True, max_length=MAX_LEN, padding="max_length")

ds = ds.map(tokenize, batched=True, remove_columns=["Function before"])
ds = ds.rename_column("is_vuln", "labels")
ds.set_format(type="torch")

# %% [code] ▸ 3 Base model + LoRA adapter
base = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/codebert-base", num_labels=2
)

lora_cfg = LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.05,
    target_modules=["query","value"],  # attention layers
    bias="none", task_type="SEQ_CLS"
)
model = get_peft_model(base, lora_cfg).to(DEVICE)
# compute class imbalance on the train split
labels = ds["train"]["labels"]
pos = int((labels == 1).sum())   # number of vulnerable samples
neg = int((labels == 0).sum())   # number of clean samples
clean_weight = pos / neg if neg else 1.0        # >1 if many vulns
loss_weights = torch.tensor([clean_weight, 1.0]).to(DEVICE)

# patch into the model (PyTorch CROssEntropyLoss supports weight=…)
model.classifier.loss_fct = CrossEntropyLoss(weight=loss_weights)
print(f"Class weights → clean: {clean_weight:.2f}   vuln: 1.0")

model.print_trainable_parameters()    # sanity: ~0.5 M params trainable

# %% [code] ▸ 4 Metric callback
sk_metrics = evaluate.combine(["precision", "recall", "f1"])
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return sk_metrics.compute(predictions=preds, references=labels)

# be sure to import evaluate earlier:
#   import evaluate
#   sk_metrics = evaluate.combine(["precision", "recall", "f1"])

# %% [code] ▸ 5 HF Trainer setup
args = TrainingArguments(
    output_dir=str(OUT_DIR),
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-4,
    warmup_ratio=0.05,
    fp16=(DEVICE=="cuda"),
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    seed=42
)

# data_collator = DataCollatorWithPadding(tokenizer=tok, padding="longest")

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

t0 = time.time()
trainer.train()
print(f"⏱ Finished in {(time.time()-t0)/60:.1f} min")

# %% [code] ▸ 6 Save weights & metrics
trainer.save_model(str(OUT_DIR))            # saves only LoRA adapter
metrics = trainer.evaluate(ds["validation"])
with open(OUT_DIR / "metrics.json", "w") as w:
    json.dump(metrics, w, indent=2)
metrics


Device: cpu
DatasetDict({
    train: Dataset({
        features: ['Function before', 'is_vuln', 'cvss'],
        num_rows: 563
    })
    validation: Dataset({
        features: ['Function before', 'is_vuln', 'cvss'],
        num_rows: 70
    })
})


Map: 100%|██████████| 70/70 [00:00<00:00, 795.28 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class weights → clean: 1.00   vuln: 1.0
trainable params: 887,042 || all params: 125,534,212 || trainable%: 0.7066


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.7486,0.696745,0.0,0.0,0.0
2,0.7153,0.69379,1.0,0.028571,0.055556
3,0.7192,0.692939,0.833333,0.142857,0.243902


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


⏱ Finished in 48.1 min




Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0001,6e-06,1.0,1.0,1.0
2,0.0,4e-06,1.0,1.0,1.0
3,0.0,3e-06,1.0,1.0,1.0


{'eval_loss': 0.6929389834403992,
 'eval_precision': 0.8333333333333334,
 'eval_recall': 0.14285714285714285,
 'eval_f1': 0.24390243902439024,
 'eval_runtime': 22.0597,
 'eval_samples_per_second': 3.173,
 'eval_steps_per_second': 0.408,
 'epoch': 3.0}



⏱ Finished in 152.2 min




{'eval_loss': 6.17799651081441e-06,
 'eval_precision': 1.0,
 'eval_recall': 1.0,
 'eval_f1': 1.0,
 'eval_runtime': 59.7813,
 'eval_samples_per_second': 3.346,
 'eval_steps_per_second': 0.418,
 'epoch': 3.0}

In [None]:
from transformers import TrainingArguments
TrainingArguments(
    output_dir="tmp",
    eval_strategy="epoch",
    logging_steps=10,
    per_device_train_batch_size=2,
    num_train_epochs=1
)
