In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.18-py311-none-any.whl.metadata (7.5 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

In [12]:
import os
import urllib.request
import zipfile
import numpy as np
import torch
from datasets import load_dataset, Dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import evaluate

In [13]:
SEED=42

In [14]:
bugvul_zip_url = "https://raw.githubusercontent.com/Meerschwein/Automating-SE/refs/heads/main/Big-Vul-dataset.zip"
data_path = "Big-Vul-dataset/data.json"

if not os.path.exists("Big-Vul-dataset.zip"):
    urllib.request.urlretrieve(bugvul_zip_url, "Big-Vul-dataset.zip")
if not os.path.exists("Big-Vul-dataset"):
    with zipfile.ZipFile("Big-Vul-dataset.zip", "r") as zip_ref:
        zip_ref.extractall("Big-Vul-dataset")

ds = load_dataset("json", data_files={"train": data_path}, split="train")
print(ds)

ds = ds.remove_columns(["flaw_line_no", "bigvul_id"]) # we don't need these columns

ds = ds.rename_column("vul", "labels")
ds = ds.class_encode_column("labels")

ds = ds.train_test_split(test_size=0.2, stratify_by_column="labels", seed=SEED)
train_ds = ds["train"]
val_ds = ds["test"]

train_sample_frac = 0.5
train_ds = train_ds.train_test_split(test_size=1-train_sample_frac, stratify_by_column="labels", seed=SEED)["train"]

In [15]:
MODEL_NAME = "neulab/codebert-cpp"
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch["code"], padding="max_length", truncation=True, max_length=512)
    #return tokenizer(batch["code"], padding="longest", truncation=True)

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

Map:   0%|          | 0/74612 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at neulab/codebert-cpp and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")
# auc_metric = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_metric.compute(predictions=predictions, references=labels)["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    # roc = auc_metric.compute(prediction_scores=logits[:, 1], references=labels)["roc_auc"]

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
      }

In [17]:
epochs = 3
training_args = TrainingArguments(
    output_dir="./content/bigvul_trainer",
    learning_rate=2e-5,
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    save_strategy="epoch",
    logging_dir="./content/logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

In [18]:
trainer.train()
metrics = trainer.evaluate()
print(metrics)

trainer.save_model("./content/bigvul_vuln_detector")

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0734,0.076948,0.98397,0.938776,0.706083,0.80597
2,0.0464,0.067743,0.985713,0.925694,0.757817,0.833385
3,0.0441,0.077691,0.9859,0.915711,0.77203,0.837754


{'eval_loss': 0.07769133895635605, 'eval_accuracy': 0.9859003913579585, 'eval_precision': 0.9157113958192852, 'eval_recall': 0.7720295622512792, 'eval_f1': 0.83775447254781, 'eval_runtime': 265.1523, 'eval_samples_per_second': 140.697, 'eval_steps_per_second': 8.795, 'epoch': 3.0}


In [22]:
!rm -f /content/bigvul_vuln_detector.zip
!zip -r /content/bigvul_vuln_detector.zip /content/content/bigvul_vuln_detector
from google.colab import files
files.download("/content/bigvul_vuln_detector.zip")

  adding: content/content/bigvul_vuln_detector/ (stored 0%)
  adding: content/content/bigvul_vuln_detector/config.json (deflated 51%)
  adding: content/content/bigvul_vuln_detector/tokenizer_config.json (deflated 76%)
  adding: content/content/bigvul_vuln_detector/training_args.bin (deflated 52%)
  adding: content/content/bigvul_vuln_detector/model.safetensors (deflated 7%)
  adding: content/content/bigvul_vuln_detector/merges.txt (deflated 53%)
  adding: content/content/bigvul_vuln_detector/vocab.json (deflated 59%)
  adding: content/content/bigvul_vuln_detector/special_tokens_map.json (deflated 85%)
  adding: content/content/bigvul_vuln_detector/tokenizer.json (deflated 82%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>