In [None]:
!pip install evaluate

In [3]:
import os
import urllib.request
import zipfile
import numpy as np
import torch
from datasets import load_dataset, Dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import evaluate

In [4]:
SEED=42

In [None]:
bugvul_zip_url = "https://raw.githubusercontent.com/Meerschwein/Automating-SE/refs/heads/main/Big-Vul-dataset.zip"
data_path = "Big-Vul-dataset/data.json"

if not os.path.exists("Big-Vul-dataset.zip"):
    urllib.request.urlretrieve(bugvul_zip_url, "Big-Vul-dataset.zip")
if not os.path.exists("Big-Vul-dataset"):
    with zipfile.ZipFile("Big-Vul-dataset.zip", "r") as zip_ref:
        zip_ref.extractall("Big-Vul-dataset")

ds = load_dataset("json", data_files={"train": data_path}, split="train")

# we don't need these columns
ds = ds.remove_columns(["flaw_line_no", "bigvul_id"])

# the Trainer stuff needs it to be called labels
ds = ds.rename_column("vul", "labels")
ds = ds.class_encode_column("labels")
ds = ds.train_test_split(test_size=0.2, stratify_by_column="labels", seed=SEED)
train_ds = ds["train"]
val_ds = ds["test"]

train_sample_frac = 0.1
train_ds = train_ds.train_test_split(test_size=1-train_sample_frac, stratify_by_column="labels", seed=SEED)["train"]

In [None]:
MODEL_NAME = "microsoft/codebert-base"
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch["code"], padding="max_length", truncation=True, max_length=256)
    #return tokenizer(batch["code"], padding="longest", truncation=True)

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    load_accuracy = evaluate.load("accuracy")
    load_f1 = evaluate.load("f1")
    load_recall = evaluate.load("recall")

    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    recall = load_recall.compute(predictions=predictions, references=labels)["recall"]

    return {"accuracy": accuracy, "f1": f1, "recall": recall}

In [13]:
epochs = 1
training_args = TrainingArguments(
    output_dir="./content/bigvul_trainer",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    save_strategy="epoch",
    logging_dir="./content/logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()
trainer.evaluate()

trainer.save_model("./content/bigvul_vuln_detector")