In [None]:
!pip install -U evaluate datasets fsspec

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstal

In [None]:
import os
import urllib.request
import zipfile
import numpy as np
import torch
import pandas as pd
from torch.utils.data import Dataset as TorchDataset
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import evaluate
from tqdm import tqdm
from sklearn.metrics import classification_report

In [None]:
seed = 42

dataset_percent = 0.2

train_percent = 0.75
eval_percent  = 0.10
test_percent  = 0.15

epochs = 3

tokenizer_name = "neulab/codebert-cpp"
model_name     = "neulab/codebert-cpp"

download_model = True

assert train_percent + eval_percent + test_percent == 1
np.random.seed(seed); torch.manual_seed(seed);

In [None]:
bugvul_zip_url = "https://raw.githubusercontent.com/Meerschwein/Automating-SE/refs/heads/main/Big-Vul-dataset.zip"
data_path = "Big-Vul-dataset/data.json"

if not os.path.exists("Big-Vul-dataset.zip"):
    urllib.request.urlretrieve(bugvul_zip_url, "Big-Vul-dataset.zip")

if not os.path.exists("Big-Vul-dataset"):
    with zipfile.ZipFile("Big-Vul-dataset.zip", "r") as zip_ref:
        zip_ref.extractall("Big-Vul-dataset")

In [None]:
ds = load_dataset("json", data_files={"train": data_path}, split="train")

ds = ds.remove_columns("bigvul_id") # we don't need these columns

ds = ds.rename_column("vul", "labels")
ds = ds.class_encode_column("labels")

if 0 < dataset_percent < 1: # smaller for training
  ds = ds.train_test_split(test_size=1-dataset_percent, stratify_by_column="labels", seed=seed)["train"]

train_eval_test_split = ds.train_test_split(train_size=train_percent, stratify_by_column="labels", seed=seed)
train_ds = train_eval_test_split["train"]

eval_test_split = train_eval_test_split["test"].train_test_split(test_size=test_percent/(test_percent+eval_percent), stratify_by_column="labels", seed=seed)
eval_ds = eval_test_split["train"]
test_ds = eval_test_split["test"]

print(f"Training Dataset   {((len(train_ds)/len(ds))*100):.2f}% {len(train_ds)}")
print(f"Validation Dataset {((len(eval_ds)/len(ds))*100):.2f}% {len(eval_ds)}")
print(f"Test Dataset       {((len(test_ds)/len(ds))*100):.2f}% {len(test_ds)}")

Generating train split: 0 examples [00:00, ? examples/s]

Stringifying the column:   0%|          | 0/186530 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/186530 [00:00<?, ? examples/s]

Training Dataset   75.00% 27979
Validation Dataset 10.00% 3730
Test Dataset       15.00% 5597


In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at neulab/codebert-cpp and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def function_tokenize(batch):
    return tokenizer(batch["code"], padding="max_length", truncation=True, max_length=512)

train_ds = train_ds.map(function_tokenize, batched=True)
eval_ds  = eval_ds.map(function_tokenize, batched=True)
test_ds  = test_ds.map(function_tokenize, batched=True)

Map:   0%|          | 0/27979 [00:00<?, ? examples/s]

Map:   0%|          | 0/3730 [00:00<?, ? examples/s]

Map:   0%|          | 0/5597 [00:00<?, ? examples/s]

In [None]:
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")
mcc_metric = evaluate.load("matthews_correlation")
auc_metric = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()  # Probability of class 1 (vulnerable)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_metric.compute(predictions=predictions, references=labels)["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    mcc = mcc_metric.compute(predictions=predictions, references=labels)["matthews_correlation"]
    auc = auc_metric.compute(prediction_scores=probs, references=labels)["roc_auc"]

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "mcc": mcc,
        "auc": auc,
    }

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.60k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

In [None]:
training_args = TrainingArguments(
    output_dir="./bigvul_trainer",
    learning_rate=2e-5,
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="mcc",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

predictions_output = trainer.predict(test_ds)
logits = predictions_output.predictions
labels = predictions_output.label_ids
preds = np.argmax(logits, axis=-1)
probs = torch.nn.functional.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()

print(classification_report(labels, preds, target_names=["Non-vulnerable", "Vulnerable"]))

accuracy = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
mcc = mcc_metric.compute(predictions=preds, references=labels)["matthews_correlation"]
auc = auc_metric.compute(prediction_scores=probs, references=labels)["roc_auc"]

print(f"Accuracy: {accuracy:.4f}")
print(f"MCC:      {mcc:.4f}")
print(f"ROC AUC:  {auc:.4f}")

from google.colab import files
if download_model:
  trainer.save_model("./bigvul_vuln_detector_seed"+str(seed))
  !rm -f ./bigvul_vuln_detector.zip
  !zip -r ./bigvul_vuln_detector.zip ./bigvul_vuln_detector_seed{seed}
  files.download("bigvul_vuln_detector.zip")

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc,Auc
1,0.0663,0.0623,0.984788,0.903553,0.758523,0.82471,0.82024,0.971935
2,0.0548,0.06146,0.986061,0.906557,0.785511,0.841705,0.836798,0.97351
3,0.0436,0.066686,0.987401,0.93,0.792614,0.855828,0.852252,0.971649


                precision    recall  f1-score   support

Non-vulnerable       0.99      1.00      0.99     21329
    Vulnerable       0.92      0.79      0.85      1055

      accuracy                           0.99     22384
     macro avg       0.96      0.89      0.92     22384
  weighted avg       0.99      0.99      0.99     22384

Accuracy: 0.9870
MCC:      0.8480
ROC AUC:  0.9740
  adding: bigvul_vuln_detector_seed42/ (stored 0%)
  adding: bigvul_vuln_detector_seed42/model.safetensors (deflated 7%)
  adding: bigvul_vuln_detector_seed42/vocab.json (deflated 59%)
  adding: bigvul_vuln_detector_seed42/tokenizer_config.json (deflated 76%)
  adding: bigvul_vuln_detector_seed42/config.json (deflated 51%)
  adding: bigvul_vuln_detector_seed42/merges.txt (deflated 53%)
  adding: bigvul_vuln_detector_seed42/tokenizer.json (deflated 82%)
  adding: bigvul_vuln_detector_seed42/special_tokens_map.json (deflated 85%)
  adding: bigvul_vuln_detector_seed42/training_args.bin (deflated 52%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
if download_model:
  trainer.save_model("./bigvul_vuln_detector_seed"+str(seed))
  !rm -f ./bigvul_vuln_detector.zip
  !zip -r ./bigvul_vuln_detector.zip ./bigvul_vuln_detector_seed{seed}
  files.download("bigvul_vuln_detector.zip")

  adding: bigvul_vuln_detector_seed42/ (stored 0%)
  adding: bigvul_vuln_detector_seed42/model.safetensors (deflated 7%)
  adding: bigvul_vuln_detector_seed42/vocab.json (deflated 59%)
  adding: bigvul_vuln_detector_seed42/tokenizer_config.json (deflated 76%)
  adding: bigvul_vuln_detector_seed42/config.json (deflated 51%)
  adding: bigvul_vuln_detector_seed42/merges.txt (deflated 53%)
  adding: bigvul_vuln_detector_seed42/tokenizer.json (deflated 82%)
  adding: bigvul_vuln_detector_seed42/special_tokens_map.json (deflated 85%)
  adding: bigvul_vuln_detector_seed42/training_args.bin (deflated 52%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from sklearn.utils import resample

# 1) your existing line‐level builder
def make_line_level(hf_ds):
    records = []
    for idx_ex, ex in enumerate(hf_ds):
        function_id = idx_ex
        lines = ex["code"].splitlines()
        flaws = set(ex["flaw_line_no"]) if ex["labels"] == 1 else set()
        for idx_line, line in enumerate(lines, start=1):
            records.append({
                "function_id": function_id,
                "line":         line,
                "label":        int(idx_line in flaws)
            })
    return pd.DataFrame(records)

train_df = make_line_level(train_ds)
val_df   = make_line_level(eval_ds)
test_df  = make_line_level(test_ds)

pos_df = train_df[train_df.label == 1]   # vulnerable
neg_df = train_df[train_df.label == 0]   # non-vulnerable

desired_ratio = 0.10   # 10% vul

# 3) compute how many negatives to keep
neg_keep = int(len(pos_df) * (1 - desired_ratio) / desired_ratio)

# 4) undersample negatives
neg_down = neg_df.sample(n=neg_keep, random_state=seed)

# 5) combine (keep all positives, drop most negatives) and shuffle
train_sampled_df = (
    pd.concat([pos_df, neg_down])
      .sample(frac=1.0, random_state=seed)
      .reset_index(drop=True)
)

print(
    f"After sampling: {train_sampled_df.label.sum()}/"
    f"{len(train_sampled_df)} positives "
    f"({train_sampled_df.label.mean():.2%})"
)


# 6) turn back into a HF Dataset
train_ds_sampled = Dataset.from_pandas(train_sampled_df)
val_ds_lines     = Dataset.from_pandas(val_df)
test_ds_lines    = Dataset.from_pandas(test_df)

line_datasets = DatasetDict({
    "train":      train_ds_sampled,
    "validation": val_ds_lines,
    "test":       test_ds_lines,
})

After sampling: 21893/218930 positives (10.00%)


In [None]:
def tokenize_line(batch):
    return tokenizer(batch["line"], padding="max_length", truncation=True, max_length=128)

tokenized_lines = line_datasets.map(tokenize_line, batched=True)


NameError: name 'line_datasets' is not defined

In [None]:
line_model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)
line_trainer = Trainer(
    model=line_model,
    args=training_args,
    train_dataset=tokenized_lines["train"],
    eval_dataset=tokenized_lines["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
line_trainer.train()
line_eval = line_trainer.evaluate()
print(line_eval)

line_test = line_trainer.predict(tokenized_lines["test"])
print(line_test.metrics)
#use the function level trainer to decide if a function should pass to line level or not
# func_predictions_output = trainer.predict(test_ds)
# func_logits = func_predictions_output.predictions
# func_labels = func_predictions_output.label_ids
# func_preds = np.argmax(func_logits, axis=-1)
# func_probs = torch.nn.functional.softmax(torch.tensor(func_logits), dim=1)[:, 1].numpy()

# # Classification report
# print(classification_report(func_labels, func_preds, target_names=["Non-vulnerable", "Vulnerable"]))
# accuracy = accuracy_metric.compute(predictions=func_preds, references=func_labels)["accuracy"]
# mcc = mcc_metric.compute(predictions=func_preds, references=func_labels)["matthews_correlation"]
# auc = auc_metric.compute(prediction_scores=func_probs, references=func_labels)["roc_auc"]

# print(f"Accuracy: {accuracy:.4f}")
# print(f"MCC:      {mcc:.4f}")
# print(f"ROC AUC:  {auc:.4f}")
vul_func_ids = [func["function_id"] for func, label in zip(test_ds, func_preds) if label == 1]
filtered_lines_df = test_df[test_df["function_id"].isin(vul_func_ids)].reset_index(drop=True)
print(f"Filtered {len(filtered_lines_df)} lines \n" f"from {len(test_df)} total lines for cascade evaluation")


evaluate_lines = Dataset.from_pandas(filtered_lines_df)
tkn_eval_lines = evaluate_lines.map(tokenize_line,batched=True)


line_predictions = line_trainer.predict(tkn_eval_lines)
print(line_predictions.metrics)

from google.colab import files
import shutil
if download_model:
    output_dir = f"./bigvul_vuln_detector_line_level_seed{seed}"
    line_trainer.save_model(output_dir)

    tokenizer.save_pretrained(output_dir)

    trainer.state.save_to_json(f"{output_dir}/trainer_state.json")

    zip_base = f"./bigvul_vuln_detector_line_level_seed{seed}"
    shutil.make_archive(base_name=zip_base,
                        format="zip",
                        root_dir=output_dir)
    zip_path = zip_base + ".zip"


    files.download(zip_path)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at neulab/codebert-cpp and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  line_trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
def make_windowed_line_level(hf_ds, window=1):
    records = []
    for idx_ex, ex in enumerate(hf_ds):
        lines = ex["code"].splitlines()
        flaws = set(ex["flaw_line_no"]) if ex["labels"] == 1 else set()
        for i, line in enumerate(lines):
            # collect context lines before/after
            start = max(0, i - window)
            end   = min(len(lines), i + window + 1)
            context_lines = lines[start:i] + [line] + lines[i+1:end]
            snippet = "\n".join(context_lines)
            records.append({
                "text":  snippet,
                "label": int(i+1 in flaws)
            })
    return pd.DataFrame(records)

train_win_df = make_windowed_line_level(train_ds, window=1)
val_win_df   = make_windowed_line_level(eval_ds,   window=1)
test_win_df  = make_windowed_line_level(test_ds,   window=1)

ds = DatasetDict({
    "train":      Dataset.from_pandas(train_win_df),
    "validation": Dataset.from_pandas(val_win_df),
    "test":       Dataset.from_pandas(test_win_df),
})

# 3) Tokenize the "text" snippets
def tokenize_snippet(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

ds = ds.map(
    tokenize_snippet,
    batched=True,
    remove_columns=["text"]
)

# 4) Set PyTorch format
ds.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

line_model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)
line_trainer = Trainer(
    model=line_model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    compute_metrics=compute_metrics,
)

line_trainer.train()
line_eval = line_trainer.evaluate()
print(line_eval)

# line_test = line_trainer.predict(tokenized_lines["test"])
# print(line_test.metrics)
# #use the function level trainer to decide if a function should pass to line level or not
# func_predictions_output = trainer.predict(test_ds)
# func_logits = func_predictions_output.predictions
# func_labels = func_predictions_output.label_ids
# func_preds = np.argmax(func_logits, axis=-1)
# func_probs = torch.nn.functional.softmax(torch.tensor(func_logits), dim=1)[:, 1].numpy()

# # # Classification report
# print(classification_report(func_labels, func_preds, target_names=["Non-vulnerable", "Vulnerable"]))
# accuracy = accuracy_metric.compute(predictions=func_preds, references=func_labels)["accuracy"]
# mcc = mcc_metric.compute(predictions=func_preds, references=func_labels)["matthews_correlation"]
# auc = auc_metric.compute(prediction_scores=func_probs, references=func_labels)["roc_auc"]

# print(f"Accuracy: {accuracy:.4f}")
# print(f"MCC:      {mcc:.4f}")
# print(f"ROC AUC:  {auc:.4f}")
# vul_func_ids = [func["function_id"] for func, label in zip(test_ds, func_preds) if label == 1]
# filtered_lines_df = test_df[test_df["function_id"].isin(vul_func_ids)].reset_index(drop=True)
# print(f"Filtered {len(filtered_lines_df)} lines \n" f"from {len(test_df)} total lines for cascade evaluation")


# evaluate_lines = Dataset.from_pandas(filtered_lines_df)
# tkn_eval_lines = evaluate_lines.map(tokenize_line,batched=True)


# line_predictions = line_trainer.predict(tkn_eval_lines)
# print(line_predictions.metrics)

from google.colab import files
import shutil
if download_model:
    # 1) choose an output dir
    output_dir = f"./bigvul_vuln_detector_line_level_seed{seed}"

    # 2) save the model weights + config
    #    (this works whether your Trainer wraps a SequenceClassification,
    #     TokenClassification, or any other HuggingFace model)
    line_trainer.save_model(output_dir)

    # 3) save the tokenizer so you get vocabulary, merges, special tokens, etc.
    tokenizer.save_pretrained(output_dir)

    # 4) (optional) snapshot the trainer state if you ever want to resume
    trainer.state.save_to_json(f"{output_dir}/trainer_state.json")

    # 5) bundle it all into one .zip for easy download
    zip_base = f"./bigvul_vuln_detector_line_level_seed{seed}"
    shutil.make_archive(base_name=zip_base,
                        format="zip",
                        root_dir=output_dir)
    zip_path = zip_base + ".zip"

    # 6) trigger the browser download
    files.download(zip_path)

Map:   0%|          | 0/676484 [00:00<?, ? examples/s]

Map:   0%|          | 0/90766 [00:00<?, ? examples/s]

Map:   0%|          | 0/136818 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at neulab/codebert-cpp and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc,Auc
1,0.0427,0.09239,0.983694,0.590206,0.147742,0.236326,0.28985,0.778382
2,0.0346,0.109631,0.983276,0.529091,0.187742,0.277143,0.308573,0.763311


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc,Auc
1,0.0427,0.09239,0.983694,0.590206,0.147742,0.236326,0.28985,0.778382
2,0.0346,0.109631,0.983276,0.529091,0.187742,0.277143,0.308573,0.763311
