<a href="https://colab.research.google.com/github/Meerschwein/Automating-SE/blob/main/Balanced_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
import os
import urllib.request
import zipfile
import numpy as np
import torch
from torch.utils.data import Dataset as TorchDataset
from datasets import load_dataset, Dataset, ClassLabel
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    RobertaForTokenClassification,
    Trainer,
    TrainingArguments,
)
import evaluate
from tqdm import tqdm
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import pandas as pd
from google.colab import files
import shutil

In [14]:
seed = 42

dataset_percent = 1 # 0.1 => 10%

train_percent = 0.75
eval_percent  = 0.10
test_percent  = 0.15

# <1 => undersampling
# >1 => oversampling
# "vuln"/"nonvuln" => adjust number of samples to this dataset
training_sample_nonvuln = 0.5
training_sample_vuln    = "nonvuln"

epochs = 3

tokenizer_name        = "neulab/codebert-cpp"
fn_level_model_name   = "neulab/codebert-cpp"
line_level_model_name = "neulab/codebert-cpp"

use_tokenizer_max_length = True # if False: use below
tokenizer_max_length     = 2048

download_model = True

fn_level_trainer_args = TrainingArguments(
    output_dir="./fn-level",
    learning_rate=2e-5,
    eval_strategy="epoch",
    per_device_train_batch_size=48,
    per_device_eval_batch_size=48,
    gradient_accumulation_steps=4,
    num_train_epochs=epochs,
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="mcc",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

line_level_trainer_args = TrainingArguments(
    output_dir="./line-level",
    learning_rate=2e-5,
    eval_strategy="epoch",
    per_device_train_batch_size=48,
    per_device_eval_batch_size=48,
    gradient_accumulation_steps=4,
    num_train_epochs=epochs,
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="mcc",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

In [4]:
assert train_percent + eval_percent + test_percent == 1
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [5]:
bugvul_zip_url = "https://raw.githubusercontent.com/Meerschwein/Automating-SE/refs/heads/main/Big-Vul-dataset.zip"
data_path = "Big-Vul-dataset/data.json"

if not os.path.exists("Big-Vul-dataset.zip"):
    urllib.request.urlretrieve(bugvul_zip_url, "Big-Vul-dataset.zip")

if not os.path.exists("Big-Vul-dataset"):
    with zipfile.ZipFile("Big-Vul-dataset.zip", "r") as zip_ref:
        zip_ref.extractall("Big-Vul-dataset")

In [6]:
df = pd.read_json(data_path, dtype={"vul": "int8"})

df = (df.drop(["bigvul_id"], axis=1)
        .rename(columns={"vul": "labels"})
        .dropna(subset=["code", "labels"])
        .drop_duplicates("code")
        .reset_index(drop=True))

if 0 < dataset_percent < 1: # smaller for training
    df, _ = train_test_split(df, test_size=1-dataset_percent, stratify=df['labels'], random_state=seed)

train_df, eval_test_df = train_test_split(df, train_size=train_percent, stratify=df['labels'], random_state=seed)
eval_df, test_df = train_test_split(eval_test_df, test_size=test_percent/(test_percent+eval_percent), stratify=eval_test_df['labels'], random_state=seed)

In [7]:
def print_class_distribution(name, df):
    class_counts = df["labels"].value_counts().sort_index()
    total = len(df)
    print(f"{name} - {total}")
    for label, count in class_counts.items():
        percent = (count / total) * 100
        l = "Vuln   " if label == 1 else "Nonvuln"
        print(f"    {l} {count} ({percent:.2f}%)")

def print_all_class_distributions():
    print_class_distribution("Training", train_df)
    print_class_distribution("Validation", eval_df)
    print_class_distribution("Test", test_df)

print_all_class_distributions()

Training - 139706
    Nonvuln 133120 (95.29%)
    Vuln    6586 (4.71%)
Validation - 18627
    Nonvuln 17749 (95.29%)
    Vuln    878 (4.71%)
Test - 27942
    Nonvuln 26624 (95.28%)
    Vuln    1318 (4.72%)


In [8]:
train_vuln    = train_df[train_df['labels'] == 1]
train_nonvuln = train_df[train_df['labels'] == 0]

def sample_class(df_class, rule):
    if isinstance(rule, float) and rule != 1:
        if rule < 1:
            return df_class.sample(frac=rule, random_state=seed)
        else:
            return resample(df_class, replace=True, n_samples=int(len(df_class) * rule), random_state=seed)
    elif isinstance(rule, int):
        if rule == 1:
            return df_class
        else:
            return resample(df_class, replace=True, n_samples=len(df_class) * rule, random_state=seed)
    elif rule == "vuln":
        return resample(df_class, replace=True, n_samples=len(train_vuln), random_state=seed)
    elif rule == "nonvuln":
        return resample(df_class, replace=True, n_samples=len(train_nonvuln), random_state=seed)
    else:
        raise ValueError(f"Invalid rule: {rule}")

train_nonvuln = sample_class(train_nonvuln, training_sample_nonvuln)
train_vuln    = sample_class(train_vuln, training_sample_vuln)

train_df = pd.concat([train_vuln, train_nonvuln]).sample(frac=1, random_state=seed).reset_index(drop=True)

print_all_class_distributions()

raw_train_ds = Dataset.from_pandas(train_df, preserve_index=False)
raw_eval_ds  = Dataset.from_pandas(eval_df, preserve_index=False)
raw_test_ds  = Dataset.from_pandas(test_df, preserve_index=False)

Training - 133120
    Nonvuln 66560 (50.00%)
    Vuln    66560 (50.00%)
Validation - 18627
    Nonvuln 17749 (95.29%)
    Vuln    878 (4.71%)
Test - 27942
    Nonvuln 26624 (95.28%)
    Vuln    1318 (4.72%)


In [9]:
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [10]:
def tokenize(batch):
    max_length = tokenizer.model_max_length if use_tokenizer_max_length else tokenizer_max_length
    return tokenizer(batch["code"], padding="max_length", truncation=True, max_length=max_length)

fn_level_train_ds = raw_train_ds.map(tokenize, batched=True, remove_columns=["code"])
fn_level_eval_ds  = raw_eval_ds.map(tokenize, batched=True, remove_columns=["code"])
fn_level_test_ds  = raw_test_ds.map(tokenize, batched=True, remove_columns=["code"])

Map:   0%|          | 0/133120 [00:00<?, ? examples/s]

Map:   0%|          | 0/18627 [00:00<?, ? examples/s]

Map:   0%|          | 0/27942 [00:00<?, ? examples/s]

In [11]:
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")
mcc_metric = evaluate.load("matthews_correlation")
auc_metric = evaluate.load("roc_auc")

metrics_include_report = False

def fn_level_compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()  # Probability of class 1 (vulnerable)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_metric.compute(predictions=predictions, references=labels)["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    mcc = mcc_metric.compute(predictions=predictions, references=labels)["matthews_correlation"]
    auc = auc_metric.compute(prediction_scores=probs, references=labels)["roc_auc"]

    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "mcc": mcc,
        "auc": auc,
    }

    if metrics_include_report:
        report = classification_report(labels, predictions, target_names=["Non-vulnerable", "Vulnerable"])
        metrics["report"] = report

    return metrics

def test_model(trainer, test_dataset):
    global metrics_include_report
    metrics_include_report = True
    evaluation_results = trainer.evaluate(test_dataset)
    evaluation_df = pd.DataFrame([evaluation_results])
    evaluation_df.columns = evaluation_df.columns.str.replace('eval_', '')
    evaluation_df = evaluation_df.drop(["samples_per_second", "steps_per_second", "epoch", "runtime", "report", "loss"], axis=1)
    display(evaluation_df)
    print(evaluation_results["eval_report"])
    metrics_include_report = False

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.60k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

In [12]:
fn_level_model = RobertaForSequenceClassification.from_pretrained(fn_level_model_name, num_labels=2)

config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at neulab/codebert-cpp and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
fn_level_trainer = Trainer(
    args=fn_level_trainer_args,
    model=fn_level_model,
    train_dataset=fn_level_train_ds,
    eval_dataset=fn_level_eval_ds,
    processing_class=tokenizer,
    compute_metrics=fn_level_compute_metrics,
)

fn_level_trainer.train()
fn_level_trainer.save_model("fn-level-model")

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc,Auc
1,0.1905,0.161367,0.937564,0.424723,0.915718,0.580296,0.59924,0.978873
2,0.097,0.092752,0.973318,0.667546,0.864465,0.75335,0.746402,0.976387
3,0.0385,0.101049,0.974016,0.674028,0.869021,0.759204,0.752433,0.975369


In [16]:
test_model(fn_level_trainer, fn_level_test_ds)

Unnamed: 0,accuracy,precision,recall,f1,mcc,auc
0,0.974984,0.684997,0.869499,0.766299,0.759264,0.972987


                precision    recall  f1-score   support

Non-vulnerable       0.99      0.98      0.99     26624
    Vulnerable       0.68      0.87      0.77      1318

      accuracy                           0.97     27942
     macro avg       0.84      0.92      0.88     27942
  weighted avg       0.98      0.97      0.98     27942



In [35]:
def download_model(dir):
    files.download(shutil.make_archive(dir, 'zip', dir))

In [36]:
if download_model:
    download_model("./fn-level-model")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [19]:
def add_token_labels(example):
    code        = example["code"]
    vuln_lines  = set(example["flaw_line_no"]) # [] if benign

    max_length = tokenizer.model_max_length if use_tokenizer_max_length else tokenizer_max_length
    enc = tokenizer(
        code,
        return_offsets_mapping=True,
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )

    # map every token to its source-code line
    labels = np.full(len(enc["input_ids"]), -100, dtype=np.int8) # pad value
    line_start = [0] + [i + 1 for i, c in enumerate(code) if c == "\n"]

    for idx, (start, _) in enumerate(enc["offset_mapping"]):
        if start == 0 and idx == 0: # [CLS] token => keep -100
            continue
        # line numbers are 1-based
        line_no = 1 + sum(start >= ls for ls in line_start)
        labels[idx] = int(line_no in vuln_lines)

    enc.pop("offset_mapping")
    enc["labels"] = labels.tolist()
    return enc

line_level_train_ds = raw_train_ds.map(add_token_labels, remove_columns=list(train_df.columns))
line_level_eval_ds  = raw_eval_ds.map(add_token_labels, remove_columns=list(train_df.columns))
line_level_test_ds  = raw_test_ds.map(add_token_labels, remove_columns=list(train_df.columns))

Map:   0%|          | 0/133120 [00:00<?, ? examples/s]

Map:   0%|          | 0/18627 [00:00<?, ? examples/s]

Map:   0%|          | 0/27942 [00:00<?, ? examples/s]

In [20]:
metrics_include_report = False
def line_level_metrics(eval_pred):
    logits, y = eval_pred
    logits_flat = logits.reshape(-1, logits.shape[-1]) # Flatten logits for masking
    p = logits.argmax(-1).flatten()
    y = y.flatten()
    mask = y != -100 # ignore padding tokens
    predictions, labels = p[mask], y[mask]
    logits_masked = logits_flat[mask]
    probs = torch.nn.functional.softmax(torch.tensor(logits_masked), dim=1)[:, 1].numpy()  # Probability of class 1 (vulnerable)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    precision = precision_metric.compute(predictions=predictions, references=labels)["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    mcc = mcc_metric.compute(predictions=predictions, references=labels)["matthews_correlation"]
    auc = auc_metric.compute(prediction_scores=probs, references=labels)["roc_auc"]

    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "mcc": mcc,
        "auc": auc,
    }

    if metrics_include_report:
        report = classification_report(labels, predictions, target_names=["Non-vulnerable", "Vulnerable"])
        metrics["report"] = report

    return metrics

In [21]:
line_level_model = RobertaForTokenClassification.from_pretrained(line_level_model_name, num_labels=2)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at neulab/codebert-cpp and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
line_level_trainer = Trainer(
    args=line_level_trainer_args,
    model=line_level_model,
    train_dataset=line_level_train_ds,
    eval_dataset=line_level_eval_ds,
    processing_class=tokenizer,
    compute_metrics=line_level_metrics,
)

line_level_trainer.train()
line_level_trainer.save_model("line-level-model")

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc,Auc
1,0.2412,0.043942,0.984539,0.42956,0.555464,0.484465,0.480811,0.959173
2,0.1237,0.046646,0.987288,0.511151,0.642734,0.56944,0.566882,0.958239
3,0.0696,0.052843,0.986581,0.490444,0.668078,0.565643,0.56589,0.95848


In [23]:
test_model(line_level_trainer, line_level_test_ds)

Unnamed: 0,accuracy,precision,recall,f1,mcc,auc
0,0.987828,0.528586,0.636737,0.577643,0.574072,0.956524


                precision    recall  f1-score   support

Non-vulnerable       1.00      0.99      0.99  14091704
    Vulnerable       0.53      0.64      0.58    186658

      accuracy                           0.99  14278362
     macro avg       0.76      0.81      0.79  14278362
  weighted avg       0.99      0.99      0.99  14278362



In [34]:
if download_model:
    download_model("line-level-model")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
uploaded = files.upload()
for filename in uploaded.keys():
    if filename.endswith(".zip"):
        folder_name = filename.replace(".zip", "")
        os.makedirs(folder_name, exist_ok=True)
        !unzip -q "$filename" -d "$folder_name"

KeyboardInterrupt: 

In [26]:
trained_fn_level_model = RobertaForSequenceClassification.from_pretrained("./fn-level-model")
trained_line_level_model = RobertaForTokenClassification.from_pretrained("./line-level-model")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_ = trained_fn_level_model.to(device).eval()
_ = trained_line_level_model.to(device).eval()

In [27]:
def get_vuln_lines(example):
    code = example["code"]

    # Function-level classification
    fn_inputs = tokenizer(code, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    fn_inputs = {k: v.to(device) for k, v in fn_inputs.items()}
    with torch.no_grad():
        fn_outputs = trained_fn_level_model(**fn_inputs)
        fn_probs = torch.softmax(fn_outputs.logits, dim=1)
    is_vulnerable = fn_probs[0][1].item() > 0.5  # Class 1 = vulnerable

    if not is_vulnerable:
        return {"vulnerable": False, "lines": []}

    # Line-level classification
    enc = tokenizer(code, return_offsets_mapping=True, return_tensors="pt",
                    truncation=True, padding="max_length", max_length=512)
    offset_mapping = enc.pop("offset_mapping")[0]
    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        line_outputs = trained_line_level_model(**enc)
    line_logits = line_outputs.logits
    line_preds = torch.argmax(line_logits, dim=-1)[0]  # shape: [seq_len]

    # Map tokens to line numbers
    lines = code.split('\n')
    line_start_positions = [0]
    for line in lines:
        line_start_positions.append(line_start_positions[-1] + len(line) + 1)

    line_indices = set()
    for idx, (start_offset, _) in enumerate(offset_mapping):
        if start_offset == 0 and idx == 0:  # [CLS] token
            continue
        if line_preds[idx].item() == 1:
            start = start_offset.item()
            line_no = 1 + sum(start >= pos for pos in line_start_positions)
            line_indices.add(line_no)

    return {"vulnerable": True, "lines": sorted(line_indices)}

def display_vulnerability_result(example, predicted_lines):
    code_lines = example["code"].split("\n")
    actual_lines = set(example.get("flaw_line_no", []))
    predicted_lines = set(predicted_lines)

    max_line_no_width = len(str(len(code_lines)))

    print(f"lines{sorted(actual_lines)} pred{sorted(predicted_lines)}")
    for i, line in enumerate(code_lines, start=1):
        line_no = str(i).rjust(max_line_no_width)
        actual_flag = "v" if i in actual_lines else " "
        predicted_flag = "p" if i in predicted_lines else " "
        print(f"{line_no} {actual_flag}{predicted_flag}|{line}")

small_vuln_examples = df[
    (df["labels"] == 1) &
    (df["code"].apply(lambda c: len(c.splitlines()) <= 10))  # max 7 lines
]
examples_to_test = small_vuln_examples.sample(n=5, random_state=seed).to_dict(orient="records")

for ex in examples_to_test:
    result = get_vuln_lines(ex)
    print(f"vuln {ex['labels']==1} pred {result['vulnerable']}")
    display_vulnerability_result(ex, result["lines"])
    print()

vuln True pred True
lines[4] pred[4, 5]
1   | void ImageTokenizer::stopParsing()
2   | {
3   |     Tokenizer::stopParsing();
4 vp|    m_imageElement->cachedImage()->error();
5  p| }
6   |

vuln True pred True
lines[2, 3, 4] pred[2, 3, 4, 5]
1   |bool Track::GetLacing() const
2 vp|{
3 vp|    return m_info.lacing;
4 vp|}
5  p|

vuln True pred True
lines[2, 3, 4] pred[2, 3, 4, 5]
1   |void CrosLibrary::TestApi::SetPowerLibrary(
2 vp|    PowerLibrary* library, bool own) {
3 vp|  library_->power_lib_.SetImpl(library, own);
4 vp|}
5  p|

vuln True pred True
lines[2, 3, 4] pred[2, 3, 4, 5]
1   |CuePoint::~CuePoint()
2 vp|{
3 vp|    delete[] m_track_positions;
4 vp|}
5  p|

vuln True pred True
lines[2] pred[2, 3, 4, 5]
1   |   void CreatePersistentMemoryAllocator() {
2 vp|    GlobalHistogramAllocator::GetCreateHistogramResultHistogram();
3  p|     GlobalHistogramAllocator::CreateWithLocalMemory(
4  p|         kAllocatorMemorySize, 0, "SparseHistogramAllocatorTest");
5  p|     allocator_ = Glob

keep colab running
```js
function ClickConnect() {
    console.log("Working");
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click();
}
var clicker = setInterval(ClickConnect, 60000);
```