<a href="https://colab.research.google.com/github/Meerschwein/Automating-SE/blob/main/longformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q evaluate peft

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m113.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m75.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os, urllib.request, zipfile, shutil, math, random, gc
import numpy as np, pandas as pd, torch
from tqdm import tqdm
from datasets import Dataset
from transformers import (
    LongformerTokenizerFast,
    LongformerForSequenceClassification,
    LongformerForTokenClassification,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model, TaskType
import evaluate, sklearn
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from google.colab import files

In [3]:
seed = 42

dataset_percent = 0.5
train_percent   = 0.75
eval_percent    = 0.10
test_percent    = 0.15
epochs          = 3

tokenizer_name        = "allenai/longformer-base-4096"
fn_level_model_name   = "allenai/longformer-base-4096"
line_level_model_name = "allenai/longformer-base-4096"

use_tokenizer_max_length = False   # When set to False , it uses the tokenizer_max_length on the next line
tokenizer_max_length     = 2048
prob_threshold           = None

download_model = True

fn_level_trainer_args = TrainingArguments(
    output_dir                 = "./fn-level",
    learning_rate              = 2e-5,
    eval_strategy              = "epoch",
    per_device_train_batch_size= 150,
    per_device_eval_batch_size = 150,
    gradient_accumulation_steps = 4,
    num_train_epochs           = epochs,
    save_strategy              = "epoch",
    logging_dir                = "./logs",
    load_best_model_at_end     = True,
    metric_for_best_model      = "mcc",
    greater_is_better          = True,
    fp16                       = False,
    bf16                       = True,
    report_to                  = "none",
)

line_level_trainer_args = TrainingArguments(
    output_dir                 = "./line-level",
    learning_rate              = 2e-5,
    eval_strategy              = "epoch",
    per_device_train_batch_size= 150,
    per_device_eval_batch_size = 150,
    gradient_accumulation_steps = 4,
    num_train_epochs           = epochs,
    save_strategy              = "epoch",
    logging_dir                = "./logs",
    load_best_model_at_end     = True,
    metric_for_best_model      = "mcc",
    greater_is_better          = True,
    fp16                       = False,
    bf16                       = True,
    report_to                  = "none",
)


In [4]:
assert math.isclose(train_percent + eval_percent + test_percent, 1.0)

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [5]:
bugvul_zip_url = (
    "https://raw.githubusercontent.com/Meerschwein/Automating-SE/refs/heads/main/Big-Vul-dataset.zip"
)
data_path = "Big-Vul-dataset/data.json"

if not os.path.exists("Big-Vul-dataset.zip"):
    urllib.request.urlretrieve(bugvul_zip_url, "Big-Vul-dataset.zip")
if not os.path.exists("Big-Vul-dataset"):
    with zipfile.ZipFile("Big-Vul-dataset.zip", "r") as z:
        z.extractall("Big-Vul-dataset")

In [6]:

df = pd.read_json(data_path, dtype={"vul": "int8"})
df = (
    df.drop(["bigvul_id"], axis=1)
      .rename(columns={"vul": "labels"})
      .dropna(subset=["code", "labels"])
      .drop_duplicates("code")
      .reset_index(drop=True)
)

if 0 < dataset_percent < 1:
    df, _ = train_test_split(df, test_size=1-dataset_percent,
                             stratify=df["labels"], random_state=seed)

train_df, eval_test_df = train_test_split(
    df, train_size=train_percent, stratify=df["labels"], random_state=seed
)
eval_df, test_df = train_test_split(
    eval_test_df,
    test_size=test_percent / (test_percent + eval_percent),
    stratify=eval_test_df["labels"],
    random_state=seed,
)

raw_train_ds = Dataset.from_pandas(train_df, preserve_index=False)
raw_eval_ds  = Dataset.from_pandas(eval_df , preserve_index=False)
raw_test_ds  = Dataset.from_pandas(test_df , preserve_index=False)

print(f"Training Dataset   {((len(raw_train_ds)/len(df))*100):.2f}% {len(raw_train_ds)}")
print(f"Validation Dataset {((len(raw_eval_ds)/len(df))*100):.2f}% {len(raw_eval_ds)}")
print(f"Test Dataset       {((len(raw_test_ds)/len(df))*100):.2f}% {len(raw_test_ds)}")

Training Dataset   75.00% 69852
Validation Dataset 10.00% 9314
Test Dataset       15.00% 13971


In [7]:
# ───────────────────────────────────────────
#  Tokeniser & models
# ───────────────────────────────────────────
tokenizer = LongformerTokenizerFast.from_pretrained(tokenizer_name)
tokenizer.model_max_length = 4096          # keep full length for inference

fn_level_model  = LongformerForSequenceClassification.from_pretrained(fn_level_model_name, num_labels=2)
line_level_model = LongformerForTokenClassification.from_pretrained(line_level_model_name, num_labels=2)

# ── LoRA (PEFT) ──────────────────────────

lora_targets = ["query", "key", "value"]

fn_lora_cfg = LoraConfig(
    task_type       = TaskType.SEQ_CLS,     # sequence-level task
    r               = 8,
    lora_alpha      = 16,
    lora_dropout    = 0.05,
    target_modules  = lora_targets,
)

line_lora_cfg = LoraConfig(
    task_type       = TaskType.TOKEN_CLS,   # token-level task
    r               = 8,
    lora_alpha      = 16,
    lora_dropout    = 0.05,
    target_modules  = lora_targets,
)

fn_level_model   = get_peft_model(fn_level_model,  fn_lora_cfg)
line_level_model = get_peft_model(line_level_model, line_lora_cfg)

for m in (fn_level_model, line_level_model):
    for n, p in m.named_parameters():
        if ("lora_" not in n) and ("classifier" not in n):
            p.requires_grad_(False)

# ── Gradient-checkpointing ──────────────────
fn_level_model.gradient_checkpointing_enable()
line_level_model.gradient_checkpointing_enable()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForTokenClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def _with_global_attention(enc):
    if isinstance(enc["input_ids"][0], list):
        enc["global_attention_mask"] = [[1]+[0]*(len(ids)-1) for ids in enc["input_ids"]]
    elif isinstance(enc["input_ids"], torch.Tensor):
        g = torch.zeros_like(enc["input_ids"])
        g[..., 0] = 1
        enc["global_attention_mask"] = g
    else:
        enc["global_attention_mask"] = [1]+[0]*(len(enc["input_ids"])-1)
    return enc

def tokenize(batch):
    max_len = tokenizer_max_length if not use_tokenizer_max_length else tokenizer.model_max_length
    enc = tokenizer(batch["code"], padding="max_length", truncation=True, max_length=max_len)
    return _with_global_attention(enc)

fn_level_train_ds = raw_train_ds.map(tokenize, batched=True, remove_columns=["code"])
fn_level_eval_ds  = raw_eval_ds .map(tokenize, batched=True, remove_columns=["code"])
fn_level_test_ds  = raw_test_ds .map(tokenize, batched=True, remove_columns=["code"])

Map:   0%|          | 0/69852 [00:00<?, ? examples/s]

Map:   0%|          | 0/9314 [00:00<?, ? examples/s]

Map:   0%|          | 0/13971 [00:00<?, ? examples/s]

In [9]:
# ───────────────────────────────────────────
#  Class-weight for loss
# ───────────────────────────────────────────
maj, min_ = np.bincount(train_df.labels)
class_weights = torch.tensor([1.0, maj/min_], dtype=torch.float)

class WeightedCELossTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights.to(self.args.device)

    # NEW: accept **kwargs (or num_items_in_batch=None)
    def compute_loss(
        self,
        model,
        inputs,
        return_outputs: bool = False,
        **kwargs              # ← swallows num_items_in_batch and future additions
    ):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss

In [10]:
accuracy_metric  = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric    = evaluate.load("recall")
f1_metric        = evaluate.load("f1")
mcc_metric       = evaluate.load("matthews_correlation")
auc_metric       = evaluate.load("roc_auc")

metrics_include_report = False

def _base_scores(preds, labels):
    return {
        "accuracy":  accuracy_metric .compute(predictions=preds, references=labels)["accuracy"],
        "precision": precision_metric.compute(predictions=preds, references=labels)["precision"],
        "recall":    recall_metric   .compute(predictions=preds, references=labels)["recall"],
        "f1":        f1_metric       .compute(predictions=preds, references=labels)["f1"],
        "mcc":       mcc_metric      .compute(predictions=preds, references=labels)["matthews_correlation"],
    }

def _safe_auc(probs, labels):
    return (auc_metric.compute(prediction_scores=probs, references=labels)["roc_auc"]
            if len(np.unique(labels))==2 else float("nan"))

def fn_level_compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()

    if prob_threshold is None:                 # ← NEW
        preds = (logits.argmax(-1))            # argmax = class with max prob
    else:
        preds = (probs >= prob_threshold).astype(int)

    scores = _base_scores(preds, labels)
    scores["auc"] = _safe_auc(probs, labels)
    return scores

def test_model(trainer, test_dataset):
    global metrics_include_report
    metrics_include_report = True
    evaluation_results = trainer.evaluate(test_dataset)
    evaluation_df = pd.DataFrame([evaluation_results])
    evaluation_df.columns = evaluation_df.columns.str.replace('eval_', '')
    evaluation_df = evaluation_df.drop(["samples_per_second", "steps_per_second", "epoch", "runtime", "report", "loss"], axis=1, errors="ignore" )
    display(evaluation_df)
    print(evaluation_results["eval_report"])
    metrics_include_report = False

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.60k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

In [11]:
fn_level_trainer = WeightedCELossTrainer(
    class_weights,
    model=fn_level_model,
    args=fn_level_trainer_args,
    train_dataset=fn_level_train_ds,
    eval_dataset=fn_level_eval_ds,
    tokenizer=tokenizer,
    compute_metrics=fn_level_compute_metrics,
)

fn_level_trainer.train()
fn_level_trainer.save_model("fn-level-model")

  super().__init__(*args, **kwargs)
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Mcc,Auc
1,No log,0.654157,0.87041,0.130769,0.309795,0.183908,0.139917,0.719474
2,No log,0.647741,0.873631,0.133929,0.307517,0.186593,0.142676,0.725323
3,No log,0.646134,0.877174,0.136223,0.300683,0.1875,0.14325,0.726995




In [12]:
test_model(fn_level_trainer, fn_level_test_ds)

Unnamed: 0,accuracy,precision,recall,f1,mcc,auc
0,0.87152,0.135897,0.3217,0.191077,0.148383,0.730389


KeyError: 'eval_report'

In [None]:
def _zip_download(dir):
    if download_model:
        files.download(shutil.make_archive(dir, "zip", dir))

In [None]:
 _zip_download("fn-level-model")

In [None]:
def add_token_labels(example):
    code  = example["code"]; vuln = set(example["flaw_line_no"])
    max_len = tokenizer_max_length if not use_tokenizer_max_length else tokenizer.model_max_length
    enc = tokenizer(code, return_offsets_mapping=True, truncation=True, max_length=max_len, padding="max_length")
    enc = _with_global_attention(enc)
    labels = np.full(len(enc["input_ids"]), -100, np.int8)
    starts = [0]+[i+1 for i,c in enumerate(code) if c=="\n"]
    for idx,(beg,_) in enumerate(enc["offset_mapping"]):
        if beg==0 and idx==0: continue
        line = 1+sum(beg>=s for s in starts); labels[idx]=int(line in vuln)
    enc.pop("offset_mapping"); enc["labels"]=labels.tolist(); return enc

line_level_train_ds = raw_train_ds.map(add_token_labels, remove_columns=list(train_df.columns))
line_level_eval_ds  = raw_eval_ds .map(add_token_labels, remove_columns=list(train_df.columns))
line_level_test_ds  = raw_test_ds .map(add_token_labels, remove_columns=list(train_df.columns))



In [None]:
metrics_include_report = False

def line_level_metrics(eval_pred):
    logits, y = eval_pred
    probs  = torch.softmax(
                torch.tensor(logits.reshape(-1, 2)), dim=1)[:, 1].numpy()
    labels = y.flatten()
    mask   = labels != -100
    labels = labels[mask]

    if prob_threshold is None:                 # ← NEW
        preds = logits.reshape(-1, 2).argmax(-1)[mask]
    else:
        preds = (probs[mask] >= prob_threshold).astype(int)

    scores = _base_scores(preds, labels)
    scores["auc"] = _safe_auc(probs[mask], labels)
    return scores

In [None]:
line_level_trainer = Trainer(
    args=line_level_trainer_args,
    model=line_level_model,
    train_dataset=line_level_train_ds,
    eval_dataset=line_level_eval_ds,
    tokenizer=tokenizer,
    compute_metrics=line_level_metrics,
)

line_level_trainer.train()
line_level_trainer.save_model("line-level-model")

In [None]:
test_model(line_level_trainer, line_level_test_ds)

In [None]:
 _zip_download("line-level-model")

In [None]:
uploaded = files.upload()
for filename in uploaded.keys():
    if filename.endswith(".zip"):
        folder_name = filename.replace(".zip", "")
        os.makedirs(folder_name, exist_ok=True)
        !unzip -q "$filename" -d "$folder_name"

In [None]:
trained_fn_level_model   = LongformerForSequenceClassification.from_pretrained("./fn-level-model")
trained_line_level_model = LongformerForTokenClassification.from_pretrained("./line-level-model")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_ = trained_fn_level_model.to(device).eval()
_ = trained_line_level_model.to(device).eval()

In [None]:
def get_vuln_lines(example):
    code = example["code"]

    # Function-level classification
    fn_inputs = tokenizer(code, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    fn_inputs = _with_global_attention(fn_inputs)
    fn_inputs = {k: v.to(device) for k, v in fn_inputs.items()}
    with torch.no_grad():
        fn_outputs = trained_fn_level_model(**fn_inputs)
        fn_probs = torch.softmax(fn_outputs.logits, dim=1)
    is_vulnerable = fn_probs[0, 1].item() > (
       0.5 if prob_threshold is None else prob_threshold)

    if not is_vulnerable:
        return {"vulnerable": False, "lines": []}

    # Line-level classification
    enc = tokenizer(code, return_offsets_mapping=True, return_tensors="pt",
                    truncation=True, padding="max_length", max_length=512)
    enc = _with_global_attention(enc)
    offset_mapping = enc.pop("offset_mapping")[0]
    enc = {k: v.to(device) for k, v in enc.items()}

    with torch.no_grad():
        line_outputs = trained_line_level_model(**enc)
    line_logits = line_outputs.logits
    line_preds = torch.argmax(line_logits, dim=-1)[0]  # shape: [seq_len]

    # Map tokens to line numbers
    lines = code.split('\n')
    line_start_positions = [0]
    for line in lines:
        line_start_positions.append(line_start_positions[-1] + len(line) + 1)

    line_indices = set()
    for idx, (start_offset, _) in enumerate(offset_mapping):
        if start_offset == 0 and idx == 0:  # [CLS] token
            continue
        if line_preds[idx].item() == 1:
            start = start_offset.item()
            line_no = 1 + sum(start >= pos for pos in line_start_positions)
            line_indices.add(line_no)

    return {"vulnerable": True, "lines": sorted(line_indices)}

def display_vulnerability_result(example, predicted_lines):
    code_lines = example["code"].split("\n")
    actual_lines = set(example.get("flaw_line_no", []))
    predicted_lines = set(predicted_lines)

    max_line_no_width = len(str(len(code_lines)))

    print(f"lines{sorted(actual_lines)} pred{sorted(predicted_lines)}")
    for i, line in enumerate(code_lines, start=1):
        line_no = str(i).rjust(max_line_no_width)
        actual_flag = "v" if i in actual_lines else " "
        predicted_flag = "p" if i in predicted_lines else " "
        print(f"{line_no} {actual_flag}{predicted_flag}|{line}")

small_vuln_examples = df[
    (df["labels"] == 1) &
    (df["code"].apply(lambda c: len(c.splitlines()) <= 10))  # max 7 lines
]
examples_to_test = small_vuln_examples.sample(n=5, random_state=seed).to_dict(orient="records")

for ex in examples_to_test:
    result = get_vuln_lines(ex)
    print(f"vuln {ex['labels']==1} pred {result['vulnerable']}")
    display_vulnerability_result(ex, result["lines"])
    print()