In [None]:
!pip install datasets==2.16

In [27]:
import zipfile
import os
import urllib.request
from datasets import load_dataset
from transformers import RobertaTokenizerFast
import numpy as np
import matplotlib as plt

In [28]:
bugvul_zip_url = "https://raw.githubusercontent.com/Meerschwein/Automating-SE/refs/heads/main/Big-Vul-dataset.zip"
data_path = "Big-Vul-dataset/data.json"

if not os.path.exists("Big-Vul-dataset.zip"):
    urllib.request.urlretrieve(bugvul_zip_url, "Big-Vul-dataset.zip")

if not os.path.exists("Big-Vul-dataset"):
    with zipfile.ZipFile("Big-Vul-dataset.zip", "r") as zip_ref:
        zip_ref.extractall("Big-Vul-dataset")

In [29]:
ds = load_dataset("json", data_files={"train": data_path}, split="train")

In [36]:
tokenizer = RobertaTokenizerFast.from_pretrained("neulab/codebert-cpp")

In [None]:
def count_tokens(batch):
    tokenized = tokenizer(batch["code"], truncation=False, padding=False)
    return {"token_count": [len(ids) for ids in tokenized["input_ids"]]}

ds = ds.map(count_tokens, batched=True)
vuln_ds = ds.filter(lambda x: x["vul"] == 1)
non_vuln_ds = ds.filter(lambda x: x["vul"] == 0)


In [None]:
def analyze(name, dataset):
    counts = np.array(dataset["token_count"])
    print(f"\n=== {name} ===")
    print(f"Total examples: {len(dataset)}")

    print(f"Mean tokens: {counts.mean():.1f}")
    print(f"Median tokens: {np.median(counts):.1f}")
    print(f"Max tokens: {counts.max()}")

    thresholds = [512, 1024, 2048, 4096, 8192]
    print("\nCumulative Token Coverage:")
    for threshold in thresholds:
        num_under = (counts <= threshold).sum()
        percent_under = (num_under / len(counts)) * 100
        print(f"<= {threshold:4} tokens: {num_under:6} examples ({percent_under:.1f}%)")

analyze("All", ds)
analyze("Vulnerable", vuln_ds)
analyze("Non-Vulnerable", non_vuln_ds)


=== All ===
Total examples: 186530
Mean tokens: 394.9
Median tokens: 167.0
Max tokens: 143833

Cumulative Token Coverage:
≤  512 tokens: 153318 examples (82.2%)
≤ 1024 tokens: 172939 examples (92.7%)
≤ 2048 tokens: 181935 examples (97.5%)
≤ 4096 tokens: 185064 examples (99.2%)
≤ 8192 tokens: 186081 examples (99.8%)

=== Vulnerable ===
Total examples: 8794
Mean tokens: 1094.8
Median tokens: 418.0
Max tokens: 55807

Cumulative Token Coverage:
≤  512 tokens:   5036 examples (57.3%)
≤ 1024 tokens:   6729 examples (76.5%)
≤ 2048 tokens:   7802 examples (88.7%)
≤ 4096 tokens:   8357 examples (95.0%)
≤ 8192 tokens:   8619 examples (98.0%)

=== Non-Vulnerable ===
Total examples: 177736
Mean tokens: 360.3
Median tokens: 161.0
Max tokens: 143833

Cumulative Token Coverage:
≤  512 tokens: 148282 examples (83.4%)
≤ 1024 tokens: 166210 examples (93.5%)
≤ 2048 tokens: 174133 examples (98.0%)
≤ 4096 tokens: 176707 examples (99.4%)
≤ 8192 tokens: 177462 examples (99.8%)
