In [None]:
from datasets import load_dataset

ds = load_dataset("bstee615/bigvul")

train_set = ds["train"]
print("Number of examples in the training set:", len(train_set))
print("Features before modification:")
for key in train_set.features:
    print(f"{key}: {train_set.features[key]}")

train_set = train_set.remove_columns(["codeLink", "commit_id", "CVE Page", "lang", "commit_message", 'vul'])
print("\nFeatures after modification:")
for key in train_set.features:
    print(f"{key}: {train_set.features[key]}")




Number of examples in the training set: 150908
Features before modification:
CVE ID: Value(dtype='string', id=None)
CVE Page: Value(dtype='string', id=None)
CWE ID: Value(dtype='string', id=None)
codeLink: Value(dtype='string', id=None)
commit_id: Value(dtype='string', id=None)
commit_message: Value(dtype='string', id=None)
func_after: Value(dtype='string', id=None)
func_before: Value(dtype='string', id=None)
lang: Value(dtype='string', id=None)
project: Value(dtype='string', id=None)
vul: Value(dtype='int8', id=None)

Features after modification:
CVE ID: Value(dtype='string', id=None)
CWE ID: Value(dtype='string', id=None)
func_after: Value(dtype='string', id=None)
func_before: Value(dtype='string', id=None)
project: Value(dtype='string', id=None)


In [None]:
# Remove entries with missing or empty CWE ID
train_set = train_set.filter(lambda example: example["CWE ID"] not in (None, "", "None"))

print("Number of examples in filtered set:", len(train_set))
unique_cwes = set(train_set['CWE ID'])
print(f"Number of unique CWE IDs: {len(unique_cwes)}")
for cwe in unique_cwes:
    print(cwe)

Filter: 100%|██████████| 121933/121933 [00:02<00:00, 46263.80 examples/s]


Number of examples in filtered set: 121933
Number of unique CWE IDs: 90
CWE-399
CWE-664
CWE-89
CWE-209
CWE-320
CWE-77
CWE-416
CWE-824
CWE-269
CWE-862
CWE-19
CWE-674
CWE-120
CWE-352
CWE-290
CWE-287
CWE-129
CWE-330
CWE-611
CWE-172
CWE-770
CWE-285
CWE-254
CWE-400
CWE-200
CWE-755
CWE-18
CWE-369
CWE-834
CWE-772
CWE-93
CWE-665
CWE-134
CWE-327
CWE-436
CWE-918
CWE-78
CWE-682
CWE-415
CWE-732
CWE-346
CWE-404
CWE-362
CWE-22
CWE-345
CWE-787
CWE-358
CWE-189
CWE-191
CWE-125
CWE-94
CWE-16
CWE-706
CWE-284
CWE-388
CWE-532
CWE-494
CWE-1021
CWE-90
CWE-361
CWE-190
CWE-693
CWE-668
CWE-20
CWE-119
CWE-311
CWE-59
CWE-255
CWE-909
CWE-74
CWE-502
CWE-79
CWE-354
CWE-426
CWE-252
CWE-835
CWE-754
CWE-601
CWE-264
CWE-617
CWE-704
CWE-17
CWE-310
CWE-522
CWE-347
CWE-281
CWE-763
CWE-295
CWE-769
CWE-476


In [26]:
unique_projects = set(train_set['project'])
print(f"Number of unique projects: {len(unique_projects)}")
print("Unique projects:")
projects_list = list(unique_projects)
for project in unique_projects:
    print(project)

Number of unique projects: 298
Unique projects:
torque
rpm
memcached
libmspack
ioq3
aircrack-ng
strongswan
libuv
proxychains-ng
moodle
netfilter
libfep
musl
gpmf-parser
gnome-session
bubblewrap
Onigmo
unixODBC
minisphere
exempi
flatpak
knc
quagga
git
libcomps
LibRaw-demosaic-pack-GPL2
fontconfig
oniguruma
mongoose-os
libxsmm
feh
mujs
jabberd2
libu2f-host
openjpeg
shibboleth
ImageMagick6
yubico-pam
nmap
rufus
iperf
libx11
pngquant
tinc
libinfinity
libming
FFmpeg
hexchat
libxml2
NetworkManager
exim
jansson
ntp
Espruino
openssh-portable
cups
opa-ff
xserver
libXfixes
nagioscore
pigz
mapserver
gpac
httpd
xcursor
tnef
gnulib
util-linux
imageworsener
libetpan
postgres
Android
launchpad
wildmidi
PDFGen
infradead
lynx-snapshots
pupnp-code
libevt
qemu
busybox
radvd
kamailio
harfbuzz
squashfs-tools
dbus
cgminer
libmodbus
media-tree
nfdump
mbedtls
ovs
rdesktop
quassel
yara
poppler
udisks
profanity
lxde
libXpm
thor
libXv
pure-ftpd
charybdis
shadowsocks-libev
libgd
capstone
libevent
suricata
polarss

In [51]:
from pydantic import BaseModel, Field
class ProjectResponse(BaseModel):
    applicable_projects: list[str] = Field(
        ...,
        description="List of projects that are applicable in automotive context."
    )
    not_applicable_projects: list[str] = Field(
        ...,
        description="List of projects that are not applicable in automotive context."
    )
    justification_per_project: list[str] = Field(
        ...,
        description="Justification for each project regarding its applicability in automotive context."
    )

def get_projects_prompt(projects_list_slice):
    prompt = f"""
You are an expert in automotive software development.
Based on the provided list of project names, use your expertise to analyze the projects and determine their use and applicability in the automotive context.
Applicability can be in different areas such as safety, security, performance, or compliance with automotive standards.
For example, linux is applicable in automotive context as it is used in many ECUs as a an operating system.
Strongswan is applicable in automotive context as it is used for secure communication in vehicle networks.
Project needs to be applicable in the on-board context, not in the cloud, server, or simulation context. In short, the project should be used in the vehicle codebase.
Please provide a detailed analysis of each project, including whether it is applicable or not, and the reasons for your assessment.
Keep justification short and consice, ideally 1-2 sentences per project.
Here is the list of projects:
{'\n'.join(projects_list_slice)}"""
    return prompt

In [None]:
from pydantic import BaseModel, Field
from google import genai

with open('token', 'r') as file:
    token = file.read().strip()
    
client = genai.Client(api_key=token)

def determine_applicability_of_projects(prompt: str) -> ProjectResponse | None:
    """Determine if the vulnerability is applicable to automotive context."""
    try:
        response = client.models.generate_content(
            model="gemini-2.5-flash-preview-05-20",
            contents=prompt,
            config={
                "response_mime_type": "application/json",
                "response_schema": ProjectResponse,
            },
        )
        # if server error, return empty list
    except genai.errors.ServerError as e:
        print(f"Server error: {e}")
        return None
    return response.parsed

In [55]:
print(f'Length of projects list: {len(projects_list)}')
projects_list_slice = projects_list[:10]  # Adjust the slice size as needed

response = determine_applicability_of_projects(get_projects_prompt(projects_list_slice))

def print_response(response: ProjectResponse | None):
    """Print the response from the model."""
    if response:
        print("Applicable projects:", response.applicable_projects)
        print("Not applicable projects:", response.not_applicable_projects)
        for justification in response.justification_per_project:
            print("Justification:", justification)
    else:
        print("No response received or an error occurred.")

print_response(response)

Length of projects list: 298
Applicable projects: ['strongswan', 'libuv']
Not applicable projects: ['torque', 'rpm', 'memcached', 'libmspack', 'ioq3', 'aircrack-ng', 'proxychains-ng', 'moodle']
Justification: HPC job scheduler, not relevant for embedded automotive systems.
Justification: Package management system, generally not used for runtime in on-board vehicle software.
Justification: Distributed memory caching system, primarily for server environments, not typical for embedded vehicle ECUs.
Justification: Library for Microsoft compression formats, not relevant for on-board automotive system functionality.
Justification: Open-source game engine, not applicable for automotive vehicle control or core functions.
Justification: Wireless network auditing tool, used for security testing, not integrated into on-board vehicle software.
Justification: IPsec-based VPN solution, critical for secure communication in connected vehicle networks (e.g., V2X, telematics, inter-ECU).
Justification: 

In [None]:
from time import sleep
import json

chunk_step = 50
applicable = []
not_applicable = []
justifications = []
# Process the projects in chunks
for i in range(0, len(projects_list), chunk_step):
    projects_list_slice = projects_list[i:i + chunk_step]
    print(f"Processing projects {i} to {i + chunk_step}...")
    while True:
        response = determine_applicability_of_projects(get_projects_prompt(projects_list_slice))
        if response:
            print(f"Processed projects {i} to {i + chunk_step}")
            print(f'More applicable projects: {len(response.applicable_projects)}')
            print(f'More not applicable projects: {len(response.not_applicable_projects)}')
            print(f'Appending {len(response.applicable_projects)} applicable projects and {len(response.not_applicable_projects)} not applicable projects.')
            applicable += response.applicable_projects
            not_applicable += response.not_applicable_projects
            justifications += response.justification_per_project
            print("Saving current results to file...")
            results = {
                "applicable_projects": applicable,
                "not_applicable_projects": not_applicable,
                "justifications": justifications
            }
            with open('applicable_projects.json', 'w') as f:
                json.dump(results, f, indent=4)
            print("Results saved successfully.")
            break
        else:
            print(f"Retrying for projects {i} to {i + chunk_step} due to server error...")
            sleep(5)



Processing projects 0 to 50...
Processed projects 0 to 50
More applicable projects: 19
More not applicable projects: 31
Appending 19 applicable projects and 31 not applicable projects.
Saving current results to file...
Results saved successfully.
Processing projects 50 to 100...
Processed projects 50 to 100
More applicable projects: 23
More not applicable projects: 27
Appending 23 applicable projects and 27 not applicable projects.
Saving current results to file...
Results saved successfully.
Processing projects 100 to 150...
Processed projects 100 to 150
More applicable projects: 16
More not applicable projects: 34
Appending 16 applicable projects and 34 not applicable projects.
Saving current results to file...
Results saved successfully.
Processing projects 150 to 200...
Processed projects 150 to 200
More applicable projects: 27
More not applicable projects: 23
Appending 27 applicable projects and 23 not applicable projects.
Saving current results to file...
Results saved successful

In [63]:
with open('applicable_projects.json', 'r') as f:
    projects_applicability_data = json.load(f)

stats = {
    "total_applicable": len(projects_applicability_data["applicable_projects"]),
    "total_not_applicable": len(projects_applicability_data["not_applicable_projects"]),
}

print("Total applicable projects:", stats["total_applicable"])
print("Total not applicable projects:", stats["total_not_applicable"])
# Print the first 10 applicable projects
print("\nFirst 10 applicable projects:")
for project in projects_applicability_data["applicable_projects"][:10]:
    print(project)
# Print the first 10 not applicable projects
print("\nFirst 10 not applicable projects:")
for project in projects_applicability_data["not_applicable_projects"][:10]:
    print(project)

Total applicable projects: 168
Total not applicable projects: 230

First 10 applicable projects:
strongswan
libuv
netfilter
musl
bubblewrap
Onigmo
quagga
LibRaw-demosaic-pack-GPL2
fontconfig
oniguruma

First 10 not applicable projects:
torque
rpm
memcached
libmspack
ioq3
aircrack-ng
proxychains-ng
moodle
libfep
gpmf-parser


In [64]:
# Filter the dataset based on the applicability of projects
applicable_projects_set = set(projects_applicability_data["applicable_projects"])

print("Number of applicable projects:", len(applicable_projects_set))
print("size before filtering:", len(train_set))
project_filtered_train_set = train_set.filter(lambda example: example["project"] in applicable_projects_set)

print("size after filtering:", len(project_filtered_train_set))

Number of applicable projects: 135
size before filtering: 121933


Filter: 100%|██████████| 121933/121933 [00:03<00:00, 31436.95 examples/s]

size after filtering: 95263



