In [1]:
from datasets import load_dataset

ds = load_dataset("bstee615/bigvul")

train_set = ds["train"]
print("Number of examples in the training set:", len(train_set))
print("Features before modification:")
for key in train_set.features:
    print(f"{key}: {train_set.features[key]}")

train_set = train_set.remove_columns(["codeLink", "commit_id", "CVE Page", "lang", "commit_message", 'vul'])
print("\nFeatures after modification:")
for key in train_set.features:
    print(f"{key}: {train_set.features[key]}")




  from .autonotebook import tqdm as notebook_tqdm


Number of examples in the training set: 150908
Features before modification:
CVE ID: Value(dtype='string', id=None)
CVE Page: Value(dtype='string', id=None)
CWE ID: Value(dtype='string', id=None)
codeLink: Value(dtype='string', id=None)
commit_id: Value(dtype='string', id=None)
commit_message: Value(dtype='string', id=None)
func_after: Value(dtype='string', id=None)
func_before: Value(dtype='string', id=None)
lang: Value(dtype='string', id=None)
project: Value(dtype='string', id=None)
vul: Value(dtype='int8', id=None)

Features after modification:
CVE ID: Value(dtype='string', id=None)
CWE ID: Value(dtype='string', id=None)
func_after: Value(dtype='string', id=None)
func_before: Value(dtype='string', id=None)
project: Value(dtype='string', id=None)


In [2]:
# Remove entries with missing or empty CWE ID
train_set = train_set.filter(lambda example: example["CWE ID"] not in (None, "", "None"))

print("Number of examples in filtered set:", len(train_set))
unique_cwes = set(train_set['CWE ID'])
print(f"Number of unique CWE IDs: {len(unique_cwes)}")
for cwe in unique_cwes:
    print(cwe)

Number of examples in filtered set: 121933
Number of unique CWE IDs: 90
CWE-532
CWE-494
CWE-59
CWE-763
CWE-125
CWE-330
CWE-290
CWE-17
CWE-769
CWE-90
CWE-119
CWE-436
CWE-354
CWE-787
CWE-824
CWE-345
CWE-755
CWE-200
CWE-191
CWE-254
CWE-502
CWE-665
CWE-311
CWE-909
CWE-732
CWE-134
CWE-918
CWE-674
CWE-190
CWE-388
CWE-682
CWE-310
CWE-352
CWE-209
CWE-522
CWE-77
CWE-1021
CWE-664
CWE-426
CWE-327
CWE-835
CWE-400
CWE-369
CWE-617
CWE-706
CWE-476
CWE-94
CWE-284
CWE-320
CWE-346
CWE-255
CWE-834
CWE-416
CWE-285
CWE-16
CWE-252
CWE-704
CWE-772
CWE-269
CWE-287
CWE-19
CWE-611
CWE-120
CWE-22
CWE-668
CWE-693
CWE-770
CWE-264
CWE-399
CWE-362
CWE-189
CWE-129
CWE-78
CWE-862
CWE-281
CWE-20
CWE-74
CWE-79
CWE-18
CWE-361
CWE-89
CWE-358
CWE-93
CWE-404
CWE-415
CWE-601
CWE-347
CWE-754
CWE-172
CWE-295


In [3]:
unique_projects = set(train_set['project'])
print(f"Number of unique projects: {len(unique_projects)}")
print("Unique projects:")
projects_list = list(unique_projects)
for project in unique_projects:
    print(project)

Number of unique projects: 298
Unique projects:
libXrandr
savannah
OpenSC
gpac
pdfresurrect
libvips
file
rawstudio
zlib
util-linux
curl
libvirt
bzrtp
libx11
libbsd
libmysofa
php-src
ioq3
xcursor
collectd
jerryscript
rdesktop
uwsgi
oniguruma
yubico-pam
libxfont
ngiflib
libreport
enlightment
Openswan
lighttpd1.4
memcached
mod_auth_mellon
leptonica
libtomcrypt
torque
ntp
sleuthkit
ppp
libXtst
zfs
moodle
tartarus
libxsmm
nedmalloc
unixODBC
xserver
unrealircd
MAC-Telnet
libxml2
postgres
das_watchdog
neomutt
capstone
flatpak
httpd
gifsicle
gstreamer
udisks
busybox
pgbouncer
aircrack-ng
redis
sgminer
harfbuzz
mapserver
drm
fontforge
ovs
libevt
libplist
texlive-source
charybdis
pngquant
lysator
linux-nfs
musl
FFmpeg
bdwgc
mruby
php
uriparser
libevent
openvpn
WavPack
libxkbcommon
liblouis
proftpd
media-tree
mod_auth_openidc
pixman
mstdlib
exfat
jabberd2
jasper
wpitchoune
libXi
weechat
feh
libu2f-host
pam-u2f
faad2
gnome-session
libpcap
shibboleth
Little-CMS
bubblewrap
libpng
libimobiledevice
al

In [4]:
from pydantic import BaseModel, Field
class ProjectResponse(BaseModel):
    applicable_projects: list[str] = Field(
        ...,
        description="List of projects that are applicable in automotive context."
    )
    not_applicable_projects: list[str] = Field(
        ...,
        description="List of projects that are not applicable in automotive context."
    )
    justification_per_project: list[str] = Field(
        ...,
        description="Justification for each project regarding its applicability in automotive context."
    )

def get_projects_prompt(projects_list_slice):
    prompt = f"""
You are an expert in automotive software development.
Based on the provided list of project names, use your expertise to analyze the projects and determine their use and applicability in the automotive context.
Applicability can be in different areas such as safety, security, performance, or compliance with automotive standards.
For example, linux is applicable in automotive context as it is used in many ECUs as a an operating system.
Strongswan is applicable in automotive context as it is used for secure communication in vehicle networks.
Project needs to be applicable in the on-board context, not in the cloud, server, or simulation context. In short, the project should be used in the vehicle codebase.
Please provide a detailed analysis of each project, including whether it is applicable or not, and the reasons for your assessment.
Keep justification short and consice, ideally 1-2 sentences per project.
Here is the list of projects:
{'\n'.join(projects_list_slice)}"""
    return prompt

In [6]:
from pydantic import BaseModel, Field
from google import genai

with open('token', 'r') as file:
    token = file.read().strip()
    
client = genai.Client(api_key=token)

def determine_applicability_of_projects(prompt: str) -> ProjectResponse | None:
    """Determine if the vulnerability is applicable to automotive context."""
    try:
        response = client.models.generate_content(
            model="gemini-2.5-flash-preview-05-20",
            contents=prompt,
            config={
                "response_mime_type": "application/json",
                "response_schema": ProjectResponse,
            },
        )
        # if server error, return empty list
    except genai.errors.ServerError as e:
        print(f"Server error: {e}")
        return None
    return response.parsed

In [7]:
print(f'Length of projects list: {len(projects_list)}')
projects_list_slice = projects_list[:10]  # Adjust the slice size as needed

response = determine_applicability_of_projects(get_projects_prompt(projects_list_slice))

def print_response(response: ProjectResponse | None):
    """Print the response from the model."""
    if response:
        print("Applicable projects:", response.applicable_projects)
        print("Not applicable projects:", response.not_applicable_projects)
        for justification in response.justification_per_project:
            print("Justification:", justification)
    else:
        print("No response received or an error occurred.")

print_response(response)

Length of projects list: 298
Applicable projects: ['OpenSC', 'gpac', 'libvips', 'file', 'zlib', 'util-linux']
Not applicable projects: ['libXrandr', 'savannah', 'pdfresurrect', 'rawstudio']
Justification: libXrandr: Primarily for X Window System display management, which is not a common or direct component of embedded automotive systems.
Justification: savannah: A project hosting and management platform, not a software component for on-board vehicle systems.
Justification: OpenSC: Could be applicable for secure authentication, digital key management, or secure element interaction in automotive systems.
Justification: gpac: A multimedia framework capable of handling various formats, relevant for in-vehicle infotainment (IVI) systems for audio and video playback.
Justification: pdfresurrect: A forensic tool for recovering data from PDF files, which has no direct use in an on-board automotive system.
Justification: libvips: A fast image processing library, applicable for camera systems, A

In [8]:
from time import sleep
import json

chunk_step = 50
applicable = []
not_applicable = []
justifications = []
# Process the projects in chunks
for i in range(0, len(projects_list), chunk_step):
    projects_list_slice = projects_list[i:i + chunk_step]
    print(f"Processing projects {i} to {i + chunk_step}...")
    while True:
        response = determine_applicability_of_projects(get_projects_prompt(projects_list_slice))
        if response:
            print(f"Processed projects {i} to {i + chunk_step}")
            print(f'More applicable projects: {len(response.applicable_projects)}')
            print(f'More not applicable projects: {len(response.not_applicable_projects)}')
            print(f'Appending {len(response.applicable_projects)} applicable projects and {len(response.not_applicable_projects)} not applicable projects.')
            applicable += response.applicable_projects
            not_applicable += response.not_applicable_projects
            justifications += response.justification_per_project
            print("Saving current results to file...")
            results = {
                "applicable_projects": applicable,
                "not_applicable_projects": not_applicable,
                "justifications": justifications
            }
            with open('applicable_projects.json', 'w') as f:
                json.dump(results, f, indent=4)
            print("Results saved successfully.")
            break
        else:
            print(f"Retrying for projects {i} to {i + chunk_step} due to server error...")
            sleep(5)



Processing projects 0 to 50...
Processed projects 0 to 50
More applicable projects: 20
More not applicable projects: 30
Appending 20 applicable projects and 30 not applicable projects.
Saving current results to file...
Results saved successfully.
Processing projects 50 to 100...
Processed projects 50 to 100
More applicable projects: 22
More not applicable projects: 28
Appending 22 applicable projects and 28 not applicable projects.
Saving current results to file...
Results saved successfully.
Processing projects 100 to 150...
Processed projects 100 to 150
More applicable projects: 20
More not applicable projects: 30
Appending 20 applicable projects and 30 not applicable projects.
Saving current results to file...
Results saved successfully.
Processing projects 150 to 200...
Processed projects 150 to 200
More applicable projects: 15
More not applicable projects: 35
Appending 15 applicable projects and 35 not applicable projects.
Saving current results to file...
Results saved successful

In [9]:
with open('applicable_projects.json', 'r') as f:
    projects_applicability_data = json.load(f)

stats = {
    "total_applicable": len(projects_applicability_data["applicable_projects"]),
    "total_not_applicable": len(projects_applicability_data["not_applicable_projects"]),
}

print("Total applicable projects:", stats["total_applicable"])
print("Total not applicable projects:", stats["total_not_applicable"])
# Print the first 10 applicable projects
print("\nFirst 10 applicable projects:")
for project in projects_applicability_data["applicable_projects"][:10]:
    print(project)
# Print the first 10 not applicable projects
print("\nFirst 10 not applicable projects:")
for project in projects_applicability_data["not_applicable_projects"][:10]:
    print(project)

Total applicable projects: 114
Total not applicable projects: 183

First 10 applicable projects:
gpac
libvips
file
zlib
util-linux
curl
libmysofa
collectd
jerryscript
oniguruma

First 10 not applicable projects:
libXrandr
savannah
OpenSC
pdfresurrect
rawstudio
libvirt
bzrtp
libx11
libbsd
php-src


In [10]:
# Filter the dataset based on the applicability of projects
applicable_projects_set = set(projects_applicability_data["applicable_projects"])

print("Number of applicable projects:", len(applicable_projects_set))
print("size before filtering:", len(train_set))
project_filtered_train_set = train_set.filter(lambda example: example["project"] in applicable_projects_set)

print("size after filtering:", len(project_filtered_train_set))

Number of applicable projects: 114
size before filtering: 121933


Filter: 100%|██████████| 121933/121933 [00:03<00:00, 34030.99 examples/s]

size after filtering: 53658



