In [1]:
from datasets import load_from_disk
from pathlib import Path

RAW_PATH = Path("../data/raw/") / "bigvul.raw"

ds = load_from_disk(RAW_PATH)

train_set = ds["train"]
validation_set = ds["validation"]
test_set = ds["test"]

print(f"Size of train {len(train_set)}")
print(f"Size of validation {len(validation_set)}")
print(f"Size of test set {len(test_set)}")


  from .autonotebook import tqdm as notebook_tqdm


Size of train 150908
Size of validation 33049
Size of test set 33050


In [2]:
# Stripping columns
train_set = train_set.remove_columns(["codeLink", "commit_id", "CVE Page", "lang", "commit_message", 'vul'])
validation_set = validation_set.remove_columns(["codeLink", "commit_id", "CVE Page", "lang", "commit_message", 'vul'])
test_set = test_set.remove_columns(["codeLink", "commit_id", "CVE Page", "lang", "commit_message", 'vul'])

In [3]:
# Remove entries with missing or empty CWE ID
train_set = train_set.filter(lambda example: example["CWE ID"] not in (None, "", "None"))
validation_set = validation_set.filter(lambda example: example["CWE ID"] not in (None, "", "None"))
test_set = test_set.filter(lambda example: example["CWE ID"] not in (None, "", "None"))

print(f"Size of train {len(train_set)}")
print(f"Size of validation {len(validation_set)}")
print(f"Size of test set {len(test_set)}")

Size of train 121933
Size of validation 26739
Size of test set 26626


In [4]:
# Classification of applicable projects


from pydantic import BaseModel, Field
from google import genai

class ProjectResponse(BaseModel):
    applicable_projects: list[str] = Field(
        ...,
        description="List of projects that are applicable in automotive context."
    )
    not_applicable_projects: list[str] = Field(
        ...,
        description="List of projects that are not applicable in automotive context."
    )
    justification_per_project: list[str] = Field(
        ...,
        description="Justification for each project regarding its applicability in automotive context."
    )


def get_projects_prompt(projects_list_slice):
    prompt = f"""
You are an expert in automotive software development.
Based on the provided list of project names, use your expertise to analyze the projects and determine their use and applicability in the automotive context.
Applicability can be in different areas such as safety, security, performance, or compliance with automotive standards.
For example, linux is applicable in automotive context as it is used in many ECUs as a an operating system.
Strongswan is applicable in automotive context as it is used for secure communication in vehicle networks.
Project needs to be applicable in the on-board context, not in the cloud, server, or simulation context. In short, the project should be used in the vehicle codebase.
Please provide a detailed analysis of each project, including whether it is applicable or not, and the reasons for your assessment.
Keep justification short and consice, ideally 1-2 sentences per project.
Here is the list of projects:
{'\n'.join(projects_list_slice)}"""
    return prompt


with open('token', 'r') as file:
    token = file.read().strip()
    

client = genai.Client(api_key=token)


def determine_applicability_of_projects(prompt: str) -> ProjectResponse | None:
    """Determine if the vulnerability is applicable to automotive context."""
    try:
        response = client.models.generate_content(
            model="gemini-2.5-flash-preview-05-20",
            contents=prompt,
            config={
                "response_mime_type": "application/json",
                "response_schema": ProjectResponse,
            },
        )
        # if server error, return empty list
    except genai.errors.ServerError as e:
        print(f"Server error: {e}")
        return None
    return response.parsed


from time import sleep
import json
from pathlib import Path

APPL_PROJECTS_PATH = Path("../data/meta/") / "applicable_projects.json"

unique_projects = set(train_set['project'])
projects_list = list(unique_projects)

def classify_projects_in_chunks():
    chunk_step = 50
    applicable = []
    not_applicable = []
    justifications = []
    # Process the projects in chunks
    for i in range(0, len(projects_list), chunk_step):
        projects_list_slice = projects_list[i:i + chunk_step]
        print(f"Processing projects {i} to {i + chunk_step}...")
        while True:
            response = determine_applicability_of_projects(get_projects_prompt(projects_list_slice))
            if response:
                print(f"Processed projects {i} to {i + chunk_step}")
                print(f'More applicable projects: {len(response.applicable_projects)}')
                print(f'More not applicable projects: {len(response.not_applicable_projects)}')
                print(f'Appending {len(response.applicable_projects)} applicable projects and {len(response.not_applicable_projects)} not applicable projects.')
                applicable += response.applicable_projects
                not_applicable += response.not_applicable_projects
                justifications += response.justification_per_project
                print("Saving current results to file...")
                results = {
                    "applicable_projects": applicable,
                    "not_applicable_projects": not_applicable,
                    "justifications": justifications
                }
                with open(APPL_PROJECTS_PATH, 'w') as f:
                    json.dump(results, f, indent=4)
                print("Results saved successfully.")
                break
            else:
                print(f"Retrying for projects {i} to {i + chunk_step} due to server error...")
                sleep(5)


with open(APPL_PROJECTS_PATH, 'r') as f:
    projects_applicability_data = json.load(f)

if not projects_applicability_data:
    print("Projects have not been classified yet. Doing it now.")
    classify_projects_in_chunks()
    with open(APPL_PROJECTS_PATH, 'r') as f:
        projects_applicability_data = json.load(f)

stats = {
    "total_applicable": len(projects_applicability_data["applicable_projects"]),
    "total_not_applicable": len(projects_applicability_data["not_applicable_projects"]),
}

print("Total applicable projects:", stats["total_applicable"])
print("Total not applicable projects:", stats["total_not_applicable"])
# Print the first 10 applicable projects
print("\nFirst 10 applicable projects:")
for project in projects_applicability_data["applicable_projects"][:10]:
    print(project)
# Print the first 10 not applicable projects
print("\nFirst 10 not applicable projects:")
for project in projects_applicability_data["not_applicable_projects"][:10]:
    print(project)

Total applicable projects: 114
Total not applicable projects: 183

First 10 applicable projects:
gpac
libvips
file
zlib
util-linux
curl
libmysofa
collectd
jerryscript
oniguruma

First 10 not applicable projects:
libXrandr
savannah
OpenSC
pdfresurrect
rawstudio
libvirt
bzrtp
libx11
libbsd
php-src


In [5]:
# Filter the dataset based on the applicability of projects
applicable_projects_set = set(projects_applicability_data["applicable_projects"])



train_set = train_set.filter(lambda example: example["project"] in applicable_projects_set)
validation_set = validation_set.filter(lambda example: example["project"] in applicable_projects_set)
test_set = test_set.filter(lambda example: example["project"] in applicable_projects_set)

print(f"Size of train {len(train_set)}")
print(f"Size of validation {len(validation_set)}")
print(f"Size of test set {len(test_set)}")

Size of train 53658
Size of validation 12445
Size of test set 12304


In [6]:
from datasets import DatasetDict
from pathlib import Path

# Assuming you filtered train, test, validation separately
qualified_dataset = DatasetDict({
    "train": train_set,
    "test": validation_set,
    "validation": test_set,
})

PATH_TO_QUALIFIED = Path("../data/processed/") / "with_cwe_projects_qualified"

ds.save_to_disk(PATH_TO_QUALIFIED)

Saving the dataset (1/1 shards): 100%|██████████| 150908/150908 [00:00<00:00, 282286.60 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 33049/33049 [00:00<00:00, 145889.10 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 33050/33050 [00:00<00:00, 260011.01 examples/s]
