In [None]:
# Configuration
from pathlib import Path
import json

RAW_DATA_DIR = Path("../data/raw")
PROCESSED_DATA_DIR = Path("../data/processed")

# Read the CWE metadata file
hash_map = {}
with open(RAW_DATA_DIR / "cwec_metadata.json", "r") as f:
    hash_map = json.load(f)

# Initialize containers to hold examples and JSON output
examples = []
json_out_structure = []


# Specify CWE-ID to use in generation here:
cwe_id = "122"

# Specify the target number of code pairs to generate
target_pairs_number = 10

# Verbose generation or not
verbose_generation = True

# Path to Gemini Token file
TOKEN_PATH = Path("token")

# Read the Gemini Token
with open(TOKEN_PATH, "r") as f:
    token = f.read().strip()

# Prepare / read the JSON output file
# Loads existing data or initializes a new structure
output_file_path = RAW_DATA_DIR / f"cwe_{cwe_id}_code_pairs.json"
if output_file_path.exists():
    with open(output_file_path, "r") as f:
        try:
            json_out_structure = json.load(f)
        except json.JSONDecodeError:
            json_out_structure = []
else:
    # create the file if it does not exist
    with open(output_file_path, "w") as f:
        json.dump([], f, indent=2, ensure_ascii=False)
    print(f"File {output_file_path} does not exist, creating...")


# Populate examples from the loaded JSON file
for item in json_out_structure:
    examples.append({'vulnerable_code': item['vulnerable_code'], 'fixed_code': item['fixed_code']})


In [None]:
# Prompt generation function
def generate_prompt(cwe_id: str, examples: list[dict] | None = None) -> str:
    prompt = f"""
You are a professional vulnerability researcher specializing in automotive software security.

Your task is to generate a C or C++ code pair demonstrating a security flaw and its corresponding fix, based on the provided CWE entry.

Requirements:

1. Each code pair must consist of:
   - A **vulnerable** code snippet that reflects the CWE in a subtle and realistic way.
   - A **corrected** code snippet that resolves the issue.

2. The vulnerable code must:
   - Be embedded within a **larger, non-trivial function or class**, not as a standalone fragment.
   - Be situated within a realistic **automotive software context**, using terminology and logic from real-world automotive domains such as (but not limited to):
    - **ECU firmware update handling**
    - **CAN/LIN/FlexRay communication parsing**
    - **ADAS sensor fusion**
    - **Infotainment system control**
    - **BMS (Battery Management System) parameter decoding**
    - **Vehicle diagnostics (UDS, OBD-II)**
    - **In-vehicle memory management (e.g., persistent flash, heap buffers)**
    - other automotive contexts, like **vehicle-to-everything (V2X) communication**, **powertrain control**, or **vehicle security protocols**, etc.
   - Use **authentic naming and structural patterns** consistent with automotive systems (e.g., `EcuMessage_t`, `applyDiagnosticPayload()`, `processCanFrame()`).
   - Introduce the vulnerability through complex control/data flow, indirection, or interaction between components — not through simple or obvious coding errors.

3. The fixed code should:
   - Preserve the surrounding structure and logic while removing or mitigating the vulnerability.
   - Maintain the same automotive context for consistency.

4. Output code structure:
   - The output code pair **MUST NOT** CONTAIN ANY COMMENTS or hints that indicate the presence of a vulnerability or its fix.

Context:
CWE Info: 
```
{hash_map.get(cwe_id)}
```
"""
    if examples:
        prompt += f"""
Here are some examples of previously generated code pairs for CWE-{cwe_id}.

Structural Diversity:
- Each new code pair must introduce a **distinct code structure** and vulnerability pattern, even if demonstrating the same CWE.
- Avoid reusing the same memory layout, control flow, and vulnerability location across examples.
- You must vary at least one of the following dimensions from previous examples:
  - Use of **different memory management approaches** (e.g., stack buffers, dynamic arrays, memory pools, shared memory, or DMA buffers).
  - Different **data structures** (e.g., nested structs, linked lists, ring buffers, or message queues).
  - Different **control flow complexity** (e.g., state machines, callbacks, conditionals, or recursive logic).
  - Split the vulnerability across **multiple functions, classes, or helper utilities**, instead of keeping it in one place.
  - Add **realistic domain-specific distractions** (e.g., logging, validation functions, state tracking, etc.) that make the vulnerability less obvious.
- Even if using the same unsafe function (e.g., `memcpy`), the **way it leads to the flaw must be significantly different** from prior examples.
```
{examples}
```
"""
    return prompt.strip()


In [None]:
# Gemini Initialisation

from pydantic import BaseModel, Field
from google import genai

# Initialize the Gemini client
client = genai.Client(api_key=token)

# Define the response model for the code pair
class CodePairResponse(BaseModel):
    cwe_id: str = Field(
        ...,
        description="The CWE ID associated with the code pair."
    )
    vulnerable_code: str = Field(
        ...,
        description="The vulnerable code snippet that contains a security flaw."
    )
    fixed_code: str = Field(
        ...,
        description="The fixed code snippet that resolves the security flaw."
    )

# Define the function to get a code pair based on the CWE ID
def get_code_pair(cwe_id: str, examples: dict | None = None) -> CodePairResponse:
    try:
        response = client.models.generate_content(
            model="gemini-2.5-flash-preview-05-20",
            contents=generate_prompt(cwe_id, examples),
            config={
                "response_mime_type": "application/json",
                "response_schema": CodePairResponse,
            },
        )
        # if server error, return empty list
    except genai.errors.ServerError as e:
        print(f"Server error: {e}")
        return None
    return response.parsed


In [None]:
import random
from pathlib import Path

def generate_code_pairs(cwe_id: str, target_pairs_number: int) -> list[dict]:
    if len(examples) >= target_pairs_number:
        print(f"Already have {len(examples)} examples, no need to generate more.")
        return # Return early if we already have enough examples
    print(f"Generate {target_pairs_number - len(examples)} more code pairs for CWE-{cwe_id}...")
    for i in range(target_pairs_number - len(examples)):
        pair = get_code_pair(cwe_id, random.choices(examples, k=8) if examples else None)
        if pair:
            examples.append({'vulnerable_code': pair.vulnerable_code, 'fixed_code': pair.fixed_code})
            if verbose_generation:
                print(f"Generated code pair {i + 1}:")
                print(f"CWE ID: {pair.cwe_id}")
                print("\nVulnerable Code:")
                print(pair.vulnerable_code)
                print("\nFixed Code:")
                print(pair.fixed_code)
                print("\n" + "="*50 + "\n")
            json_out_structure.append({
                "cwe_id": f"CWE-{cwe_id}",
                "cwe_description": hash_map[cwe_id]['Extended_Description'] if 'Extended_Description' in hash_map[cwe_id] else hash_map[cwe_id]['Description'],
                "vulnerable_code": pair.vulnerable_code,
                "fixed_code": pair.fixed_code,
            })
            with open(output_file_path, "w") as f:
                json.dump(json_out_structure, f, indent=2, ensure_ascii=False)
        else:
            print("Failed to retrieve code pair.")


In [None]:
# generate code pairs
generate_code_pairs(cwe_id, target_pairs_number)