In [None]:
%pip install pyarrow
%pip install pandas
%pip install openai

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting pandas
  Downloading pandas-2.3.1-cp311-cp311-win_amd64.whl (11.3 MB)
     ---------------------------------------- 11.3/11.3 MB 5.9 MB/s eta 0:00:00
Collecting numpy>=1.23.2
  Downloading numpy-2.3.2-cp311-cp311-win_amd64.whl (13.1 MB)
     ---------------------------------------- 13.1/13.1 MB 4.6 MB/s eta 0:00:00
Collecting pytz>=2020.1
  Downloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
     -------------------------------------- 509.2/509.2 kB 5.3 MB/s eta 0:00:00
Collecting tzdata>=2022.7
  Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
     -------------------------------------- 347.8/347.8 kB 4.3 MB/s eta 0:00:00
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-2.3.2 pandas-2.3.1 pytz-2025.2 tzdata-2025.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
import pyarrow.parquet as pq
import pandas as pd

file_path = 'prompts_full_dataset.parquet'
parquet_file = pq.ParquetFile(file_path)

schema = parquet_file.schema
unique_columns = []
seen = set()
for field in schema:
    if field.name not in seen:
        unique_columns.append(field.name)
        seen.add(field.name)


table = parquet_file.read(columns=unique_columns)
df = table.to_pandas()

print("Columns in the dataset:", df.columns)
print("\nFirst few rows of the dataset:")
print(df.loc[0, 'prompt'])

Columns in the dataset: Index(['project', 'commit_id', 'func', 'vul', 'CVE ID', 'CWE ID', 'CWE Name',
       'CWE Description', 'Potential Mitigation', 'None', 'None', 'prompt',
       'prompt_char_count', 'prompt_token_count'],
      dtype='object')

First few rows of the dataset:
You are a professional cybersecurity analyst with expertise in static code analysis and Common Weakness Enumeration (CWE) classification.

You will be provided with:
- One **input function**, whose vulnerability status you must assess.
- Several **reference examples**, each containing:
  - A code function
  - A known vulnerability label (1 for vulnerable, 0 for not)
  - CWE ID (if vulnerable)
  - CWE Name (if vulnerable)

---

### TASK

Analyze the **structure, logic, and behavior** of the **input function**. Use deep comparison and reasoning based on the structure, intent, and usage patterns in the reference examples. Focus on the logical flow, operations performed, and overall behavior of the code to asses

In [10]:
print("Sum of tokens in all prompts:", df['prompt_token_count'].sum())

Sum of tokens in all prompts: 3948621


In [3]:
unique_cwe_ids = df['CWE ID'].nunique()
print(f"Number of unique CWE IDs: {unique_cwe_ids}")

num_functions = df['func'].nunique()
print(f"Number of unique functions: {num_functions}")

num_prompts = df['prompt'].nunique()
print(f"Number of unique prompts: {num_prompts}")

avg_char_count = df['prompt_char_count'].mean()
avg_token_count = df['prompt_token_count'].mean()
print(f"Average character count of prompts: {avg_char_count:.2f}")
print(f"Average token count of prompts: {avg_token_count:.2f}")

unique_projects = df['project'].nunique()
print(f"Number of unique projects: {unique_projects}")

Number of unique CWE IDs: 13
Number of unique functions: 2253
Number of unique prompts: 2253
Average character count of prompts: 7011.89
Average token count of prompts: 1752.61
Number of unique projects: 377


In [7]:
def check_func_in_prompt(row):
    func = row['func']
    prompt = row['prompt']
    if isinstance(func, str) and isinstance(prompt, str):
        return func in prompt
    return False

df['func_in_prompt'] = df.apply(check_func_in_prompt, axis=1)

print("\nSample of functions and their presence in prompts:")
print(df[['func', 'prompt', 'func_in_prompt']].head(10))

num_funcs_in_prompt = df['func_in_prompt'].sum()
print(f"\nNumber of functions present in their corresponding prompts: {num_funcs_in_prompt}")
print(f"Number of functions NOT present in their corresponding prompts: {len(df) - num_funcs_in_prompt}")


Sample of functions and their presence in prompts:
                                                func  \
0  FLAC__bool read_residual_partitioned_rice_(FLA...   
1  NeXTPreDecode(TIFF* tif, uint16 s)\n{\n\tstati...   
2  int ncp_open_create_file_or_subdir(struct ncp_...   
3  static MagickBooleanType IsWEBPImageLossless(c...   
4  static Image *ReadTIFFImage(const ImageInfo *i...   
5  bool ImageBitmap::isAccelerated() const {\n  r...   
6  int vfs_fallocate(struct file *file, int mode,...   
7  HTMLElement& toHTMLElement(FormAssociatedEleme...   
8  static void download_one_url(const char *url)\...   
9  static av_cold int vqa_decode_init(AVCodecCont...   

                                              prompt  func_in_prompt  
0  You are a professional cybersecurity analyst w...            True  
1  You are a professional cybersecurity analyst w...            True  
2  You are a professional cybersecurity analyst w...            True  
3  You are a professional cybersecurity analyst

In [9]:
def check_func_in_prompt(row):
    func = row['func']
    prompt = row['prompt']
    if isinstance(func, str) and isinstance(prompt, str):
        return func in prompt
    return False

In [10]:
df['func_in_prompt'] = df.apply(check_func_in_prompt, axis=1)

print("\nSample of functions and their presence in prompts (first 5 rows):")
print(df[['func', 'prompt', 'func_in_prompt']].head(5))

# Inspect rows where func is NOT in prompt
print("\nSample of rows where function is NOT present in prompt:")
not_present = df[~df['func_in_prompt']][['func', 'prompt']].head(5)
for idx, row in not_present.iterrows():
    print(f"\nRow {idx}:")
    print(f"Function: {row['func']}")
    print(f"Prompt: {row['prompt']}")

num_funcs_in_prompt = df['func_in_prompt'].sum()
num_funcs_not_in_prompt = len(df) - num_funcs_in_prompt
print(f"\nNumber of functions present in prompts: {num_funcs_in_prompt}")
print(f"Number of functions NOT present in prompts: {num_funcs_not_in_prompt}")

print(f"\nMissing func values: {df['func'].isna().sum()}")
print(f"Missing prompt values: {df['prompt'].isna().sum()}")


Sample of functions and their presence in prompts (first 5 rows):
                                                func  \
0  FLAC__bool read_residual_partitioned_rice_(FLA...   
1  NeXTPreDecode(TIFF* tif, uint16 s)\n{\n\tstati...   
2  int ncp_open_create_file_or_subdir(struct ncp_...   
3  static MagickBooleanType IsWEBPImageLossless(c...   
4  static Image *ReadTIFFImage(const ImageInfo *i...   

                                              prompt  func_in_prompt  
0  You are a professional cybersecurity analyst w...            True  
1  You are a professional cybersecurity analyst w...            True  
2  You are a professional cybersecurity analyst w...            True  
3  You are a professional cybersecurity analyst w...            True  
4  You are a professional cybersecurity analyst w...            True  

Sample of rows where function is NOT present in prompt:

Row 28:
Function:  static void copyMono8(
         short *dst,
        const int *const *src,
         unsigned 

In [11]:
def check_func_in_prompt_normalized(row):
    func = row['func']
    prompt = row['prompt']
    if isinstance(func, str) and isinstance(prompt, str):
        func = ' '.join(func.split())
        prompt = ' '.join(prompt.split())
        return func in prompt
    return False

df['func_in_prompt_normalized'] = df.apply(check_func_in_prompt_normalized, axis=1)
print("\nWith normalized text:")
print(f"Number of functions present in prompts: {df['func_in_prompt_normalized'].sum()}")
print(f"Number of functions NOT present in prompts: {len(df) - df['func_in_prompt_normalized'].sum()}")


With normalized text:
Number of functions present in prompts: 2253
Number of functions NOT present in prompts: 0


In [2]:
import os
from dotenv import load_dotenv
from openai import OpenAI
load_dotenv()
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

prompt = "Write a one-sentence bedtime story about a unicorn"

response = client.responses.create(
    model="gpt-4o-mini",
    input=prompt,
)

print(response)

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}