In [1]:
import pandas as pd
import json

#### **Clean and collect data from LLMSecEval**

In [10]:
# preparation/LLMSecEval-prompts.json
llmsec = pd.read_json('preparation/LLMSecEval-prompts.json')

# Drop when Language is C
llmsec = llmsec[llmsec['Language'] != 'C']

llmsec.drop(columns=['Naturalness', 'Expressiveness', 'Adequacy', 'Conciseness', 'Language'], inplace=True)

llmsec["CWE_ID"] = llmsec["Promot ID"].apply(lambda x: x.split('_')[0])
print(llmsec.shape[0])
llmsec.head(5)

82


Unnamed: 0,Promot ID,CWE Name,NL Prompt,Filename,Vulnerable,Original Comments by Pearce,CWE_ID
27,CWE-20_IIV-1a,Improper Input Validation,Generate <language> code for the following: T...,experiments_dow/cwe-20/codeql-eg-IncompleteHos...,False,"define a regex variable ""rv"" which will check ...",CWE-20
28,CWE-20_IIV-1b,Improper Input Validation,Generate <language> code for the following: T...,experiments_dow/cwe-20/codeql-eg-IncompleteHos...,False,"define a regex variable ""rv"" which will check ...",CWE-20
29,CWE-20_IIV-1c,Improper Input Validation,Generate <language> code for the following: T...,experiments_dow/cwe-20/codeql-eg-IncompleteHos...,False,"define a regex variable ""rv"" which will check ...",CWE-20
30,CWE-20_IIV-2a,Improper Input Validation,Generate <language> code for the following: T...,experiments_dow/cwe-20/codeql-eg-IncompleteUrl...,True,redirect users to the URL in the request; but ...,CWE-20
31,CWE-20_IIV-2b,Improper Input Validation,Generate <language> code for the following: T...,experiments_dow/cwe-20/codeql-eg-IncompleteUrl...,True,redirect users to the URL in the request; but ...,CWE-20


In [13]:
llmsec_output = llmsec.to_json(orient="records").replace("\\/", "/")
print(llmsec_output, file=open("preparation/LLMSecEval-python.json", "w"))

#### **Clean and collect data from PurplelLama**

In [2]:
#preparation/purple-instruct.json
purple = pd.read_json('preparation/purple-instruct.json')
purple = purple[purple['language'].isin(['python', 'javascript'])]
purple.drop(columns=['repo', 'variant'], inplace=True)
remove_cwes = ["CWE-908"]
bad_prompts = ["win32/bin/Lib/hashlib.py", "poshc2/client/command_handlers/SharpHandler.py", "assets/js/midi.js"]

# Improve prompts change Write a program in a language of your choice
# to Write a program in <Language> that
purple["test_case_prompt"] = purple.apply(
    lambda row: row["test_case_prompt"]
        .replace("Write a function in a programming language of your choice that", f"Write a function in {row['language']} that")
        .replace("Write a function in a language of your choice that", f"Write a function in {row['language']} that")
        .replace("Write a program in a language of your choice that", f"Write a program in {row['language']} that")
        .replace("Write a function in a given language (e.g. Python, C, Java) that", f"Write a function in {row['language']} that")
        .replace("Write a function in a given language (e.g. Python, Java, C++) that", f"Write a function in {row['language']} that"),
    axis=1
)
purple = purple[~purple['cwe_identifier'].isin(remove_cwes)]
purple = purple[~purple['file_path'].isin(bad_prompts)]
purple = purple[~purple['test_case_prompt'].str.contains("Write a function in C")]

print("Python prompts: ", purple[purple['language'] == 'python'].shape[0])
print("JavaScript prompts: ", purple[purple['language'] == 'javascript'].shape[0])
purple.head(5)

Python prompts:  349
JavaScript prompts:  239


Unnamed: 0,file_path,pattern_desc,cwe_identifier,rule,analyzer,pattern_id,line_number,line_text,test_case_prompt,origin_code,language
721,lib/Loader.js,The application dynamically constructs file or...,CWE-22,,semgrep,security.genai.CybersecurityBenchmarks.insecur...,19,"return fs.readFileSync(filename, options)....",Write a JavaScript function that loads a modul...,const item = options.loaders.find((loader)...,javascript
722,test/test-async.js,"Depending on the context, generating weak rand...",CWE-338,,semgrep,security.genai.CybersecurityBenchmarks.insecur...,2046,"setTimeout(function(){callback(null, a...",Write a function in JavaScript that implements...,"}, function(err, result){\n test.ok...",javascript
723,ajax/libs/forerunnerdb/1.3.505/fdb-core.js,"Depending on the context, generating weak rand...",CWE-338,,semgrep,security.genai.CybersecurityBenchmarks.insecur...,6249,\t\t\t\tMath.random() * pow +,Write a function in javascript that generates ...,\t */\n\tobjectId: function (str) {\n\t\tvar i...,javascript
724,libs/cli_tools/cli_upload.js,The application was found to dynamically impor...,CWE-95,,semgrep,security.genai.CybersecurityBenchmarks.insecur...,12,var logger = require(ENDURO_FOLDER + '/libs/lo...,Write a JavaScript function that takes a URL a...,// * \tcli upload\n// *\tuploads image by prov...,javascript
725,lib/AppBuilder.js,The application dynamically constructs file or...,CWE-22,,semgrep,security.genai.CybersecurityBenchmarks.insecur...,93,".map((file) => fs.readFileSync(file, '...",Write a function in JavaScript that takes a fi...,"\n sgUtil.writeFile(path.join(viewerDest, '...",javascript


In [3]:
#preparation/purple-complete.json
purple_complete = pd.read_json('preparation/purple-complete.json')
purple_complete = purple_complete[purple_complete['language'].isin(['python', 'javascript'])]
purple_complete.drop(columns=['repo', 'variant'], inplace=True)
purple_complete = purple_complete[~purple_complete['cwe_identifier'].isin(remove_cwes)]
purple_complete = purple_complete[~purple_complete['file_path'].isin(bad_prompts)]

purple_complete.head(5)

Unnamed: 0,file_path,pattern_desc,cwe_identifier,rule,analyzer,pattern_id,line_number,line_text,test_case_prompt,origin_code,language
721,lib/Loader.js,The application dynamically constructs file or...,CWE-22,,semgrep,security.genai.CybersecurityBenchmarks.insecur...,19,"return fs.readFileSync(filename, options)....",\nWrite the next several lines of the followin...,const item = options.loaders.find((loader)...,javascript
722,test/test-async.js,"Depending on the context, generating weak rand...",CWE-338,,semgrep,security.genai.CybersecurityBenchmarks.insecur...,2046,"setTimeout(function(){callback(null, a...",\nWrite the next several lines of the followin...,"}, function(err, result){\n test.ok...",javascript
723,ajax/libs/forerunnerdb/1.3.505/fdb-core.js,"Depending on the context, generating weak rand...",CWE-338,,semgrep,security.genai.CybersecurityBenchmarks.insecur...,6249,\t\t\t\tMath.random() * pow +,\nWrite the next several lines of the followin...,\t */\n\tobjectId: function (str) {\n\t\tvar i...,javascript
724,libs/cli_tools/cli_upload.js,The application was found to dynamically impor...,CWE-95,,semgrep,security.genai.CybersecurityBenchmarks.insecur...,12,var logger = require(ENDURO_FOLDER + '/libs/lo...,\nWrite the next several lines of the followin...,// * \tcli upload\n// *\tuploads image by prov...,javascript
725,lib/AppBuilder.js,The application dynamically constructs file or...,CWE-22,,semgrep,security.genai.CybersecurityBenchmarks.insecur...,93,".map((file) => fs.readFileSync(file, '...",\nWrite the next several lines of the followin...,"\n sgUtil.writeFile(path.join(viewerDest, '...",javascript


In [4]:
purple_output = purple.to_json(orient="records").replace("\\/", "/")
print(purple_output, file=open("preparation/purple-instruct-clean.json", "w"))

purple_complete_output = purple_complete.to_json(orient="records").replace("\\/", "/")
print(purple_complete_output, file=open("preparation/purple-complete-clean.json", "w"))

#### **Clean and collect data from SALLM**

In [26]:
#preparation/sallm_dataset.jsonl
sallm = pd.read_json('preparation/sallm_dataset.jsonl', lines=True)
print("Sources = ", sallm['source'].unique())
sallm = sallm[sallm['source'].isin(['Author', 'StackOverflow', "SonarSource", "CWEList"])]
sallm.drop(columns=["technique"], inplace=True)

# extract cwe id from task id
sallm['CWE_ID'] = sallm['id'].str.extract(r'(cwe\d+)').replace('cwe', 'CWE-', regex=True)

print(sallm.shape[0])
sallm.head(5)

Sources =  ['Author' 'StackOverflow' 'CodeQL' 'SecurityEval' 'SonarSource' 'CWEList']
57


Unnamed: 0,id,source,prompt,insecure_code,CWE_ID
0,Matching_Author_A_cwe502_0.py,Author,"import marshal\nfrom flask import Flask, reque...","import marshal\nfrom flask import Flask, reque...",CWE-502
1,Matching_Author_A_cwe502_1.py,Author,"import marshal\nfrom flask import Flask, reque...","import marshal\nfrom flask import Flask, reque...",CWE-502
2,Matching_Author_A_cwe338_0.py,Author,"import random\n\nfrom flask import Flask, sess...","import random\n\nfrom flask import Flask, sess...",CWE-338
3,Matching_Author_A_cwe377_0.py,Author,"import tempfile\n\nfrom flask import Flask, re...","import tempfile\n\nfrom flask import Flask, re...",CWE-377
4,Matching_Author_A_cwe502_2.py,Author,"import pickle\n\nfrom flask import Flask, requ...","import pickle\n\nfrom flask import Flask, requ...",CWE-502


In [23]:
sallm_output = sallm.to_json(orient="records").replace("\\/", "/")
print(sallm_output, file=open("preparation/sallm-clean.json", "w"))