In [1]:
import os
import json
import pandas as pd 

In [None]:
def extract_vulns_from_json_files(directory_path):
    """
    Iterates through JSON files in a directory, extracts those with CWE IDs,
    and returns a dictionary mapping file names to CWE data.

    Args:
        directory_path (str): The path to the directory containing JSON files.

    Returns:
        dict: A dictionary where keys are file names and values are lists of CWE data.
    """

    descriptions_df = {
        "pubYear": list(),
        "Status": list(),
        "Description": list(),
        "CWE-ID": list(),
        "vectorString": list(),
        "attackVector": list(),
        "attackComplexity": list(),
        "privilegesRequired": list(),
        "userInteraction": list(),
        "scope": list(),
        "confidentialityImpact": list(),
        "integrityImpact": list(),
        "availabilityImpact": list(),
    }
    years_list = list(range(2010, 2025))
    for year in years_list:
        for filename in os.listdir(os.path.join(directory_path, str(year))):
            if filename.endswith(".json"):
                filepath = os.path.join(directory_path, str(year), filename)
                print(filepath)
                with open(filepath, "r") as file:
                    try:
                        data = json.load(file)
                    except json.JSONDecodeError:
                        print(f"Skipping invalid JSON file: {filename}")
                        continue  # Skip to the next file if JSON is invalid
                    for entry in data["vulnerabilities"]:
                        # try:
                        descriptions_df["pubYear"].append(
                            entry["cve"]["published"].split("-")[0])  # Extract publication year
                        descriptions_df["Description"].append(
                            entry["cve"]["descriptions"][0]["value"])
                        descriptions_df["Status"].append(
                            entry["cve"]["vulnStatus"])
                        if "weaknesses" in entry["cve"].keys():
                            descriptions_df["CWE-ID"].append(
                                entry["cve"]["weaknesses"][0]["description"][0]["value"])
                        else:
                            descriptions_df["CWE-ID"].append("None")

                        if "metrics" in entry["cve"].keys() and "cvssMetricV30" in entry["cve"]["metrics"].keys():
                            nist_metric = entry["cve"]["metrics"]["cvssMetricV30"]
                            print(nist_metric)
                            descriptions_df["vectorString"].append(
                                nist_metric[0]["cvssData"]["vectorString"] if nist_metric != [] else "None")
                            descriptions_df["attackVector"].append(
                                nist_metric[0]["cvssData"]["attackVector"] if nist_metric != [] else "None")
                            descriptions_df["attackComplexity"].append(
                                nist_metric[0]["cvssData"]["attackComplexity"] if nist_metric != [] else "None")
                            descriptions_df["privilegesRequired"].append(
                                nist_metric[0]["cvssData"]["privilegesRequired"] if nist_metric != [] else "None")
                            descriptions_df["userInteraction"].append(
                                nist_metric[0]["cvssData"]["userInteraction"] if nist_metric != [] else "None")
                            descriptions_df["scope"].append(
                                nist_metric[0]["cvssData"]["scope"] if nist_metric != [] else "None")
                            descriptions_df["confidentialityImpact"].append(
                                nist_metric[0]["cvssData"]["confidentialityImpact"] if nist_metric != [] else "None")
                            descriptions_df["integrityImpact"].append(
                                nist_metric[0]["cvssData"]["integrityImpact"] if nist_metric != [] else "None")
                            descriptions_df["availabilityImpact"].append(
                                nist_metric[0]["cvssData"]["availabilityImpact"] if nist_metric != [] else "None")
                        else:
                            descriptions_df["vectorString"].append("None")
                            descriptions_df["attackVector"].append("None")
                            descriptions_df["attackComplexity"].append("None")
                            descriptions_df["privilegesRequired"].append(
                                "None")
                            descriptions_df["userInteraction"].append("None")
                            descriptions_df["scope"].append("None")
                            descriptions_df["confidentialityImpact"].append(
                                "None")
                            descriptions_df["integrityImpact"].append("None")
                            descriptions_df["availabilityImpact"].append(
                                "None")
                        # except Exception as e:
                        #    print(f"exception:{e}")
                        #    continue

    return descriptions_df


directory = "API responses"  # Update with your actual directory path
cwe_data = extract_vulns_from_json_files(directory)

In [11]:
cwe_data.keys()

dict_keys(['pubYear', 'Status', 'Description', 'CWE-ID', 'vectorString', 'attackVector', 'attackComplexity', 'privilegesRequired', 'userInteraction', 'scope', 'confidentialityImpact', 'integrityImpact', 'availabilityImpact'])

In [8]:
import pandas as pd

In [13]:
data = pd.DataFrame(cwe_data)

In [16]:
data["Status"].value_counts()

Status
Analyzed               119663
Modified                61535
Rejected                12160
Undergoing Analysis        14
Name: count, dtype: int64

In [17]:
data["CWE-ID"].value_counts()

CWE-ID
NVD-CWE-noinfo    25116
CWE-79            22846
None              13506
CWE-119            9516
CWE-787            9017
                  ...  
CWE-643               1
CWE-324               1
CWE-451               1
CWE-456               1
CWE-1393              1
Name: count, Length: 405, dtype: int64

In [None]:
data["vectorString"].value_counts()

In [None]:

# Drop rows where 'Status' is 'Rejected' or 'Undergoing Analysis'
data = data[~data['Status'].isin(['Rejected', 'Undergoing Analysis'])]

# Now filter for rows where 'CWE-ID' is 'None'
misc1= data[data['CWE-ID'] == 'None']

# Now filter for rows where 'CVSSnone' is 'None'
misc2 = data[data['vectorString'] == 'None']

# Save the filtered data to a new CSV file
misc1.to_csv('misc1.csv', index=False)

# Save the filtered data to a new CSV file
misc2.to_csv('misc2.csv', index=False)


In [39]:
data.to_csv('filtered_cve.csv',index=False)

AttributeError: 'dict' object has no attribute 'to_csv'

In [48]:
df=pd.read_csv("filtered_cve.csv")

In [49]:
df["CWE-ID"].unique()

array(['CWE-79', 'CWE-89', 'CWE-94', 'CWE-264', 'CWE-20', 'CWE-119',
       'CWE-352', 'CWE-310', 'NVD-CWE-noinfo', 'CWE-22', 'CWE-287',
       'NVD-CWE-Other', 'CWE-399', 'CWE-255', 'CWE-312', 'CWE-189',
       'CWE-787', 'CWE-16', 'CWE-200', 'CWE-416', 'CWE-134', 'CWE-476',
       'CWE-667', 'CWE-362', 'CWE-190', 'CWE-191', 'CWE-77', 'CWE-78',
       'CWE-59', 'CWE-400', 'CWE-843', 'CWE-295', 'CWE-835', 'CWE-120',
       'CWE-732', 'CWE-798', 'CWE-918', 'CWE-401', 'CWE-17', 'CWE-252',
       'CWE-19', 'CWE-863', 'CWE-917', 'CWE-908', 'CWE-829', 'CWE-129',
       'CWE-681', 'CWE-824', 'CWE-426', 'CWE-502', 'CWE-193', 'CWE-611',
       'CWE-617', 'CWE-415', 'CWE-269', 'CWE-209', 'CWE-909', 'CWE-704',
       'CWE-404', 'CWE-369', 'CWE-1284', 'CWE-276', 'CWE-125', 'CWE-665',
       'CWE-772', 'CWE-1021', 'CWE-770', 'CWE-532', 'CWE-668', 'CWE-776',
       'CWE-254', 'CWE-74', 'CWE-326', 'CWE-346', 'CWE-565', 'CWE-697',
       'CWE-682', 'CWE-347', 'CWE-319', 'CWE-306', 'CWE-913', 'CWE-521

In [50]:
# Drop rows where 'Status' is 'Rejected' or 'Undergoing Analysis'
df = df[~df['CWE-ID'].isin(['NVD-CWE-noinfo', 'NVD-CWE-Other',None])]


In [51]:
df = df[~df['vectorString'].isin([None])]

In [52]:
df.head()

Unnamed: 0,pubYear,Status,Description,CWE-ID,vectorString,attackVector,attackComplexity,privilegesRequired,userInteraction,scope,confidentialityImpact,integrityImpact,availabilityImpact
0,2010,Modified,Cross-site scripting (XSS) vulnerability in ma...,CWE-79,,,,,,,,,
1,2010,Modified,SQL injection vulnerability in page.php in Min...,CWE-89,,,,,,,,,
2,2010,Modified,Multiple PHP remote file inclusion vulnerabili...,CWE-94,,,,,,,,,
3,2010,Modified,Cross-site scripting (XSS) vulnerability in ne...,CWE-79,,,,,,,,,
4,2010,Modified,PHP remote file inclusion vulnerability in ind...,CWE-94,,,,,,,,,


In [53]:
df = df.dropna()

In [54]:
df

Unnamed: 0,pubYear,Status,Description,CWE-ID,vectorString,attackVector,attackComplexity,privilegesRequired,userInteraction,scope,confidentialityImpact,integrityImpact,availabilityImpact
4297,2010,Modified,"MIT Kerberos 5 (aka krb5) 1.3.x, 1.4.x, 1.5.x,...",CWE-310,CVSS:3.0/AV:N/AC:H/PR:N/UI:N/S:U/C:N/I:L/A:N,NETWORK,HIGH,NONE,NONE,UNCHANGED,NONE,LOW,NONE
4298,2010,Modified,MIT Kerberos 5 (aka krb5) 1.7.x and 1.8.x thro...,CWE-310,CVSS:3.0/AV:N/AC:H/PR:N/UI:N/S:U/C:N/I:L/A:N,NETWORK,HIGH,NONE,NONE,UNCHANGED,NONE,LOW,NONE
4302,2010,Modified,MIT Kerberos 5 (aka krb5) 1.8.x through 1.8.3 ...,CWE-310,CVSS:3.0/AV:N/AC:L/PR:L/UI:N/S:U/C:L/I:L/A:L,NETWORK,LOW,LOW,NONE,UNCHANGED,LOW,LOW,LOW
6635,2011,Analyzed,contrib/pdfmark/pdfroff.sh in GNU troff (aka g...,CWE-254,CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:L/A:L,NETWORK,LOW,NONE,NONE,UNCHANGED,NONE,LOW,LOW
7682,2011,Analyzed,Memory leak in the NAT implementation in Cisco...,CWE-399,CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:H,NETWORK,LOW,NONE,NONE,UNCHANGED,NONE,NONE,HIGH
...,...,...,...,...,...,...,...,...,...,...,...,...,...
180144,2023,Analyzed,A malicious user could use this issue to acces...,CWE-918,CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:C/C:H/I:N/A:N,NETWORK,LOW,NONE,NONE,CHANGED,HIGH,NONE,NONE
180145,2023,Modified,A malicious user could use this issue to get c...,CWE-29,CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H,NETWORK,LOW,NONE,NONE,UNCHANGED,HIGH,HIGH,HIGH
180146,2023,Analyzed,This vulnerability is capable of writing arbit...,CWE-434,CVSS:3.0/AV:N/AC:L/PR:L/UI:N/S:U/C:H/I:H/A:H,NETWORK,LOW,LOW,NONE,UNCHANGED,HIGH,HIGH,HIGH
180147,2023,Analyzed,This vulnerability enables malicious users to ...,CWE-29,CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:C/C:H/I:H/A:L,NETWORK,LOW,NONE,NONE,CHANGED,HIGH,HIGH,LOW


In [55]:
df.to_csv("cleaned_for_ft.csv",index=False)

In [56]:
df = pd.read_csv("cleaned_for_ft.csv")

In [58]:
df["CWE-ID"].value_counts().to_dict()

{'CWE-79': 6243,
 'CWE-119': 4375,
 'CWE-200': 3289,
 'CWE-20': 3272,
 'CWE-125': 2153,
 'CWE-787': 2071,
 'CWE-89': 1570,
 'CWE-352': 1321,
 'CWE-416': 1294,
 'CWE-22': 1201,
 'CWE-190': 982,
 'CWE-264': 973,
 'CWE-78': 910,
 'CWE-476': 846,
 'CWE-284': 754,
 'CWE-287': 737,
 'CWE-732': 442,
 'CWE-434': 440,
 'CWE-611': 431,
 'CWE-400': 373,
 'CWE-798': 361,
 'CWE-94': 358,
 'CWE-295': 347,
 'CWE-601': 314,
 'CWE-269': 302,
 'CWE-502': 298,
 'CWE-77': 275,
 'CWE-862': 270,
 'CWE-254': 262,
 'CWE-772': 256,
 'CWE-426': 254,
 'CWE-918': 251,
 'CWE-310': 248,
 'CWE-362': 246,
 'CWE-399': 245,
 'CWE-522': 235,
 'CWE-74': 226,
 'CWE-120': 222,
 'CWE-863': 216,
 'CWE-306': 208,
 'CWE-835': 189,
 'CWE-532': 175,
 'CWE-704': 144,
 'CWE-770': 143,
 'CWE-327': 136,
 'CWE-59': 135,
 'CWE-19': 134,
 'CWE-384': 134,
 'CWE-415': 130,
 'CWE-319': 129,
 'CWE-255': 128,
 'CWE-326': 125,
 'CWE-129': 124,
 'CWE-122': 117,
 'CWE-427': 114,
 'CWE-121': 113,
 'CWE-209': 106,
 'CWE-276': 99,
 'CWE-369': 97,

In [None]:


# Function to format each entry as a fine-tuning conversation
def format_for_finetuning(row):
    return [
        {
            "from": "system",
            "value": (
                "Analyze the following CVE description and map it to the appropriate CWE, "
                "and generate the resulting vector string (for metrics). Provide a brief "
                "justification for your choice. Ensure the last line of your response "
                "contains a dictionary with the mapped results."
            )
        },
        {
            "from": "human",
            "value": row["Description"]
        },
        {
            "from": "gpt",
            "value": (
                f"Explanation about the CVE, mapped CWE ({row['CWE-ID'][0]}) and the CVSS vector "
                f"string: {row['vectorString'][0]}. The mapped data: "
                f"{json.dumps({'CWE-ID': row['CWE-ID'][0], 'vectorString': row['vectorString'][0]})}"
            )
        }
    ]

# Apply the function to your dataframe rows
formatted_data = [format_for_finetuning(row) for _, row in df.iterrows()]

# Save to a JSON file
with open("finetuning_data.json", "w") as f:
    json.dump(formatted_data, f, indent=4)


In [None]:
import pandas as pd
import json


def chatML_for_rcm_vsp_training_formatter(
    data_directory: str, output_directory: str = "./formatted_data.json"
) -> str:
    # Load the dataset from a tab-separated file
    df = pd.read_csv(data_directory, encoding="utf-8")

    # Prepare the result dictionary with the conversation format
    result = {
        "conversations": [
            [
                {
                    "from": "system",
                    "value": "Analyze the following CVE description and map it to the appropriate CWE, "
                    "and generate the resulting vector string (for metrics). "
                  
                   " Your response should be in JSON (dictionary) format "
                   "with 'CWE-ID' and 'vectorString' as keys with their respective values.",
                },
                {
                    "from": "human",
                    "value": df.loc[
                        index, "Description"
                    ],  # The CVE description from the dataframe
                },
                {
                    "from": "gpt",
                    "value": f"CWE-ID: {df.loc[index, 'CWE-ID']}\nVector string: {df.loc[index, 'vectorString']}\n\n
                    "Details about vector string:\nAttack Vector: {df.loc[index, 'attackVector']}\n
                    "Attack Complexity: {df.loc[index, 'attackComplexity']}\n
                    "Privileges Required: {df.loc[index, 'privilegesRequired']}\n
                    "User Interaction: {df.loc[index, 'userInteraction']}\n
                    "Scope: {df.loc[index, 'scope']}\n
                    "Confidentiality Impact: {df.loc[index, 'confidentialityImpact']}\n
                    "Integrity Impact: {df.loc[index, 'integrityImpact']}\n
                    "Availability Impact: {df.loc[index, 'availabilityImpact']}\n\n
                    "{json.dumps({'CWE-ID': df.loc[index, 'CWE-ID'],'vectorString': df.loc[index, "vectorString"]})}"
                },
            ]
            for index in range(len(df))  # Iterate over each row in the dataframe
        ]
    }

    # Write the formatted result to a JSON file
    with open(output_directory, "w", encoding="utf-8") as f:
        json.dump(result, f, indent=2)

    return f"Formatted {data_directory} and saved into {output_directory}"

In [69]:
chatML_for_rcm_vsp_training_formatter("cleaned_for_ft.csv")

'Formatted cleaned_for_ft.csv and saved into ./formatted_data.json'

In [62]:
from huggingface_hub import HfApi

# Define your dataset repository name (e.g., "username/dataset-name")
repo_name = "Ghaythfd/cve_rcm_ft"

# Initialize the API
api = HfApi()

# Create a new dataset repository if it doesn't exist
api.create_repo(repo_id=repo_name, repo_type="dataset", exist_ok=True)

# Upload the JSON file to the dataset repository
api.upload_file(
    path_or_fileobj="formatted_dataset_with_gpt.json",
    path_in_repo="formatted_dataset_with_gpt.json",
    repo_id=repo_name,
    repo_type="dataset"
)

print(f"Dataset uploaded successfully to https://huggingface.co/datasets/{repo_name}")


formatted_dataset_with_gpt.json: 100%|██████████| 39.7M/39.7M [10:59<00:00, 60.3kB/s]  


Dataset uploaded successfully to https://huggingface.co/datasets/Ghaythfd/cve_rcm_ft
