In [None]:
import pandas as pd
import os
import glob
import re

def strip_comments_and_cwe(code):
    """Strip comments and CWE-related variable names from code."""
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
    code = re.sub(r'//.*?\n', '\n', code)
    code = re.sub(r'\bCWE\d{3}_\w+', 'var', code)
    code = re.sub(r'\n\s*\n', '\n', code).strip()
    return code

def extract_none_samples_from_juliet(juliet_dir):
    """Extract 'good' (non-vulnerable) samples from Juliet dataset and label as 'none'."""
    cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']
    good_samples = []
    
    for cwe in cwes:
        cwe_dir = os.path.join(juliet_dir, f'{cwe}*')
        cwe_dirs = glob.glob(cwe_dir)
        for dir_path in cwe_dirs:
            good_files = glob.glob(os.path.join(dir_path, '*good*.c'))
            for file_path in good_files:
                try:
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        code = f.read()
                    good_samples.append({
                        'cwe': 'none',
                        'code': code,
                        'file': os.path.basename(file_path)
                    })
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")
    
    return pd.DataFrame(good_samples)

In [None]:
import os
import glob

# Updated Juliet path
juliet_dir = r"C:\Users\MartyNattakit\Desktop\Datasets\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support"

# Check if directory exists
print(f"Juliet directory exists: {os.path.exists(juliet_dir)}")

# Check for CWE directories
cwes = ['CWE121', 'CWE78', 'CWE122', 'CWE190', 'CWE191']
for cwe in cwes:
    cwe_dir = os.path.join(juliet_dir, f'{cwe}*')
    cwe_dirs = glob.glob(cwe_dir)
    print(f"\nLooking for {cwe} directories: {cwe_dirs}")
    for dir_path in cwe_dirs:
        good_files = glob.glob(os.path.join(dir_path, '*good*.c'))
        print(f"Good files in {dir_path}: {good_files}")

Juliet directory exists: True

Looking for CWE121 directories: ['C:\\Users\\MartyNattakit\\Desktop\\Datasets\\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\\cwe121_results.txt']
Good files in C:\Users\MartyNattakit\Desktop\Datasets\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support\cwe121_results.txt: []

Looking for CWE78 directories: []

Looking for CWE122 directories: []

Looking for CWE190 directories: []

Looking for CWE191 directories: []


In [None]:
#Prepare and Save Dataset
original_csv_path = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\cwe_top5_sampled.csv"
juliet_dir = r"C:\Users\MartyNattakit\Desktop\Datasets\2022-08-11-juliet-c-cplusplus-v1-3-1-with-extra-support"
output_csv_path = r"C:\Users\MartyNattakit\Desktop\CodeSentinel\cwe_top5_sampled_with_juliet_none.csv"

# Load the original dataset
full_df = pd.read_csv(original_csv_path)
print(f"Loaded original dataset with {len(full_df)} samples.")
print("Original CWE Distribution:\n", full_df['cwe'].value_counts())

# Extract 'none' samples from Juliet
none_df = extract_none_samples_from_juliet(juliet_dir)
print(f"Extracted {len(none_df)} 'none' samples from Juliet.")

# Fallback: If no 'none' samples extracted, add synthetic ones
if len(none_df) == 0:
    print("No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...")
    none_samples = pd.DataFrame({
        'cwe': ['none'] * 400,
        'code': [
            'int main() { printf("Hello, World!"); return 0; }',
            'void func() { int x = 5; printf("%d", x); }',
            'int add(int a, int b) { return a + b; }',
            'void loop() { for(int i = 0; i < 10; i++) { printf("."); } }',
            'int main() { char str[] = "test"; puts(str); return 0; }'
        ] * 80,
        'file': ['synthetic_none_' + str(i) + '.c' for i in range(400)]
    })
    none_df = none_samples

# Combine with original dataset
full_df = pd.concat([full_df, none_df], ignore_index=True)

# Clean the code
full_df['code'] = full_df['code'].apply(strip_comments_and_cwe)

# Save the updated dataset
full_df.to_csv(output_csv_path, index=False)
print(f"Updated dataset saved as {output_csv_path} with {len(full_df)} samples.")
print("Final CWE Distribution:\n", full_df['cwe'].value_counts())
print("Unique CWE labels:", full_df['cwe'].unique())

Loaded original dataset with 2000 samples.
Original CWE Distribution:
 cwe
CWE121    400
CWE78     400
CWE190    400
CWE191    400
CWE122    400
Name: count, dtype: int64
Extracted 0 'none' samples from Juliet.
No 'none' samples found in Juliet. Adding synthetic 'none' samples instead...
Updated dataset saved as C:\Users\MartyNattakit\Desktop\CodeSentinel\cwe_top5_sampled_with_juliet_none.csv with 2400 samples.
Final CWE Distribution:
 cwe
CWE121    400
CWE78     400
CWE190    400
CWE191    400
CWE122    400
none      400
Name: count, dtype: int64
Unique CWE labels: ['CWE121' 'CWE78' 'CWE190' 'CWE191' 'CWE122' 'none']
