In [None]:
import os
import json
import re
from tqdm import tqdm

# Configuration
INPUT_DIR = "../SourceCode/Test"  # Folder containing your JS files
OUTPUT_FILE = "processed_gadgets.json"  # Output file for results

def get_line_number(code, index):
    """Get the line number of a given index in a string."""
    return code.count("\n", 0, index) + 1

def extract_code_gadgets(code):
    """Extract security-relevant code gadgets from JavaScript code."""
    gadgets = []

    # === Path Traversal ===
    path_traversal_pattern = r"path\.(?:join|resolve)\s*\(([^)]*)\)"
    user_input_pattern = r"(req\.query\.\w+|req\.params\.\w+|req\.body\.\w+)"
    
    matches = re.finditer(path_traversal_pattern, code)
    for match in matches:
        line_number = get_line_number(code, match.start())
        source_code = match.group(1)

        if re.search(user_input_pattern, source_code):
            gadget_type = "path_traversal_with_user_input"
        else:
            gadget_type = "path_traversal"

        gadgets.append({
            "type": gadget_type,
            "source": match.group(),
            "line_number": line_number
        })

    # === CVE-2022-24785: moment().locale() ===
    # 1️⃣ Detect `locale = req.query.locale`
    locale_assignment_pattern = r"(\w+)\s*=\s*(req\.query\.\w+)"
    locale_vars = {}  # Stores local variable mapping

    for match in re.finditer(locale_assignment_pattern, code):
        var_name, user_input = match.groups()
        locale_vars[var_name] = user_input

    # 2️⃣ Detect `moment().locale(locale)`
    moment_pattern = r"moment\(\)\.locale\s*\(\s*([a-zA-Z0-9_]+)\s*\)"
    
    for match in re.finditer(moment_pattern, code):
        line_number = get_line_number(code, match.start())
        var_name = match.group(1)

        if var_name in locale_vars:
            gadgets.append({
                "type": "insecure_locale_loading (CVE-2022-24785)",
                "source": match.group(),
                "line_number": line_number
            })

    return gadgets

def process_dataset():
    """Process all JavaScript files and detect vulnerabilities."""
    js_files = [os.path.join(INPUT_DIR, f) for f in os.listdir(INPUT_DIR) if f.endswith(".js")]
    
    processed_data = []
    for js_file in tqdm(js_files):
        try:
            with open(js_file, "r", encoding="utf-8") as f:
                code = f.read()
            gadgets = extract_code_gadgets(code)
            processed_data.append({
                "file": js_file,
                "gadgets": gadgets
            })
        except Exception as e:
            print(f"Error processing {js_file}: {str(e)}")
    
    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(processed_data, f, indent=2)

# Run the dataset processing
process_dataset()

# Load and display results
with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
    results = json.load(f)

print(json.dumps(results, indent=2))


In [None]:
import csv
import json
import re

INPUT_CSV = "../Datasets/output_cleaned.csv"  # CSV file containing JavaScript code snippets
OUTPUT_JSON = "processed_gadgets_2.json"  # Output JSON file

def get_line_number(code, index):
    """Get the line number of a given index in a string."""
    return code.count("\n", 0, index) + 1

def extract_code_gadgets(code):
    """Extract security-relevant code gadgets from JavaScript code."""
    gadgets = []
    
    # Path Traversal Detection
    path_traversal_pattern = r"path\.(?:join|resolve)\s*\(([^)]*)\)"
    user_input_pattern = r"(req\.query\.\w+|req\.params\.\w+|req\.body\.\w+)"
    
    matches = re.finditer(path_traversal_pattern, code)
    for match in matches:
        line_number = get_line_number(code, match.start())
        source_code = match.group(1)

        if re.search(user_input_pattern, source_code):
            gadget_type = "path_traversal_with_user_input"
        else:
            gadget_type = "path_traversal"

        gadgets.append({
            "type": gadget_type,
            "source": match.group(),
            "line_number": line_number
        })
    
    return gadgets

def process_csv():
    """Process the CSV file and detect vulnerabilities in JavaScript code snippets."""
    processed_data = []
    
    with open(INPUT_CSV, "r", encoding="utf-8") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip header
        
        for row in reader:
            code, label = row[0], int(row[1])
            gadgets = extract_code_gadgets(code)
            processed_data.append({
                "code": code,
                "label": label,
                "gadgets": gadgets
            })
    
    with open(OUTPUT_JSON, "w", encoding="utf-8") as jsonfile:
        json.dump(processed_data, jsonfile, indent=2)
    
    print(json.dumps(processed_data, indent=2))

# Run the script
process_csv()
