In [1]:
import os
import json
import esprima
from tqdm import tqdm
import pandas as pd

In [2]:
def extract_code_gadgets(code):
    gadgets = []
    
    try:
        ast = esprima.parseScript(code, {
            'range': True,
            'tolerant': True,
            'jsx': False
        })
    except Exception as e:
        print(f"🔥 Parsing failed: {str(e)}")
        return gadgets

    sensitive_apis = [
        "fs.readFile", "fs.readFileSync", "fs.createReadStream",
        "fs.writeFile", "fs.writeFileSync", "fs.appendFile", "fs.appendFileSync",
        "fs.unlink", "fs.unlinkSync", "fs.rm", "fs.rmSync",
        "fs.readdir", "fs.readdirSync",
        "fs.existsSync", "fs.access", "fs.accessSync",
        "path.join", "path.resolve", "path.normalize",
        "child_process.exec", "child_process.execSync",
        "child_process.spawn", "child_process.spawnSync",
        "child_process.fork",
        "eval", "Function", "require", "import",
        "moment.locale"
    ]


    class GadgetExtractor(esprima.NodeVisitor):
        def __init__(self):
            self.sensitive_operations = []
            self.user_inputs = set()
            self.debug_log = []

        def get_source_code(self, node):
            """Safely extract source code with range validation"""
            if not hasattr(node, 'range') or node.range is None:
                return None
            start, end = node.range
            if start is None or end is None:
                return None
            return code[start:end]

        def visit_VariableDeclarator(self, node):
            try:
                if node.init:
                    source = self.get_source_code(node.init)
                    if source and any(s in source for s in ['req.query', 'req.params', 'req.body']):
                        var_name = node.id.name
                        self.user_inputs.add(var_name)
                        self.sensitive_operations.append({
                            'node': node,
                            'type': 'user_input',
                            'source': self.get_source_code(node)
                        })
            except Exception as e:
                print(f"VariableDeclarator error: {str(e)}")
            self.generic_visit(node)

        def visit_CallExpression(self, node):
            try:
                # Detect require() with user input
                if hasattr(node.callee, 'name') and node.callee.name == 'require':
                    arg_source = self.get_source_code(node.arguments[0])
                    if arg_source and any(var in arg_source for var in self.user_inputs):
                        self.sensitive_operations.append({
                            'node': node,
                            'type': 'dynamic_require',
                            'source': self.get_source_code(node)
                        })

                # Detect sensitive API calls
                if node.callee.type == 'MemberExpression':
                    obj = self.unwind_member_expression(node.callee)
                    method_call = f"{obj}.{node.callee.property.name}"
                    
                    if method_call in sensitive_apis:
                        self.sensitive_operations.append({
                            'node': node,
                            'type': method_call,
                            'source': self.get_source_code(node)
                        })
            except Exception as e:
                print(f"CallExpression error: {str(e)}")
            self.generic_visit(node)

        def unwind_member_expression(self, node):
            """Robust member expression unwinding"""
            parts = []
            try:
                while node.type == 'MemberExpression':
                    parts.append(node.property.name)
                    node = node.object
                base = node.name if hasattr(node, 'name') else self.get_source_code(node)
                return f"{base}.{'.'.join(reversed(parts))}" if parts else base
            except Exception:
                return "UnknownExpression"

    extractor = GadgetExtractor()
    extractor.visit(ast)

    # Safely create gadgets
    for op in extractor.sensitive_operations:
        # Validate node and range
        if not op['source']:
            continue
            
        node = op['node']
        if not hasattr(node, 'range') or node.range is None:
            continue
            
        start, end = node.range
        if start is None or end is None:
            continue

        # Calculate line numbers safely
        try:
            start_line = code[:start].count('\n') + 1
            end_line = code[:end].count('\n') + 1
            context_start = max(0, start_line - 3)  # 0-based index
            context_end = end_line + 1  # +2 lines after
            lines = code.split('\n')
            gadget = {
                'type': op['type'],
                'source': op['source'],
                'context': lines[context_start:context_end],
                'line_numbers': (start_line, end_line)
            }
            gadgets.append(gadget)
        except Exception as e:
            print(f"Skipping gadget due to error: {str(e)}")

    return gadgets

In [3]:
# Configuration
INPUT_CSV = "../Datasets/output_cleaned.csv"  # Path to your CSV file
OUTPUT_FILE = "processed_gadgets_3.csv"  # Output file for results

def process_csv_dataset():
    # Load the CSV dataset
    try:
        df = pd.read_csv(INPUT_CSV)
    except Exception as e:
        print(f"❌ Error loading CSV file: {str(e)}")
        return

    # Process each row in the dataset
    results = []
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing dataset"):
        try:
            code = row['code']
            label = row['label']
            
            # Extract gadgets
            gadgets = extract_code_gadgets(code)
            
            # Flatten gadgets into individual rows
            for gadget in gadgets:
                # Join the context lines into a single string
                gadget_context = "\n".join(gadget['context'])
                results.append({
                    'gadget': gadget_context,
                    'label': label
                })
        except Exception as e:
            print(f"❌ Error processing row {index}: {str(e)}")

    # Save results to a new CSV file
    try:
        results_df = pd.DataFrame(results)
        results_df.to_csv(OUTPUT_FILE, index=False)
        print(f"\n✅ Processing complete! Results saved to {OUTPUT_FILE}")
    except Exception as e:
        print(f"❌ Error saving results: {str(e)}")

# Run the dataset processing
process_csv_dataset()

Processing dataset:  35%|███▍      | 63/181 [00:00<00:00, 308.24it/s]

🔥 Parsing failed: Line 4: Unexpected token ILLEGAL
🔥 Parsing failed: object of type 'float' has no len()
🔥 Parsing failed: Line 21: Unexpected token ILLEGAL
🔥 Parsing failed: Line 5: Unexpected token ILLEGAL
🔥 Parsing failed: object of type 'float' has no len()
🔥 Parsing failed: Line 8: Unexpected token ILLEGAL
🔥 Parsing failed: object of type 'float' has no len()
🔥 Parsing failed: object of type 'float' has no len()
🔥 Parsing failed: Line 1: Unexpected token ILLEGAL
🔥 Parsing failed: Line 1: Unexpected token ILLEGAL
🔥 Parsing failed: Line 1: Unexpected token ILLEGAL
🔥 Parsing failed: Line 1: Unexpected token ILLEGAL
🔥 Parsing failed: object of type 'float' has no len()
🔥 Parsing failed: object of type 'float' has no len()
🔥 Parsing failed: object of type 'float' has no len()
🔥 Parsing failed: object of type 'float' has no len()
🔥 Parsing failed: object of type 'float' has no len()
🔥 Parsing failed: object of type 'float' has no len()
🔥 Parsing failed: object of type 'float' has no len

Processing dataset:  78%|███████▊  | 142/181 [00:00<00:00, 308.62it/s]

🔥 Parsing failed: object of type 'float' has no len()
🔥 Parsing failed: Line 6: Unexpected token }
🔥 Parsing failed: Line 6: Unexpected token ]
🔥 Parsing failed: Line 1: Missing initializer in const declaration
🔥 Parsing failed: Line 1: Unexpected identifier


Processing dataset: 100%|██████████| 181/181 [00:00<00:00, 317.49it/s]

🔥 Parsing failed: Line 1: Unexpected token :
🔥 Parsing failed: Line 1: Unexpected token (
🔥 Parsing failed: Line 1: Unexpected identifier
🔥 Parsing failed: object of type 'float' has no len()
🔥 Parsing failed: Line 1: Unexpected identifier
🔥 Parsing failed: Line 1: Unexpected token {

✅ Processing complete! Results saved to processed_gadgets_3.csv





In [4]:
import re

# JavaScript keywords; immutable set, REMOVED 'function' and 'const' as they are not considered keywords
'''
=========================== REMOVED 'function' and 'const' ===========================
'''
keywords = frozenset({'abstract', 'await', 'boolean', 'break', 'byte', 'case', 'catch', 'char', 'class', 'const',
                      'continue', 'debugger', 'default', 'delete', 'do', 'double', 'else', 'enum', 'export', 'extends',
                      'false', 'final', 'finally', 'float', 'for', 'goto', 'if', 'implements', 'import',
                      'in', 'instanceof', 'int', 'interface', 'let', 'long', 'native', 'new', 'null', 'package',
                      'private', 'protected', 'public', 'return', 'short', 'static', 'super', 'switch', 'synchronized',
                      'this', 'throw', 'throws', 'transient', 'true', 'try', 'typeof', 'var', 'void', 'volatile', 'while',
                      'with', 'yield'})


# Holds known common function names; immutable set (in place of 'main' in C++)
main_set = frozenset({'init', 'start'})

# JavaScript/TypeScript doesn’t have argc/argv, but you may add similar terms as needed.
main_args = frozenset({})

# Compile regex patterns
rx_comment = re.compile(r'\*/\s*$')
rx_fun = re.compile(r'\b([_A-Za-z]\w*)\b(?=\s*\()')
rx_var = re.compile(r'\b([_A-Za-z]\w*)\b(?:(?=\s*\w+\()|(?!\s*\w+))(?!\s*\()')
rx_str_lit = re.compile(r'"(?:\\.|[^"\\])*"|\'(?:\\.|[^\'\\])*\'|`(?:\\.|[^`\\])*`')
rx_non_ascii = re.compile(r'[^\x00-\x7f]')

def clean_string_literals_and_non_ascii(line):
    # Remove all string literals
    line = rx_str_lit.sub('""', line)
    # Replace any non-ASCII characters with empty string
    return rx_non_ascii.sub('', line)

def clean_gadget(gadget):
    fun_symbols = {}
    var_symbols = {}

    fun_count = 1
    var_count = 1

    cleaned_gadget = []

    for line in gadget:
        if not rx_comment.search(line):
            line = clean_string_literals_and_non_ascii(line)
            user_fun = rx_fun.findall(line)
            user_var = rx_var.findall(line)

            for fun_name in user_fun:
                if fun_name not in main_set | keywords:
                    if fun_name not in fun_symbols:
                        fun_symbols[fun_name] = f'FUN{fun_count}'
                        fun_count += 1
                    line = re.sub(rf'\b{fun_name}\b(?=\s*\()', fun_symbols[fun_name], line)

            for var_name in user_var:
                if var_name not in keywords | main_args:
                    if var_name not in var_symbols:
                        var_symbols[var_name] = f'VAR{var_count}'
                        var_count += 1
                    line = re.sub(rf'\b{var_name}\b(?:(?=\s*\w+\()|(?!\s*\w+))(?!\s*\()', var_symbols[var_name], line)

            cleaned_gadget.append(line)
    return cleaned_gadget

In [5]:
# clean the gadgets and save them to a new CSV file
def clean_gadgets():
    try:
        df = pd.read_csv(OUTPUT_FILE)
    except Exception as e:
        print(f"❌ Error loading CSV file: {str(e)}")
        return

    cleaned_gadgets = []
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Cleaning gadgets"):
        try:
            gadget = row['gadget']
            label = row['label']
            cleaned_gadget = clean_gadget(gadget.split('\n'))
            cleaned_gadgets.append({
                'gadget': '\n'.join(cleaned_gadget),
                'label': label
            })
        except Exception as e:
            print(f"❌ Error cleaning gadget {index}: {str(e)}")

    try:
        cleaned_df = pd.DataFrame(cleaned_gadgets)
        cleaned_df.to_csv("cleaned_gadgets.csv", index=False)
        print(f"\n✅ Cleaning complete! Cleaned gadgets saved to cleaned_gadgets.csv")
    except Exception as e:
        print(f"❌ Error saving cleaned gadgets: {str(e)}")
        
clean_gadgets()

Cleaning gadgets: 100%|██████████| 71/71 [00:00<00:00, 1831.97it/s]


✅ Cleaning complete! Cleaned gadgets saved to cleaned_gadgets.csv





ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject