In [1]:
import os
import json
import esprima
from tqdm import tqdm

In [None]:
def extract_code_gadgets(code):
    gadgets = []
    
    try:
        # Parse the JavaScript code into an AST
        ast = esprima.parseScript(code, {
            'range': True,
            'tolerant': True,
            'jsx': False
        })
    except Exception as e:
        print(f"🔥 Parsing failed: {str(e)}")
        return gadgets

    # Define sensitive APIs to look for
    sensitive_apis = [
        'fs.readFile', 'fs.readFileSync', 'path.join', 'path.resolve',
        'moment.locale', 'require'  # Added new sensitive targets
    ]

    class GadgetExtractor(esprima.NodeVisitor):
        def __init__(self):
            self.sensitive_operations = []
            self.user_inputs = set()
            self.debug_log = []

        def visit_VariableDeclarator(self, node):
            """Detect user inputs in complex expressions"""
            try:
                if node.init:
                    # Recursive check for user input patterns
                    def check_expression(n):
                        if n.type == 'MemberExpression':
                            source = self.get_source_code(n)
                            if any(s in source for s in ['req.query', 'req.params', 'req.body']):
                                return True
                        elif n.type == 'LogicalExpression':
                            return check_expression(n.left) or check_expression(n.right)
                        elif n.type == 'ConditionalExpression':
                            return check_expression(n.test) or check_expression(n.consequent) or check_expression(n.alternate)
                        return False
                    
                    if check_expression(node.init):
                        var_name = node.id.name
                        self.user_inputs.add(var_name)
                        self.debug_log.append(f"🎯 Found user input: {var_name} = {self.get_source_code(node.init)}")
                        self.sensitive_operations.append({
                            'node': node,
                            'type': 'user_input',
                            'source': self.get_source_code(node)
                        })
            except Exception as e:
                print(f"Error in VariableDeclarator: {str(e)}")
            self.generic_visit(node)

        def visit_CallExpression(self, node):
            """Detect both API calls and require() usage"""
            try:
                # Detect direct require() calls
                if node.callee.name == 'require' and node.arguments:
                    arg_source = self.get_source_code(node.arguments[0])
                    if any(var in arg_source for var in self.user_inputs):
                        self.debug_log.append(f"❗ Detected dynamic require: {arg_source}")
                        self.sensitive_operations.append({
                            'node': node,
                            'type': 'dynamic_require',
                            'source': self.get_source_code(node)
                        })

                # Detect method calls with user-controlled arguments
                if node.callee.type == 'MemberExpression':
                    obj = self.unwind_member_expression(node.callee)
                    method_call = f"{obj}.{node.callee.property.name}"
                    
                    # Check if any arguments are user-controlled
                    user_args = [
                        arg for arg in node.arguments
                        if any(var in self.get_source_code(arg) for var in self.user_inputs)
                    ]
                    
                    if user_args and method_call in sensitive_apis:
                        self.debug_log.append(f"🔧 Detected tainted API call: {method_call}")
                        self.sensitive_operations.append({
                            'node': node,
                            'type': method_call,
                            'source': self.get_source_code(node)
                        })
                        
            except AttributeError:
                pass
            self.generic_visit(node)

        def unwind_member_expression(self, node):
            """Handle complex member expressions"""
            parts = []
            while node.type == 'MemberExpression':
                parts.append(node.property.name)
                node = node.object
            return node.name + '.' + '.'.join(reversed(parts)) if parts else node.name

        def get_source_code(self, node):
            return code[node.range[0]:node.range[1]] if hasattr(node, 'range') else ''

    extractor = GadgetExtractor()
    extractor.visit(ast)

    # Create code gadgets with context
    for op in extractor.sensitive_operations:
        lines = code.split('\n')
        start_line = code[:op['node'].range[0]].count('\n')
        end_line = code[:op['node'].range[1]].count('\n')
        
        gadget = {
            'type': op['type'],
            'source': op['source'],
            'context': lines[max(0, start_line-2):end_line+3],
            'line_numbers': (start_line+1, end_line+1)
        }
        gadgets.append(gadget)

    return gadgets

In [4]:
# Configuration
INPUT_DIR = "../SourceCode/Test"  # Folder containing your JS files
OUTPUT_FILE = "processed_gadgets.json"  # Output file for results

def process_dataset():
    # Get list of JS files
    js_files = [os.path.join(INPUT_DIR, f) for f in os.listdir(INPUT_DIR) if f.endswith(".js")]
    
    # Process each file
    processed_data = []
    for js_file in tqdm(js_files, desc="Processing files"):
        try:
            with open(js_file, "r") as f:
                code = f.read()
            
            # Extract gadgets
            gadgets = extract_code_gadgets(code)
            
            # Save results
            processed_data.append({
                "file": js_file,
                "gadgets": gadgets
            })
        except Exception as e:
            print(f"❌ Error processing {js_file}: {str(e)}")
    
    # Save all results to a JSON file
    with open(OUTPUT_FILE, "w") as f:
        json.dump(processed_data, f, indent=2)
    
    print(f"\n✅ Processing complete! Results saved to {OUTPUT_FILE}")

# Run the dataset processing
process_dataset()

# %% [markdown]
# ## Step 5: View Results
# The results are saved in `processed_gadgets.json`. You can load and view them here:

# %%
# Load and display results
with open(OUTPUT_FILE, "r") as f:
    results = json.load(f)

print(json.dumps(results, indent=2))

Processing files: 100%|██████████| 3/3 [00:00<00:00, 339.86it/s]

❌ Error processing ../SourceCode/Test/vuln4.js: unsupported operand type(s) for +: 'NoneType' and 'str'
❌ Error processing ../SourceCode/Test/vuln2.js: unsupported operand type(s) for +: 'NoneType' and 'str'

✅ Processing complete! Results saved to processed_gadgets.json
[
  {
    "file": "../SourceCode/Test/vuln.js",
    "gadgets": []
  }
]



