In [6]:
import os
import json
import esprima
from tqdm import tqdm
import pandas as pd
import pandas as pd
import networkx as nx
import graph2vec
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [11]:
import esprima

def code_to_ast_with_nodes(code):
    """Extract AST with detailed node information"""
    try:
        # Parse code with full syntax details
        ast = esprima.parseScript(code, {
            'range': True,       # Get character ranges
            'loc': True,         # Get line/column positions
            'tokens': True,      # Include tokens
            'comment': True      # Include comments
        })
        
        # Extract nodes with type and source code
        nodes = []
        def _traverse(node, parent=None):
            if not isinstance(node, dict):
                return
                
            node_info = {
                'type': node['type'],
                'range': node.get('range', None),
                'loc': node.get('loc', None),
                'source': code[node['range'][0]:node['range'][1]] if 'range' in node else None
            }
            nodes.append(node_info)
            
            # Recursively process child nodes
            for key in node:
                if key in ['type', 'range', 'loc']:
                    continue
                value = node[key]
                if isinstance(value, dict):
                    _traverse(value, node_info)
                elif isinstance(value, list):
                    for item in value:
                        if isinstance(item, dict):
                            _traverse(item, node_info)

        _traverse(ast.toDict())
        return nodes
        
    except Exception as e:
        print(f"AST extraction failed: {str(e)}")
        return []

In [12]:
import networkx as nx

def ast_to_graph(nodes):
    """Convert AST nodes to a graph structure"""
    G = nx.DiGraph()
    
    # Add nodes with properties
    for idx, node in enumerate(nodes):
        G.add_node(idx, **node)
        
    # Add edges based on parent-child relationships
    parent_stack = []
    current_parent = None
    
    for idx, node in enumerate(nodes):
        while parent_stack and node['range'][0] > parent_stack[-1]['range'][1]:
            parent_stack.pop()
            
        if parent_stack:
            current_parent = parent_stack[-1]
            G.add_edge(parent_stack.index(current_parent), idx)
            
        if 'children' in node:  # For composite nodes
            parent_stack.append(node)
            
    return G

In [13]:
# Example code from your dataset
sample_code = """
const express = require('express');
const app = express();
app.get('/file/:filePath', (req, res) => {
  const filePath = req.params.filePath;
  const newFilePath = filePath + '/etc/passwd';
  res.sendFile(newFilePath);
});
"""

# Extract AST nodes
ast_nodes = code_to_ast_with_nodes(sample_code)

# Convert to graph
ast_graph = ast_to_graph(ast_nodes)

# Show sample output
print(f"Extracted {len(ast_nodes)} AST nodes")
print("\nFirst 5 nodes:")
for node in ast_nodes[:5]:
    print(f"{node['type']}: {node['source'][:50]}...")

Extracted 96 AST nodes

First 5 nodes:
Program: const express = require('express');
const app = ex...
VariableDeclaration: const express = require('express');...
VariableDeclarator: express = require('express')...
Identifier: express...
CallExpression: require('express')...


In [None]:
test = """
const fs = require('fs');
const filePath = fs.readFileSync('path/to/file.txt', 'utf8');
const newFilePath = filePath + '/etc/passwd';
fs.chmodSync(newFilePath, 0o777);
"""

# AST 
ast = esprima.parseScript(test)

# convert ast to dictionary and save as json
ast_dict = ast.toDict()
with open('ast.json', 'w') as f:
    json.dump(ast_dict, f, indent=4)

In [17]:
import esprima
import torch
import numpy as np

class ASTNode:
    def __init__(self, node, identifier):
        self.node = node
        self.identifier = identifier
        self.children = []

    def add_child(self, child):
        self.children.append(child)

def traverse_ast(node, counter=[1]):
    if not isinstance(node, dict):
        return None

    current_node = ASTNode(node, counter[0])
    counter[0] += 1

    for key, value in node.items():
        if isinstance(value, dict):
            child_node = traverse_ast(value, counter)
            if child_node:
                current_node.add_child(child_node)
        elif isinstance(value, list):
            for item in value:
                child_node = traverse_ast(item, counter)
                if child_node:
                    current_node.add_child(child_node)

    return current_node

def generate_edgelist(ast_root):
    edges = [[], []]

    def walk_tree_and_add_edges(node):
        for child in node.children:
            edges[0].append(node.identifier - 1)
            edges[1].append(child.identifier - 1)
            walk_tree_and_add_edges(child)

    walk_tree_and_add_edges(ast_root)
    return torch.tensor(edges, dtype=torch.long)

def generate_features(ast_root):
    features = []

    def walk_tree_and_set_features(node):
        degree = len(node.children)
        features.append([node.identifier, degree])

        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)

    features_array = np.asarray(features)
    return torch.tensor(features_array, dtype=torch.float)

def process_js_code(js_code, bug_label):
    parsed_ast = esprima.parseScript(js_code, { "range": True })
    ast_root = traverse_ast(parsed_ast.toDict())

    graphs_embedding = generate_edgelist(ast_root)
    nodes_embedding = generate_features(ast_root)
    y = torch.tensor([bug_label], dtype=torch.int64)

    return torch.utils.data.DataLoader([(nodes_embedding, graphs_embedding, y)])

# Example JavaScript Code
js_code = """
function add(a, b) {
    return a + b;
}
"""

data_loader = process_js_code(js_code, bug_label=1)  # 1 for vulnerable

In [20]:
import esprima

def extract_ast(js_code):
    return esprima.parseScript(js_code, tolerant=True)


In [21]:
import torch
from torch_geometric.data import Data

def traverse_ast(node, edges, node_features, parent_id=None, node_id=0):
    """
    Recursively traverse AST, store node features and edges.
    """
    current_id = node_id
    node_features.append([current_id, len(node.body) if hasattr(node, 'body') else 1])

    if parent_id is not None:
        edges[0].append(parent_id)
        edges[1].append(current_id)

    if hasattr(node, 'body'):
        for child in node.body:
            node_id = traverse_ast(child, edges, node_features, current_id, node_id + 1)

    return node_id

def generate_graph(js_code):
    ast_root = extract_ast(js_code)
    edges = [[], []]
    node_features = []

    traverse_ast(ast_root, edges, node_features)

    x = torch.tensor(node_features, dtype=torch.float)
    edge_index = torch.tensor(edges, dtype=torch.long)

    return Data(x=x, edge_index=edge_index)


  from .autonotebook import tqdm as notebook_tqdm


In [22]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)


In [23]:
js_code = """
function test(x) {
    if (x < 0) {
        console.log("Negative");
    } else {
        console.log("Positive");
    }
}
"""
graph_data = generate_graph(js_code)

model = GNN(in_channels=2, hidden_channels=16, out_channels=2)
output = model(graph_data)
print(output)

TypeError: object of type 'BlockStatement' has no len()