In [14]:
import os
import json
import esprima
from tqdm import tqdm
import pandas as pd
import networkx as nx
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [15]:
data = pd.read_csv("../Datasets/output_cleaned.csv")
data

Unnamed: 0,code,label
0,const fs = require('fs');\nconst path = requir...,1
1,const fs = require('fs'); const path = require...,0
2,var fs = require('fs');\nvar filePath = fs.rea...,1
3,const express = require('express');\nconst app...,1
4,const childProcess = require('child_process');...,1
...,...,...
176,(function() {\n var http = require('http');...,1
177,if (fs.existsSync(this.pluginsDir + na...,1
178,get url() {\n return `http://${this[incom...,1
179,getFilenameFromUrl(url) {\n const publicPath ...,1


In [16]:
# count label frequency
data['label'].value_counts()

label
1    98
0    83
Name: count, dtype: int64

In [None]:
def create_ast_graph(code):
    """Convert JavaScript code to a NetworkX graph representation of the AST"""
    try:
        # Parse the code into an AST
        ast = esprima.parseScript(code)
        
        # Create a directed graph
        G = nx.DiGraph()
        
        def traverse_ast(node, parent_id=None, edge_name=None):
            # Create unique node ID
            node_id = len(G.nodes)
            
            # Add node with its type as attribute
            G.add_node(node_id, type=node.type)
            
            # Add edge from parent if it exists
            if parent_id is not None:
                G.add_edge(parent_id, node_id, type=edge_name)
            
            # Recursively process all child nodes
            for key, value in node.items():
                if isinstance(value, esprima.nodes.Node):
                    traverse_ast(value, node_id, key)
                elif isinstance(value, list):
                    for item in value:
                        if isinstance(item, esprima.nodes.Node):
                            traverse_ast(item, node_id, key)
                            
        traverse_ast(ast.body[0] if ast.body else ast)
        return G
    except:
        # Return empty graph if parsing fails
        return nx.DiGraph()

# Convert all code samples to graphs
graphs = []
labels = []

print("Converting code to AST graphs...")
for idx, row in tqdm(data.iterrows(), total=len(data)):
    graph = create_ast_graph(row['code'])
    if graph.number_of_nodes() > 0:  # Only include valid graphs
        graphs.append(graph)
        labels.append(row['label'])

# Convert graphs to feature vectors using graph2vec
print("Creating graph embeddings...")

Converting code to AST graphs...


100%|██████████| 181/181 [00:00<00:00, 2710.87it/s]

Creating graph embeddings...





In [11]:
import esprima
import torch
import numpy as np
from torch_geometric.data import Data

def save_ast(node):
    """Recursively save children of each AST node."""
    if not hasattr(node, 'type'):
        return
    
    node.children = []
    for key, value in node.items():
        if key == 'type':
            continue
        if isinstance(value, dict):
            node.children.append(value)
        elif isinstance(value, list):
            for item in value:
                if isinstance(item, dict):
                    node.children.append(item)
    
    for child in node.children:
        save_ast(child)

def numbering_ast_nodes(node, counter=1):
    """Assign unique identifiers to each AST node."""
    if not hasattr(node, 'type'):
        return counter
    
    node.identifier = counter
    counter += 1
    
    for child in node.children:
        counter = numbering_ast_nodes(child, counter)
    
    return counter

def generate_edgelist(ast_root):
    """Generate edge list for the AST graph."""
    edges = [[], []]

    def walk_tree_and_add_edges(node):
        for child in node.children:
            if hasattr(child, 'identifier'):
                edges[0].append(node.identifier - 1)
                edges[1].append(child.identifier - 1)
                walk_tree_and_add_edges(child)

    walk_tree_and_add_edges(ast_root)
    return torch.tensor(edges, dtype=torch.long)

def generate_features(ast_root):
    """Generate node features for the AST graph."""
    features = []

    def walk_tree_and_set_features(node):
        if not hasattr(node, 'type'):
            return
        
        out_degree = len(node.children)
        node_id = node.identifier
        features.append([node_id, out_degree])

        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)
    return torch.tensor(features, dtype=torch.float)

def js_process(code, label):
    """Process JavaScript code into a graph representation."""
    try:
        # Parse JavaScript code into AST
        ast = esprima.parseScript(code, {'range': True, 'tolerant': True})
        ast_dict = ast.toDict()
        
        # Save and number AST nodes
        save_ast(ast_dict)
        numbering_ast_nodes(ast_dict)
        
        # Generate graph edges and features
        edges = generate_edgelist(ast_dict)
        features = generate_features(ast_dict)
        
        # Create PyTorch Geometric Data object
        y = torch.tensor([label], dtype=torch.int64)
        return Data(x=features, edge_index=edges, y=y)
    except Exception as e:
        print(f"Error processing code: {str(e)}")
        return None

# Example usage
code = """
const fs = require('fs');
const path = require('path');
fs.readFileSync(path.join(__dirname, 'file.txt'), 'utf8');
"""
label = 1  # Vulnerable

data = js_process(code, label)
print(data)

Error processing code: 'dict' object has no attribute 'children'
None
