In [1]:
# Extract AST from C source code using clang

import clang.cindex
import sys
import json
import os
import dask.dataframe as dd # for parallel computing 
import numpy as np
import pandas as pd
import torch
from torch_geometric.data import Data
from torch.utils.data import Dataset
from tqdm import tqdm
from typing import Optional, List, Dict, Any, Tuple

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
vdisc = pd.read_csv('Datasets/Normalized_CWE-469.csv.gz')

In [None]:
def save_ast(node: clang.cindex.Cursor) -> None:
    """
    Recursively saves the children of a node in the AST.
    
    Args:
        node (clang.cindex.Cursor): The root node of the AST.
    """
    node.children = list(node.get_children())
    for child in node.children:
        save_ast(child)


def number_ast_nodes(node: clang.cindex.Cursor, counter: int = 1) -> int:
    """
    Assigns unique identifiers to AST nodes in a depth-first manner.
    
    Args:
        node (clang.cindex.Cursor): The root node of the AST.
        counter (int): The starting identifier for numbering.
    
    Returns:
        int: The next available identifier after numbering all nodes.
    """
    node.identifier = counter
    counter += 1
    
    node.children = list(node.get_children())
    for child in node.children:
        counter = number_ast_nodes(child, counter)
    
    return counter


def generate_edge_list(ast_root: clang.cindex.Cursor) -> torch.Tensor:
    """
    Generates an edge list tensor from the AST.
    
    Args:
        ast_root (clang.cindex.Cursor): The root node of the AST.
    
    Returns:
        torch.Tensor: A tensor representing the edges in the AST.
    """
    edges = [[], []]

    def walk_tree_and_add_edges(node: clang.cindex.Cursor) -> None:
        for child in node.children:
            edges[0].append(node.identifier - 1)  # Source node
            edges[1].append(child.identifier - 1)  # Target node
            walk_tree_and_add_edges(child)
    
    walk_tree_and_add_edges(ast_root)
    return torch.tensor(edges, dtype=torch.long)


def generate_node_features(ast_root: clang.cindex.Cursor) -> torch.Tensor:
    """
    Generates node feature tensors from the AST.
    
    Args:
        ast_root (clang.cindex.Cursor): The root node of the AST.
    
    Returns:
        torch.Tensor: A tensor representing the node features.
    """
    features = []

    def walk_tree_and_set_features(node: clang.cindex.Cursor) -> None:
        degree = len(node.children)  # Out-degree
        features.append([node.identifier, degree])
        
        for child in node.children:
            walk_tree_and_set_features(child)
    
    walk_tree_and_set_features(ast_root)
    return torch.tensor(np.array(features), dtype=torch.float)


def process_test_case(test_case: pd.Series) -> Optional[Data]:
    """
    Processes a test case and extracts the AST representation.
    
    Args:
        test_case (pd.Series): A row from the DataFrame containing filename, code, and vulnerability label.
    
    Returns:
        Optional[Data]: A PyTorch Geometric Data object representing the AST graph.
    """
    try:
        index = clang.cindex.Index.create()
        translation_unit = index.parse(
            path=test_case.filename,
            unsaved_files=[(test_case.filename, test_case.code)],
        )
        
        ast_root = translation_unit.cursor
        save_ast(ast_root)
        number_ast_nodes(ast_root)
        
        edge_index = generate_edge_list(ast_root)
        node_features = generate_node_features(ast_root)
        y = torch.tensor([test_case.vuln], dtype=torch.int64)  # Use 'vuln' instead of 'bug'
        
        return Data(x=node_features, edge_index=edge_index, y=y)
    
    except Exception as e:
        print(f"Error processing test case {test_case.filename}: {e}")
        return None

def process_dataset(csv_location: str, output_location: str, num_partitions: int = 20) -> str:
    """
    Processes the source code dataset and extracts graph representations.
    
    Args:
        csv_location (str): Path to the CSV file containing the dataset.
        output_location (str): Directory to save the processed graph representations.
        num_partitions (int): Number of partitions for parallel processing.
    
    Returns:
        str: Path to the directory containing the processed graph files.
    """
    print("Preprocessing source code files and extracting ASTs")
    
    # Load and partition the dataset
    data = pd.read_csv(csv_location)
    dask_data = dd.from_pandas(data, npartitions=num_partitions)
    
    # Process each test case in parallel
    graphs = dask_data.map_partitions(
        lambda df: df.apply(process_test_case, axis=1)
    ).compute()
    
    # Save processed graphs to disk
    graph2vec_input_dir = os.path.join(output_location, "graph2vec_input")
    os.makedirs(graph2vec_input_dir, exist_ok=True)
    
    for index, graph in enumerate(graphs):
        if graph is not None:
            file_path = os.path.join(graph2vec_input_dir, f"{index}.pt")
            torch.save(graph, file_path)
            print(f"Saved graph {index} to {file_path}")
    
    print("-> Done.")
    return graph2vec_input_dir

: 

In [None]:
csv_location = 'Datasets/Normalized_CWE-469.csv.gz'
output_location = 'Datasets/Graph2Vec_Input'
process_dataset(csv_location, output_location, num_partitions=20)

Preprocessing source code files and extracting ASTs
Error processing test case a: Error parsing translation unit.
Error processing test case <NA>: 'NAType' object has no attribute 'encode'


Exception ignored in: <function TranslationUnit.__del__ at 0x0000029060A849A0>
Traceback (most recent call last):
  File "c:\Users\ireen\AppData\Local\Programs\Python\Python311\Lib\site-packages\clang\cindex.py", line 3148, in __del__
    conf.lib.clang_disposeTranslationUnit(self)
OSError: exception: access violation reading 0xFFFFFFFF95C85308


In [None]:
import pyarrow
print(pyarrow.__version__)

19.0.1


In [3]:
import torch
import numpy as np
import os
import pandas as pd
import dask.dataframe as dd
import clang.cindex
from torch_geometric.data import Data


def save_ast(node):
    """Recursively saves the children of a node."""
    node.children = list(node.get_children())
    for child in node.children:
        save_ast(child)


def numbering_ast_nodes(node, counter: int = 1) -> int:
    """Assigns unique identifiers to AST nodes."""
    node.identifier = counter
    counter += 1
    
    node.children = list(node.get_children())
    for child in node.children:
        counter = numbering_ast_nodes(child, counter)
    
    return counter


def generate_edgelist(ast_root) -> torch.Tensor:
    """Generates an edge list tensor from the AST."""
    edges = [[], []]

    def walk_tree_and_add_edges(node):
        for child in node.children:
            edges[0].append(node.identifier - 1)
            edges[1].append(child.identifier - 1)
            walk_tree_and_add_edges(child)
    
    walk_tree_and_add_edges(ast_root)
    return torch.tensor(edges, dtype=torch.long)


def generate_features(ast_root) -> torch.Tensor:
    """Generates node feature tensors from the AST."""
    features = []

    def walk_tree_and_set_features(node):
        degree = len(node.children)  # Out-degree
        features.append([node.identifier, degree])
        
        for child in node.children:
            walk_tree_and_set_features(child)
    
    walk_tree_and_set_features(ast_root)
    return torch.tensor(np.array(features), dtype=torch.float)


def clang_process(testcase) -> Data:
    """Processes a test case and extracts the AST representation."""
    parse_list = [(testcase.filename, testcase.code)]
    
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=testcase.filename,
        unsaved_files=parse_list,
    )
    
    ast_root = translation_unit.cursor
    save_ast(ast_root)
    numbering_ast_nodes(ast_root)
    
    graph_embedding = generate_edgelist(ast_root)
    node_embedding = generate_features(ast_root)
    y = torch.tensor([testcase.bug], dtype=torch.int64)
    
    # Cleanup clang objects
    del translation_unit, ast_root, index
    
    return Data(x=node_embedding, edge_index=graph_embedding, y=y)


def graph_representation_process(csv_location: str, output_location: str, num_partitions: int = 20) -> str:
    """Processes the source code dataset and extracts graph representations."""
    print("Preprocessing source code files and extracting ASTs")
    
    data = pd.read_csv(csv_location)
    data = dd.from_pandas(data, npartitions=num_partitions)
    
    graphs = data.groupby(['testID']).apply(
        clang_process, axis='columns', meta=('processed_for_graph2vec', 'unicode')
    )
    
    graph2vec_input_dir = os.path.join(output_location, "graph2vec_input")
    os.makedirs(graph2vec_input_dir, exist_ok=True)
    
    for index, row in graphs.iteritems():
        print(f"Current Iteration: {index}")
        with open(os.path.join(graph2vec_input_dir, f"{index}.json"), 'w') as f:
            f.write(row)
    
    print("-> Done.")
    return graph2vec_input_dir

In [4]:
csv_location = 'Datasets/Normalized_CWE-469.csv.gz'
output_location = 'Datasets/Graph2Vec_Input'
graph_representation_process(csv_location, output_location, num_partitions=20)

Preprocessing source code files and extracting ASTs


AttributeError: 'Series' object has no attribute 'iteritems'