In [1]:
# Extract AST from C source code using clang
import clang.cindex
import sys
import json
import os
import dask.dataframe as dd # for parallel computing 
import numpy as np
import pandas as pd
import torch
from torch_geometric.data import Data
from torch.utils.data import Dataset
from tqdm import tqdm
from typing import Optional, List, Dict, Any, Tuple

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Configure libclang path
if platform == "linux" or platform == "linux2":
    print("Linux")
    clang.cindex.Config.set_library_file('/usr/lib/llvm-14/lib/libclang.so')
elif platform == "darwin":
    print("OS X")
    clang.cindex.Config.set_library_file('/Library/Developer/CommandLineTools/usr/lib/libclang.dylib')
elif platform == "win32":
    print("Windows")
    clang.cindex.Config.set_library_file('D:/Project/LLVM/bin/libclang.dll')

# Verify if libclang is loaded
print(clang.cindex.Config.loaded)  # Should print `True`

Linux/Mac
None
False


## Load dataset

In [3]:
# vdisc = pd.read_csv('Datasets/Normalized_CWE-469.csv.gz')
# vdisc.info()

In [4]:
def save_ast(node):
    """ Recursively save the AST in a dictionary format """
    node.children = list(node.get_children())

    for child in node.children:
        save_ast(child)
        
def numbering_ast_nodes(node, counter=1):
    """ Recursively number the AST nodes """
    node.identifier = counter
    counter += 1

    node.children = list(node.get_children())
    for child in node.children:
        counter = numbering_ast_nodes(child, counter)

    return counter

def generate_edgelist(ast_root):
    """ Generate an edge list from the AST """
    edges = []

    def walk_tree_and_add_edges(node):
        for child in node.children:
            edges.append([node.identifier, child.identifier])
            walk_tree_and_add_edges(child)

    walk_tree_and_add_edges(ast_root)

    return edges

def generate_features(ast_root):
    """ Generate features for each node in the AST """
    features = {}

    def walk_tree_and_set_features(node):
        out_degree = len(node.children)
        degree = out_degree

        features[node.identifier] = degree

        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)

    return features

def get_source_file(datapoints):
    """ Get the source file from the list of datapoints """
    if len(datapoints) == 1:
        return datapoints.iloc[0]

def clang_process(testcase, **kwargs):
    """Parses source code with Clang and extracts AST-based graph representation."""
    parse_list = [
        (datapoint.filename, datapoint.code)
        for datapoint in testcase.itertuples()
    ]

    source_file = get_source_file(testcase)

    # Parsing the source code and extracting AST using clang
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=source_file.filename,
        unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor

    save_ast(ast_root)
    numbering_ast_nodes(ast_root)

    edgelist = generate_edgelist(ast_root)
    features = generate_features(ast_root)

    graph_representation = {
        "edges": edgelist,
        "features": features,
    }

    # delete clang objects
    del translation_unit
    del ast_root
    del index

    return json.dumps(graph_representation)

def process_dataset(csv_location, output_location):
    print("Preprocessing source code files and extracting ASTs")

    data = dd.read_csv(csv_location, blocksize="25MB")  # Read as Dask DF
    num_partitions = data.npartitions  # Dynamically get number of partitions

    graph2vec_input_dir = os.path.join(output_location, "graph2vec_input/")
    os.makedirs(graph2vec_input_dir, exist_ok=True)

    print(f"Total Partitions: {num_partitions}")  # Debugging

    for partition_index in range(num_partitions):  # Process only available partitions
        print(f"Processing Partition {partition_index}...")  # Debugging
        batch = data.partitions[partition_index]  
        batch = batch.compute()  # Convert only this partition to Pandas

        graphs = batch.groupby(['testID']).apply(
            clang_process,
            axis=1
        )

        for index, row in graphs.items():
            with open(os.path.join(graph2vec_input_dir, f"{index}.json"), 'w') as f:
                f.write(row)

    print("✅ Done.")
    return graph2vec_input_dir

In [5]:
csv_location = 'Datasets/Normalized_CWE-469.csv'
output_location = 'graph2vec/dataset/'
process_dataset(csv_location, output_location)

Preprocessing source code files and extracting ASTs
Total Partitions: 1
Processing Partition 0...


LibclangError: dlsym(0x7ed8fb60, clang_getOffsetOfBase): symbol not found. Please ensure that your python bindings are compatible with your libclang.so version.

## Run this after when it extracts the data from the input file.

```bash
python graph2vec.py --input-path dataset/graph2vec_input/ --output-path features/<file_name>.csv
```