In [1]:
# Extract AST from C source code using clang
import clang.cindex
import sys
import json
import os
import dask.dataframe as dd # for parallel computing 
import numpy as np
import pandas as pd
import torch
from torch_geometric.data import Data
from torch.utils.data import Dataset
from tqdm import tqdm
from typing import Optional, List, Dict, Any, Tuple

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Configure libclang path
if os.name == 'nt':  # Windows
    print('Windows')
    clang.cindex.Config.set_library_file('D:/Project/LLVM/bin/libclang.dll')
    #clang.cindex.Config.set_library_file('D:\Project\LLVM\bin\libclang.dll')
elif os.name == 'posix':  # Linux/Mac
    print('Linux/Mac')
    clang.cindex.Config.set_library_file('/usr/lib/llvm-14/lib/libclang.so.1')

# Verify if libclang is loaded
print(clang.cindex.Config.loaded)  # Should print `True`

Windows
False


In [3]:
vdisc = pd.read_csv('Datasets/Normalized_CWE-469.csv.gz')
vdisc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5250 entries, 0 to 5249
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5250 non-null   int64 
 1   testID      5250 non-null   int64 
 2   filename    5250 non-null   object
 3   code        5250 non-null   object
 4   vuln        5250 non-null   bool  
 5   type        5250 non-null   object
dtypes: bool(1), int64(2), object(3)
memory usage: 210.3+ KB


In [4]:
vdisctemp = pd.read_csv('Datasets/Normalized_CWE-120.csv.gz')
vdisctemp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95320 entries, 0 to 95319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  95320 non-null  int64 
 1   testID      95320 non-null  int64 
 2   filename    95320 non-null  object
 3   code        95320 non-null  object
 4   vuln        95320 non-null  bool  
 5   type        95320 non-null  object
dtypes: bool(1), int64(2), object(3)
memory usage: 3.7+ MB


In [7]:
import os
import json
import pandas as pd
import dask.dataframe as dd
import clang.cindex

def save_ast(node):
    node.children = list(node.get_children())

    for child in node.children:
        save_ast(child)
        
def numbering_ast_nodes(node, counter=1):
    node.identifier = counter
    counter += 1

    node.children = list(node.get_children())
    for child in node.children:
        counter = numbering_ast_nodes(child, counter)

    return counter

def generate_edgelist(ast_root):
    edges = []

    def walk_tree_and_add_edges(node):
        for child in node.children:
            edges.append([node.identifier, child.identifier])
            walk_tree_and_add_edges(child)

    walk_tree_and_add_edges(ast_root)

    return edges

def generate_features(ast_root):
    features = {}

    def walk_tree_and_set_features(node):
        out_degree = len(node.children)
        degree = out_degree

        features[node.identifier] = degree

        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)

    return features

def get_source_file(datapoints):
    if len(datapoints) == 1:
        return datapoints.iloc[0]

def clang_process(testcase, **kwargs):
    parse_list = [
        (datapoint.filename, datapoint.code)
        for datapoint in testcase.itertuples()
    ]

    source_file = get_source_file(testcase)

    # Parsing the source code and extracting AST using clang
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=source_file.filename,
        unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor

    save_ast(ast_root)
    numbering_ast_nodes(ast_root)

    edgelist = generate_edgelist(ast_root)
    features = generate_features(ast_root)

    graph_representation = {
        "edges": edgelist,
        "features": features,
    }

    # delete clang objects
    del translation_unit
    del ast_root
    del index

    return json.dumps(graph_representation)

def process_dataset(csv_location, output_location):
    print("Preprocessing source code files and extracting ASTs")

    data = dd.read_csv(csv_location, blocksize="25MB")  # Read as Dask DF
    num_partitions = data.npartitions  # Dynamically get number of partitions

    graph2vec_input_dir = os.path.join(output_location, "graph2vec_input/")
    os.makedirs(graph2vec_input_dir, exist_ok=True)

    print(f"Total Partitions: {num_partitions}")  # Debugging

    for partition_index in range(num_partitions):  # Process only available partitions
        print(f"Processing Partition {partition_index}...")  # Debugging
        batch = data.partitions[partition_index]  
        batch = batch.compute()  # Convert only this partition to Pandas

        graphs = batch.groupby(['testID']).apply(
            clang_process,
            axis=1
        )

        for index, row in graphs.items():
            with open(os.path.join(graph2vec_input_dir, f"{index}.json"), 'w') as f:
                f.write(row)

    print("✅ Done.")
    return graph2vec_input_dir

In [8]:
csv_location = 'Datasets/Normalized_CWE-469.csv'
output_location = 'graph2vec/dataset/'
process_dataset(csv_location, output_location)

Preprocessing source code files and extracting ASTs
Total Partitions: 1
Processing Partition 0...


  graphs = batch.groupby(['testID']).apply(


✅ Done.


'graph2vec/dataset/graph2vec_input/'

# Run this after when it extracts the data from the input file.

```bash
python graph2vec.py --input-path dataset/graph2vec_input/ --output-path features/<file_name>.csv
```