In [1]:
# Extract AST from C source code using clang
import clang.cindex
import sys
import json
import os
import dask.dataframe as dd # for parallel computing 
import numpy as np
import pandas as pd
import torch
from torch_geometric.data import Data
from torch.utils.data import Dataset
from tqdm import tqdm
from typing import Optional, List, Dict, Any, Tuple

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Configure libclang path
if os.name == 'nt':  # Windows
    print('Windows')
    clang.cindex.Config.set_library_file('D:/Project/LLVM/bin/libclang.dll')
    #clang.cindex.Config.set_library_file('D:\Project\LLVM\bin\libclang.dll')
elif os.name == 'posix':  # Linux/Mac
    print('Linux/Mac')
    clang.cindex.Config.set_library_file('/usr/lib/llvm-14/lib/libclang.so.1')

# Verify if libclang is loaded
print(clang.cindex.Config.loaded)  # Should print `True`

Windows
False


In [3]:
vdisc = pd.read_csv('Datasets/Normalized_CWE-469.csv.gz')
vdisc.head()

Unnamed: 0.1,Unnamed: 0,testID,filename,code,vuln,type
0,0,0,cwe469_0.c,"gretl_list_build (const char *s, const DATASET...",False,CWE-469
1,1,1,cwe469_1.c,rd_meta_is_broken(FILE *fp)\n{\n char buf[M...,True,CWE-469
2,2,2,cwe469_2.c,"load( f_ck_query query_func, t_CKBOOL lazy )\n...",False,CWE-469
3,3,3,cwe469_3.c,checkSupGroups (LDAP * ld)\n{\n LDAPMessage *...,True,CWE-469
4,4,4,cwe469_4.c,"dht_getxattr_unwind (call_frame_t *frame,\n ...",False,CWE-469


In [4]:
import os
import json
import pandas as pd
import dask.dataframe as dd
import clang.cindex

def save_ast(node):
    node.children = list(node.get_children())

    for child in node.children:
        save_ast(child)
        
def numbering_ast_nodes(node, counter=1):
    node.identifier = counter
    counter += 1

    node.children = list(node.get_children())
    for child in node.children:
        counter = numbering_ast_nodes(child, counter)

    return counter

def generate_edgelist(ast_root):
    edges = []

    def walk_tree_and_add_edges(node):
        for child in node.children:
            edges.append([node.identifier, child.identifier])
            walk_tree_and_add_edges(child)

    walk_tree_and_add_edges(ast_root)

    return edges

def generate_features(ast_root):
    features = {}

    def walk_tree_and_set_features(node):
        out_degree = len(node.children)
        degree = out_degree

        features[node.identifier] = degree

        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)

    return features

def get_source_file(datapoints):
    if len(datapoints) == 1:
        return datapoints.iloc[0]

def clang_process(testcase, **kwargs):
    parse_list = [
        (datapoint.filename, datapoint.code)
        for datapoint in testcase.itertuples()
    ]

    source_file = get_source_file(testcase)

    # Parsing the source code and extracting AST using clang
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=source_file.filename,
        unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor

    save_ast(ast_root)
    numbering_ast_nodes(ast_root)

    edgelist = generate_edgelist(ast_root)
    features = generate_features(ast_root)

    graph_representation = {
        "edges": edgelist,
        "features": features,
    }

    # delete clang objects
    del translation_unit
    del ast_root
    del index

    return json.dumps(graph_representation)

def process_dataset(csv_location, output_location, num_partitions=20):
    print("Preprocess source code files and extracting AST's")

    data = pd.read_csv(csv_location)
    data = dd.from_pandas(data, npartitions=num_partitions)

    graphs = data.groupby(['testID']).apply(
        clang_process,
        axis='columns',
        meta=('processed_for_graph2vec', 'unicode'),
    )

    graph2vec_input_dir = output_location + "/graph2vec_input/"
    os.makedirs(graph2vec_input_dir, exist_ok=True)
    
    # Process partition by partition to avoid memory issues
    for partition in graphs.to_delayed():
        print("Processing partition")
        # Process each partition
        partition_result = partition.compute()
        for index, row in partition_result.items():
            print(f"Processing {index}")
            with open(graph2vec_input_dir + str(index) + ".json", 'w') as f:
                f.write(row)

    print("`-> Done.")
    return graph2vec_input_dir

In [None]:
csv_location = 'Datasets/Normalized_CWE-120.csv.gz'
output_location = 'Datasets/'
process_dataset(csv_location, output_location)

Preprocess source code files and extracting AST's


Processing partition


In [None]:
example = vdisc.iloc[0]
print(example.code)

gretl_list_build (const char *s, const DATASET *dset, int *err)
{
    char test[32];
    int *list = NULL;
    int *nlist;
    int i, v, len, nf;

    list = gretl_null_list();
    if (list == NULL) {
	*err = E_ALLOC;
	return NULL;
    }

    nf = count_fields(s, NULL);
    
    for (i=0; i<nf && !*err; i++) {
	s += strspn(s, " ");
	len = strcspn(s, " ");
	if (len > 31) {
	    *err = E_PARSE;
	} else {
	    *test = 0;
	    strncat(test, s, len);

	    /* valid elements: integers, varnames, named lists */

	    if (isdigit(*test)) {
		v = positive_int_from_string(test);
		if (v >= 0) {
		    list = gretl_list_append_term(&list, v);
		} else {
		    *err = E_PARSE;
		}
	    } else {
		v = series_index(dset, test);
		if (v < dset->v) {
		    list = gretl_list_append_term(&list, v);
		} else {
		    nlist = get_list_by_name(test);
		    if (nlist != NULL) {
			*err = gretl_list_add_list(&list, nlist);
		    } else {
			*err = E_UNKVAR;
		    }
		}
	    }

	    if (list == NULL) {
		*err = 

In [None]:
# Print the AST of the first datapoint
index = clang.cindex.Index.create()
translation_unit = index.parse(
    path=example.filename,
    unsaved_files=[(example.filename, example.code)],
)
ast_root = translation_unit.cursor

save_ast(ast_root)
numbering_ast_nodes(ast_root)

edgelist = generate_edgelist(ast_root)
features = generate_features(ast_root)

graph_representation = {
    "edges": edgelist,
    "features": features,
}

print(json.dumps(graph_representation))

# delete clang objects
del translation_unit
del ast_root
del index

{"edges": [[1, 2], [2, 3], [2, 4], [2, 5], [2, 6], [6, 7], [7, 8], [8, 9], [6, 10], [10, 11], [6, 12], [12, 13], [6, 14], [14, 15], [14, 16], [14, 17], [14, 18], [6, 19], [19, 20], [19, 21], [21, 22], [22, 23], [23, 24], [6, 25], [25, 26], [25, 27], [6, 28], [28, 29], [29, 30], [29, 31], [28, 32], [32, 33], [33, 34], [34, 35], [33, 36], [36, 37], [32, 38], [38, 39], [39, 40], [40, 41], [41, 42], [28, 43], [43, 44], [28, 45], [45, 46], [46, 47], [46, 48], [48, 49], [49, 50], [48, 51], [51, 52], [48, 53], [53, 54], [54, 55], [45, 56], [56, 57], [56, 58], [58, 59], [59, 60], [60, 61], [59, 62], [62, 63], [59, 64], [64, 65], [65, 66], [45, 67], [67, 68], [68, 69], [69, 70], [68, 71], [67, 72], [67, 73], [73, 74], [74, 75], [75, 76], [76, 77], [74, 78], [78, 79], [73, 80], [80, 81], [81, 82], [80, 83], [83, 84], [80, 85], [85, 86], [80, 87], [87, 88], [88, 89], [73, 90], [90, 91], [91, 92], [92, 93], [91, 94], [94, 95], [95, 96], [96, 97], [97, 98], [90, 99], [99, 100], [100, 101], [100, 10

In [None]:
# check python 32-bit or 64-bit
import struct
print( 8 * struct.calcsize("P"))


64
