This notebook shows the steps related to extraxing ASTs from source code using Clang. It prepares the graph representation for the first approach, which is based on graph2vec

In this notebook some functions are written with some help from the following documentations as well as tutorials: 

Clang documentation:

https://libclang.readthedocs.io/en/latest/index.html

An example of parsing C ++ code using libclang in Python:

https://sudonull.com/post/907-An-example-of-parsing-C-code-using-libclang-in-Python

Clang tutorial:

https://jonasdevlieghere.com/understanding-the-clang-ast/#cursors


In [1]:
import clang.cindex
import json
import os
import dask.dataframe as dd # for parallel computing 
import numpy as np
import pandas as pd
import ctypes

Uploading a processed CWE type:

In [2]:
clang.cindex.Config.set_library_file('D:/Project/LLVM/bin/libclang.dll')

In [3]:
vdisc = pd.read_csv("Datasets/Normalized_CWE-469.csv")

In [4]:
def save_ast(node):
   
    node.children = list(node.get_children())

    for child in node.children:
        counter = save_ast(child)

In [5]:
def numbering_ast_nodes(node, counter=1):
  
    node.identifier = counter
    counter += 1

    node.children = list(node.get_children())
    for child in node.children:
        counter = numbering_ast_nodes(child, counter)

    return counter

Generate edges

In [6]:
def generate_edgelist(ast_root):
  
    edges = []

    def walk_tree_and_add_edges(node):
        for child in node.children:
            edges.append([node.identifier, child.identifier])
            walk_tree_and_add_edges(child)

    walk_tree_and_add_edges(ast_root)

    return edges

Build features that include node index and degree

In [7]:
def generate_features(ast_root):
    
    features = {}

    def walk_tree_and_set_features(node):
        out_degree = len(node.children)
        #in_degree = 1
        #degree = out_degree + in_degree
        degree = out_degree

        features[node.identifier] = degree

        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)

    return features

In [8]:
def get_source_file(datapoints):
    if len(datapoints) == 1:
        return datapoints.iloc[0]

In [9]:
def clang_process(testcase, **kwargs):
    
    parse_list = [
        (datapoint.filename, datapoint.code)
        for datapoint in testcase.itertuples()
    ]

    source_file= get_source_file(testcase)

    # Parsing the source code and extracting AST using clang
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=source_file.filename,
        unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor

    save_ast(ast_root)
    numbering_ast_nodes(ast_root)

    edgelist = generate_edgelist(ast_root)

    features = generate_features(ast_root)

    graph_representation = {
        "edges": edgelist,
        "features": features,
    }

    # delete clang objects
    del translation_unit
    del ast_root
    del index

    # Writing to sample.json
    # with open("sample.json", "w") as outfile:
    #     json.dump(graph2vec_representation,outfile)
    return json.dumps(graph_representation)

In [None]:
def graph_Representation_process(csv_location, output_location, num_partitions=20):
    print("Preprocess source code files and extracting AST's")

    data = pd.read_csv(csv_location)
    data = dd.from_pandas(data, npartitions=num_partitions)

    graphs = data.groupby(['testID']).apply(
        clang_process,
        axis='columns',
        meta=('processed_for_graph2vec', 'unicode'),
    )

    graph2vec_input_dir = output_location + "/graph2vec_input/"
    os.makedirs(graph2vec_input_dir, exist_ok=True)

    for index, row in graphs.iteritems():
        print("Current Iteration: "+str(index))
        with open(graph2vec_input_dir + str(index) + ".json", 'w') as f:
            f.write(row)

    print("`-> Done.")

    return graph2vec_input_dir

Process all source code using dask

In [13]:
csvLocation = "Datasets/Normalized_CWE-469.csv"
outputLocation = "graph2vec/dataset/"

graph_Representation_process(csvLocation, outputLocation)

Preprocess source code files and extracting AST's


TranslationUnitLoadError: Error parsing translation unit.