In [1]:
# Extract AST from C source code using clang
import clang.cindex
import sys
import json
import os
import dask.dataframe as dd # for parallel computing 
import numpy as np
import pandas as pd
import torch
from torch_geometric.data import Data
from torch.utils.data import Dataset
from tqdm import tqdm
from typing import Optional, List, Dict, Any, Tuple

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
vdisc = pd.read_csv('Datasets/Normalized_CWE-469.csv.gz')
vdisc.head()

Unnamed: 0,testID,filename,code,vuln,type
0,0,cwe469_0.c,"gretl_list_build (const char *s, const DATASET...",False,CWE-469
1,1,cwe469_1.c,rd_meta_is_broken(FILE *fp)\n{\n char buf[M...,True,CWE-469
2,2,cwe469_2.c,"load( f_ck_query query_func, t_CKBOOL lazy )\n...",False,CWE-469
3,3,cwe469_3.c,checkSupGroups (LDAP * ld)\n{\n LDAPMessage *...,True,CWE-469
4,4,cwe469_4.c,"dht_getxattr_unwind (call_frame_t *frame,\n ...",False,CWE-469


In [3]:
def save_ast(node):
    node.children = list(node.get_children())

    for child in node.children:
        counter = save_ast(child)
        
def numbering_ast_nodes(node, counter=1):
    node.identifier = counter
    counter += 1

    node.children = list(node.get_children())
    for child in node.children:
        counter = numbering_ast_nodes(child, counter)

    return counter

def generate_edgelist(ast_root):
    edges = []

    def walk_tree_and_add_edges(node):
        for child in node.children:
            edges.append([node.identifier, child.identifier])
            walk_tree_and_add_edges(child)

    walk_tree_and_add_edges(ast_root)

    return edges

def generate_features(ast_root):
    
    features = {}

    def walk_tree_and_set_features(node):
        out_degree = len(node.children)
        #in_degree = 1
        #degree = out_degree + in_degree
        degree = out_degree

        features[node.identifier] = degree

        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)

    return features

def get_source_file(datapoints):
    if len(datapoints) == 1:
        return datapoints.iloc[0]

def clang_process(testcase, **kwargs):
    
    parse_list = [
        (datapoint.filename, datapoint.code)
        for datapoint in testcase.itertuples()
    ]

    source_file= get_source_file(testcase)

    # Parsing the source code and extracting AST using clang
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=source_file.filename,
        unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor

    save_ast(ast_root)
    numbering_ast_nodes(ast_root)

    edgelist = generate_edgelist(ast_root)

    features = generate_features(ast_root)

    graph_representation = {
        "edges": edgelist,
        "features": features,
    }

    # delete clang objects
    del translation_unit
    del ast_root
    del index

    # Writing to sample.json
    # with open("sample.json", "w") as outfile:
    #     json.dump(graph2vec_representation,outfile)
    return json.dumps(graph_representation)

def process_dataset(csv_location, output_location, num_partitions=20):
    print("Preprocess source code files and extracting AST's")

    data = pd.read_csv(csv_location)
    data = dd.from_pandas(data, npartitions=num_partitions)

    graphs = data.groupby(['testID']).apply(
        clang_process,
        axis='columns',
        meta=('processed_for_graph2vec', 'unicode'),
    )

    # Compute the Dask DataFrame to get a pandas Series
    computed_graphs = graphs.compute()

    graph2vec_input_dir = output_location + "/graph2vec_input/"
    os.makedirs(graph2vec_input_dir, exist_ok=True)

    # Use items() instead of iteritems() on the computed pandas Series
    for index, row in computed_graphs.items():
        print("Current Iteration: " + str(index))
        with open(graph2vec_input_dir + str(index) + ".json", 'w') as f:
            f.write(row)

    print("-> Done.")
    return graph2vec_input_dir

csv_location = 'Datasets/Normalized_CWE-469.csv.gz'
output_location = 'Datasets/'
process_dataset(csv_location, output_location, num_partitions=20)

Preprocess source code files and extracting AST's


LibclangError: Could not find module 'libclang.dll' (or one of its dependencies). Try using the full path with constructor syntax.. To provide a path to libclang use Config.set_library_path() or Config.set_library_file().

In [None]:
csv_location = 'Datasets/Normalized_CWE-469.csv.gz'
output_location = 'Datasets/'
process_dataset(csv_location, output_location, num_partitions=20)

Preprocess source code files and extracting AST's


: 

In [None]:
import pyarrow
print(pyarrow.__version__)

19.0.1


In [3]:
import torch
import numpy as np
import os
import pandas as pd
import dask.dataframe as dd
import clang.cindex
from torch_geometric.data import Data


def save_ast(node):
    """Recursively saves the children of a node."""
    node.children = list(node.get_children())
    for child in node.children:
        save_ast(child)


def numbering_ast_nodes(node, counter: int = 1) -> int:
    """Assigns unique identifiers to AST nodes."""
    node.identifier = counter
    counter += 1
    
    node.children = list(node.get_children())
    for child in node.children:
        counter = numbering_ast_nodes(child, counter)
    
    return counter


def generate_edgelist(ast_root) -> torch.Tensor:
    """Generates an edge list tensor from the AST."""
    edges = [[], []]

    def walk_tree_and_add_edges(node):
        for child in node.children:
            edges[0].append(node.identifier - 1)
            edges[1].append(child.identifier - 1)
            walk_tree_and_add_edges(child)
    
    walk_tree_and_add_edges(ast_root)
    return torch.tensor(edges, dtype=torch.long)


def generate_features(ast_root) -> torch.Tensor:
    """Generates node feature tensors from the AST."""
    features = []

    def walk_tree_and_set_features(node):
        degree = len(node.children)  # Out-degree
        features.append([node.identifier, degree])
        
        for child in node.children:
            walk_tree_and_set_features(child)
    
    walk_tree_and_set_features(ast_root)
    return torch.tensor(np.array(features), dtype=torch.float)


def clang_process(testcase) -> Data:
    """Processes a test case and extracts the AST representation."""
    parse_list = [(testcase.filename, testcase.code)]
    
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=testcase.filename,
        unsaved_files=parse_list,
    )
    
    ast_root = translation_unit.cursor
    save_ast(ast_root)
    numbering_ast_nodes(ast_root)
    
    graph_embedding = generate_edgelist(ast_root)
    node_embedding = generate_features(ast_root)
    y = torch.tensor([testcase.bug], dtype=torch.int64)
    
    # Cleanup clang objects
    del translation_unit, ast_root, index
    
    return Data(x=node_embedding, edge_index=graph_embedding, y=y)


def graph_representation_process(csv_location: str, output_location: str, num_partitions: int = 20) -> str:
    """Processes the source code dataset and extracts graph representations."""
    print("Preprocessing source code files and extracting ASTs")
    
    data = pd.read_csv(csv_location)
    data = dd.from_pandas(data, npartitions=num_partitions)
    
    graphs = data.groupby(['testID']).apply(
        clang_process, axis='columns', meta=('processed_for_graph2vec', 'unicode')
    )
    
    graph2vec_input_dir = os.path.join(output_location, "graph2vec_input")
    os.makedirs(graph2vec_input_dir, exist_ok=True)
    
    for index, row in graphs.iteritems():
        print(f"Current Iteration: {index}")
        with open(os.path.join(graph2vec_input_dir, f"{index}.json"), 'w') as f:
            f.write(row)
    
    print("-> Done.")
    return graph2vec_input_dir

In [4]:
csv_location = 'Datasets/Normalized_CWE-469.csv.gz'
output_location = 'Datasets/Graph2Vec_Input'
graph_representation_process(csv_location, output_location, num_partitions=20)

Preprocessing source code files and extracting ASTs


AttributeError: 'Series' object has no attribute 'iteritems'