In [1]:
import networkx as nx
import numpy as np
import os
import pickle
import glob
from multiprocessing import Pool
from functools import partial
from transformers import AutoModel, AutoTokenizer
import torch
from tqdm import tqdm

In [2]:
def graph_extraction(dot):
    graph = nx.drawing.nx_pydot.read_dot(dot)
    return graph

In [3]:
vul_input_dir = "./data/pdgs/Vul"
no_vul_input_dir = "./data/pdgs/No-Vul"
vul_output_dir = "./data/outputs/Vul"
no_vul_output_dir = "./data/outputs/No-Vul"

In [4]:
device = "cuda"

In [5]:
checkpoint = "Salesforce/codet5p-110m-embedding"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True).to(device)

In [6]:
def sentence_embedding(sentence):
    input_ids = tokenizer.encode(sentence, return_tensors="pt").to(device)
    embedding = model(input_ids)[0]
    return embedding

In [7]:
def image_generation(dot):
    try:
        pdg = graph_extraction(dot)
        labels_dict = nx.get_node_attributes(pdg, 'label')
        labels_code = dict()
        for label, all_code in labels_dict.items():
            # code = all_code.split('code:')[1].split('\\n')[0]
            code = all_code[all_code.index(",") + 1:-2].split('\\n')[0]
            code = code.replace("static void", "void")
            labels_code[label] = code
    
        #print(labels_code)
        degree_cen_dict = nx.degree_centrality(pdg)
        closeness_cen_dict = nx.closeness_centrality(pdg)
        #harmonic_cen_dict = nx.harmonic_centrality(pdg)
    
        G = nx.DiGraph()
        G.add_nodes_from(pdg.nodes())
        G.add_edges_from(pdg.edges())
        katz_cen_dict = nx.katz_centrality(G)
        # print(degree_cen_dict)
        # print(closeness_cen_dict)
        # print(harmonic_cen_dict)
        # print(katz_cen_dict)
    
        degree_channel = []
        closeness_channel = []
        katz_channel = []
        with torch.no_grad():
            for label, code in labels_code.items():
                line_vec = sentence_embedding(code)
                line_vec = line_vec.to("cpu")  # for numpy convergence
                line_vec = np.array(line_vec)
        
                degree_cen = degree_cen_dict[label]
                degree_channel.append(degree_cen * line_vec)
        
                closeness_cen = closeness_cen_dict[label]
                closeness_channel.append(closeness_cen * line_vec)
        
                katz_cen = katz_cen_dict[label]
                katz_channel.append(katz_cen * line_vec)
    
        return (degree_channel, closeness_channel, katz_channel)
    except:
        return None

In [8]:
def write_to_pkl(dot, out, existing_files):
    dot_name = dot.split('/')[-1].split('\\')[-1].split('.dot')[0]
    if dot_name in existing_files:
        return None
    else:
        print(dot_name)
        channels = image_generation(dot)
        if channels is None:
            return None
        else:
            (degree_channel, closeness_channel, katz_channel) = channels
            out_pkl = out + dot_name + '.pkl'
            data = [degree_channel, closeness_channel, katz_channel]
            with open(out_pkl, 'wb') as f:
                pickle.dump(data, f)

In [9]:
def main(input_dir, output_dir):
    if input_dir[-1] == '/':
        input_dir = input_dir
    else:
        input_dir += "/"
    dotfiles = glob.glob(input_dir + '*.dot')

    if output_dir[-1] == '/':
        output_dir = output_dir
    else:
        output_dir += "/"

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    existing_files = glob.glob(output_dir + "/*.pkl")
    existing_files = [f.split(".pkl") for f in existing_files]

    print("Number of files =", len(dotfiles))

    # in parallel
    # pool = Pool(10)
    # pool.map(partial(write_to_pkl, out=output_dir, existing_files=existing_files), dotfiles)

    for dot in tqdm(dotfiles):
        write_to_pkl(dot, output_dir, existing_files)

In [None]:
main(vul_input_dir, vul_output_dir)

Number of files = 12303


In [17]:
main(no_vul_input_dir, no_vul_output_dir)

21057
