### Convert Small ACFGs

In [22]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import glob
import os
import pickle as pkl
import numpy as np
import scipy as sp
import pandas as pd
from networkx import number_of_nodes, adjacency_matrix, all_neighbors
from node_attributes import node_features


"""
Read files under input_dir, aggregate ACFGs into
the txt format defined in https://github.com/littlepretty/pytorch_DGCNN/tree/master/data.
"""
MAX_GRAPH_SIZE = 480
input_dir = 'Small%dACFGs/'  % MAX_GRAPH_SIZE
output_dir = 'DGCNN_Small%dACFGs/' % MAX_GRAPH_SIZE

if not os.path.exists(output_dir):
    os.makedirs(output_dir)


CFG_filepaths = np.loadtxt(input_dir + 'small_graph_filenames.csv', dtype=str)
CFG_labels = ['Bifrose', 'Bagle', 'Benign']
CFG_cnts = [139, 139, 132]
print("Input: %s, Output: %s" % (input_dir, output_dir))

def list2str(l1, l2):
    """
    Merge two list, then return space seperated string format.
    """
    return " ".join([str(x) for x in (list(l1) + list(l2))])


output = open(output_dir + 'cfg.txt', 'wb')
output.write("%d\n" % sum(CFG_cnts))
test_cnt = 0
for pkl_filename in CFG_filepaths:
    label = CFG_labels.index(pkl_filename.split('/')[0])
    graph_id = pkl_filename.split('/')[1]
    
    features = np.loadtxt(input_dir + graph_id + '.features.txt', dtype=int, ndmin=2)
    adjacent = np.loadtxt(input_dir + graph_id + '.adjacent.txt', dtype=int, ndmin=2)
    test_cnt += 1
    output.write("%d %d\n" % (features.shape[0], label))
    
    for (i, row) in enumerate(adjacent):
        neighbors = np.flatnonzero(row)
        feature = features[i, :]
        output.write("1 %d %s\n" % (len(neighbors), list2str(neighbors, feature)))


output.close()
print("[Finished] Convert %d ACFGs to DGCNN txt format" % test_cnt)

Input: Small480ACFGs/, Output: DGCNN_Small480ACFGs/


### Convert All ACFGs

In [31]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import pickle as pkl
import numpy as np
import scipy as sp
import pandas as pd

"""
Read files under input_dir, aggregate ACFGs into
the txt format defined in https://github.com/littlepretty/pytorch_DGCNN/tree/master/data.
"""
input_dir = 'AllACFGs/'
output_dir = 'DGCNN_AllACFGs/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

CFG_filepaths = np.loadtxt(input_dir + 'graph_filenames.csv', dtype=str)
CFG_sizes = pd.read_csv(input_dir + 'graph_sizes.csv', header=0)
CFG_labels = ['Bifrose', 'Bagle', 'Benign']
CFG_cnts = CFG_sizes.count()
print("Input: %s, Output: %s\n%s" % (input_dir, output_dir, CFG_cnts))


Input: AllACFGs/, Output: DGCNN_AllACFGs/
Benign      518
Bagle       152
Bifrose    1019
dtype: int64


In [32]:
def list2str(l1, l2):
    """
    Merge two list, then return space seperated string format.
    """
    return " ".join([str(x) for x in (list(l1) + list(l2))])


output = open(output_dir + 'cfg.txt', 'wb')
output.write("%d\n" % sum(CFG_cnts.tolist()))
test_cnt = 0
for pkl_filename in CFG_filepaths:
    label = CFG_labels.index(pkl_filename.split('/')[0])
    graph_id = pkl_filename.split('/')[1]
    
    features = np.loadtxt(input_dir + graph_id + '.features.txt', dtype=int, ndmin=2)
    sp_adjacent_mat = sp.sparse.load_npz(input_dir + graph_id + '.adjacent.npz')
    output.write("%d %d\n" % (features.shape[0], label))
    test_cnt += 1
    
    sp_adjacent = sp.sparse.find(sp_adjacent_mat)
    indices = {}
    for i in range(len(sp_adjacent[0])):
        if sp_adjacent[0][i] not in indices:
            indices[sp_adjacent[0][i]] = []
            
        indices[sp_adjacent[0][i]].append(sp_adjacent[1][i])
        
    for (i, feature) in enumerate(features):
        neighbors = indices[i] if i in indices else []
        output.write("1 %d %s\n" % (len(neighbors), list2str(neighbors, feature)))

output.close()
print("[Finished] Convert %d ACFGs to DGCNN txt format" % test_cnt)


[Finished] Convert 1689 ACFGs to DGCNN txt format
