In [None]:
##############################################################################
#                                                                            #
#  Code for the USENIX Security '24 paper:                                   #
#  Code is not Natural Language: Unlock the Power of Semantics-Oriented      #
#             Graph Representation for Binary Code Similarity Detection      #
#                                                                            #
#  MIT License                                                               #
#                                                                            #
#  Copyright (c) 2023 SJTU NSSL Lab                                     #
#                                                                            #
##############################################################################

In [2]:
### Collect Function Sizes.

import os
import json
import pickle
from os.path import join

from tqdm import tqdm
from collections import defaultdict
from multiprocessing import Pool

PROJ_DIR = "../.."
PCODE_RAW_TRAIN_DS = join(
    PROJ_DIR, "dbs/Dataset-1/features/training/pcode_raw_Dataset-1_training")
PCODE_RAW_TEST_DS = join(
    PROJ_DIR, "dbs/Dataset-1/features/testing/pcode_raw_Dataset-1_testing")
CACHE_DIR = "./__cache__"

NPROC = 44


def pickle_dump(obj, fp):
    with open(fp, "wb") as f:
        pickle.dump(obj, f)


def pickle_load(fp):
    with open(fp, "rb") as f:
        return pickle.load(f)


def get_graph_sizes_one(fp):
    graph_sizes = defaultdict(list)
    with open(fp, "r") as f:
        d = json.load(f)
    idb_path = list(d.keys())[0]
    d = d[idb_path]
    for _, f_data in d.items():
        for gtype in ['ACFG', 'TSCG', 'ISCG', 'SOG']:
            graph = f_data[gtype]
            nodes = graph['nodes']
            graph_sizes[gtype].append(len(nodes))
    return graph_sizes


def get_graph_sizes_in_node(dataset_path):
    graph_sizes = defaultdict(list)
    files = [join(dataset_path, fn) for fn in os.listdir(dataset_path)]
    with Pool(NPROC) as p:
        for graph_sizes_one in tqdm(p.imap_unordered(get_graph_sizes_one, files), total=len(files)):
            for gtype in ['ACFG', 'TSCG', 'ISCG', 'SOG']:
                graph_sizes[gtype].extend(graph_sizes_one[gtype])
    return graph_sizes


In [None]:
os.makedirs(CACHE_DIR, exist_ok=True)

train_node_sizes = get_graph_sizes_in_node(PCODE_RAW_TRAIN_DS)
test_node_sizes = get_graph_sizes_in_node(PCODE_RAW_TEST_DS)

pickle_dump(train_node_sizes, join(CACHE_DIR, "train_node_sizes.pkl"))
pickle_dump(test_node_sizes, join(CACHE_DIR, "test_node_sizes.pkl"))

In [3]:
train_node_sizes = pickle_load(join(CACHE_DIR, "train_node_sizes.pkl"))
test_node_sizes = pickle_load(join(CACHE_DIR, "test_node_sizes.pkl"))

In [4]:
import numpy as np
# 1%, 10%, 25%, 75%, 90%, 99% thresholds
def stats(arr):
    arr = np.sort(arr)
    print(arr.min(), '%.0f'%arr.mean(), arr.max())
    for th in [0.01, 0.10, 0.25, 0.50, 0.75, 0.90, 0.99]:
        print(arr[int(len(arr)*(1-th))], end=", ")
    print("")

stats(train_node_sizes['SOG'])
print("-----------------")
stats(test_node_sizes['SOG'])

25 490 308828
3579, 880, 447, 244, 152, 107, 60, 
-----------------
20 543 38241
3676, 1112, 582, 322, 194, 131, 71, 


In [1]:
import pandas as pd
from utils import get_groupped_dataframe, id_map, taskname_from_summary_fn, get_size_range

In [3]:
## Appendix C (Impact of function sizes) - Table

result_fns = [
    "summary_xm-1000-10000_Ds1_MRR_Recall_max.csv",
    "summary_xm-400-100-0_71_Ds1_MRR_Recall_max.csv",
    "summary_xm-400-100-3676_1000000_Ds1_MRR_Recall_max.csv",
    "summary_xm-400-100-q_0_71_Ds1_MRR_Recall_max.csv",
    "summary_xm-400-100-q_3676_1000000_Ds1_MRR_Recall_max.csv",
]

dfs = [get_groupped_dataframe(fn) for fn in result_fns]
df = pd.concat(map(lambda x: x[1], dfs),
               axis="columns", keys=map(lambda x: x[0], dfs))

selected = ['SAFE', 'Trex', 'GMN', 'HermesSim']
columns = [(taskname_from_summary_fn(fn), 'MRR@P100') for fn in result_fns]
df = df.loc[selected][columns]
df


Unnamed: 0_level_0,XM,XM-small,XM-large,XM-small-query-only,XM-large-query-only
Unnamed: 0_level_1,MRR@P100,MRR@P100,MRR@P100,MRR@P100,MRR@P100
SAFE,0.189492,0.216115,0.135827,0.47297,0.275252
Trex,0.343794,0.567052,0.393557,0.766973,0.477189
GMN,0.536901,0.590875,0.481508,0.875602,0.800612
HermesSim,0.802205,0.88068,0.915666,0.979929,0.975704


In [13]:
## Appendix C (Impact of function sizes) - Table

result_fns = [
    "summary_xc-1000-10000-arch_x-bit_64_Ds1_MRR_Recall_max.csv",
    "summary_xc-200-100-q_0_71-arch_x-bit_64_Ds1_MRR_Recall_max.csv",
    "summary_xc-200-100-q_3676_1000000-arch_x-bit_64_Ds1_MRR_Recall_max.csv",
    "summary_xc-200-100-0_71-arch_x-bit_64_Ds1_MRR_Recall_max.csv",
    "summary_xc-200-100-3676_1000000-arch_x-bit_64_Ds1_MRR_Recall_max.csv",
]

dfs = [get_groupped_dataframe(fn) for fn in result_fns]
df = pd.concat(map(lambda x: x[1], dfs),
               axis="columns", keys=map(lambda x: x[0], dfs))

selected = ['SAFE', 'Asm2Vec', 'Trex', 'GMN', 'jTrans', 'HermesSim']
columns = [(taskname_from_summary_fn(fn), 'MRR@P100') for fn in result_fns]
df = df.loc[selected][columns]
df


Unnamed: 0_level_0,x64-XC,x64-XC-small-query-only,x64-XC-large-query-only,x64-XC-small,x64-XC-large
Unnamed: 0_level_1,MRR@P100,MRR@P100,MRR@P100,MRR@P100,MRR@P100
SAFE,0.249275,0.454387,0.374411,0.278063,0.243675
Asm2Vec,0.350293,0.68956,0.767701,0.607392,0.561657
Trex,0.532166,0.91854,0.726125,0.818881,0.670186
GMN,0.561543,0.898743,0.863421,0.649775,0.5838
jTrans,0.738284,0.934165,0.8776,0.798803,0.824249
HermesSim,0.806502,0.978321,0.961772,0.85176,0.925552
