In [1]:
%%capture
from tqdm.notebook import tqdm

import json
import os
import pickle
import numpy as np

random_seed = 42
np.random.seed(random_seed)

data_dir = 'data/'
prep_path = data_dir + 'preprocessed/'
graphs_dir = prep_path + 'graphs/'

prep_dirs = ['CuckooClean/', 'CuckooCleanHippo/', 'CuckooCleanPippo/', 'CuckooVirusShare/']

Convert API calls sequences to bipartite graph representation

In [10]:
processed_traces_num = 0
processed_files_num = 0


def process_log_file(prep_dir, filename, calls_encoding_map, args_encoding_map, args_counter):
    global processed_traces_num
    global processed_files_num
    
    filepath = prep_path + prep_dir + filename
    with open(filepath) as json_file:
        sample_logs = json.load(json_file)

    for log_info in sample_logs:
        g_calls = {}
        g_args = {}
        for call_info in log_info["calls"]:
            if len(call_info) != 1:
                print('Log record {} len not 1 in {}'.format(str(call_info), filepath))
                continue

            call_name = next(iter(call_info))
            call_args = call_info[call_name]

            if not call_name in calls_encoding_map:
                calls_encoding_map[call_name] = len(calls_encoding_map)

            call_code = calls_encoding_map[call_name]

            if not call_code in g_calls:
                g_calls[call_code] = set()

            for call_arg in call_args:
                str_arg = str(call_arg)

                if not str_arg in args_encoding_map:
                    args_encoding_map[str_arg] = len(args_encoding_map)

                arg_code = args_encoding_map[str_arg]
                g_calls[call_code].add(arg_code)

                if not arg_code in g_args:
                    g_args[arg_code] = set()
                g_args[arg_code].add(call_code)

                args_counter[arg_code] = args_counter.get(arg_code, 0) + 1

        graph = {'g_calls' : g_calls, 'g_args' : g_args}
        with open(graphs_dir + prep_dir + str(processed_traces_num) + '.pkl', 'wb') as out_file:
            pickle.dump(graph, out_file, pickle.HIGHEST_PROTOCOL)
            
        processed_traces_num += 1
            
    processed_files_num += 1
        
        
def process_all_logs():
    #map string names with unique numbers
    calls_encoding_map = {}
    args_encoding_map = {}
    args_counter = {}
    for prep_dir in prep_dirs:
        print('Processing ' + prep_path + prep_dir)
        if not os.path.exists(graphs_dir + prep_dir):
            os.makedirs(graphs_dir + prep_dir)

        for filename in tqdm(sorted([x for x in os.listdir(prep_path + prep_dir) if x.endswith('.json')])):            
            process_log_file(prep_dir, filename, calls_encoding_map, args_encoding_map, args_counter)
                
    with open(graphs_dir  + 'calls_encoding_map.pkl', 'wb') as out_file:
        pickle.dump(calls_encoding_map, out_file, pickle.HIGHEST_PROTOCOL)     
    print('calls_encoding_map written')
    calls_encoding_map = {}

    with open(graphs_dir + 'args_counter.pkl', 'wb') as out_file:
        pickle.dump(args_counter, out_file, pickle.HIGHEST_PROTOCOL)        
    print('args_counter written')
    args_counter = {}

    with open(graphs_dir + 'args_encoding_map.pkl', 'wb') as out_file:
        pickle.dump(args_encoding_map, out_file, pickle.HIGHEST_PROTOCOL)        
    print('args_encoding_map written')
    args_encoding_map = {}

In [None]:
process_all_logs()

In [2]:
with open(graphs_dir + 'calls_encoding_map.pkl', 'rb') as in_file:
    unique_calls_num = len(pickle.load(in_file))

Take most frequent arguments

In [3]:
top_unique_args_num = 2**13 #8192
with open(graphs_dir + 'top_unique_args_position_map_' + str(top_unique_args_num) + '.pkl', 'rb') as out_file:
    top_unique_args_position_map = pickle.load(out_file)

In [3]:
with open(graphs_dir + 'args_counter.pkl', 'rb') as in_file:
    args_counter = pickle.load(in_file)
    
sorted_args = sorted(args_counter, key=args_counter.get, reverse=True)
top_unique_args_position_map = {arg : i + unique_calls_num for i, arg in enumerate(sorted_args[:top_unique_args_num])}
with open(graphs_dir + 'top_unique_args_position_map_' + str(top_unique_args_num) + '.pkl', 'wb') as out_file:
    pickle.dump(top_unique_args_position_map, out_file)

Extract behavior patterns from graphs and convert to binary vectors

In [4]:
binary_vectors_size = unique_calls_num + top_unique_args_num

binary_path = prep_path + '8477_vec_size/binary/'
train_path = binary_path + 'train/'
test_path = binary_path + 'test/'
benign_dir = 'benign/'
malicious_dir = 'malicious/'

if not os.path.exists(train_path + benign_dir):
    os.makedirs(train_path + benign_dir)
    
if not os.path.exists(train_path + malicious_dir):
    os.makedirs(train_path + malicious_dir)
    
if not os.path.exists(test_path + benign_dir):
    os.makedirs(test_path + benign_dir)
    
if not os.path.exists(test_path + malicious_dir):
    os.makedirs(test_path + malicious_dir)

In [5]:
def get_num_of_graphs():
    nums_in_dirs = [len([x for x in os.listdir(graphs_dir + prep_dir) if x.endswith('.pkl')]) for prep_dir in prep_dirs]
    return sum(nums_in_dirs)


def extract_behavior_patterns(g_calls, g_args):
    global top_unique_args_position_map
    if len(g_args) == 0:
        return [(set([call_name]), set()) for call_name in g_calls.keys()]
    
    result = []                
    max_arg_degree = max([len(calls_set) for calls_set in g_args.values()])
    for i in range(1, max_arg_degree + 1):
        for arg in [k for k, v in g_args.items() if len(v) == i]:
            pattern_calls = g_args[arg].copy()
            pattern_args = set()
            if arg in top_unique_args_position_map:
                pattern_args.add(arg)
                
            args_candidates = g_calls[next(iter(pattern_calls))]
            for arg_candidate in args_candidates:
                if arg_candidate in top_unique_args_position_map and all([arg_candidate in g_calls[call] for call in pattern_calls]):
                    pattern_args.add(arg_candidate)
                    
            if not any([pattern_calls == p_calls and pattern_args == p_args for p_calls, p_args in result]):
                result.append((pattern_calls, pattern_args))
    
    for call_name in [k for k, v in g_calls.items() if len(v) == 0]:
        result.append((set([call_name]), set()))
        
    return result


def behavior_patterns_to_binary_vectors(nb_patterns):
    global top_unique_args_position_map
    
    result = []
    for pattern_calls, pattern_args in nb_patterns:
        binary_pattern = np.zeros((binary_vectors_size,), dtype=np.byte)
        
        for p_call in pattern_calls:
            binary_pattern[p_call] = 1
            if p_call in top_unique_args_position_map.values():
                print('ggwp')
        
        for p_arg in pattern_args:
            binary_pattern[top_unique_args_position_map[p_arg]] = 1
                
        result.append(binary_pattern)
        
    return np.stack(result)


def process_graphs_to_bin_vectors_dataset():
    processed_count = 0
    num_of_graphs = get_num_of_graphs()

    indices = np.random.permutation(num_of_graphs)
    train_size = len(indices) * 8 // 10
    inds_train = indices[:train_size]
    inds_test = indices[train_size:]
    
    
    for prep_dir in prep_dirs:
        print('Processing {}'.format(graphs_dir + prep_dir))
        filenames = sorted([x for x in os.listdir(graphs_dir + prep_dir) if x.endswith('.pkl')])
        for filename in tqdm(filenames):
            with open(graphs_dir + prep_dir + filename, 'rb') as in_file:
                sample_graph = pickle.load(in_file)
                
            behavior_patterns = extract_behavior_patterns(sample_graph['g_calls'], sample_graph['g_args'])
            bin_patterns = behavior_patterns_to_binary_vectors(behavior_patterns)
            
            save_path = train_path if processed_count in inds_train else test_path
            save_path += malicious_dir if prep_dir == 'CuckooVirusShare/' else benign_dir
            save_path += str(processed_count) + '.npy'
            np.save(save_path, bin_patterns)
            processed_count += 1
    print(processed_count, num_of_graphs)

In [6]:
process_graphs_to_bin_vectors_dataset()

Processing data/preprocessed/graphs/CuckooClean/


HBox(children=(FloatProgress(value=0.0, max=1105.0), HTML(value='')))


Processing data/preprocessed/graphs/CuckooCleanHippo/


HBox(children=(FloatProgress(value=0.0, max=1357.0), HTML(value='')))


Processing data/preprocessed/graphs/CuckooCleanPippo/


HBox(children=(FloatProgress(value=0.0, max=252.0), HTML(value='')))


Processing data/preprocessed/graphs/CuckooVirusShare/


HBox(children=(FloatProgress(value=0.0, max=2026.0), HTML(value='')))


4740 4740
