In [1]:
# Example Feature Extraction from XML Files
# We count the number of specific system calls made by the programs, and use
# these as our features.

# This code requires that the unzipped training set is in a folder called "train". 

import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse

import util

TRAIN_DIR = "train"

call_set = set([])

def add_to_set(tree):
    for el in tree.iter():
        call = el.tag
        call_set.add(call)

def create_data_matrix(start_index, end_index, direc="train"):
    X = None
    classes = []
    ids = [] 
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue

        i += 1
        if i < start_index:
            continue 
        if i >= end_index:
            break

        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        add_to_set(tree)
        this_row = call_feats(tree)
        if X is None:
            X = this_row 
        else:
            X = np.vstack((X, this_row))

    return X, np.array(classes), ids

def call_feats(tree):
    good_calls = ['sleep', 'dump_line']

    call_counter = {}
    for el in tree.iter():
        call = el.tag
        if call not in call_counter:
            call_counter[call] = 0
        else:
            call_counter[call] += 1

    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]

    return call_counter

## Feature extraction
def main():
    X_train, t_train, train_ids = create_data_matrix(0, 5, TRAIN_DIR)
    X_valid, t_valid, valid_ids = create_data_matrix(10, 15, TRAIN_DIR)

    print 'Data matrix (training set):'
    print X_train
    print 'Classes (training set):'
    print t_train

    # From here, you can train models (eg by importing sklearn and inputting X_train, t_train).

if __name__ == "__main__":
    main()
    

Data matrix (training set):
[[ {'process': 2, 'check_for_debugger': 1, 'sleep': 2, 'load_dll': 134, 'create_window': 10, 'com_get_class_object': 0, 'kill_process': 1, 'load_image': 2, 'create_process': 0, 'enum_window': 50, 'find_file': 5, 'open_file': 14, 'set_windows_hook': 1, 'query_value': 241, 'create_mutex': 6, 'create_file': 1, 'open_key': 95, 'get_file_attributes': 11, 'destroy_window': 41, 'find_window': 0, 'set_file_attributes': 1, 'set_file_time': 1, 'enum_keys': 3, 'create_thread': 1, 'enum_values': 0, 'processes': 0, 'vm_protect': 35, 'thread': 5, 'get_system_directory': 13, 'all_section': 5, 'com_create_instance': 1, 'show_window': 5, 'open_process': 1, 'get_windows_directory': 2, 'create_directory': 3}]
 [ {'dump_line': 3433, 'enum_processes': 196, 'process': 5, 'open_url': 5, 'recv_socket': 210, 'check_for_debugger': 3, 'send_socket': 16, 'sleep': 253, 'load_dll': 466, 'create_window': 47, 'com_get_class_object': 207, 'open_scmanager': 7, 'kill_process': 1, 'read_sectio

In [62]:
X_train, t_train, train_ids = create_data_matrix(0, 3086, TRAIN_DIR)

In [64]:
features = dict()
for i in call_set:
    features[i] = []
    
for i in X_train:
    for j in call_set:
        if j in i[0].keys():
            features[j].append(i[0][j])
        else:
            features[j].append(-1)
    

In [67]:
import pandas as pd

In [68]:
train_set = pd.DataFrame(features)
train_set['id'] = train_ids
train_set['class'] = t_train


In [69]:
train_set

Unnamed: 0,accept_socket,add_netjob,all_section,bind_socket,change_service_config,check_for_debugger,com_create_instance,com_createole_object,com_get_class_object,connect,...,trimmed_bytes,unload_driver,vm_allocate,vm_mapviewofsection,vm_protect,vm_read,vm_write,write_value,id,class
0,-1,-1,5,-1,-1,1,1,-1,0,-1,...,-1,-1,-1,-1,35,-1,-1,-1,00269ea50001a6c699d0222032d45b74b2e7e8be9,8
1,-1,-1,40,5,-1,3,338,-1,207,4,...,204,-1,-1,-1,364,-1,-1,2,00278ec420236020d6121dffe0cc20034422e7228,6
2,-1,-1,0,-1,-1,0,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,002d5615d19c851934dc481c607b6a74a6e9e536e,12
3,-1,-1,1,-1,-1,0,0,-1,-1,-1,...,-1,-1,-1,-1,53,-1,-1,-1,006be5Dc265600c19728c9747fb4c7bc9e8d6f106,8
4,-1,-1,6,-1,-1,1,4,-1,1,-1,...,37,-1,7,-1,24,-1,4,-1,0089453df77890cae95ce7d9130a4ef85eaea36e8,10
5,-1,-1,0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,009b59ea3f1f2023f41ce6c7e9c1376e57008fc01,14
6,-1,-1,4,-1,-1,1,1,-1,0,-1,...,-1,-1,-1,-1,35,-1,-1,-1,009e91f9f01186d90cac584d950d6246700a48d35,8
7,-1,-1,7,9,-1,4,31,-1,-1,-1,...,47,-1,-1,-1,19,-1,-1,-1,00bee48acc9d1774e4edf96f9582fac06b2ec1f14,8
8,-1,-1,4,-1,-1,0,1,-1,0,-1,...,-1,-1,-1,-1,31,-1,-1,4,00bef0928a8a8e1001e99c0327898652b6f0bc75c,8
9,-1,-1,4,-1,-1,1,1,-1,0,-1,...,-1,-1,-1,-1,35,-1,-1,-1,00bfdb4aa7b18d3bc0e28d2eb0cec76900b29b8b8,8
