In [58]:
# Example Feature Extraction from XML Files
# We count the number of specific system calls made by the programs, and use
# these as our features.

# This code requires that the unzipped training set is in a folder called "train". 

import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse

import util

TRAIN_DIR = "train"

call_set = set([])

def add_to_set(tree):
    for el in tree.iter():
        call = el.tag
        call_set.add(call)

def create_data_matrix(start_index, end_index, direc="train"):
    X = None
    classes = []
    ids = [] 
    i = -1
    mark_calls= set([])
    rows=[]
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue

        i += 1
        if i < start_index:
            continue 
        if i >= end_index:
            break

        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        add_to_set(tree)
        this_row,mark_calls = call_feats(tree,mark_calls)
        if rows is None:
            rows=this_row
        else:
            rows.append(this_row)
    
    
    cfeats=list(mark_calls)

    for row in rows:
        rfeats=make_feats(row,cfeats)
        if X is None:
            X = rfeats 
        else:
            X = np.vstack((X, rfeats))

    return X, np.array(classes), ids, cfeats

def call_feats(tree,mark_calls):
#     mark_calls = ['sleep', 'dump_line']

    call_counter = {}
    for el in tree.iter():
        call = el.tag
        if call not in call_counter:
            call_counter[call] = 0
        else:
            call_counter[call] += 1

    mark_calls.update(call_counter.keys())
        
    return call_counter,mark_calls

def make_feats(row,feats):
    call_feat_array = np.zeros(len(feats))
    for i in range(len(feats)):
        call = feats[i]
        call_feat_array[i] = 0
        if call in row:
            call_feat_array[i] = row[call]

    return call_feat_array
# ## Feature extraction
# def main():
#     X_train, t_train, train_ids = create_data_matrix(0, 5, TRAIN_DIR)
#     X_valid, t_valid, valid_ids = create_data_matrix(10, 15, TRAIN_DIR)

#     print 'Data matrix (training set):'
#     print X_train
#     print 'Classes (training set):'
#     print t_train

#     # From here, you can train models (eg by importing sklearn and inputting X_train, t_train).

# if __name__ == "__main__":
#     main()
    

In [60]:
X_train, t_train, train_ids, features = create_data_matrix(0, 10, TRAIN_DIR)

In [61]:
X_train


array([[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          2.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   2.00000000e+00,   1.34000000e+02,
          1.00000000e+01,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          2.00000000e+00,   0.00000000e+00,   5.00000000e+01,
          0.00000000e+00,   5.00000000e+00,   1.40000000e+01,
          0.00000000e+00,   1.00000000e+00,   2.41000000e+02,
          0.00000000e+00,   6.00000000e+00,   1.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          9.50000000e+01,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.10000000e+01,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   4.10000000e+01,
          1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
        