In [79]:
# Example Feature Extraction from XML Files
# We count the number of specific system calls made by the programs, and use
# these as our features.

# This code requires that the unzipped training set is in a folder called "train". 

import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse

import util

TRAIN_DIR = "train"

call_set = set([])

def add_to_set(tree):
    for el in tree.iter():
        call = el.tag
        call_set.add(call)

def create_data_matrix(start_index, end_index, direc="train"):
    X = None
    classes = []
    ids = [] 
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue

        i += 1
        if i < start_index:
            continue 
        if i >= end_index:
            break

        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        add_to_set(tree)
        this_row = call_feats(tree)
        if X is None:
            X = this_row 
        else:
            X = np.vstack((X, this_row))

    return X, np.array(classes), ids

def call_feats(tree):
    good_calls = ['sleep', 'dump_line']

    call_counter = {}
    for el in tree.iter():
        call = el.tag
        if call not in call_counter:
            call_counter[call] = 1
        else:
            call_counter[call] += 1

    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]

    return call_counter

## Feature extraction
def main():
    X_train, t_train, train_ids = create_data_matrix(0, 5, TRAIN_DIR)
    X_valid, t_valid, valid_ids = create_data_matrix(10, 15, TRAIN_DIR)

    print 'Data matrix (training set):'
    print X_train
    print 'Classes (training set):'
    print t_train

    # From here, you can train models (eg by importing sklearn and inputting X_train, t_train).

if __name__ == "__main__":
    main()
    

Data matrix (training set):
[[ {'process': 3, 'check_for_debugger': 2, 'sleep': 3, 'load_dll': 135, 'create_window': 11, 'com_get_class_object': 1, 'kill_process': 2, 'load_image': 3, 'create_process': 1, 'enum_window': 51, 'find_file': 6, 'open_file': 15, 'set_windows_hook': 2, 'query_value': 242, 'create_mutex': 7, 'create_file': 2, 'open_key': 96, 'get_file_attributes': 12, 'destroy_window': 42, 'find_window': 1, 'set_file_attributes': 2, 'set_file_time': 2, 'enum_keys': 4, 'create_thread': 2, 'enum_values': 1, 'processes': 1, 'vm_protect': 36, 'thread': 6, 'get_system_directory': 14, 'all_section': 6, 'com_create_instance': 2, 'show_window': 6, 'open_process': 2, 'get_windows_directory': 3, 'create_directory': 4}]
 [ {'dump_line': 3434, 'enum_processes': 197, 'process': 6, 'open_url': 6, 'recv_socket': 211, 'check_for_debugger': 4, 'send_socket': 17, 'sleep': 254, 'load_dll': 467, 'create_window': 48, 'com_get_class_object': 208, 'open_scmanager': 8, 'kill_process': 2, 'read_sectio

In [80]:
X_train, t_train, train_ids = create_data_matrix(0, 3086, TRAIN_DIR)

In [None]:
X_test, t_test, test_ids = create_data_matrix(0, 3728, 'test')

In [81]:
features = dict()
for i in call_set:
    features[i] = []
    
for i in X_train:
    for j in call_set:
        if j in i[0].keys():
            features[j].append(i[0][j])
        else:
            features[j].append(0)
    

In [82]:
import pandas as pd

In [83]:
train_set = pd.DataFrame(features)
train_set['id'] = train_ids
train_set['class'] = t_train

In [84]:
train_set

Unnamed: 0,accept_socket,add_netjob,all_section,bind_socket,change_service_config,check_for_debugger,com_create_instance,com_createole_object,com_get_class_object,connect,...,trimmed_bytes,unload_driver,vm_allocate,vm_mapviewofsection,vm_protect,vm_read,vm_write,write_value,id,class
0,0,0,6,0,0,2,2,0,1,0,...,0,0,0,0,36,0,0,0,00269ea50001a6c699d0222032d45b74b2e7e8be9,8
1,0,0,41,6,0,4,339,0,208,5,...,205,0,0,0,365,0,0,3,00278ec420236020d6121dffe0cc20034422e7228,6
2,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,002d5615d19c851934dc481c607b6a74a6e9e536e,12
3,0,0,2,0,0,1,1,0,0,0,...,0,0,0,0,54,0,0,0,006be5Dc265600c19728c9747fb4c7bc9e8d6f106,8
4,0,0,7,0,0,2,5,0,2,0,...,38,0,8,0,25,0,5,0,0089453df77890cae95ce7d9130a4ef85eaea36e8,10
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,009b59ea3f1f2023f41ce6c7e9c1376e57008fc01,14
6,0,0,5,0,0,2,2,0,1,0,...,0,0,0,0,36,0,0,0,009e91f9f01186d90cac584d950d6246700a48d35,8
7,0,0,8,10,0,5,32,0,0,0,...,48,0,0,0,20,0,0,0,00bee48acc9d1774e4edf96f9582fac06b2ec1f14,8
8,0,0,5,0,0,1,2,0,1,0,...,0,0,0,0,32,0,0,5,00bef0928a8a8e1001e99c0327898652b6f0bc75c,8
9,0,0,5,0,0,2,2,0,1,0,...,0,0,0,0,36,0,0,0,00bfdb4aa7b18d3bc0e28d2eb0cec76900b29b8b8,8


In [87]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

h = .02  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
         "Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Analysis",
         "Quadratic Discriminant Analysis"]
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()]

In [90]:
y_train = train_set['class']
X_train = train_set.drop(['id', 'class'], axis=1)
y_test = train_set['class']
X_test = train_set.drop(['id', 'class'], axis=1)

In [92]:
for name, clf in zip(names, classifiers):
#         ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print name, score


Nearest Neighbors 0.837016574586
Linear SVM 0.85635359116
RBF SVM 0.628913443831
Decision Tree 0.822283609576
Random Forest 0.792817679558
AdaBoost



 0.72467771639
Naive Bayes 0.33241252302
Linear Discriminant Analysis 0.830570902394
Quadratic Discriminant Analysis 0.791896869245


