In [1]:
# Example Feature Extraction from XML Files
# We count the number of specific system calls made by the programs, and use
# these as our features.

# This code requires that the unzipped training set is in a folder called "train". 

import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse

import util

TRAIN_DIR = "train"

call_set = set([])

def add_to_set(tree):
    for el in tree.iter():
        call = el.tag
        call_set.add(call)

def create_data_matrix(start_index, end_index, direc="train"):
    X = None
    classes = []
    ids = [] 
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue

        i += 1
        if i < start_index:
            continue 
        if i >= end_index:
            break

        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        if direc=="train":
            add_to_set(tree)
        this_row = call_feats(tree)
        if X is None:
            X = this_row 
        else:
            X = np.vstack((X, this_row))

    return X, np.array(classes), ids

def call_feats(tree):
    good_calls = ['sleep', 'dump_line']

    call_counter = {}
    for el in tree.iter():
        call = el.tag
        if call not in call_counter:
            call_counter[call] = 1
        else:
            call_counter[call] += 1

    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]

    return call_counter

# # Feature extraction
# def main():
#     X_train, t_train, train_ids = create_data_matrix(0, 5, TRAIN_DIR)
#     X_valid, t_valid, valid_ids = create_data_matrix(10, 15, TRAIN_DIR)

#     print 'Data matrix (training set):'
#     print X_train
#     print 'Classes (training set):'
#     print t_train

#     # From here, you can train models (eg by importing sklearn and inputting X_train, t_train).

# if __name__ == "__main__":
#     main()
    

In [2]:
X_train, t_train, train_ids = create_data_matrix(0, 3086, TRAIN_DIR)

In [3]:
X_test, t_test, test_ids = create_data_matrix(0, 3728, 'test')

In [4]:
def convertMatrix(X, call_set):
    features = dict()
    for i in call_set:
        features[i] = []
    
    for i in X:
        for j in call_set:
            if j in i[0].keys():
                features[j].append(i[0][j])
            else:
                features[j].append(0)
    
    return features

In [5]:
import pandas as pd

In [6]:
train_set = pd.DataFrame(convertMatrix(X_train,call_set))
test_set= pd.DataFrame(convertMatrix(X_test,call_set))
# train_set['id'] = train_ids
# train_set['class'] = t_train

In [7]:
train_set['id'] = train_ids
train_set['class'] = t_train

In [8]:
test_set['id'] = test_ids

In [10]:
#train_set

In [None]:
feature_eng_dir = 'train/'
id_list_feat_eng = []
hash_error_list_feat_eng = []
security_anony_list_feat_eng = []
class_list_feat_eng = []
for file_name in os.listdir('train/'):
    if file_name == '.DS_Store':
        continue
    content = open(feature_eng_dir + file_name, 'r').read()
    num_of_line = content.count('<')
    num_of_hash_error = content.count('hash_error')
    num_of_security_anony = content.count('SECURITY_ANONYMOUS')
    class_list_feat_eng.append(file_name.split('.')[1])
    
    id_list_feat_eng.append(file_name.split('.')[0])
    hash_error_list_feat_eng.append(1.0*num_of_hash_error/num_of_line)
    security_anony_list_feat_eng.append(1.0*num_of_security_anony/num_of_line)

In [None]:
import copy
train_set_2=copy.deepcopy(train_set)
train_set_2['id'] =train_ids
feat_eng_df = pd.DataFrame({'id':id_list_feat_eng, 'hash_error':hash_error_list_feat_eng, 'security_anonymous':security_anony_list_feat_eng})
new=pd.merge(train_set_2, feat_eng_df, on='id')
new=new.drop('id',axis=1)

In [None]:
from scipy.stats.stats import pearsonr   
b=new['security_anonymous'].values
type(t_train)
pearsonr(t_train,b)

In [14]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

h = .02  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVM","Poly SVM","Sigmoid SVM","RBF SVM", "Decision Tree",
         "Random Forest",  "Linear Discriminant Analysis",
         "Quadratic Discriminant Analysis"]
classifiers = [
    KNeighborsClassifier(5),
    SVC(kernel="linear", C=0.025),
    SVC(kernel="poly", C=0.025),
    SVC(kernel="sigmoid", C=0.025),
    SVC(kernel="rbf",C=0.025),
    #SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=10),
    RandomForestClassifier(max_depth=10, n_estimators=10, max_features=1),
    #AdaBoostClassifier(),
    #GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()]

In [15]:
y_train = t_train
X_train = train_set.drop(['id', 'class'], axis = 1)
# y_valid = t_train
# X_valid = train_set[2000:].drop(['id', 'class'], axis = 1)
y_test = t_test
X_test = test_set

In [16]:
t_test

array([-1, -1, -1, ..., -1, -1, -1])

In [23]:
for name, clf in zip(names, classifiers):
#         ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        clf.fit(X_train, y_train)
        score = clf.score(X_valid, y_valid)
        print name, score


Nearest Neighbors 0.837937384899
Linear SVM 0.85635359116
Poly SVM 0.82320441989
Sigmoid SVM 0.515653775322
RBF SVM 0.622467771639
Decision Tree 0.858195211786
Random Forest



 0.883057090239
Linear Discriminant Analysis 0.830570902394
Quadratic Discriminant Analysis 0.791896869245




In [17]:
#X_train

In [29]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
                                 max_depth=5, random_state=2).fit(X_train, y_train)
clf.score(X_valid, y_valid)                 

0.87476979742173111

In [41]:
%%time
clfrf, Xtrain, ytrain, Xtest, ytest = do_classify(GradientBoostingClassifier(),
                                                   {"learning_rate": [0.01,0.1,0.2],
                                                    "random_state":[1,2,3, 5]}, 
                                                   X_train,t_train,mask=mask)

using mask
BEST {'learning_rate': 0.1, 'random_state': 1} 0.876388888889 [mean: 0.87037, std: 0.01682, params: {'learning_rate': 0.01, 'random_state': 1}, mean: 0.87083, std: 0.01647, params: {'learning_rate': 0.01, 'random_state': 2}, mean: 0.87037, std: 0.01684, params: {'learning_rate': 0.01, 'random_state': 3}, mean: 0.87037, std: 0.01689, params: {'learning_rate': 0.01, 'random_state': 5}, mean: 0.87639, std: 0.01698, params: {'learning_rate': 0.1, 'random_state': 1}, mean: 0.87500, std: 0.01534, params: {'learning_rate': 0.1, 'random_state': 2}, mean: 0.87639, std: 0.01484, params: {'learning_rate': 0.1, 'random_state': 3}, mean: 0.87639, std: 0.01439, params: {'learning_rate': 0.1, 'random_state': 5}, mean: 0.87176, std: 0.01227, params: {'learning_rate': 0.2, 'random_state': 1}, mean: 0.86991, std: 0.01487, params: {'learning_rate': 0.2, 'random_state': 2}, mean: 0.87130, std: 0.01569, params: {'learning_rate': 0.2, 'random_state': 3}, mean: 0.87222, std: 0.01500, params: {'lea



In [44]:
t_test=clfrf.predict(X_test)
write_predictions(t_test,test_ids,"gradient.csv")

In [None]:
clf=RandomForestClassifier(max_depth=10, n_estimators=10, max_features=1)
clf.fit(train_set, t_train)
t_test=clf.predict(X_test)

In [None]:
t_test

In [32]:
# these are the fifteen malware classes we're looking for
malware_classes = ["Agent", "AutoRun", "FraudLoad", "FraudPack", "Hupigon", "Krap",
           "Lipler", "Magania", "None", "Poison", "Swizzor", "Tdss",
           "VB", "Virut", "Zbot"]

# a function for writing predictions in the required format
def write_predictions(predictions, ids, outfile):
    """
    assumes len(predictions) == len(ids), and that predictions[i] is the
    index of the predicted class with the malware_classes list above for 
    the executable corresponding to ids[i].
    outfile will be overwritten
    """
    with open(outfile,"w+") as f:
        # write header
        f.write("Id,Prediction\n")
        for i, history_id in enumerate(ids):
            f.write("%s,%d\n" % (history_id, predictions[i]))

In [None]:
write_predictions(t_test,test_ids,"randomforrest.csv")

# Exploration

In [None]:
%matplotlib inline

In [None]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


sns.set(style="white")

# Generate a large random dataset

# Compute the correlation matrix
corr = train_set.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
            square=True, xticklabels=5, yticklabels=5,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)

In [None]:
plt.show()

In [8]:
from sklearn.decomposition import PCA

pca=PCA(n_components=20)
pca.fit_transform(train_set,t_train)
pca.explained_variance_ratio_

array([  3.91315276e-01,   2.63259786e-01,   1.45155855e-01,
         9.41070436e-02,   4.22026597e-02,   1.50369668e-02,
         1.35901802e-02,   1.08299298e-02,   9.02474844e-03,
         4.79322777e-03,   3.51131656e-03,   1.46616478e-03,
         1.30781114e-03,   1.09285478e-03,   8.78713852e-04,
         4.77860237e-04,   4.44813602e-04,   3.01532820e-04,
         2.17952089e-04,   1.66558047e-04])

In [9]:
pca_train=pca.fit_transform(train_set,t_train)
pca_test=pca.transform(test_set)

# Systemized Model Selection

In [36]:
from sklearn.cross_validation import train_test_split
itrain, itest = train_test_split(xrange(train_set.shape[0]), train_size=0.7,random_state=123)
mask=np.ones(train_set.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

In [37]:
from sklearn.grid_search import GridSearchCV

"""
Function
--------
cv_optimize

Inputs
------
clf : an instance of a scikit-learn classifier
parameters: a parameter grid dictionary thats passed to GridSearchCV (see above)
X: a samples-features matrix in the scikit-learn style
y: the response vectors of 1s and 0s (+ives and -ives)
n_folds: the number of cross-validation folds (default 5)
score_func: a score function we might want to pass (default python None)
   
Returns
-------
The best estimator from the GridSearchCV, after the GridSearchCV has been used to
fit the model.
     
Notes
-----
see do_classify and the code below for an example of how this is used
"""
#your code here

def cv_optimize(clf,parameters,X,y,n_folds,score_func=None):
    if score_func:
        fitmodel = GridSearchCV(clf, param_grid=parameters, cv=n_folds, scoring=score_func)
    else:
        fitmodel = GridSearchCV(clf, param_grid=parameters, cv=n_folds)
    fitmodel.fit(X,y)
    print "BEST", fitmodel.best_params_, fitmodel.best_score_, fitmodel.grid_scores_
    best = fitmodel.best_estimator_
    return best

In [38]:
from sklearn.metrics import confusion_matrix
def do_classify(clf, parameters, train, test, mask=None, reuse_split=None, score_func=None, n_folds=5):
    
    X=train
    y=test
    if mask !=None:
        print "using mask"
        Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
    if reuse_split !=None:
        print "using reuse split"
        Xtrain, Xtest, ytrain, ytest = reuse_split['Xtrain'], reuse_split['Xtest'], reuse_split['ytrain'], reuse_split['ytest']
    if parameters:
        clf = cv_optimize(clf, parameters, Xtrain, ytrain, n_folds=n_folds, score_func=score_func)
    clf=clf.fit(Xtrain, ytrain)
    training_accuracy = clf.score(Xtrain, ytrain)
    test_accuracy = clf.score(Xtest, ytest)
    print "############# based on standard predict ################"
    print "Accuracy on training data: %0.2f" % (training_accuracy)
    print "Accuracy on test data:     %0.2f" % (test_accuracy)
    print confusion_matrix(ytest, clf.predict(Xtest))
    print "########################################################"
    return clf, Xtrain, ytrain, Xtest, ytest

In [None]:
%%time
clfrf, Xtrain, ytrain, Xtest, ytest = do_classify(RandomForestClassifier(),
                                                   {"max_depth": [10,20,50,100,200,500]}, 
                                                   train_set,t_train,mask=mask)

In [None]:
t_test=clfrf.predict(X_test)
write_predictions(t_test,test_ids,"randomforrest2.csv")

In [16]:
%%time
clfrf, Xtrain, ytrain, Xtest, ytest = do_classify(RandomForestClassifier(random_state=123),
                                                   {"max_depth": [10,20,50],
                                                    "n_estimators":[1,5,10,20,50]}, 
                                                   train_set,t_train,mask=mask)

using mask
BEST {'n_estimators': 50, 'max_depth': 20} 0.883333333333 [mean: 0.83380, std: 0.01164, params: {'n_estimators': 1, 'max_depth': 10}, mean: 0.87083, std: 0.01103, params: {'n_estimators': 5, 'max_depth': 10}, mean: 0.87454, std: 0.01100, params: {'n_estimators': 10, 'max_depth': 10}, mean: 0.87593, std: 0.01088, params: {'n_estimators': 20, 'max_depth': 10}, mean: 0.87917, std: 0.01683, params: {'n_estimators': 50, 'max_depth': 10}, mean: 0.84074, std: 0.00589, params: {'n_estimators': 1, 'max_depth': 20}, mean: 0.86620, std: 0.00570, params: {'n_estimators': 5, 'max_depth': 20}, mean: 0.87824, std: 0.01279, params: {'n_estimators': 10, 'max_depth': 20}, mean: 0.87546, std: 0.00940, params: {'n_estimators': 20, 'max_depth': 20}, mean: 0.88333, std: 0.01391, params: {'n_estimators': 50, 'max_depth': 20}, mean: 0.82870, std: 0.00936, params: {'n_estimators': 1, 'max_depth': 50}, mean: 0.86713, std: 0.00693, params: {'n_estimators': 5, 'max_depth': 50}, mean: 0.87639, std: 0.01



In [34]:
reuse_split=dict(Xtrain=Xtrain, Xtest=Xtest, ytrain=ytrain, ytest=ytest)

NameError: name 'Xtrain' is not defined

In [50]:
%%time
from sklearn.linear_model import LogisticRegression 

clflr, Xtrain, ytrain, Xtest, ytest = do_classify(LogisticRegression(penalty="l2",random_state=12,\
                                                                     multi_class="multinomial",class_weight='balanced'), \
                                                   {"C": [0.01, 0.1, 1.0, 10.0],"solver":["newton-cg"]},
                                                   train_set,t_train,mask=mask)


using mask




ValueError: could not convert string to float: ffdba6079b981688512353cF89ca7e1b8f4868263

In [23]:
t_test=clflr.predict(test_set)
write_predictions(t_test,test_ids,"logistic2.csv")

In [None]:
%%time
# from sklearn.neural_network import MLPClassifier

# clfnn, Xtrain, ytrain, Xtest, ytest = do_classify(MLPClassifier(algorithm="sgd",activation="logistic",
#                                                                random_state=1),
#                                                    {"alpha": [1e-5,1e-4,1e-3,1e-2,1],
#                                                     "n_estimators":[1,5,10]}, 
#                                                    train_set,t_train,reuse_split=reuse_split)

# PCA transformation

In [None]:
from sklearn.cross_validation import train_test_split
itrain, itest = train_test_split(xrange(pca_train.shape[0]), train_size=0.7)
mask=np.ones(pca_train.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)

In [25]:
%%time
clfrf, Xtrain, ytrain, Xtest, ytest = do_classify(RandomForestClassifier(random_state=123),
                                                   {"max_depth": [10,20,50],
                                                    "n_estimators":[1,5,10,20,50]}, 
                                                   pca_train,t_train,mask=mask)

using mask
BEST {'n_estimators': 50, 'max_depth': 20} 0.869907407407 [mean: 0.81898, std: 0.01532, params: {'n_estimators': 1, 'max_depth': 10}, mean: 0.85324, std: 0.00982, params: {'n_estimators': 5, 'max_depth': 10}, mean: 0.85972, std: 0.00693, params: {'n_estimators': 10, 'max_depth': 10}, mean: 0.86157, std: 0.00596, params: {'n_estimators': 20, 'max_depth': 10}, mean: 0.86944, std: 0.01028, params: {'n_estimators': 50, 'max_depth': 10}, mean: 0.82269, std: 0.02251, params: {'n_estimators': 1, 'max_depth': 20}, mean: 0.84769, std: 0.01220, params: {'n_estimators': 5, 'max_depth': 20}, mean: 0.85741, std: 0.01079, params: {'n_estimators': 10, 'max_depth': 20}, mean: 0.86435, std: 0.01150, params: {'n_estimators': 20, 'max_depth': 20}, mean: 0.86991, std: 0.01455, params: {'n_estimators': 50, 'max_depth': 20}, mean: 0.82269, std: 0.02251, params: {'n_estimators': 1, 'max_depth': 50}, mean: 0.84722, std: 0.01164, params: {'n_estimators': 5, 'max_depth': 50}, mean: 0.85648, std: 0.01



In [26]:
%%time
from sklearn.linear_model import LogisticRegression 

clflr, Xtrain, ytrain, Xtest, ytest = do_classify(LogisticRegression(penalty="l2",random_state=12,\
                                                                     multi_class="ovr"), \
                                                   {"C": [0.01, 0.1, 1.0, 10.0],"solver":["sag","newton-cg"]},
                                                   pca_train,t_train,reuse_split=reuse_split)

using reuse split
BEST {'C': 1.0, 'solver': 'newton-cg'} 0.855555555556 [mean: 0.66389, std: 0.00949, params: {'C': 0.01, 'solver': 'sag'}, mean: 0.84954, std: 0.01880, params: {'C': 0.01, 'solver': 'newton-cg'}, mean: 0.66389, std: 0.00949, params: {'C': 0.1, 'solver': 'sag'}, mean: 0.85370, std: 0.01900, params: {'C': 0.1, 'solver': 'newton-cg'}, mean: 0.66389, std: 0.00949, params: {'C': 1.0, 'solver': 'sag'}, mean: 0.85556, std: 0.01823, params: {'C': 1.0, 'solver': 'newton-cg'}, mean: 0.66389, std: 0.00949, params: {'C': 10.0, 'solver': 'sag'}, mean: 0.85324, std: 0.02000, params: {'C': 10.0, 'solver': 'newton-cg'}]
############# based on standard predict ################
Accuracy on training data: 0.92
Accuracy on test data:     0.86
[[ 14   0   0   0   0   2   0   1  13   1   0   0   3   0   1]
 [  0   9   0   0   0   0   0   0   0   2   0   0   5   0   0]
 [  0   0   4   3   0   2   0   0   2   0   0   0   0   0   0]
 [  0   0   2   5   0   4   0   0   2   0   0   0   0   0   0

In [None]:
len(call_set)

In [49]:
from GaussianGenerativeModel import GaussianGenerativeModel

nb1 = GaussianGenerativeModel(isSharedCovariance=False)
nb1.fit(train_set.values,t_train)
call_list=list(call_set)
t_test=nb1.predict(test_set.values,call_list)

TypeError: can't multiply sequence by non-int of type 'float'

In [None]:
write_predictions(t_test,test_ids,"GMM2.csv")

In [None]:
nb1 = GaussianGenerativeModel(isSharedCovariance=False)
nb1.fit(pca_train,t_train)
call_list=list(call_set)
t_test=nb1.predict(pca_test,call_list)

In [11]:
# Kaggle score 0.65158

In [None]:
%%time

from sklearn.mixture import GMM

# clflr, Xtrain, ytrain, Xtest, ytest = do_classify(GMM(covariance_type='diag',random_state=None,
#                                                       n_components=15,n_iter=100), \
#                                                   {},pca_train,t_train,reuse_split=reuse_split)

clfgmm=GMM(n_components=15)
clfgmm.fit(pca_train[:2000],y=t_train[:2000])
print np.average(clfgmm.score(pca_train[2000:],y=t_train[2000:]))

clfgmm.fit(pca_train,y=t_train)
t_test=clfgmm.predict(pca_test)
write_predictions(t_test,test_ids,"GMM.csv")

# SVM

In [48]:
%%time
clflsvm, Xtrain, ytrain, Xtest, ytest = do_classify(SVC(random_state=123), \
                                                   {"C": [0.01, 0.1, 1.0, 10.0],"kernel":["sigmoid","poly","rbf","linear"]},
                                                   train_set,t_train,reuse_split=reuse_split)

using reuse split
BEST {'kernel': 'linear', 'C': 0.01} 0.859259259259 [mean: 0.51157, std: 0.00456, params: {'kernel': 'sigmoid', 'C': 0.01}, mean: 0.82269, std: 0.01483, params: {'kernel': 'poly', 'C': 0.01}, mean: 0.55602, std: 0.03667, params: {'kernel': 'rbf', 'C': 0.01}, mean: 0.85926, std: 0.01660, params: {'kernel': 'linear', 'C': 0.01}, mean: 0.51157, std: 0.00456, params: {'kernel': 'sigmoid', 'C': 0.1}, mean: 0.81898, std: 0.01642, params: {'kernel': 'poly', 'C': 0.1}, mean: 0.67083, std: 0.01117, params: {'kernel': 'rbf', 'C': 0.1}, mean: 0.85463, std: 0.01729, params: {'kernel': 'linear', 'C': 0.1}, mean: 0.51157, std: 0.00456, params: {'kernel': 'sigmoid', 'C': 1.0}, mean: 0.81898, std: 0.01946, params: {'kernel': 'poly', 'C': 1.0}, mean: 0.77593, std: 0.00996, params: {'kernel': 'rbf', 'C': 1.0}, mean: 0.85833, std: 0.01218, params: {'kernel': 'linear', 'C': 1.0}, mean: 0.51157, std: 0.00456, params: {'kernel': 'sigmoid', 'C': 10.0}, mean: 0.81852, std: 0.01861, params: {

In [49]:
clflsvm, Xtrain, ytrain, Xtest, ytest = do_classify(SVC(random_state=123), \
                                                   {"C": [0.001],"kernel":["linear"]},
                                                   train_set,t_train,reuse_split=reuse_split)

using reuse split
BEST {'kernel': 'linear', 'C': 0.001} 0.856481481481 [mean: 0.85648, std: 0.01585, params: {'kernel': 'linear', 'C': 0.001}]
############# based on standard predict ################
Accuracy on training data: 0.90
Accuracy on test data:     0.85
[[ 12   0   1   0   0   1   1   1  15   0   2   0   1   0   1]
 [  0   7   0   1   0   0   0   0   1   0   2   0   5   0   0]
 [  1   0   6   0   0   1   0   0   3   0   0   0   0   0   0]
 [  0   0   0  10   0   1   0   0   2   0   0   0   0   0   0]
 [  1   0   0   0   2   0   0   0   7   0   0   0   3   0   0]
 [  0   0   0   0   0   5   0   0   2   0   0   0   0   0   1]
 [  0   0   0   0   0   0  10   0   2   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   4   6   0   0   0   0   0   0]
 [  7   2   0   0   0   3   2   1 467   0   4   1  17   0   0]
 [  0   0   0   0   0   0   0   0   2   0   3   0   1   0   0]
 [  0   0   0   0   0   0   0   0   0   0 146   0   0   0   0]
 [  0   0   1   0   0   0   0   0   2   0  

In [50]:
X_train.shape

(3086, 106)

# Feature engineering

In [9]:
len(call_set)

106

In [10]:
'cfceeea896da8c14eb2fc96791ee98b33e843db03.VB.xml'.split('.')[0]

'cfceeea896da8c14eb2fc96791ee98b33e843db03'

In [11]:
whole_file_flow = dict()

for datafile in os.listdir('train'):
    if datafile == '.DS_Store':
        continue
        
    file1_tag_flow = []
    file1 = open('train/'+datafile, 'r')
#     file1.readline()
    for line in file1:
        tag = line.split(' ')[0][1:]
        if tag in list(call_set):
            file1_tag_flow.append(tag)
    
    file1_tag_flow_unique = [file1_tag_flow[0]]
    for i in range(1, len(file1_tag_flow)):
        if file1_tag_flow[i] == file1_tag_flow_unique[-1]:
            continue
        else :
            file1_tag_flow_unique.append(file1_tag_flow[i])
        
    whole_file_flow[datafile.split('.')[0]] = file1_tag_flow_unique
# print file1_tag_flow

In [12]:
from scipy.stats import itemfreq

file_gram_pair = dict()

for k in whole_file_flow.keys():
    file1_tag_flow_unique = whole_file_flow[k]
#     N = 4 # bi-grams
    grams=[]
#     for n in range(2,5):
    n = 4
    grams += [file1_tag_flow_unique[i:i+n] for i in xrange(len(file1_tag_flow_unique)-n)]
    #print len(grams)
    # ?itemfreq

    single_key_grams = []
    for i in grams:
        single_key_grams.append(' '.join(i))
        
    grams = []
    for i in list(itemfreq(single_key_grams)):
        grams.append([i[0], int(i[1])])
    grams.sort(key = lambda x: x[1], reverse = True) 
    file_gram_pair[k] = grams
# print itemfreq(grams)

# for i in file1_tag_flow_unique:
#     for j in file1_tag_flow_unique

In [13]:
new_features = []
for v in file_gram_pair.values():
    new_features += [i[0] for i in v if i[1] >= 4]

In [14]:
itemfreq((new_features))
feature = []
for i in list(itemfreq((new_features))):
    feature.append([i[0], int(i[1])])

feature.sort(key = lambda x: x[1], reverse = True)

In [15]:
features_for_df =[]
for i in feature[:100]:
    features_for_df.append(i[0])
features_for_df

['open_key query_value open_key query_value',
 'query_value open_key query_value open_key',
 'vm_protect vm_write vm_allocate vm_protect',
 'process thread load_image load_dll',
 'dump_line trimmed_bytes recv_socket dump_line',
 'recv_socket dump_line trimmed_bytes recv_socket',
 'trimmed_bytes recv_socket dump_line trimmed_bytes',
 'dump_line trimmed_bytes send_socket dump_line',
 'open_key enum_keys open_key query_value',
 'send_socket dump_line trimmed_bytes send_socket',
 'open_key query_value load_dll create_window',
 'trimmed_bytes send_socket dump_line trimmed_bytes',
 'load_dll open_key query_value open_key',
 'open_file open_key query_value open_file',
 'open_file open_key query_value open_key',
 'open_key query_value open_file find_file',
 'query_value open_file find_file open_key',
 'get_file_attributes read_value get_file_attributes read_value',
 'open_key query_value load_dll open_key',
 'read_value get_file_attributes read_value get_file_attributes',
 'load_dll open_key l

In [16]:
add_dict = {}
l = len(train_set) 
add_dict['id']=[]
for i in features_for_df:
    add_dict[i] = [0]*l

In [17]:
count = 0
for k, v in file_gram_pair.items():
#     add_dict = {}
    add_dict['id'].append(k)
    
    for pair in v :
        if pair[0] in features_for_df:
            add_dict[pair[0]][count] = pair[1]
    count += 1

In [18]:
gram_df = pd.DataFrame(add_dict)

In [19]:
gram_df['id']

0       d7aa4c075833e15251aa5832c136fa979e80179da
1       a62317c20af9acd81921d0ac790423e4836b88b3e
2       0e1574318cb4a8bb36a356bb3f84413e219697583
3       c7d03956f0c65913289d0a2e52c74dc5cb9ee87ad
4       a3bccc7ef9fb4be9822c0eb8f7c43a7e6d86d0176
5       4c62f50fe5ce5358cba99bd05301fb129b668d450
6       165db65964196a14eeda3B9b6bbffd8e11fef939d
7       391bba3454669d4735c976975685c3e74edc86bf9
8       bc5e68c99d4665e73b3b7298d9b76184bcf292472
9       9b1c8048fc5bc70b6086ba49e5a350a19a61e0890
10      c6fb671992f5bb3821103ccd3f62b2fd68359b63a
11      5335235c25c40c5abadcf36c6aa39c99e38729f9c
12      6671ee3537f843eacbdf717cded16b4f39895f6a2
13      fc216b615c464fd87b19f7bb733549d0ec9168f08
14      a683fd7C4e41888e7697b20b6e8096b847ff7edb6
15      57fb3150a7ef7fd52e752f79e225a384b2f2aebf4
16      4Db4401f865ecf8a46ec239f2ed9e04eed9c6f6a3
17      65c274740e82fbed73541670458120d1a14695c7f
18      90b6413989fe0cb1b0abea4229206c2252ead3244
19      9057189b7919928a7185a42d7fa09f0c7b027beea


In [20]:
file_gram_pair[file_gram_pair.keys()[0]]

[['open_key query_value open_key query_value', 63],
 ['query_value open_key query_value open_key', 56],
 ['create_file open_file set_file_time set_file_attributes', 3],
 ['enum_keys open_key query_value open_key', 3],
 ['open_key enum_keys open_key query_value', 3],
 ['process thread load_image load_dll', 3],
 ['load_dll open_key query_value open_file', 2],
 ['load_dll open_key query_value open_key', 2],
 ['open_file set_file_time set_file_attributes create_file', 2],
 ['open_key query_value load_dll create_window', 2],
 ['open_key query_value open_key enum_keys', 2],
 ['query_value open_key enum_keys open_key', 2],
 ['set_file_attributes create_file open_file set_file_time', 2],
 ['set_file_time set_file_attributes create_file open_file', 2],
 ['thread load_image load_dll open_file', 2],
 ['check_for_debugger load_dll open_key load_dll', 1],
 ['check_for_debugger load_dll open_key query_value', 1],
 ['com_create_instance get_file_attributes open_file load_dll', 1],
 ['com_create_insta

In [21]:
gram_train_df = pd.merge(train_set, gram_df, on = 'id')

In [22]:
gram_train_df = gram_train_df.drop(['class'], axis=1)
gram_train_df = gram_train_df.drop(['id'], axis=1)


In [None]:
%%time
clfrf, Xtrain, ytrain, Xtest, ytest = do_classify(RandomForestClassifier(random_state=123),
                                                   {"max_depth": [50],
                                                    "n_estimators":[20]}, 
                                                   gram_train_df,t_train,mask=mask)

In [25]:
test_set.head()

Unnamed: 0,accept_socket,add_netjob,all_section,bind_socket,change_service_config,check_for_debugger,com_create_instance,com_createole_object,com_get_class_object,connect,...,thread,trimmed_bytes,unload_driver,vm_allocate,vm_mapviewofsection,vm_protect,vm_read,vm_write,write_value,id
0,0,0,5,0,0,2,2,0,1,0,...,5,0,0,0,0,36,0,0,0,0015c8c9ff02fea9d0f45692b9eebfb4abff4e42f
1,0,0,6,2,0,2,1,0,0,0,...,6,1,0,8,0,255,0,5,0,001f298a534ae4b0db7f2707169250aa215c3b5f2
2,0,0,2,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,001f5fdaaa8bbe20303527198d09a30bb7ca3eb50
3,0,0,6,5,0,2,1,0,1,1,...,6,13,0,0,0,72,0,0,0,002ca2c41b649f85c05ae30013436781a932fecc6
4,0,0,1,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,003e109543b4ea22d2bcc1ec309bf2fd34e9a1a1d


In [23]:
whole_test_file = dict()

for datafile in os.listdir('test'):
    if datafile == '.DS_Store':
        continue
        
    file1_tag_flow = []
    file1 = open('test/'+datafile, 'r')
#     file1.readline()
    for line in file1:
        tag = line.split(' ')[0][1:]
        if tag in list(call_set):
            file1_tag_flow.append(tag)
    
    file1_tag_flow_unique = [file1_tag_flow[0]]
    for i in range(1, len(file1_tag_flow)):
        if file1_tag_flow[i] == file1_tag_flow_unique[-1]:
            continue
        else :
            file1_tag_flow_unique.append(file1_tag_flow[i])
        
    whole_test_file[datafile.split('.')[0]] = file1_tag_flow_unique
# print file1_tag_flow

In [24]:
from scipy.stats import itemfreq

file_gram_pair_test = dict()

for k in whole_test_file.keys():
    file1_tag_flow_unique = whole_test_file[k]
#     N = 4 # bi-grams
    grams=[]
#     for n in range(2,5):
    n = 4
    grams += [file1_tag_flow_unique[i:i+n] for i in xrange(len(file1_tag_flow_unique)-n)]
    #print len(grams)
    # ?itemfreq

    single_key_grams = []
    for i in grams:
        single_key_grams.append(' '.join(i))
        
    grams = []
    for i in list(itemfreq(single_key_grams)):
        grams.append([i[0], int(i[1])])
    grams.sort(key = lambda x: x[1], reverse = True) 
    file_gram_pair_test[k] = grams
# print itemfreq(grams)

# for i in file1_tag_flow_unique:
#     for j in file1_tag_flow_unique

In [25]:
file_gram_pair_test[file_gram_pair_test.keys()[0]]

[['open_key query_value open_key query_value', 63],
 ['query_value open_key query_value open_key', 56],
 ['enum_keys open_key query_value open_key', 3],
 ['open_key enum_keys open_key query_value', 3],
 ['process thread load_image load_dll', 3],
 ['load_dll open_key query_value open_file', 2],
 ['load_dll open_key query_value open_key', 2],
 ['open_key query_value load_dll create_window', 2],
 ['open_key query_value open_key enum_keys', 2],
 ['query_value open_key enum_keys open_key', 2],
 ['thread load_image load_dll open_file', 2],
 ['check_for_debugger load_dll open_key load_dll', 1],
 ['check_for_debugger load_dll open_key query_value', 1],
 ['com_create_instance get_file_attributes open_file load_dll', 1],
 ['com_create_instance vm_protect open_key enum_keys', 1],
 ['com_get_class_object load_dll open_file get_file_attributes', 1],
 ['create_file open_file set_file_time set_file_attributes', 1],
 ['create_mutex find_window open_process create_mutex', 1],
 ['create_mutex get_file_a

In [26]:
add_dict_test = {}
l_test = len(test_set) 
add_dict_test['id']=[]
for i in features_for_df:
    add_dict_test[i] = [0]*l_test

In [27]:
count = 0
for k, v in file_gram_pair_test.items():
#     add_dict = {}
    add_dict_test['id'].append(k)
    
    for pair in v :
        if pair[0] in features_for_df:
            add_dict_test[pair[0]][count] = pair[1]
    count += 1

In [28]:
gram_df_test = pd.DataFrame(add_dict_test)
gram_test_df = pd.merge(test_set, gram_df_test, on = 'id')
gram_test_df = gram_test_df.drop(['id'], axis=1)


In [29]:
gram_test_df.head()

Unnamed: 0,accept_socket,add_netjob,all_section,bind_socket,change_service_config,check_for_debugger,com_create_instance,com_createole_object,com_get_class_object,connect,...,trimmed_bytes recv_socket dump_line trimmed_bytes,trimmed_bytes send_socket dump_line trimmed_bytes,vm_allocate vm_protect create_thread_remote vm_allocate,vm_allocate vm_protect vm_write vm_allocate,vm_allocate vm_protect vm_write vm_protect,vm_protect create_thread_remote vm_allocate vm_protect,vm_protect vm_write vm_allocate vm_protect,vm_protect vm_write vm_protect vm_write,vm_write vm_allocate vm_protect create_thread_remote,vm_write vm_allocate vm_protect vm_write
0,0,0,5,0,0,2,2,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,6,2,0,2,1,0,0,0,...,0,0,1,3,1,1,4,1,1,3
2,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,6,5,0,2,1,0,1,1,...,9,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [145]:
t_test=clfrf.predict(gram_test_df)
write_predictions(t_test,test_ids,"new_features.csv")

In [140]:
gram_test_df.shape

(3724, 206)

In [141]:
gram_train_df.shape

(3086, 207)

In [146]:
%%time
clfxbf, Xtrain, ytrain, Xtest, ytest = do_classify(GradientBoostingClassifier(),
                                                   {"learning_rate": [0.1],
                                                    "random_state":[1]}, 
                                                   gram_train_df,t_train,mask=mask)

using mask
BEST {'learning_rate': 0.1, 'random_state': 1} 0.879166666667 [mean: 0.87917, std: 0.01674, params: {'learning_rate': 0.1, 'random_state': 1}]
############# based on standard predict ################
Accuracy on training data: 0.98
Accuracy on test data:     0.90
[[ 16   0   0   0   0   0   0   1  10   3   0   1   3   1   0]
 [  0   8   0   0   0   0   0   0   2   1   0   0   4   1   0]
 [  0   0  10   0   0   1   0   0   0   0   0   0   0   0   0]
 [  0   0   0  11   0   1   0   0   1   0   0   0   0   0   0]
 [  0   0   0   0   8   0   0   0   4   0   0   0   0   1   0]
 [  0   1   0   0   0   5   0   0   2   0   0   0   0   0   0]
 [  0   0   0   0   0   0  11   0   1   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   8   2   0   0   0   0   0   0]
 [  6   1   1   0   2   0   0   0 484   0   2   1   3   3   1]
 [  0   0   0   0   0   0   0   0   2   2   0   0   1   1   0]
 [  0   0   0   0   0   0   0   0   0   0 146   0   0   0   0]
 [  0   0   1   0   0   0   0   



In [147]:
t_test=clfxbf.predict(gram_test_df)
write_predictions(t_test,test_ids,"new_features_xbf.csv")

In [54]:
%%time
from sklearn.linear_model import LogisticRegression 

clflr, Xtrain, ytrain, Xtest, ytest = do_classify(LogisticRegression(penalty="l2",random_state=12,\
                                                                     multi_class="multinomial",class_weight='balanced'), \
                                                   {"C": [0.01, 0.1, 1.0, 10.0],"solver":["newton-cg"]},
                                                   gram_train_df,t_train,reuse_split=reuse_split)


using reuse split
BEST {'C': 1.0, 'solver': 'newton-cg'} 0.772685185185 [mean: 0.75000, std: 0.01523, params: {'C': 0.01, 'solver': 'newton-cg'}, mean: 0.76991, std: 0.00790, params: {'C': 0.1, 'solver': 'newton-cg'}, mean: 0.77269, std: 0.00859, params: {'C': 1.0, 'solver': 'newton-cg'}, mean: 0.76944, std: 0.00878, params: {'C': 10.0, 'solver': 'newton-cg'}]
############# based on standard predict ################
Accuracy on training data: 0.84
Accuracy on test data:     0.77
[[ 14   0   0   2   5   3   0   1   2   1   0   0   2   5   0]
 [  0   8   0   0   0   0   0   0   0   3   0   1   4   0   0]
 [  0   0   5   3   1   2   0   0   0   0   0   0   0   0   0]
 [  0   0   2   9   0   1   0   0   1   0   0   0   0   0   0]
 [  0   0   1   0   9   0   0   0   0   1   0   0   2   0   0]
 [  0   1   0   0   0   5   0   0   2   0   0   0   0   0   0]
 [  0   0   0   0   0   0  11   0   1   0   0   0   0   0   0]
 [  1   0   0   1   0   0   0   8   0   0   0   0   0   0   0]
 [  5   1   

In [55]:
clflr2, Xtrain, ytrain, Xtest, ytest = do_classify(LogisticRegression(penalty="l2",random_state=12,\
                                                                     multi_class="multinomial"), \
                                                   {"C": [1.0],"solver":["newton-cg"]},
                                                   gram_train_df,t_train,reuse_split=reuse_split)


using reuse split
BEST {'C': 1.0, 'solver': 'newton-cg'} 0.856944444444 [mean: 0.85694, std: 0.01795, params: {'C': 1.0, 'solver': 'newton-cg'}]
############# based on standard predict ################
Accuracy on training data: 0.91
Accuracy on test data:     0.84
[[ 14   0   0   0   0   1   0   2  15   0   1   0   2   0   0]
 [  0   8   0   0   0   0   0   0   1   2   0   0   5   0   0]
 [  0   0   4   3   0   1   0   1   2   0   0   0   0   0   0]
 [  0   0   1   6   0   4   0   0   2   0   0   0   0   0   0]
 [  0   0   0   0   6   0   1   0   3   0   0   0   3   0   0]
 [  0   0   0   0   0   5   0   0   2   0   0   0   1   0   0]
 [  0   0   0   0   0   0  10   0   2   0   0   0   0   0   0]
 [  2   0   0   0   0   0   0   4   4   0   0   0   0   0   0]
 [  6   0   2   0   0   1   3   1 466   1   6   1  17   0   0]
 [  1   1   0   0   0   0   0   0   2   0   1   0   1   0   0]
 [  0   0   0   0   0   0   0   0   0   0 146   0   0   0   0]
 [  1   0   0   0   0   0   0   0   1   0

In [33]:
#t_test=clflr.predict(gram_test_df)
write_predictions(t_test,test_ids,"ggm2.csv")

In [30]:
from GaussianGenerativeModel import GaussianGenerativeModel

nb1 = GaussianGenerativeModel(isSharedCovariance=False)
nb1.fit(gram_train_df.values,t_train)
call_list=list(call_set)
t_test=nb1.predict(gram_test_df.values,call_list)

In [48]:
def convertT(C,cls):
    num_cls=len(cls)
    nobs=len(C)
    classes=np.zeros((nobs,num_cls))
    for j in np.arange(nobs):
        for i in np.arange(num_cls):
            if C[j]==cls[i]:
                classes[j,i]=1
    return classes

clslist=np.arange(15)
classes = convertT(t_train, clslist)
freq=np.sum(classes,axis=0)/np.sum(classes)
clswgt = dict(zip(clslist,freq))

In [49]:
clswgt

{0: 0.036941023979261182,
 1: 0.016202203499675955,
 2: 0.011989630589760207,
 3: 0.010369410239792612,
 4: 0.013285806869734284,
 5: 0.012637718729747246,
 6: 0.017174335709656513,
 7: 0.013285806869734284,
 8: 0.52138690861957226,
 9: 0.0068049254698639011,
 10: 0.17563188593648738,
 11: 0.010369410239792612,
 12: 0.12184057031756319,
 13: 0.019118600129617629,
 14: 0.012961762799740765}

In [61]:
%%time
from sklearn.linear_model import LogisticRegression 

clflr, Xtrain, ytrain, Xtest, ytest = do_classify(LogisticRegression(penalty="l2",random_state=12,\
                                                                     multi_class="multinomial",class_weight=clswgt), \
                                                   {"C": [0.01, 0.1, 1.0],"solver":["newton-cg"]},
                                                   gram_train_df,t_train,mask=mask)




using mask
BEST {'C': 0.1, 'solver': 'newton-cg'} 0.770833333333 [mean: 0.75509, std: 0.01378, params: {'C': 0.01, 'solver': 'newton-cg'}, mean: 0.77083, std: 0.04428, params: {'C': 0.1, 'solver': 'newton-cg'}, mean: 0.76759, std: 0.04508, params: {'C': 1.0, 'solver': 'newton-cg'}]
############# based on standard predict ################
Accuracy on training data: 0.77
Accuracy on test data:     0.75
[[  4   0   1   0   0   1   0   0  28   0   0   0   0   1   0]
 [  0   1   0   0   0   0   0   0   8   0   2   0   5   0   0]
 [  1   0   3   1   0   0   0   0   6   0   0   0   0   0   0]
 [  0   0   1   7   0   1   0   0   2   0   1   0   1   0   0]
 [  0   1   0   0   0   0   0   0  12   0   0   0   0   0   0]
 [  0   0   0   0   0   3   0   0   5   0   0   0   0   0   0]
 [  0   0   0   0   0   0   9   0   2   0   0   0   1   0   0]
 [  0   0   0   0   0   0   0   3   7   0   0   0   0   0   0]
 [  5   1   1   1   0   1   1   0 484   0   6   1   3   0   0]
 [  0   0   0   0   0   0   0

In [41]:
%%time
from sklearn.ensemble import GradientBoostingClassifier
clfxbf, Xtrain, ytrain, Xtest, ytest = do_classify(GradientBoostingClassifier(),
                                                   {"learning_rate": [0.05 ,0.1, 0.2],
                                                    "max_depth":[3,5,10]}, 
                                                   gram_train_df,t_train,mask=mask)

using mask
BEST {'learning_rate': 0.05, 'max_depth': 5} 0.880092592593 [mean: 0.87963, std: 0.01939, params: {'learning_rate': 0.05, 'max_depth': 3}, mean: 0.88009, std: 0.01562, params: {'learning_rate': 0.05, 'max_depth': 5}, mean: 0.87269, std: 0.01289, params: {'learning_rate': 0.05, 'max_depth': 10}, mean: 0.87917, std: 0.01914, params: {'learning_rate': 0.1, 'max_depth': 3}, mean: 0.87731, std: 0.01143, params: {'learning_rate': 0.1, 'max_depth': 5}, mean: 0.87870, std: 0.01247, params: {'learning_rate': 0.1, 'max_depth': 10}, mean: 0.87315, std: 0.01231, params: {'learning_rate': 0.2, 'max_depth': 3}, mean: 0.87361, std: 0.01357, params: {'learning_rate': 0.2, 'max_depth': 5}, mean: 0.87593, std: 0.01306, params: {'learning_rate': 0.2, 'max_depth': 10}]
############# based on standard predict ################
Accuracy on training data: 0.99
Accuracy on test data:     0.90
[[ 16   0   0   0   0   0   0   1  11   3   0   1   2   1   0]
 [  0   8   0   1   0   0   0   0   1   1   0



In [42]:
t_test=clfxbf.predict(gram_test_df)
write_predictions(t_test,test_ids,"gbc.csv")