In [61]:
import os
import json
import numpy as np
from sklearn.model_selection import train_test_split
from pprint import pprint

In [29]:
syscalls = {"data": [], "cat": []}
syscalls

{'cat': [], 'data': []}

In [31]:
## add benign syscalls
filePath_b = "../data/benign"

for fileName in os.listdir(filePath_b):
    fileName = os.path.join(filePath_b, fileName)
    
    if os.path.isfile(fileName):
        data = json.load(open(fileName))
        syscall = " ".join(data["system_call"])
        syscalls["data"].append(syscall)
        syscalls["cat"].append(0)

In [37]:
## add malicious syscalls
filePath_m = "../data/malicious"

for fileName in os.listdir(filePath_m):
    fileName = os.path.join(filePath_m, fileName)
    
    if os.path.isfile(fileName):
        data = json.load(open(fileName))
        syscall = " ".join(data["system_call"])
        syscalls["data"].append(syscall)
        syscalls["cat"].append(1)

In [41]:
len(syscalls['data'])

[0, 0, 0, 0, 1, 1, 1, 1, 1, 1]

In [39]:
syscalls['cat'][95:105]

[0, 0, 0, 0, 1, 1, 1, 1, 1, 1]

In [82]:
## train_test_split 20%
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(syscalls['data'], syscalls['cat'], test_size=0.2, random_state=10)
len(X_test)
#skf = cross_validation.StratifiedKFold(y, n_folds=10) #2-fold cross validation

31

In [51]:
X_test

['ioctl ioctl restart_syscall futex futex restart_syscall recvmsg rt_sigtimedwait restart_syscall ioctl ioctl sched_setscheduler ioctl getpriority mprotect clock_gettime ioctl sched_setscheduler ioctl ioctl ioctl ioctl ioctl ioctl ioctl dup close epoll_ctl ioctl ioctl ioctl ioctl ioctl ioctl clock_gettime clock_gettime clock_gettime recvfrom ioctl clock_gettime ioctl ioctl dup fcntl64 close epoll_ctl mprotect ioctl ioctl getpid getuid32 epoll_wait recvfrom recvfrom clock_gettime clock_gettime getuid32 writev ioctl mmap2 sched_setscheduler madvise getpriority mprotect mprotect clone clock_gettime sched_setscheduler sched_setscheduler getpriority ioctl clock_gettime ioctl sched_setscheduler getpid ioctl getuid32 epoll_wait clock_gettime getpid getuid32 epoll_wait clock_gettime clock_gettime clock_gettime recvfrom clock_gettime clock_gettime write mprotect mprotect mprotect mmap2 madvise mmap2 madvise mprotect clone futex set_thread_area mmap2 madvise sigaltstack prctl gettid futex futex 

In [52]:
## Tokenizing text N-gram
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range=(2, 2))
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(123, 2247)

In [54]:
## tf_idf
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(123, 2247)

In [77]:
## building a pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 5))),
                     ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, random_state=10,
                                           max_iter=5, tol=None)),
                    ])

In [78]:
## fit
text_clf.fit(X_train, y_train)  

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 5), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=10, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [79]:
## predict
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)    

0.74193548387096775

In [80]:
## F1-score
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.72      1.00      0.84        21
          1       1.00      0.20      0.33        10

avg / total       0.81      0.74      0.68        31



In [81]:
metrics.confusion_matrix(y_test, predicted)

array([[21,  0],
       [ 8,  2]])