In [6]:
import numpy as np
from time import time
import ast
import utils as lu
import logging

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier

In [7]:
def FP_without_pua(y_pred, y_test):
    fp = 0
    for i in range(len(y_pred)):
        if y_pred[i] == 2 and y_test[i] == 0:
            fp += 1
    return fp / len(y_pred)


def FP_with_pua(y_pred, y_test):
    fp = 0
    for i in range(len(y_pred)):
        if (y_pred[i] == 1 or y_pred[i] == 2) and y_test[i] == 0:
            fp += 1
    return fp / len(y_pred)

In [8]:
all_labels_file = 'all_labels.txt'
embeddings = '/Users/grigoriipogorelov/Desktop/KL_graph_embeddings/gexf_graphs_dims_300_epochs_30_lr_0.3_embeddings.txt'
X, Y = lu.X_Y_from_embeddings(all_labels_file, embeddings)
lu.test_proportions(Y)

clean 0.5, pua 0.3, malw 0.2


In [9]:
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
log_path = '/Users/grigoriipogorelov/Desktop/KL_graph_embeddings/results/svm_linear_not_equal_dim300.log'
logging.basicConfig(filename=log_path, filemode="w", level=logging.INFO)


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=12345)
logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape,
                                                                        Y_train.shape, Y_test.shape))

logging.info('===================== SVM [linear] =====================')
params = [{'C':[0.01,0.1,1,10]}]

classifier = GridSearchCV(LinearSVC(), params, scoring='accuracy',verbose=1)
start = time()
classifier.fit(X_train,Y_train)
end = time()

logging.info('fit time is %.1f min' % ((end-start)/60))
logging.info('best classifier model\'s hyperparameters '+ str(classifier.best_params_))

Y_pred = classifier.predict(X_test)
acc = accuracy_score(Y_test, Y_pred)

logging.info('Linear SVM accuracy: {}'.format(acc))
logging.info(classification_report(Y_test, Y_pred))
logging.info('FPR without pua: {}'.format(FP_without_pua(Y_pred, Y_test)))
logging.info('FPR with pua: {}'.format(FP_with_pua(Y_pred, Y_test)))
np.savetxt('/Users/grigoriipogorelov/Desktop/KL_graph_embeddings/results/svm_lin_not_equal_dim300_pred.txt',Y_pred)
np.savetxt('/Users/grigoriipogorelov/Desktop/KL_graph_embeddings/results/svm_lin_not_equal_dim300_test.txt',Y_test)
logging.info('======================================================')

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  6.7min finished


In [10]:
classifier.best_params_

{'C': 0.1}

In [5]:
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
log_path = '/Users/grigoriipogorelov/Desktop/KL_graph_embeddings/results/kNN_not_equal_dim300.log'
logging.basicConfig(filename=log_path, filemode="w", level=logging.INFO)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=12345)
logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape,
                                                                        Y_train.shape, Y_test.shape))

logging.info('===================== k-NN =====================')
params = [{'n_neighbors':[5]}]

classifier = GridSearchCV(KNeighborsClassifier(), params, scoring='accuracy', verbose=1)
start = time()
classifier.fit(X_train,Y_train)
end = time()

logging.info('fit time is %.1f min' % ((end-start)/60))
logging.info('best classifier model\'s hyperparameters '+ str(classifier.best_params_))

Y_pred = classifier.predict(X_test)
acc = accuracy_score(Y_test, Y_pred)

logging.info('Linear SVM accuracy: {}'.format(acc))
logging.info(classification_report(Y_test, Y_pred))
logging.info('FPR without pua: {}'.format(FP_without_pua(Y_pred, Y_test)))
logging.info('FPR with pua: {}'.format(FP_with_pua(Y_pred, Y_test)))
np.savetxt('/Users/grigoriipogorelov/Desktop/KL_graph_embeddings/results/kNN_not_equal_dim300_pred.txt',Y_pred)
np.savetxt('/Users/grigoriipogorelov/Desktop/KL_graph_embeddings/results/kNN_not_equal_dim300_test.txt',Y_test)
logging.info('======================================================')

Fitting 5 folds for each of 1 candidates, totalling 5 fits


KeyboardInterrupt: 