In [1]:
import pandas as pd

data = pd.read_csv('Data/Ankidroid_bow_features.csv', dtype={'is_vulnerable': bool})
data['is_vulnerable'] = data['is_vulnerable'].map({False: 0, True: 1})


In [2]:
x = data.loc[:, data.columns != 'is_vulnerable']
y = data['is_vulnerable']
SEQUENCE_LENGTH = x.shape[1]

In [3]:
from sklearn.model_selection import train_test_split
TEST_SPLIT = 0.2
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TEST_SPLIT, random_state=42)

In [None]:
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import tensorflow as tf

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random forest
random_forest_classifier = RandomForestClassifier(n_estimators=1000, random_state=42)
random_forest_classifier.fit(x_train, y_train)

In [None]:
print("TRAIN")

predicted_rf = random_forest_classifier.predict(x_train)
predicted_prob_rf = random_forest_classifier.predict_proba(x_train)

confusion = sklearn.metrics.confusion_matrix(y_true=y_train, y_pred=predicted_rf)
print(confusion)
tn, fp, fn, tp = confusion.ravel()
print('\nTP:',tp)
print('FP:',fp)
print('TN:',tn)
print('FN:',fn)

## Performance measure
print('\nAccuracy: '+ str(sklearn.metrics.accuracy_score(y_true=y_train, y_pred=predicted_rf)))
print('Precision: '+ str(sklearn.metrics.precision_score(y_true=y_train, y_pred=predicted_rf)))
print('Recall: '+ str(sklearn.metrics.recall_score(y_true=y_train, y_pred=predicted_rf)))
print('F-measure: '+ str(sklearn.metrics.f1_score(y_true=y_train, y_pred=predicted_rf)))
print("False Positive Rate:" + str(fp/(tn+fp)))
print('AUC: '+ str(sklearn.metrics.roc_auc_score(y_true=y_train, y_score=np.argmax(predicted_prob_rf,axis = 1))))
print('Precision-Recall AUC: '+ str(sklearn.metrics.average_precision_score(y_true=y_train, y_score=np.argmax(predicted_prob_rf,axis = 1))))
print('MCC: '+ str(sklearn.metrics.matthews_corrcoef(y_true=y_train, y_pred=predicted_rf)))

In [None]:
print("TEST")


predicted_rf = random_forest_classifier.predict(x_test)
predicted_prob_rf = random_forest_classifier.predict_proba(x_test)

confusion = sklearn.metrics.confusion_matrix(y_true=y_test, y_pred=predicted_rf)
print(confusion)
tn, fp, fn, tp = confusion.ravel()
print('\nTP:',tp)
print('FP:',fp)
print('TN:',tn)
print('FN:',fn)

## Performance measure
print('\nAccuracy: '+ str(sklearn.metrics.accuracy_score(y_true=y_test, y_pred=predicted_rf)))
print('Precision: '+ str(sklearn.metrics.precision_score(y_true=y_test, y_pred=predicted_rf)))
print('Recall: '+ str(sklearn.metrics.recall_score(y_true=y_test, y_pred=predicted_rf)))
print('F-measure: '+ str(sklearn.metrics.f1_score(y_true=y_test, y_pred=predicted_rf)))
print("False Positive Rate:" + str(fp/(tn+fp)))
print('AUC: '+ str(sklearn.metrics.roc_auc_score(y_true=y_test, y_score=np.argmax(predicted_prob_rf,axis = 1))))
print('Precision-Recall AUC: '+ str(sklearn.metrics.average_precision_score(y_true=y_test, y_score=np.argmax(predicted_prob_rf,axis = 1))))
print('MCC: '+ str(sklearn.metrics.matthews_corrcoef(y_true=y_test, y_pred=predicted_rf)))

#Resnet

In [None]:
def bn_relu(layer, dropout=0, **params):
    layer = tf.keras.layers.BatchNormalization()(layer)
    layer = tf.keras.layers.Activation(params['conv_activation'])(layer)

    if dropout > 0:
        layer = tf.keras.layers.Dropout(dropout)(layer)
    return layer

In [None]:
def resnet_block(layer, filters, kernels, dropout, activation,
                 cross_block=False, is_first=False, is_last=False, shrink=False):
  # -BN-Act-Conv-BN-Act-Conv--
  # ↳-----------------------↑
  strides = 1
  if shrink:
    strides = 2
  if cross_block:
    shortcut = tf.keras.layers.Conv1D(filters=filters,
                      kernel_size=strides,
                      kernel_initializer='random_uniform',
                      # kernel_regularizer=regularizers.l2(0.01),
                      strides=strides,
                      padding='same')(layer)
  else:
    shortcut = layer

  if not is_first:
    layer = bn_relu(layer, dropout=dropout, conv_activation=activation)

  layer = tf.keras.layers.Conv1D(filters=filters,
                 kernel_size=kernels,
                 kernel_initializer='random_uniform',
                 # kernel_regularizer=regularizers.l2(0.01),
                 strides=strides,
                 padding='same')(layer)
  layer = bn_relu(layer, dropout=dropout, conv_activation=activation)
  layer = tf.keras.layers.Conv1D(filters=filters,
                 kernel_size=kernels,
                 kernel_initializer='random_uniform',
                 # kernel_regularizer=regularizers.l2(0.01),
                 strides=1,
                 padding='same')(layer)
  layer = tf.keras.layers.add([shortcut, layer])

  if is_last:
    layer = bn_relu(layer, dropout=dropout, conv_activation=activation)

  return layer

In [None]:
'''create model'''

OUTPUT_SHAPE = 2
input = tf.keras.layers.Input(shape=(1,SEQUENCE_LENGTH))

layer = tf.keras.layers.Conv1D(filters=32,
               kernel_size=3,
               kernel_initializer='random_uniform',
               # kernel_regularizer=regularizers.l2(0.01),
               strides=1,
               padding='same')(input)


layer = resnet_block(layer=layer, filters=32, kernels=3, dropout=0, activation='relu')

layer = resnet_block(layer, 64, 3, 0, 'relu', cross_block=True, shrink=True)
layer = resnet_block(layer, 64, 3, 0, 'relu')

layer = resnet_block(layer, 128, 3, 0, 'relu', cross_block=True, shrink=True)
layer = resnet_block(layer, 128, 3, 0, 'relu')

layer = resnet_block(layer, 256, 3, 0, 'relu', cross_block=True, shrink=True)
layer = resnet_block(layer, 256, 3, 0, 'relu')
layer = tf.keras.layers.Flatten()(layer)
output = tf.keras.layers.Dense(units=OUTPUT_SHAPE, activation='softmax')(layer)

model = tf.keras.Model(inputs=[input], outputs=[output])

optimizer = tf.keras.optimizers.Adam(lr=0.005)
model.summary()
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
monitor = tf.keras.callbacks.EarlyStopping(monitor='val_acc', min_delta=1e-3, patience=25, mode='auto', restore_best_weights=True)

In [None]:
BATCH_SIZE = 150
EPOCHS = 100
VALIDATION_SPLIT = 0.1

In [None]:
y_train

In [None]:
x_train_reshaped = np.array(x_train).reshape(-1, 1, SEQUENCE_LENGTH) 
y_train_reshaped = np.eye(2)[y_train]

x_test_reshaped =  np.array(x_test).reshape(-1, 1, SEQUENCE_LENGTH) 
y_test_reshaped = np.eye(2)[y_test]

In [None]:
model.fit(x_train_reshaped,
          y_train_reshaped,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_split=VALIDATION_SPLIT,
          callbacks=[monitor])

In [None]:
print("TRAIN")

results = model.evaluate(x_train_reshaped, y_train_reshaped, batch_size=BATCH_SIZE)

for num in range(0,len(model.metrics_names)):
    print(model.metrics_names[num]+': '+str(results[num]))

predicted_prob = model.predict(x_train_reshaped)
predicted = np.argmax(predicted_prob,axis = 1)

confusion = sklearn.metrics.confusion_matrix(y_true=y_train, y_pred=predicted)
print(confusion)
tn, fp, fn, tp = confusion.ravel()
print('\nTP:',tp)
print('FP:',fp)
print('TN:',tn)
print('FN:',fn)

## Performance measure
print('\nAccuracy: '+ str(sklearn.metrics.accuracy_score(y_true=y_train, y_pred=predicted)))
print('Precision: '+ str(sklearn.metrics.precision_score(y_true=y_train, y_pred=predicted)))
print('Recall: '+ str(sklearn.metrics.recall_score(y_true=y_train, y_pred=predicted)))
print('F-measure: '+ str(sklearn.metrics.f1_score(y_true=y_train, y_pred=predicted)))
print("False Positive Rate:" + str(fp/(tn+fp)))
print('AUC: '+ str(sklearn.metrics.roc_auc_score(y_true=y_train, y_score=np.argmax(predicted_prob,axis = 1))))
print('Precision-Recall AUC: '+ str(sklearn.metrics.average_precision_score(y_true=y_train, y_score=np.argmax(predicted_prob,axis = 1))))
print('MCC: '+ str(sklearn.metrics.matthews_corrcoef(y_true=y_train, y_pred=predicted)))

In [None]:
print("TEST")

results = model.evaluate(x_test_reshaped, y_test_reshaped, batch_size=BATCH_SIZE)

for num in range(0,len(model.metrics_names)):
    print(model.metrics_names[num]+': '+str(results[num]))

predicted_prob = model.predict(x_test_reshaped)
predicted = np.argmax(predicted_prob,axis = 1)

# predicted = model.predict_classes(x_test_reshaped)
# predicted_prob = model.predict(x_test_reshaped)

confusion = sklearn.metrics.confusion_matrix(y_true=y_test, y_pred=predicted)
print(confusion)
tn, fp, fn, tp = confusion.ravel()
print('\nTP:',tp)
print('FP:',fp)
print('TN:',tn)
print('FN:',fn)

## Performance measure
print('\nAccuracy: '+ str(sklearn.metrics.accuracy_score(y_true=y_test, y_pred=predicted)))
print('Precision: '+ str(sklearn.metrics.precision_score(y_true=y_test, y_pred=predicted)))
print('Recall: '+ str(sklearn.metrics.recall_score(y_true=y_test, y_pred=predicted)))
print('F-measure: '+ str(sklearn.metrics.f1_score(y_true=y_test, y_pred=predicted)))
print("False Positive Rate:" + str(fp/(tn+fp)))
print('AUC: '+ str(sklearn.metrics.roc_auc_score(y_true=y_test, y_score=np.argmax(predicted_prob,axis = 1))))
print('Precision-Recall AUC: '+ str(sklearn.metrics.average_precision_score(y_true=y_test, y_score=np.argmax(predicted_prob,axis = 1))))
print('MCC: '+ str(sklearn.metrics.matthews_corrcoef(y_true=y_test, y_pred=predicted)))

# SVM

In [None]:
from sklearn.svm import SVC

# SVM
print("SVM (kernel = linear)")
support_vector_classifier = SVC(kernel='linear', probability=True)
support_vector_classifier.fit(x_train, y_train)

In [None]:
print("TRAIN")
predicted_svm = support_vector_classifier.predict(x_train)
predicted_prob_svm = support_vector_classifier.predict_proba(x_train)

confusion = sklearn.metrics.confusion_matrix(y_true=y_train, y_pred=predicted_svm)
print(confusion)
tn, fp, fn, tp = confusion.ravel()
print('\nTP:',tp)
print('FP:',fp)
print('TN:',tn)
print('FN:',fn)

## Performance measure
print('\nAccuracy: '+ str(sklearn.metrics.accuracy_score(y_true=y_train, y_pred=predicted_svm)))
print('Precision: '+ str(sklearn.metrics.precision_score(y_true=y_train, y_pred=predicted_svm)))
print('Recall: '+ str(sklearn.metrics.recall_score(y_true=y_train, y_pred=predicted_svm)))
print('F-measure: '+ str(sklearn.metrics.f1_score(y_true=y_train, y_pred=predicted_svm)))
print("False Positive Rate:" + str(fp/(tn+fp)))
print('AUC: '+ str(sklearn.metrics.roc_auc_score(y_true=y_train, y_score=np.argmax(predicted_prob_svm,axis = 1))))
print('Precision-Recall AUC: '+ str(sklearn.metrics.average_precision_score(y_true=y_train, y_score=np.argmax(predicted_prob_svm,axis = 1))))
print('MCC: '+ str(sklearn.metrics.matthews_corrcoef(y_true=y_train, y_pred=predicted_svm)))

In [None]:
print("TEST")

predicted_svm = support_vector_classifier.predict(x_test)
predicted_prob_svm = support_vector_classifier.predict_proba(x_test)

confusion = sklearn.metrics.confusion_matrix(y_true=y_test, y_pred=predicted_svm)
print(confusion)
tn, fp, fn, tp = confusion.ravel()
print('\nTP:',tp)
print('FP:',fp)
print('TN:',tn)
print('FN:',fn)

## Performance measure
print('\nAccuracy: '+ str(sklearn.metrics.accuracy_score(y_true=y_test, y_pred=predicted_svm)))
print('Precision: '+ str(sklearn.metrics.precision_score(y_true=y_test, y_pred=predicted_svm)))
print('Recall: '+ str(sklearn.metrics.recall_score(y_true=y_test, y_pred=predicted_svm)))
print('F-measure: '+ str(sklearn.metrics.f1_score(y_true=y_test, y_pred=predicted_svm)))
print("False Positive Rate:" + str(fp/(tn+fp)))
print('AUC: '+ str(sklearn.metrics.roc_auc_score(y_true=y_test, y_score=np.argmax(predicted_prob_svm,axis = 1))))
print('Precision-Recall AUC: '+ str(sklearn.metrics.average_precision_score(y_true=y_test, y_score=np.argmax(predicted_prob_svm,axis = 1))))
print('MCC: '+ str(sklearn.metrics.matthews_corrcoef(y_true=y_test, y_pred=predicted_svm)))

In [None]:
fpr_svm, tpr_svm, _ = sklearn.metrics.roc_curve(y_test, predicted_prob_svm[:, 1])
roc_auc_svm = sklearn.metrics.auc(fpr_svm, tpr_svm)

fpr_rf, tpr_rf, _ = sklearn.metrics.roc_curve(y_test, predicted_prob_rf[:, 1])
roc_auc_rf = sklearn.metrics.auc(fpr_rf, tpr_rf)

plt.figure()
plt.plot(fpr_svm, tpr_svm, color='darkorange',
         lw=2, label='Randomforest (area = %0.2f)' % roc_auc_svm)
plt.plot(fpr_rf, tpr_rf, color='darkgreen',
         lw=2, label='svm(area = %0.2f)' % roc_auc_rf)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

# Summary

In [None]:
# Random Forest
print("RANDOM FOREST")
print('\nAccuracy: '+ str(sklearn.metrics.accuracy_score(y_true=y_test, y_pred=predicted_rf)))
print('Precision: '+ str(sklearn.metrics.precision_score(y_true=y_test, y_pred=predicted_rf)))
print('Recall: '+ str(sklearn.metrics.recall_score(y_true=y_test, y_pred=predicted_rf)))
print('F-measure: '+ str(sklearn.metrics.f1_score(y_true=y_test, y_pred=predicted_rf)))
confusion = sklearn.metrics.confusion_matrix(y_true=y_test, y_pred=predicted_rf)
tn, fp, fn, tp = confusion.ravel()
print("False Positive Rate:" + str(fp/(tn+fp)))
print('AUC: '+ str(sklearn.metrics.roc_auc_score(y_true=y_test, y_score=np.argmax(predicted_prob_rf,axis = 1))))
print('Precision-Recall AUC: '+ str(sklearn.metrics.average_precision_score(y_true=y_test, y_score=np.argmax(predicted_prob_rf,axis = 1))))
print('MCC: '+ str(sklearn.metrics.matthews_corrcoef(y_true=y_test, y_pred=predicted_rf)))
print("\n")

## SVM
print("SVM")
print('\nAccuracy: '+ str(sklearn.metrics.accuracy_score(y_true=y_test, y_pred=predicted_svm)))
print('Precision: '+ str(sklearn.metrics.precision_score(y_true=y_test, y_pred=predicted_svm)))
print('Recall: '+ str(sklearn.metrics.recall_score(y_true=y_test, y_pred=predicted_svm)))
print('F-measure: '+ str(sklearn.metrics.f1_score(y_true=y_test, y_pred=predicted_svm)))
confusion = sklearn.metrics.confusion_matrix(y_true=y_test, y_pred=predicted_svm)
tn, fp, fn, tp = confusion.ravel()
print("False Positive Rate:" + str(fp/(tn+fp)))
print('AUC: '+ str(sklearn.metrics.roc_auc_score(y_true=y_test, y_score=np.argmax(predicted_prob_svm,axis = 1))))
print('Precision-Recall AUC: '+ str(sklearn.metrics.average_precision_score(y_true=y_test, y_score=np.argmax(predicted_prob_svm,axis = 1))))
print('MCC: '+ str(sklearn.metrics.matthews_corrcoef(y_true=y_test, y_pred=predicted_svm)))
print("\n")

#Resnet
print("RESNET")
print('\nAccuracy: '+ str(sklearn.metrics.accuracy_score(y_true=y_test, y_pred=predicted)))
print('Precision: '+ str(sklearn.metrics.precision_score(y_true=y_test, y_pred=predicted)))
print('Recall: '+ str(sklearn.metrics.recall_score(y_true=y_test, y_pred=predicted)))
print('F-measure: '+ str(sklearn.metrics.f1_score(y_true=y_test, y_pred=predicted)))
confusion = sklearn.metrics.confusion_matrix(y_true=y_test, y_pred=predicted)
tn, fp, fn, tp = confusion.ravel()
print("False Positive Rate:" + str(fp/(tn+fp)))
print('AUC: '+ str(sklearn.metrics.roc_auc_score(y_true=y_test, y_score=np.argmax(predicted_prob,axis = 1))))
print('Precision-Recall AUC: '+ str(sklearn.metrics.average_precision_score(y_true=y_test, y_score=np.argmax(predicted_prob,axis = 1))))
print('MCC: '+ str(sklearn.metrics.matthews_corrcoef(y_true=y_test, y_pred=predicted)))