Training and testing the model

In [1]:
#import libraries
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import numpy as np
import joblib

In [2]:
#Load the data
#Train data
npz = np.load('../../../../../../data/processed/urls/phishing/train_data.npz',  allow_pickle=True)
train_inputs, train_targets = npz['inputs'].astype(np.float64), npz['targets'].astype(np.int64)

#Validation data
npz = np.load('../../../../../../data/processed/urls/phishing/validation_data.npz',  allow_pickle=True)
validation_inputs, validation_targets = npz['inputs'].astype(np.float64), npz['targets'].astype(np.int64)

#Test data
npz = np.load('../../../../../../data/processed/urls/phishing/test_data.npz',  allow_pickle=True)
test_inputs, test_targets = npz['inputs'].astype(np.float64), npz['targets'].astype(np.int64)

#Train data scaled
npz = np.load('../../../../../../data/processed/urls/phishing/scaled_train_data.npz',  allow_pickle=True)
train_inputs_pred, train_targets_pred = npz['inputs'].astype(np.float64), npz['targets'].astype(np.int64)

#Validation data scaled
npz = np.load('../../../../../../data/processed/urls/phishing/scaled_validation_data.npz',  allow_pickle=True)
validation_inputs_pred, validation_targets_pred = npz['inputs'].astype(np.float64), npz['targets'].astype(np.int64)

#Test data scaled
npz = np.load('../../../../../../data/processed/urls/phishing/scaled_test_data.npz',  allow_pickle=True)
test_inputs_pred, test_targets_pred = npz['inputs'].astype(np.float64), npz['targets'].astype(np.int64)

In [10]:
#Train the models

#Scaled data
rbf_kernel_svm_clf_s = SVC(kernel="rbf", gamma=0.05, C=1000)
rbf_kernel_svm_clf_s.fit(train_inputs_pred, train_targets_pred)

In [11]:
#Train the models

#Regular data
rbf_kernel_svm_clf = SVC(kernel="rbf", gamma=0.05, C=1000)
rbf_kernel_svm_clf.fit(train_inputs, train_targets)

In [12]:
#Evaluate the model

#Train data
y_pred_train = rbf_kernel_svm_clf.predict(train_inputs)
#Train data scaled
y_pred_train_s = rbf_kernel_svm_clf_s.predict(train_inputs_pred)

#Validation data
y_pred_val = rbf_kernel_svm_clf.predict(validation_inputs)
#Validation data scaled
y_pred_val_s = rbf_kernel_svm_clf_s.predict(validation_inputs_pred)

#Test data
y_pred_test = rbf_kernel_svm_clf.predict(test_inputs)
#Test data scaled
y_pred_test_s = rbf_kernel_svm_clf_s.predict(test_inputs_pred)

print("F1 Score train inputs:", f1_score(y_pred_train, train_targets, pos_label=1))
print("F1 Score train scaled inputs:", f1_score(y_pred_train_s, train_targets_pred, pos_label=1))

print("F1 Score validation inputs:", f1_score(y_pred_val, validation_targets, pos_label=1))
print("F1 Score validation scaled inputs:", f1_score(y_pred_val_s, validation_targets_pred, pos_label=1))

print("F1 Score test inputs:", f1_score(y_pred_test, test_targets, pos_label=1))
print("F1 Score test scaled inputs:", f1_score(y_pred_test_s, test_targets_pred, pos_label=1))

F1 Score train inputs: 0.9830696027442736
F1 Score train scaled inputs: 0.9518126548074758
F1 Score validation inputs: 0.9438351611831173
F1 Score validation scaled inputs: 0.9377958079783638
F1 Score test inputs: 0.04489930670188181
F1 Score test scaled inputs: 0.03347840642785403


In [16]:
confusion_matrix(validation_targets, y_pred_val)

array([[1484,   77],
       [  92, 1420]])

In [17]:
confusion_matrix(validation_targets_pred, y_pred_val_s)

array([[1502,   59],
       [ 125, 1387]])

In [18]:
#Save the model
joblib.dump(rbf_kernel_svm_clf, '../../../../../../models/malicius_url/phishing/svm/fishing_url_detection.pkl')
#Save the scaled trained model
joblib.dump(rbf_kernel_svm_clf_s, '../../../../../../models/malicius_url/phishing/svm/fishing_url_detection_scaled.pkl')

['../../../../../../models/malicius_url/phishing/svm/fishing_url_detection_scaled.pkl']