Training and testing the model

In [1]:
#import libraries

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import numpy as np
import joblib

In [2]:
#Load the data
#Train data
npz = np.load('../../../../../../data/processed/urls/phishing/train_data.npz',  allow_pickle=True)
train_inputs, train_targets = npz['inputs'].astype(np.float64), npz['targets'].astype(np.int64)

#Validation data
npz = np.load('../../../../../../data/processed/urls/phishing/validation_data.npz',  allow_pickle=True)
validation_inputs, validation_targets = npz['inputs'].astype(np.float64), npz['targets'].astype(np.int64)

#Test data
npz = np.load('../../../../../../data/processed/urls/phishing/test_data.npz',  allow_pickle=True)
test_inputs, test_targets = npz['inputs'].astype(np.float64), npz['targets'].astype(np.int64)

In [3]:
#Define and train the model
MAX_DEPTH = 10

clf_tree = DecisionTreeClassifier(max_depth=MAX_DEPTH, random_state=42)
clf_tree.fit(train_inputs, train_targets)

In [4]:
#Evaluate the model

#Train data
y_pred_train = clf_tree.predict(train_inputs)
#Validation data
y_pred_val = clf_tree.predict(validation_inputs)
#Test data
y_pred_test = clf_tree.predict(test_inputs)

print("F1 Score train inputs:", f1_score(y_pred_train, train_targets, pos_label=1))
print("F1 Score validation inputs:", f1_score(y_pred_val, validation_targets, pos_label=1))
print("F1 Score test inputs:", f1_score(y_pred_test, test_targets, pos_label=1))

F1 Score train inputs: 0.9673717371737174
F1 Score validation inputs: 0.941455160744501
F1 Score test inputs: 0.9434604904632152


In [5]:
confusion_matrix(validation_targets, y_pred_val)

array([[1509,   52],
       [ 121, 1391]])

In [6]:
#Model selection
param_distribs = {
    'min_samples_split': randint(low=1, high=8),
    'max_depth': randint(low=8, high=50),
}

clf_tree_test = DecisionTreeClassifier()

rnd_search = RandomizedSearchCV(clf_tree_test, param_distributions=param_distribs,
n_iter=8, cv=4, scoring='f1_weighted')
rnd_search.fit(train_inputs, train_targets)

In [7]:
rnd_search.best_params_

{'max_depth': 25, 'min_samples_split': 6}

In [8]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print("F1 score:", mean_score, "-", "Params:", params)

F1 score: 0.9535692103368238 - Params: {'max_depth': 25, 'min_samples_split': 5}
F1 score: 0.9534575986039844 - Params: {'max_depth': 16, 'min_samples_split': 3}
F1 score: 0.9535702517196215 - Params: {'max_depth': 48, 'min_samples_split': 5}
F1 score: 0.9528134713109804 - Params: {'max_depth': 29, 'min_samples_split': 4}
F1 score: 0.9535693775257781 - Params: {'max_depth': 20, 'min_samples_split': 6}
F1 score: 0.9541111728782423 - Params: {'max_depth': 25, 'min_samples_split': 6}
F1 score: 0.9531364396090851 - Params: {'max_depth': 29, 'min_samples_split': 5}
F1 score: 0.953758985493113 - Params: {'max_depth': 11, 'min_samples_split': 6}


In [9]:
#Chose the best model
clf_tree_optimized = rnd_search.best_estimator_

In [10]:
#Evaluate the model

#Train data
y_pred_train = clf_tree_optimized.predict(train_inputs)
#Validation data
y_pred_val = clf_tree_optimized.predict(validation_inputs)
#Test data
y_pred_test = clf_tree_optimized.predict(test_inputs)

print("F1 Score train inputs:", f1_score(y_pred_train, train_targets, pos_label=1))
print("F1 Score validation inputs:", f1_score(y_pred_val, validation_targets, pos_label=1))
print("F1 Score test inputs:", f1_score(y_pred_test, test_targets, pos_label=1))

F1 Score train inputs: 0.9905014358294677
F1 Score validation inputs: 0.9503498833722093
F1 Score test inputs: 0.9515212303577399


In [11]:
#Save the models

#Regular model
joblib.dump(clf_tree, '../../../../../../models/malicius_url/phishing/decision_tree/phishing_url_detection.pkl')
#Optimize model
joblib.dump(clf_tree_optimized, '../../../../../../models/malicius_url/phishing/decision_tree/phishing_url_detection_optimized.pkl')

['../../../../../../models/malicius_url/phishing/decision_tree/phishing_url_detection_optimized.pkl']