Training and testing the model

In [15]:
#import libraries

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import numpy as np
import joblib

In [2]:
#Load the data
#Train data
npz = np.load('../../../../../../data/processed/urls/phishing/train_data.npz',  allow_pickle=True)
train_inputs, train_targets = npz['inputs'].astype(np.float64), npz['targets'].astype(np.int64)

#Validation data
npz = np.load('../../../../../../data/processed/urls/phishing/validation_data.npz',  allow_pickle=True)
validation_inputs, validation_targets = npz['inputs'].astype(np.float64), npz['targets'].astype(np.int64)

#Test data
npz = np.load('../../../../../../data/processed/urls/phishing/test_data.npz',  allow_pickle=True)
test_inputs, test_targets = npz['inputs'].astype(np.float64), npz['targets'].astype(np.int64)

In [3]:
#Define and train the model
MAX_DEPTH = 10

clf_tree = DecisionTreeClassifier(max_depth=MAX_DEPTH, random_state=42)
clf_tree.fit(train_inputs, train_targets)

In [4]:
#Evaluate the model

#Train data
y_pred_train = clf_tree.predict(train_inputs)
#Validation data
y_pred_val = clf_tree.predict(validation_inputs)
#Test data
y_pred_test = clf_tree.predict(test_inputs)

print("F1 Score train inputs:", f1_score(y_pred_train, train_targets, pos_label=1))
print("F1 Score validation inputs:", f1_score(y_pred_val, validation_targets, pos_label=1))
print("F1 Score test inputs:", f1_score(y_pred_test, test_targets, pos_label=1))

F1 Score train inputs: 0.9677780264041173
F1 Score validation inputs: 0.9453072248480756
F1 Score test inputs: 0.028791429527954472


In [5]:
confusion_matrix(validation_targets, y_pred_val)

array([[1511,   50],
       [ 112, 1400]])

In [8]:
#Model selection
param_distribs = {
    'min_samples_split': randint(low=1, high=8),
    'max_depth': randint(low=8, high=50),
}

clf_tree_test = DecisionTreeClassifier()

rnd_search = RandomizedSearchCV(clf_tree_test, param_distributions=param_distribs,
n_iter=8, cv=4, scoring='f1_weighted')
rnd_search.fit(train_inputs, train_targets)

In [9]:
rnd_search.best_params_

{'max_depth': 42, 'min_samples_split': 3}

In [10]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print("F1 score:", mean_score, "-", "Params:", params)

F1 score: 0.9455389912535846 - Params: {'max_depth': 26, 'min_samples_split': 5}
F1 score: 0.9469479253816662 - Params: {'max_depth': 16, 'min_samples_split': 6}
F1 score: 0.9464088245634581 - Params: {'max_depth': 35, 'min_samples_split': 6}
F1 score: 0.9438610791299571 - Params: {'max_depth': 10, 'min_samples_split': 6}
F1 score: 0.9454283182017946 - Params: {'max_depth': 44, 'min_samples_split': 5}
F1 score: 0.945404903448661 - Params: {'max_depth': 12, 'min_samples_split': 6}
F1 score: 0.948579981332337 - Params: {'max_depth': 42, 'min_samples_split': 3}
F1 score: 0.9477100947274524 - Params: {'max_depth': 29, 'min_samples_split': 4}


In [11]:
#Chose the best model
clf_tree_optimize = rnd_search.best_estimator_

In [12]:
#Evaluate the model

#Train data
y_pred_train = clf_tree_optimize.predict(train_inputs)
#Validation data
y_pred_val = clf_tree_optimize.predict(validation_inputs)
#Test data
y_pred_test = clf_tree_optimize.predict(test_inputs)

print("F1 Score train inputs:", f1_score(y_pred_train, train_targets, pos_label=1))
print("F1 Score validation inputs:", f1_score(y_pred_val, validation_targets, pos_label=1))
print("F1 Score test inputs:", f1_score(y_pred_test, test_targets, pos_label=1))

F1 Score train inputs: 0.99104081406924
F1 Score validation inputs: 0.93812375249501
F1 Score test inputs: 0.037000330360092495


In [16]:
#Save the models

#Regular model
joblib.dump(clf_tree, '../../../../../../models/malicius_url/phishing/decision_tree/fishing_url_detection.pkl')
#Optimize model
joblib.dump(clf_tree_optimize, '../../../../../../models/malicius_url/phishing/decision_tree/fishing_url_detection_optimize.pkl')

['../../../../../../models/malicius_url/phishing/decision_tree/fishing_url_detection_optimize.pkl']