Training and testing the model

In [13]:
#import libraries

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import numpy as np
import joblib

In [14]:
#Load the data
#Train data
npz = np.load('../../../../../../data/processed/urls/malware/train_data.npz',  allow_pickle=True)
train_inputs, train_targets = npz['inputs'].astype(np.float64), npz['targets'].astype(np.int64)

#Validation data
npz = np.load('../../../../../../data/processed/urls/malware/validation_data.npz',  allow_pickle=True)
validation_inputs, validation_targets = npz['inputs'].astype(np.float64), npz['targets'].astype(np.int64)

#Test data
npz = np.load('../../../../../../data/processed/urls/malware/test_data.npz',  allow_pickle=True)
test_inputs, test_targets = npz['inputs'].astype(np.float64), npz['targets'].astype(np.int64)

In [15]:
#Define and train the model
MAX_DEPTH = 10

clf_tree = DecisionTreeClassifier(max_depth=MAX_DEPTH, random_state=42)
clf_tree.fit(train_inputs, train_targets)

In [16]:
#Evaluate the model

#Train data
y_pred_train = clf_tree.predict(train_inputs)
#Validation data
y_pred_val = clf_tree.predict(validation_inputs)
#Test data
y_pred_test = clf_tree.predict(test_inputs)

print("F1 Score train inputs:", f1_score(y_pred_train, train_targets, pos_label=1))
print("F1 Score validation inputs:", f1_score(y_pred_val, validation_targets, pos_label=1))
print("F1 Score test inputs:", f1_score(y_pred_test, test_targets, pos_label=1))

F1 Score train inputs: 0.9630918354666336
F1 Score validation inputs: 0.9498181818181818
F1 Score test inputs: 0.9515482695810564


In [17]:
confusion_matrix(validation_targets, y_pred_val)

array([[1455,   90],
       [  48, 1306]])

In [18]:
#Model selection
param_distribs = {
    'min_samples_split': randint(low=1, high=8),
    'max_depth': randint(low=8, high=50),
}

clf_tree_test = DecisionTreeClassifier()

rnd_search = RandomizedSearchCV(clf_tree_test, param_distributions=param_distribs,
n_iter=8, cv=4, scoring='f1_weighted')
rnd_search.fit(train_inputs, train_targets)

8 fits failed out of a total of 32.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/josecamacho/Desktop/Projects/security/ml/venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/josecamacho/Desktop/Projects/security/ml/venv/lib/python3.11/site-packages/sklearn/tree/_classes.py", line 889, in fit
    super().fit(
  File "/Users/josecamacho/Desktop/Projects/security/ml/venv/lib/python3.11/site-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/Users/josecamacho/Desktop/Projects/security/ml/venv/lib/p

In [19]:
rnd_search.best_params_

{'max_depth': 45, 'min_samples_split': 2}

In [20]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print("F1 score:", mean_score, "-", "Params:", params)

F1 score: nan - Params: {'max_depth': 33, 'min_samples_split': 1}
F1 score: 0.9707940560166366 - Params: {'max_depth': 18, 'min_samples_split': 7}
F1 score: 0.9724070677534444 - Params: {'max_depth': 22, 'min_samples_split': 6}
F1 score: 0.9747130805746845 - Params: {'max_depth': 45, 'min_samples_split': 2}
F1 score: nan - Params: {'max_depth': 8, 'min_samples_split': 1}
F1 score: 0.9718290642158902 - Params: {'max_depth': 16, 'min_samples_split': 7}
F1 score: 0.9727548501394078 - Params: {'max_depth': 44, 'min_samples_split': 6}
F1 score: 0.9711412051417951 - Params: {'max_depth': 49, 'min_samples_split': 7}


In [21]:
#Chose the best model
clf_tree_optimized = rnd_search.best_estimator_

In [22]:
#Evaluate the model

#Train data
y_pred_train = clf_tree_optimized.predict(train_inputs)
#Validation data
y_pred_val = clf_tree_optimized.predict(validation_inputs)
#Test data
y_pred_test = clf_tree_optimized.predict(test_inputs)

print("F1 Score train inputs:", f1_score(y_pred_train, train_targets, pos_label=1))
print("F1 Score validation inputs:", f1_score(y_pred_val, validation_targets, pos_label=1))
print("F1 Score test inputs:", f1_score(y_pred_test, test_targets, pos_label=1))

F1 Score train inputs: 0.9997497497497498
F1 Score validation inputs: 0.9700511322132943
F1 Score test inputs: 0.985390796201607


In [23]:
#Save the models

#Regular model
joblib.dump(clf_tree, '../../../../../../models/malicius_url/malware/decision_tree/malware_url_detection.pkl')
#Optimize model
joblib.dump(clf_tree_optimized, '../../../../../../models/malicius_url/malware/decision_tree/malware_url_detection_optimized.pkl')

['../../../../../../models/malicius_url/malware/decision_tree/malware_url_detection_optimized.pkl']