# Multi-Class Classification with Machine Learning
In this notebook, we will explore various machine learning models to solve a multi-class classification problem. We will evaluate and compare the performance of different algorithms on the dataset.


In [34]:
import ast
import json
import random
import re
import string
from collections import defaultdict, OrderedDict
from pathlib import Path
from typing import Union, Any

import numpy
import pandas
from bayes_opt import BayesianOptimization
from imblearn.over_sampling import SMOTE
from numpy import asarray
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier


In [3]:
INIT_POINTS = 3
N_ITER = 5
RANDOM_STATE = 42
TEST_SIZE = 2e-1

COLAB_PATH = Path('/content/drive/MyDrive')
KAGGLE_PATH = Path('/kaggle/input')
LOCAL_PATH = Path('./')

# Step 1: Check if running in Google Colab
try:
    import google.colab

    DATA_PATH = COLAB_PATH / Path('data')
    MODELS_PATH = COLAB_PATH / Path('models')
except ImportError:
    # Step 2: Check if running in Kaggle
    try:
        import kaggle_secrets

        DATA_PATH = KAGGLE_PATH
        MODELS_PATH = KAGGLE_PATH
    except ImportError:
        # Step 3: Default to local Jupyter Notebook
        DATA_PATH = LOCAL_PATH / Path('data')
        MODELS_PATH = LOCAL_PATH / Path('models')

GLOVE_6B_PATH = MODELS_PATH / Path('glove-embeddings')
THREAT_TWEETS_PATH = DATA_PATH / Path('tweets-dataset-for-cyberattack-detection')

GLOVE_6B_300D_TXT = GLOVE_6B_PATH / Path('glove.6B.300d.txt')
THREAT_TWEETS_CSV = THREAT_TWEETS_PATH / Path('tweets_final.csv')


## Functions


### Preprocessing


In [4]:
def extract_keys(d, path=None):
    """
    Recursively extract keys from a dictionary, building paths as a list.

    Parameters
    ----------
    d : dict or any
        The dictionary to extract keys from.
    path : list, optional
        A list to accumulate the path, default is None.

    Returns
    -------
    list
        A list of paths representing keys in the dictionary.
    """
    if path is None:
        path = []

    if isinstance(d, dict):
        for key, value in d.items():
            path = extract_keys(value, path + [key])
    else:
        path = [d]

    return path


def build_tree(categories):
    """
    Build a tree-like structure (nested dictionary) from category labels.

    Parameters
    ----------
    categories : list of dict
        A list of categories, where each category has a 'label' key that contains a path-like string.

    Returns
    -------
    dict
        A nested dictionary representing the tree structure.
    """
    tree = {}
    for category in categories:
        current = tree
        for part in category['label'].strip('/').split('/'):
            current = current.setdefault(part, {})
    return tree


def merge_trees_with_counts(tree1, tree2, visit_count):
    """
    Merge two trees recursively and count the visits to each node.

    Parameters
    ----------
    tree1 : dict
        The first tree to be merged.
    tree2 : dict
        The second tree to be merged.
    visit_count : defaultdict(int)
        A dictionary that tracks the visit count for each node.

    Returns
    -------
    dict
        The merged tree after processing both input trees.
    """
    for key, value in tree2.items():
        if key not in tree1:
            tree1[key] = value
        elif isinstance(value, dict) and isinstance(tree1[key], dict):
            merge_trees_with_counts(tree1[key], value, visit_count)

        # Count visits for the node
        visit_count[key] += 1
    return tree1


def merge_all_trees_with_counts(trees):
    """
    Merge all trees into one general tree and count the visits to each node.

    Parameters
    ----------
    trees : list of dict
        A list of trees (dictionaries) to be merged.

    Returns
    -------
    (dict, defaultdict)
        The merged tree with all nodes, and a dictionary mapping each node to its visit count.

    """
    visit_count = defaultdict(int)
    unique_trees = [json.loads(json.dumps(tree, sort_keys=True)) for tree in trees]
    general_tree = {}

    for tree in unique_trees:
        general_tree = merge_trees_with_counts(tree1=general_tree, tree2=tree, visit_count=visit_count)

    return general_tree, visit_count


### Training


In [5]:
def load_word2vec_dict(model_path: Path, embedding_dim: int) -> dict[
    Union[str, list[str]], numpy.ndarray[Any, numpy.dtype]]:
    embeddings_dict = {}

    f = open(model_path, 'r', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[:-embedding_dim]

        if type(word) is list:
            word = ' '.join(word)

        vector = asarray([float(val) for val in values[-embedding_dim:]])
        embeddings_dict[word] = vector
    f.close()

    return embeddings_dict


def preprocess_texts(list_str, model_path, embedding_dim):
    word2vec_dict = load_word2vec_dict(
        model_path=model_path,
        embedding_dim=embedding_dim
    )
    list_embedded_str = numpy.zeros((len(list_str), embedding_dim))
    for i, text in enumerate(list_str):
        try:
            tokens = re.findall(r'\w+|[{}]'.format(re.escape(string.punctuation)), text)
            for token in tokens:
                try:
                    list_embedded_str[i] += word2vec_dict[token.lower()]
                except KeyError:
                    continue
        except:
            print(text)
            return
    return list_embedded_str


#### Linear Regression Optimization


In [6]:
def lr_optimization(C, max_iter, penalty):
    # Convert penalty type
    penalty = round(penalty)  # 0: 'l1', 1: 'l2'
    penalty_mapping = {0: 'l1', 1: 'l2'}
    penalty_type = penalty_mapping[penalty]

    # Apply SMOTE to handle class imbalance
    smote = SMOTE(random_state=RANDOM_STATE)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

    # Define the LogisticRegression model with OneVsRestClassifier
    model = OneVsRestClassifier(LogisticRegression(
        C=C,
        max_iter=round(max_iter),
        penalty=penalty_type,
        solver='liblinear',
        random_state=RANDOM_STATE
    ))

    # Fit the model on the resampled data
    model.fit(X_resampled, y_resampled)

    # Make predictions on the validation set
    y_pred = model.predict(X_val)

    # Calculate the metrics
    accuracy = accuracy_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_pred, average='weighted')

    # Return the objective function value
    return (accuracy + auc) / 2


## Table of Contents
1. [Introduction](#1-introduction)
2. [Data Loading & Preprocessing](#2-data-loading-and-preprocessing)
3. [Exploratory Data Analysis (EDA)](#3-exploratory-data-analysis-eda)
4. [Model Training](#4-model-training)
    1. [Logistic Regression](#41-logistic-regression)
    2. [Gaussian Naïve Bayes](#42-gaussian-naïve-bayes)
    3. [Decision Tree Classifier](#43-decision-tree-classifier)
    4. [Support Vector Classifier](#44-support-vector-classifier)
    5. [Random Forest Classifier](#45-random-forest-classifier)
    6. [XGBoost Classifier](#46-xgboost-classifier)
5. [Model Evaluation](#5-model-evaluation)
    1. [Accuracy](#Accuracy)
    2. [Confusion Matrix](#Confusion-Matrix)
    3. [Classification Report](#Classification-Report)
6. [Conclusion](#Conclusion)


## 1. Introduction

In this notebook, we are going to solve a multi-class classification problem using different machine learning models. Our goal is to predict the class of each sample based on the input features.


## 2. Data Loading and Preprocessing
We will load the dataset, inspect its structure, and preprocess it for machine learning models.


In [7]:
# Read the CSV file and process columns in one step
threat_tweets = (
    pandas.read_csv(filepath_or_buffer=THREAT_TWEETS_CSV)
    .assign(
        tweet=lambda df: df['tweet'].apply(func=ast.literal_eval),
        watson=lambda df: df['watson'].apply(func=ast.literal_eval)
        .apply(func=lambda x: x.get('categories', []))
        .apply(func=build_tree),
        watson_list=lambda df: df['watson'].apply(func=extract_keys),
    )
    .query(expr='relevant == True')
    .drop(labels=['relevant'], axis=1)
    .dropna(subset=['text'], ignore_index=True)
)

threat_tweets.head()


Unnamed: 0,_id,date,id,text,tweet,type,watson,annotation,urls,destination_url,valid_certificate,watson_list
0,b'5b8876f9bb325e65fa7e78e4',2018-08-30 23:00:08+00:00,1035301167952211969,Protect your customers access Prestashop Ant...,{'created_at': 'Thu Aug 30 23:00:08 +0000 2018...,ddos,{'technology and computing': {'internet techno...,threat,['http://addons.prestashop.com/en/23513-anti-d...,https://addons.prestashop.com/en/23513-anti-dd...,True,"[technology and computing, internet technology..."
1,b'5b8876f9bb325e65fa7e78e5',2018-08-30 23:00:09+00:00,1035301173178249217,Data leak from Huazhu Hotels may affect 130 mi...,{'created_at': 'Thu Aug 30 23:00:09 +0000 2018...,leak,"{'travel': {'hotels': {}}, 'home and garden': ...",threat,['http://www.hotelmanagement.net/tech/data-lea...,http://www.hotelmanagement.net/tech/data-leak-...,True,"[travel, hotels, home and garden, home improve..."
2,b'5b8876fabb325e65fa7e78e6',2018-08-30 23:00:09+00:00,1035301174583353344,Instagram App 41.1788.50991.0 #Denial Of #Serv...,{'created_at': 'Thu Aug 30 23:00:09 +0000 2018...,general,{'science': {'weather': {'meteorological disas...,threat,['https://packetstormsecurity.com/files/149120...,https://packetstormsecurity.com/files/149120/i...,True,"[science, weather, meteorological disaster, hu..."
3,b'5b88770abb325e65fa7e78e7',2018-08-30 23:00:25+00:00,1035301242271096832,(good slides): \n\nThe Advanced Exploitation o...,{'created_at': 'Thu Aug 30 23:00:25 +0000 2018...,vulnerability,{'business and industrial': {'business operati...,threat,['https://twitter.com/i/web/status/10353012422...,https://twitter.com/i/web/status/1035301242271...,True,"[business and industrial, business operations,..."
4,b'5b887713bb325e65fa7e78e8',2018-08-30 23:00:35+00:00,1035301282095853569,CVE-2018-1000532 (beep)\nhttps://t.co/CaKbo38U...,{'created_at': 'Thu Aug 30 23:00:35 +0000 2018...,vulnerability,{'technology and computing': {'computer securi...,threat,['https://web.nvd.nist.gov/view/vuln/detail?vu...,https://nvd.nist.gov/vuln/detail/CVE-2018-1000532,True,"[technology and computing, computer security, ..."


In [8]:
print(f"Number of CS related tweets:\t{len(threat_tweets)}")


Number of CS related tweets:	11112


In [9]:
general_tree, visit_count = merge_all_trees_with_counts(threat_tweets['watson'])


In [10]:
print("The subcategories in 'technology and computing' are:")
for category in list(general_tree['technology and computing'].keys()):
    print(f'· {category}')


The subcategories in 'technology and computing' are:
· computer security
· internet technology
· software
· hardware
· operating systems
· data centers
· mp3 and midi
· computer reviews
· programming languages
· consumer electronics
· tech news
· networking
· electronic components
· computer crime
· enterprise technology
· computer certification
· technological innovation
· technical support


In [11]:
sorted_visit_count = dict(sorted(visit_count.items(), key=lambda item: item[1], reverse=True))

with open('general_tree.json', 'w') as file:
    file.write(json.dumps(general_tree, indent=4))

with open('general_tree_visit_counts.json', 'w') as file:
    file.write(json.dumps(sorted_visit_count, indent=4))


## 3. Exploratory Data Analysis (EDA)
Let's analyze the dataset and gain insights into its distribution.


In [12]:
print('At macro categories are:')
for category in list(general_tree.keys()):
    print(f'· {category}')


At macro categories are:
· technology and computing
· health and fitness
· home and garden
· travel
· art and entertainment
· science
· business and industrial
· sports
· finance
· law, govt and politics
· society
· real estate
· pets
· style and fashion
· news
· hobbies and interests
· food and drink
· education
· shopping
· family and parenting
· religion and spirituality
· automotive and vehicles
· careers


For the goal of the project, the categories of interest are:
1. computer security/network security
2. computer security/antivirus and malware
3. operating systems/mac os
4. operating systems/windows
5. operating systems/unix
6. operating systems/linux
7. software
8. programming languages, included in software
9. software/databases
10. hardware
11. electronic components, included in hardware
12. hardware/computer/servers
13. hardware/computer/portable computer
14. hardware/computer/desktop computer
15. hardware/computer components
16. hardware/computer networking/router
17. hardware/computer networking/wireless technology
18. networking
19. internet technology, included in networking


In [13]:
FIX_TARGETS = {
    'computer security': 'computer security',
    'operating systems': 'operating systems',
    'software': 'software',
    'programming languages': 'software',
    'hardware': 'hardware',
    'electronic components': 'hardware',
    'networking': 'networking',
    'internet technology': 'networking'
}

chosen_categories = [
    list(set(FIX_TARGETS.keys()) & set(s))
    for s in threat_tweets['watson_list']
]

for i, watson_list in enumerate(chosen_categories):
    temp = list(set([FIX_TARGETS[c] for c in watson_list]))
    if len(temp) < 1:
        temp = ['other']
    chosen_categories[i] = temp

threat_tweets['target'] = chosen_categories
threat_tweets.head()


Unnamed: 0,_id,date,id,text,tweet,type,watson,annotation,urls,destination_url,valid_certificate,watson_list,target
0,b'5b8876f9bb325e65fa7e78e4',2018-08-30 23:00:08+00:00,1035301167952211969,Protect your customers access Prestashop Ant...,{'created_at': 'Thu Aug 30 23:00:08 +0000 2018...,ddos,{'technology and computing': {'internet techno...,threat,['http://addons.prestashop.com/en/23513-anti-d...,https://addons.prestashop.com/en/23513-anti-dd...,True,"[technology and computing, internet technology...","[software, computer security, networking]"
1,b'5b8876f9bb325e65fa7e78e5',2018-08-30 23:00:09+00:00,1035301173178249217,Data leak from Huazhu Hotels may affect 130 mi...,{'created_at': 'Thu Aug 30 23:00:09 +0000 2018...,leak,"{'travel': {'hotels': {}}, 'home and garden': ...",threat,['http://www.hotelmanagement.net/tech/data-lea...,http://www.hotelmanagement.net/tech/data-leak-...,True,"[travel, hotels, home and garden, home improve...",[other]
2,b'5b8876fabb325e65fa7e78e6',2018-08-30 23:00:09+00:00,1035301174583353344,Instagram App 41.1788.50991.0 #Denial Of #Serv...,{'created_at': 'Thu Aug 30 23:00:09 +0000 2018...,general,{'science': {'weather': {'meteorological disas...,threat,['https://packetstormsecurity.com/files/149120...,https://packetstormsecurity.com/files/149120/i...,True,"[science, weather, meteorological disaster, hu...",[hardware]
3,b'5b88770abb325e65fa7e78e7',2018-08-30 23:00:25+00:00,1035301242271096832,(good slides): \n\nThe Advanced Exploitation o...,{'created_at': 'Thu Aug 30 23:00:25 +0000 2018...,vulnerability,{'business and industrial': {'business operati...,threat,['https://twitter.com/i/web/status/10353012422...,https://twitter.com/i/web/status/1035301242271...,True,"[business and industrial, business operations,...",[operating systems]
4,b'5b887713bb325e65fa7e78e8',2018-08-30 23:00:35+00:00,1035301282095853569,CVE-2018-1000532 (beep)\nhttps://t.co/CaKbo38U...,{'created_at': 'Thu Aug 30 23:00:35 +0000 2018...,vulnerability,{'technology and computing': {'computer securi...,threat,['https://web.nvd.nist.gov/view/vuln/detail?vu...,https://nvd.nist.gov/vuln/detail/CVE-2018-1000532,True,"[technology and computing, computer security, ...","[software, computer security, hardware]"


## 4. Model Training

We will now train different models and evaluate their performance.


In [14]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(threat_tweets['target'])

X = preprocess_texts(
    list_str=threat_tweets['text'],
    model_path=GLOVE_6B_300D_TXT,
    embedding_dim=300
)


In [23]:
X.shape, y.shape

((11112, 300), (11112, 6))

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    shuffle=True
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    shuffle=True
)


### 4.1. Logistic Regression


In [19]:
# Define the hyperparameter space
pbounds = {
    'C': (0.01, 10),  # Regularization strength
    'max_iter': (100, 1000),  # Max iterations
    'penalty': (0, 1)  # Penalty type
}

optimizer = BayesianOptimization(
    f=lr_optimization,  # The objective function
    pbounds=OrderedDict(sorted(pbounds.items())),  # The bounds of the hyperparameters
    random_state=RANDOM_STATE
)

# Perform the optimization
optimizer.maximize(init_points=INIT_POINTS, n_iter=N_ITER)


|   iter    |  target   |     C     | max_iter  |  penalty  |
-------------------------------------------------------------


  warn(msg, stacklevel=1)


| [39m1        [39m | [39m0.8337   [39m | [39m3.752    [39m | [39m955.6    [39m | [39m0.732    [39m |
| [39m2        [39m | [39m0.8337   [39m | [39m5.991    [39m | [39m240.4    [39m | [39m0.156    [39m |
| [35m3        [39m | [35m0.8352   [39m | [35m0.5903   [39m | [35m879.6    [39m | [35m0.6011   [39m |
| [39m4        [39m | [39m0.8343   [39m | [39m0.8968   [39m | [39m878.5    [39m | [39m0.3206   [39m |
| [39m5        [39m | [39m0.8328   [39m | [39m0.4566   [39m | [39m879.1    [39m | [39m0.8655   [39m |
| [39m6        [39m | [39m0.8337   [39m | [39m3.259    [39m | [39m531.2    [39m | [39m0.1428   [39m |
| [39m7        [39m | [39m0.8337   [39m | [39m9.43     [39m | [39m431.1    [39m | [39m0.2213   [39m |
| [39m8        [39m | [39m0.8337   [39m | [39m8.942    [39m | [39m875.4    [39m | [39m0.4132   [39m |


In [20]:
# Get the best hyperparameters found by the optimizer
best_params = optimizer.max['params']
print(f"Best hyperparameters: {best_params}")

# Use the best parameters to train the final model
penalty_mapping = {0: 'l1', 1: 'l2'}
classifier_lr = OneVsRestClassifier(LogisticRegression(
    C=best_params['C'],
    max_iter=round(best_params['max_iter']),
    penalty=penalty_mapping[round(best_params['penalty'])],
    solver='liblinear',
    random_state=RANDOM_STATE
))

# Fit the model with the best parameters
classifier_lr.fit(X_train, y_train)


Best hyperparameters: {'C': 0.5902552855603126, 'max_iter': 879.5585311974417, 'penalty': 0.6011150117432088}


### 4.2. Gaussian Naïve Bayes


In [17]:
def gnb_optimization(var_smoothing):
    # Create and fit the OneVsRestClassifier with GaussianNB
    gnb = GaussianNB(var_smoothing=var_smoothing)
    model = OneVsRestClassifier(estimator=gnb)
    model.fit(X=X_train, y=y_train)

    # Make predictions
    y_pred = model.predict(X=X_val)

    # Calculate accuracy and AUC
    accuracy = accuracy_score(y_true=y_val, y_pred=y_pred)
    auc = roc_auc_score(y_true=y_val, y_score=y_pred, average='weighted')

    # Return the average of accuracy and AUC as the objective to maximize
    return (accuracy + auc) / 2


In [18]:
# Set the bounds for the hyperparameters
pbounds = {
    'var_smoothing': (1e-10, 1e-1)  # The range for the regularization parameter (log scale)
}
pbounds = dict(sorted(pbounds.items()))

# Create the Bayesian Optimization object
optimizer = BayesianOptimization(
    f=gnb_optimization,
    pbounds=pbounds,
    random_state=RANDOM_STATE
)

# Perform the optimization process
optimizer.maximize()


|   iter    |  target   | var_sm... |
-------------------------------------
| [39m1        [39m | [39m0.3526   [39m | [39m0.03745  [39m |


  warn(msg, stacklevel=1)


| [39m2        [39m | [39m0.344    [39m | [39m0.09507  [39m |
| [39m3        [39m | [39m0.3464   [39m | [39m0.0732   [39m |
| [39m4        [39m | [39m0.3473   [39m | [39m0.05987  [39m |
| [35m5        [39m | [35m0.3572   [39m | [35m0.0156   [39m |
| [35m6        [39m | [35m0.3736   [39m | [35m1.164e-06[39m |
| [39m7        [39m | [39m0.3696   [39m | [39m0.003087 [39m |
| [39m8        [39m | [39m0.3736   [39m | [39m1.102e-05[39m |
| [39m9        [39m | [39m0.3736   [39m | [39m5.538e-07[39m |
| [39m10       [39m | [39m0.3736   [39m | [39m1.674e-06[39m |
| [39m11       [39m | [39m0.3736   [39m | [39m8.433e-07[39m |
| [39m12       [39m | [39m0.3736   [39m | [39m2.701e-05[39m |
| [39m13       [39m | [39m0.3736   [39m | [39m1.3e-05  [39m |
| [39m14       [39m | [39m0.3736   [39m | [39m1.288e-05[39m |
| [39m15       [39m | [39m0.3736   [39m | [39m2.935e-06[39m |
| [39m16       [39m | [39m0.3736   [39m | [

In [19]:
# Get the best hyperparameters found by the optimizer
best_params = optimizer.max['params']
print(f"Best hyperparameters: {best_params}")

# Use the best parameters to train the final model
gnb_model = GaussianNB(var_smoothing=best_params['var_smoothing'])
classifier_gnb = OneVsRestClassifier(estimator=gnb_model)

# Fit the model with the best parameters
classifier_gnb.fit(X=X_train, y=y_train)


Best hyperparameters: {'var_smoothing': 1.163575535450636e-06}


### 4.3. Decision Tree Classifier


In [20]:
def dt_optimization(max_depth, min_samples_split, min_samples_leaf):
    # Create and fit the OneVsRestClassifier with DecisionTreeClassifier
    model = OneVsRestClassifier(DecisionTreeClassifier(
        max_depth=int(max_depth),
        min_samples_split=int(min_samples_split),
        min_samples_leaf=int(min_samples_leaf),
        random_state=RANDOM_STATE
    ))
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_val)

    # Calculate accuracy and AUC
    accuracy = accuracy_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_pred, average='weighted')

    # Return the average of accuracy and AUC as the objective to maximize
    return (accuracy + auc) / 2


In [21]:
# Set the bounds for the hyperparameters
pbounds = {
    'max_depth': (3, 20),  # The depth of the tree
    'min_samples_leaf': (1, 20),  # Minimum samples required at a leaf node
    'min_samples_split': (2, 20),  # Minimum samples required to split an internal node
}

# Create the Bayesian Optimization object
optimizer = BayesianOptimization(
    f=dt_optimization,  # The objective function
    pbounds=pbounds,  # The bounds of the hyperparameters
    random_state=RANDOM_STATE
)

# Perform the optimization process
optimizer.maximize(init_points=INIT_POINTS, n_iter=N_ITER)


|   iter    |  target   | max_depth | min_sa... | min_sa... |
-------------------------------------------------------------


  warn(msg, stacklevel=1)


| [39m1        [39m | [39m0.5513   [39m | [39m9.367    [39m | [39m19.06    [39m | [39m15.18    [39m |
| [35m2        [39m | [35m0.5994   [39m | [35m13.18    [39m | [35m3.964    [39m | [35m4.808    [39m |
| [39m3        [39m | [39m0.4813   [39m | [39m3.987    [39m | [39m17.46    [39m | [39m12.82    [39m |
| [39m4        [39m | [39m0.5577   [39m | [39m8.76     [39m | [39m19.09    [39m | [39m15.28    [39m |
| [39m5        [39m | [39m0.5814   [39m | [39m13.4     [39m | [39m6.175    [39m | [39m2.405    [39m |
| [39m6        [39m | [39m0.5962   [39m | [39m12.42    [39m | [39m1.139    [39m | [39m2.158    [39m |
| [39m7        [39m | [39m0.5679   [39m | [39m8.624    [39m | [39m2.81     [39m | [39m4.785    [39m |
| [35m8        [39m | [35m0.6013   [39m | [35m16.55    [39m | [35m1.234    [39m | [35m4.681    [39m |


In [22]:
# Get the best hyperparameters found by the optimizer
best_params = optimizer.max['params']
print(f"Best hyperparameters: {best_params}")

# Use the best parameters to train the final model
classifier_dt = OneVsRestClassifier(DecisionTreeClassifier(
    max_depth=int(best_params['max_depth']),
    min_samples_split=int(best_params['min_samples_split']),
    min_samples_leaf=int(best_params['min_samples_leaf']),
    random_state=RANDOM_STATE
))

# Fit the model with the best parameters
classifier_dt.fit(X_train, y_train)


Best hyperparameters: {'max_depth': 16.55256436327114, 'min_samples_leaf': 1.2342835049898957, 'min_samples_split': 4.681283555919033}


### 4.4. Support Vector Classifier


In [23]:
def svc_optimization(C, gamma, kernel, degree):
    # Convert kernel string into the correct type
    kernel = round(kernel)  # 0: 'linear', 1: 'poly', 2: 'rbf', 3: 'sigmoid'
    kernel_mapping = {0: 'linear', 1: 'poly', 2: 'rbf', 3: 'sigmoid'}
    kernel_type = kernel_mapping[kernel]

    # Create and fit the OneVsRestClassifier with SVC
    model = OneVsRestClassifier(SVC(
        C=C,
        kernel=kernel_type,
        gamma=gamma,
        degree=round(degree),
        random_state=RANDOM_STATE
    ))
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_val)

    # Calculate accuracy and AUC
    accuracy = accuracy_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_pred, average='weighted')

    # Return the average of accuracy and AUC as the objective to maximize
    return (accuracy + auc) / 2


In [24]:
# Set the bounds for the hyperparameters
pbounds = {
    'C': (0.1, 10),  # Regularization parameter (log scale)
    'gamma': (0.001, 1),  # Kernel coefficient (for 'rbf', 'poly', and 'sigmoid')
    'kernel': (0, 3),  # Kernel type (0: 'linear', 1: 'poly', 2: 'rbf', 3: 'sigmoid')
    'degree': (2, 5),  # Degree of the polynomial kernel function
}

# Create the Bayesian Optimization object
optimizer = BayesianOptimization(
    f=svc_optimization,  # The objective function
    pbounds=pbounds,  # The bounds of the hyperparameters
    random_state=RANDOM_STATE
)

# Perform the optimization process
optimizer.maximize(init_points=INIT_POINTS, n_iter=N_ITER)


|   iter    |  target   |     C     |  degree   |   gamma   |  kernel   |
-------------------------------------------------------------------------


  warn(msg, stacklevel=1)


| [39m1        [39m | [39m0.5639   [39m | [39m3.808    [39m | [39m4.852    [39m | [39m0.7323   [39m | [39m1.796    [39m |
| [39m2        [39m | [39m0.363    [39m | [39m1.645    [39m | [39m2.468    [39m | [39m0.05903  [39m | [39m2.599    [39m |
| [39m3        [39m | [39m0.3489   [39m | [39m6.051    [39m | [39m4.124    [39m | [39m0.02156  [39m | [39m2.91     [39m |
| [35m4        [39m | [35m0.5663   [39m | [35m3.875    [39m | [35m4.788    [39m | [35m0.6847   [39m | [35m1.864    [39m |
| [35m5        [39m | [35m0.6608   [39m | [35m4.1      [39m | [35m4.265    [39m | [35m0.01774  [39m | [35m0.4408   [39m |
| [35m6        [39m | [35m0.6609   [39m | [35m4.455    [39m | [35m2.763    [39m | [35m0.1241   [39m | [35m0.03997  [39m |
| [35m7        [39m | [35m0.6611   [39m | [35m5.963    [39m | [35m3.867    [39m | [35m0.9854   [39m | [35m0.05732  [39m |
| [39m8        [39m | [39m0.6607   [39m | [39m7.961    [

In [25]:
# Get the best hyperparameters found by the optimizer
best_params = optimizer.max['params']
print(f"Best hyperparameters: {best_params}")

# Use the best parameters to train the final model
kernel_mapping = {0: 'linear', 1: 'poly', 2: 'rbf', 3: 'sigmoid'}
classifier_svm = OneVsRestClassifier(SVC(
    C=best_params['C'],
    kernel=kernel_mapping[round(best_params['kernel'])],
    gamma=best_params['gamma'],
    degree=round(best_params['degree']),
    random_state=RANDOM_STATE
))

# Fit the model with the best parameters
classifier_svm.fit(X_train, y_train)


Best hyperparameters: {'C': 5.962994065206512, 'degree': 3.8673241773011173, 'gamma': 0.9854416426080342, 'kernel': 0.057316188641134636}


### 4.5. Random Forest Classifier


In [26]:
def rf_optimization(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    # Ensure parameters are integers where required
    n_estimators = round(n_estimators)
    max_depth = round(max_depth)
    min_samples_split = round(min_samples_split)
    min_samples_leaf = round(min_samples_leaf)

    # Initialize the RandomForestClassifier within the OneVsRestClassifier
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=RANDOM_STATE
    )
    clf = OneVsRestClassifier(rf)

    # Fit the model
    clf.fit(X_train, y_train)

    # Predict on the validation set
    y_pred = clf.predict(X_val)

    # Calculate the accuracy score (you could also use AUC, F1-score, etc.)
    accuracy = accuracy_score(y_val, y_pred)

    # Return the negative of accuracy (because we want to maximize it in Bayesian Optimization)
    return accuracy


In [27]:
# Define the hyperparameter space
pbounds = {
    'n_estimators': (10, 200),  # Number of trees in the forest
    'max_depth': (5, 50),  # Maximum depth of the trees
    'min_samples_split': (2, 20),  # Minimum samples required to split a node
    'min_samples_leaf': (1, 20)  # Minimum samples required at a leaf node
}

# Initialize the BayesianOptimizer
optimizer = BayesianOptimization(
    f=rf_optimization,  # The objective function
    pbounds=pbounds,  # The bounds of the hyperparameters
    verbose=2,  # Print status of optimization
    random_state=RANDOM_STATE
)

# Perform the optimization
optimizer.maximize(init_points=INIT_POINTS, n_iter=N_ITER)


|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------


  warn(msg, stacklevel=1)


| [39m1        [39m | [39m0.4606   [39m | [39m21.85    [39m | [39m19.06    [39m | [39m15.18    [39m | [39m123.7    [39m |
| [35m2        [39m | [35m0.5158   [39m | [35m12.02    [39m | [35m3.964    [39m | [35m3.046    [39m | [35m174.6    [39m |
| [39m3        [39m | [39m0.4741   [39m | [39m32.05    [39m | [39m14.45    [39m | [39m2.371    [39m | [39m194.3    [39m |
| [39m4        [39m | [39m0.5158   [39m | [39m11.49    [39m | [39m2.778    [39m | [39m2.3      [39m | [39m176.0    [39m |
| [39m5        [39m | [39m0.3953   [39m | [39m6.26     [39m | [39m3.822    [39m | [39m19.42    [39m | [39m188.8    [39m |
| [39m6        [39m | [39m0.4662   [39m | [39m8.097    [39m | [39m3.574    [39m | [39m2.777    [39m | [39m157.6    [39m |
| [35m7        [39m | [35m0.5636   [39m | [35m24.61    [39m | [35m1.135    [39m | [35m5.936    [39m | [35m175.4    [39m |
| [39m8        [39m | [39m0.5535   [39m | [39m35.82    [

In [28]:
# Get the best hyperparameters
best_params = optimizer.max['params']
print(f"Best Hyperparameters: {best_params}")

# Train the model with the best parameters found
rf_best = RandomForestClassifier(
    n_estimators=int(best_params['n_estimators']),
    max_depth=int(best_params['max_depth']),
    min_samples_split=int(best_params['min_samples_split']),
    min_samples_leaf=int(best_params['min_samples_leaf']),
    random_state=RANDOM_STATE
)
classifier_rf = OneVsRestClassifier(rf_best)
classifier_rf.fit(X_train, y_train)


Best Hyperparameters: {'max_depth': 24.61318358542884, 'min_samples_leaf': 1.1346868623391282, 'min_samples_split': 5.936251566612096, 'n_estimators': 175.42967753116375}


### 4.6. XGBoost Classifier


In [29]:
classifier_xgb = OneVsRestClassifier(XGBClassifier())
classifier_xgb.fit(X_train, y_train)


## 5. Model Evaluation

Now that we've trained the models, let's evaluate them in more detail.


In [40]:
y_pred = classifier_lr.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"Accuracy:\t{accuracy:.2f}")
print(f"AUC:\t{roc_auc_score(y_true=y_val, y_score=y_pred, average='weighted'):.2f}")
print(classification_report(y_val, y_pred, target_names=mlb.classes_, zero_division=0))


Accuracy:	0.51
AUC:	0.79
                   precision    recall  f1-score   support

computer security       0.82      0.85      0.83       931
         hardware       0.68      0.44      0.53       355
       networking       0.57      0.37      0.45       146
operating systems       0.76      0.63      0.69       171
            other       0.79      0.65      0.71       361
         software       0.78      0.73      0.76       644

        micro avg       0.78      0.69      0.73      2608
        macro avg       0.73      0.61      0.66      2608
     weighted avg       0.77      0.69      0.73      2608
      samples avg       0.73      0.70      0.69      2608



In [41]:
y_pred = classifier_gnb.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"Accuracy:\t{accuracy:.2f}")
print(f"AUC:\t{roc_auc_score(y_true=y_val, y_score=y_pred, average='weighted'):.2f}")
print(classification_report(y_val, y_pred, target_names=mlb.classes_, zero_division=0))


Accuracy:	0.12
AUC:	0.62
                   precision    recall  f1-score   support

computer security       0.60      0.81      0.69       931
         hardware       0.24      0.72      0.36       355
       networking       0.14      0.56      0.22       146
operating systems       0.21      0.70      0.32       171
            other       0.48      0.27      0.35       361
         software       0.48      0.83      0.61       644

        micro avg       0.38      0.71      0.50      2608
        macro avg       0.36      0.65      0.42      2608
     weighted avg       0.46      0.71      0.53      2608
      samples avg       0.42      0.68      0.48      2608



In [42]:
y_pred = classifier_dt.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"Accuracy:\t{accuracy:.2f}")
print(f"AUC:\t{roc_auc_score(y_true=y_val, y_score=y_pred, average='weighted'):.2f}")
print(classification_report(y_val, y_pred, target_names=mlb.classes_, zero_division=0))


Accuracy:	0.46
AUC:	0.75
                   precision    recall  f1-score   support

computer security       0.74      0.78      0.76       931
         hardware       0.64      0.49      0.55       355
       networking       0.48      0.38      0.42       146
operating systems       0.64      0.55      0.59       171
            other       0.64      0.54      0.59       361
         software       0.72      0.76      0.74       644

        micro avg       0.69      0.67      0.68      2608
        macro avg       0.64      0.58      0.61      2608
     weighted avg       0.69      0.67      0.67      2608
      samples avg       0.65      0.66      0.64      2608



In [43]:
y_pred = classifier_svm.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"Accuracy:\t{accuracy:.2f}")
print(f"AUC:\t{roc_auc_score(y_true=y_val, y_score=y_pred, average='weighted'):.2f}")
print(classification_report(y_val, y_pred, target_names=mlb.classes_, zero_division=0))


Accuracy:	0.53
AUC:	0.79
                   precision    recall  f1-score   support

computer security       0.82      0.84      0.83       931
         hardware       0.70      0.47      0.56       355
       networking       0.66      0.39      0.49       146
operating systems       0.73      0.65      0.69       171
            other       0.78      0.68      0.73       361
         software       0.78      0.74      0.76       644

        micro avg       0.78      0.71      0.74      2608
        macro avg       0.75      0.63      0.68      2608
     weighted avg       0.77      0.71      0.73      2608
      samples avg       0.73      0.72      0.70      2608



In [44]:
y_pred = classifier_rf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"Accuracy:\t{accuracy:.2f}")
print(f"AUC:\t{roc_auc_score(y_true=y_val, y_score=y_pred, average='weighted'):.2f}")
print(classification_report(y_val, y_pred, target_names=mlb.classes_, zero_division=0))


Accuracy:	0.56
AUC:	0.78
                   precision    recall  f1-score   support

computer security       0.81      0.89      0.84       931
         hardware       0.97      0.41      0.58       355
       networking       0.96      0.36      0.53       146
operating systems       0.92      0.42      0.57       171
            other       0.96      0.42      0.59       361
         software       0.90      0.73      0.81       644

        micro avg       0.86      0.66      0.75      2608
        macro avg       0.92      0.54      0.65      2608
     weighted avg       0.89      0.66      0.73      2608
      samples avg       0.72      0.66      0.67      2608



In [45]:
y_pred = classifier_xgb.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"Accuracy:\t{accuracy:.2f}")
print(f"AUC:\t{roc_auc_score(y_true=y_val, y_score=y_pred, average='weighted'):.2f}")
print(classification_report(y_val, y_pred, target_names=mlb.classes_, zero_division=0))


Accuracy:	0.61
AUC:	0.82
                   precision    recall  f1-score   support

computer security       0.85      0.86      0.85       931
         hardware       0.88      0.52      0.65       355
       networking       0.94      0.42      0.58       146
operating systems       0.95      0.57      0.71       171
            other       0.91      0.60      0.72       361
         software       0.85      0.80      0.82       644

        micro avg       0.87      0.72      0.78      2608
        macro avg       0.90      0.63      0.72      2608
     weighted avg       0.87      0.72      0.78      2608
      samples avg       0.75      0.72      0.72      2608



In [36]:
def xgboost_hyper_param(learning_rate, n_estimators, max_depth, subsample, colsample, gamma):
    max_depth = int(max_depth)
    n_estimators = int(n_estimators)

    clf = OneVsRestClassifier(XGBClassifier(
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        gamma=gamma
    ))
    return numpy.mean(cross_val_score(clf, X_train_val, y_train_val, cv=3, scoring='roc_auc'))


In [37]:
# Define the hyperparameter space
pbounds = {
    'learning_rate': (0.01, 1.0),
    'n_estimators': (100, 1000),
    'max_depth': (3, 10),
    'subsample': (1.0, 1.0),  # Change for big datasets
    'colsample': (1.0, 1.0),  # Change for datasets with lots of features
    'gamma': (0, 5)
}

optimizer = BayesianOptimization(
    f=xgboost_hyper_param,
    pbounds=pbounds,
    random_state=RANDOM_STATE
)
optimizer.maximize(n_iter=N_ITER)


|   iter    |  target   | colsample |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------------------


  warn(msg, stacklevel=1)


| [39m1        [39m | [39m0.8035   [39m | [39m1.0      [39m | [39m4.754    [39m | [39m0.7347   [39m | [39m7.191    [39m | [39m240.4    [39m | [39m1.0      [39m |
| [35m2        [39m | [35m0.8081   [39m | [35m1.0      [39m | [35m4.331    [39m | [35m0.6051   [39m | [35m7.957    [39m | [35m118.5    [39m | [35m1.0      [39m |
| [35m3        [39m | [35m0.8659   [39m | [35m1.0      [39m | [35m1.062    [39m | [35m0.19     [39m | [35m4.284    [39m | [35m373.8    [39m | [35m1.0      [39m |
| [39m4        [39m | [39m0.8422   [39m | [39m1.0      [39m | [39m1.456    [39m | [39m0.6157   [39m | [39m3.976    [39m | [39m362.9    [39m | [39m1.0      [39m |
| [39m5        [39m | [39m0.8467   [39m | [39m1.0      [39m | [39m3.926    [39m | [39m0.2077   [39m | [39m6.6      [39m | [39m633.2    [39m | [39m1.0      [39m |
| [39m6        [39m | [39m0.8396   [39m | [39m1.0      [39m | [39m4.486    [39m | [39m0.2996   [39m 

In [38]:
print("Best Parameter Setting : {}".format(optimizer.max["params"]))
print("Best Target Value      : {}".format(optimizer.max["target"]))
classification_report

Best Parameter Setting : {'colsample': 1.0, 'gamma': 0.07267070826693423, 'learning_rate': 0.4045241064311625, 'max_depth': 3.6755463919447022, 'n_estimators': 374.8045944405341, 'subsample': 1.0}
Best Target Value      : 0.8667194562181343


<function sklearn.metrics._classification.classification_report(y_true, y_pred, *, labels=None, target_names=None, sample_weight=None, digits=2, output_dict=False, zero_division='warn')>

In [46]:
max_depth = int(optimizer.max["params"]["max_depth"])
n_estimators = int(optimizer.max["params"]["n_estimators"])

optimal_classifier_xgb = XGBClassifier(
    max_depth=max_depth,
    learning_rate=optimizer.max["params"]["learning_rate"],
    n_estimators=n_estimators,
    gamma=optimizer.max["params"]["gamma"]
)
optimal_classifier_xgb.fit(X_train, y_train)

y_pred = optimal_classifier_xgb.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"Accuracy:\t{accuracy:.2f}")
print(f"AUC:\t{roc_auc_score(y_true=y_val, y_score=y_pred, average='weighted'):.2f}")
print(classification_report(y_val, y_pred, target_names=mlb.classes_, zero_division=0))


Accuracy:	0.61
AUC:	0.82
                   precision    recall  f1-score   support

computer security       0.85      0.87      0.86       931
         hardware       0.82      0.57      0.68       355
       networking       0.88      0.44      0.58       146
operating systems       0.88      0.63      0.73       171
            other       0.88      0.63      0.74       361
         software       0.81      0.80      0.81       644

        micro avg       0.84      0.74      0.79      2608
        macro avg       0.85      0.66      0.73      2608
     weighted avg       0.85      0.74      0.78      2608
      samples avg       0.76      0.74      0.74      2608

