# Multi-Class Classification with Machine Learning
In this notebook, we will explore various machine learning models to solve a multi-class classification problem. We will evaluate and compare the performance of different algorithms on the dataset.


In [1]:
import ast
import json
import re
import string
from collections import defaultdict
from pathlib import Path
from typing import Union, Any

import numpy
import pandas
from imblearn.over_sampling import SMOTE
from numpy import asarray
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.model_selection import iterative_train_test_split

from utils import MultiLabelClassifier


In [2]:
INIT_POINTS = 3
N_ITER = 5
RANDOM_STATE = 42
TEST_SIZE = 2e-1

COLAB_PATH = Path('/content/drive/MyDrive')
KAGGLE_PATH = Path('/kaggle/input')
LOCAL_PATH = Path('./')

# Step 1: Check if running in Google Colab
try:
    import google.colab

    DATA_PATH = COLAB_PATH / Path('data')
    MODELS_PATH = COLAB_PATH / Path('models')
except ImportError:
    # Step 2: Check if running in Kaggle
    try:
        import kaggle_secrets

        DATA_PATH = KAGGLE_PATH
        MODELS_PATH = KAGGLE_PATH
    except ImportError:
        # Step 3: Default to local Jupyter Notebook
        DATA_PATH = LOCAL_PATH / Path('data')
        MODELS_PATH = LOCAL_PATH / Path('models')

GLOVE_6B_PATH = MODELS_PATH / Path('glove-embeddings')
THREAT_TWEETS_PATH = DATA_PATH / Path('tweets-dataset-for-cyberattack-detection')

GLOVE_6B_300D_TXT = GLOVE_6B_PATH / Path('glove.6B.300d.txt')
THREAT_TWEETS_CSV = THREAT_TWEETS_PATH / Path('tweets_final.csv')


## Functions


### Preprocessing


In [3]:
def extract_keys(d, path=None):
    """
    Recursively extract keys from a dictionary, building paths as a list.

    Parameters
    ----------
    d : dict or any
        The dictionary to extract keys from.
    path : list, optional
        A list to accumulate the path, default is None.

    Returns
    -------
    list
        A list of paths representing keys in the dictionary.
    """
    if path is None:
        path = []

    if isinstance(d, dict):
        for key, value in d.items():
            path = extract_keys(value, path + [key])
    else:
        path = [d]

    return path


def build_tree(categories):
    """
    Build a tree-like structure (nested dictionary) from category labels.

    Parameters
    ----------
    categories : list of dict
        A list of categories, where each category has a 'label' key that contains a path-like string.

    Returns
    -------
    dict
        A nested dictionary representing the tree structure.
    """
    tree = {}
    for category in categories:
        current = tree
        for part in category['label'].strip('/').split('/'):
            current = current.setdefault(part, {})
    return tree


def merge_trees_with_counts(tree1, tree2, visit_count):
    """
    Merge two trees recursively and count the visits to each node.

    Parameters
    ----------
    tree1 : dict
        The first tree to be merged.
    tree2 : dict
        The second tree to be merged.
    visit_count : defaultdict(int)
        A dictionary that tracks the visit count for each node.

    Returns
    -------
    dict
        The merged tree after processing both input trees.
    """
    for key, value in tree2.items():
        if key not in tree1:
            tree1[key] = value
        elif isinstance(value, dict) and isinstance(tree1[key], dict):
            merge_trees_with_counts(tree1[key], value, visit_count)

        # Count visits for the node
        visit_count[key] += 1
    return tree1


def merge_all_trees_with_counts(trees):
    """
    Merge all trees into one general tree and count the visits to each node.

    Parameters
    ----------
    trees : list of dict
        A list of trees (dictionaries) to be merged.

    Returns
    -------
    (dict, defaultdict)
        The merged tree with all nodes, and a dictionary mapping each node to its visit count.

    """
    visit_count = defaultdict(int)
    unique_trees = [json.loads(json.dumps(tree, sort_keys=True)) for tree in trees]
    general_tree = {}

    for tree in unique_trees:
        general_tree = merge_trees_with_counts(tree1=general_tree, tree2=tree, visit_count=visit_count)

    return general_tree, visit_count


### Training


In [4]:
def load_word2vec_dict(model_path: Path, embedding_dim: int) -> dict[
    Union[str, list[str]], numpy.ndarray[Any, numpy.dtype]]:
    embeddings_dict = {}

    f = open(model_path, 'r', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[:-embedding_dim]

        if type(word) is list:
            word = ' '.join(word)

        vector = asarray([float(val) for val in values[-embedding_dim:]])
        embeddings_dict[word] = vector
    f.close()

    return embeddings_dict


def preprocess_texts(list_str, model_path, embedding_dim):
    word2vec_dict = load_word2vec_dict(
        model_path=model_path,
        embedding_dim=embedding_dim
    )
    list_embedded_str = numpy.zeros((len(list_str), embedding_dim))
    for i, text in enumerate(list_str):
        try:
            tokens = re.findall(r'\w+|[{}]'.format(re.escape(string.punctuation)), text)
            for token in tokens:
                try:
                    list_embedded_str[i] += word2vec_dict[token.lower()]
                except KeyError:
                    continue
        except:
            print(text)
            return
    return list_embedded_str


#### Linear Regression Optimization


In [5]:
def lr_optimization(C, max_iter, penalty):
    # Convert penalty type
    penalty = round(penalty)  # 0: 'l1', 1: 'l2'
    penalty_mapping = {0: 'l1', 1: 'l2'}
    penalty_type = penalty_mapping[penalty]

    # Apply SMOTE to handle class imbalance
    smote = SMOTE(random_state=RANDOM_STATE)
    X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

    # Define the LogisticRegression model with OneVsRestClassifier
    model = OneVsRestClassifier(LogisticRegression(
        C=C,
        max_iter=round(max_iter),
        penalty=penalty_type,
        solver='liblinear',
        random_state=RANDOM_STATE
    ))

    # Fit the model on the resampled data
    model.fit(X_resampled, y_resampled)

    # Make predictions on the validation set
    y_pred = model.predict(X_val)

    # Calculate the metrics
    accuracy = accuracy_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_pred, average='weighted')

    # Return the objective function value
    return (accuracy + auc) / 2


## Table of Contents
1. [Introduction](#1-introduction)
2. [Data Loading & Preprocessing](#2-data-loading-and-preprocessing)
3. [Exploratory Data Analysis (EDA)](#3-exploratory-data-analysis-eda)
4. [Model Training](#4-model-training)
    1. [Logistic Regression](#41-logistic-regression)
    2. [Gaussian Naïve Bayes](#42-gaussian-naïve-bayes)
    3. [Decision Tree Classifier](#43-decision-tree-classifier)
    4. [Support Vector Classifier](#44-support-vector-classifier)
    5. [Random Forest Classifier](#45-random-forest-classifier)
    6. [XGBoost Classifier](#46-xgboost-classifier)
5. [Model Evaluation](#5-model-evaluation)
    1. [Accuracy](#Accuracy)
    2. [Confusion Matrix](#Confusion-Matrix)
    3. [Classification Report](#Classification-Report)
6. [Conclusion](#Conclusion)


## 1. Introduction

In this notebook, we are going to solve a multi-class classification problem using different machine learning models. Our goal is to predict the class of each sample based on the input features.


## 2. Data Loading and Preprocessing
We will load the dataset, inspect its structure, and preprocess it for machine learning models.


In [6]:
# Read the CSV file and process columns in one step
threat_tweets = (
    pandas.read_csv(filepath_or_buffer=THREAT_TWEETS_CSV)
    .assign(
        tweet=lambda df: df['tweet'].apply(func=ast.literal_eval),
        watson=lambda df: df['watson'].apply(func=ast.literal_eval)
        .apply(func=lambda x: x.get('categories', []))
        .apply(func=build_tree),
        watson_list=lambda df: df['watson'].apply(func=extract_keys),
    )
    .query(expr='relevant == True')
    .drop(labels=['relevant'], axis=1)
    .dropna(subset=['text'], ignore_index=True)
)

print(f'{threat_tweets.shape}')
threat_tweets.head()


(11112, 12)


Unnamed: 0,_id,date,id,text,tweet,type,watson,annotation,urls,destination_url,valid_certificate,watson_list
0,b'5b8876f9bb325e65fa7e78e4',2018-08-30 23:00:08+00:00,1035301167952211969,Protect your customers access Prestashop Ant...,{'created_at': 'Thu Aug 30 23:00:08 +0000 2018...,ddos,{'technology and computing': {'internet techno...,threat,['http://addons.prestashop.com/en/23513-anti-d...,https://addons.prestashop.com/en/23513-anti-dd...,True,"[technology and computing, internet technology..."
1,b'5b8876f9bb325e65fa7e78e5',2018-08-30 23:00:09+00:00,1035301173178249217,Data leak from Huazhu Hotels may affect 130 mi...,{'created_at': 'Thu Aug 30 23:00:09 +0000 2018...,leak,"{'travel': {'hotels': {}}, 'home and garden': ...",threat,['http://www.hotelmanagement.net/tech/data-lea...,http://www.hotelmanagement.net/tech/data-leak-...,True,"[travel, hotels, home and garden, home improve..."
2,b'5b8876fabb325e65fa7e78e6',2018-08-30 23:00:09+00:00,1035301174583353344,Instagram App 41.1788.50991.0 #Denial Of #Serv...,{'created_at': 'Thu Aug 30 23:00:09 +0000 2018...,general,{'science': {'weather': {'meteorological disas...,threat,['https://packetstormsecurity.com/files/149120...,https://packetstormsecurity.com/files/149120/i...,True,"[science, weather, meteorological disaster, hu..."
3,b'5b88770abb325e65fa7e78e7',2018-08-30 23:00:25+00:00,1035301242271096832,(good slides): \n\nThe Advanced Exploitation o...,{'created_at': 'Thu Aug 30 23:00:25 +0000 2018...,vulnerability,{'business and industrial': {'business operati...,threat,['https://twitter.com/i/web/status/10353012422...,https://twitter.com/i/web/status/1035301242271...,True,"[business and industrial, business operations,..."
4,b'5b887713bb325e65fa7e78e8',2018-08-30 23:00:35+00:00,1035301282095853569,CVE-2018-1000532 (beep)\nhttps://t.co/CaKbo38U...,{'created_at': 'Thu Aug 30 23:00:35 +0000 2018...,vulnerability,{'technology and computing': {'computer securi...,threat,['https://web.nvd.nist.gov/view/vuln/detail?vu...,https://nvd.nist.gov/vuln/detail/CVE-2018-1000532,True,"[technology and computing, computer security, ..."


In [7]:
print(f"Number of CS related tweets:\t{len(threat_tweets)}")


Number of CS related tweets:	11112


In [8]:
general_tree, visit_count = merge_all_trees_with_counts(threat_tweets['watson'])


In [9]:
print("The subcategories in 'technology and computing' are:")
for category in list(general_tree['technology and computing'].keys()):
    print(f'· {category}')


The subcategories in 'technology and computing' are:
· computer security
· internet technology
· software
· hardware
· operating systems
· data centers
· mp3 and midi
· computer reviews
· programming languages
· consumer electronics
· tech news
· networking
· electronic components
· computer crime
· enterprise technology
· computer certification
· technological innovation
· technical support


In [10]:
sorted_visit_count = dict(sorted(visit_count.items(), key=lambda item: item[1], reverse=True))

with open('general_tree.json', 'w') as file:
    file.write(json.dumps(general_tree, indent=4))

with open('general_tree_visit_counts.json', 'w') as file:
    file.write(json.dumps(sorted_visit_count, indent=4))


## 3. Exploratory Data Analysis (EDA)
Let's analyze the dataset and gain insights into its distribution.


In [11]:
print('At macro categories are:')
for category in list(general_tree.keys()):
    print(f'· {category}')


At macro categories are:
· technology and computing
· health and fitness
· home and garden
· travel
· art and entertainment
· science
· business and industrial
· sports
· finance
· law, govt and politics
· society
· real estate
· pets
· style and fashion
· news
· hobbies and interests
· food and drink
· education
· shopping
· family and parenting
· religion and spirituality
· automotive and vehicles
· careers


For the goal of the project, the categories of interest are:
1. computer security/network security
2. computer security/antivirus and malware
3. operating systems/mac os
4. operating systems/windows
5. operating systems/unix
6. operating systems/linux
7. software
8. programming languages, included in software
9. software/databases
10. hardware
11. electronic components, included in hardware
12. hardware/computer/servers
13. hardware/computer/portable computer
14. hardware/computer/desktop computer
15. hardware/computer components
16. hardware/computer networking/router
17. hardware/computer networking/wireless technology
18. networking
19. internet technology, included in networking


In [12]:
FIX_TARGETS = {
    'computer security': 'computer security',
    'operating systems': 'operating systems',
    'software': 'software',
    'programming languages': 'software',
    'hardware': 'hardware',
    'electronic components': 'hardware',
    'networking': 'networking',
    'internet technology': 'networking'
}

chosen_categories = [
    list(set(FIX_TARGETS.keys()) & set(s))
    for s in threat_tweets['watson_list']
]

for i, watson_list in enumerate(chosen_categories):
    temp = list(set([FIX_TARGETS[c] for c in watson_list]))
    if len(temp) < 1:
        temp = ['other']
    chosen_categories[i] = temp

threat_tweets['target'] = chosen_categories

print(f'{threat_tweets.shape}')
threat_tweets.head()


(11112, 13)


Unnamed: 0,_id,date,id,text,tweet,type,watson,annotation,urls,destination_url,valid_certificate,watson_list,target
0,b'5b8876f9bb325e65fa7e78e4',2018-08-30 23:00:08+00:00,1035301167952211969,Protect your customers access Prestashop Ant...,{'created_at': 'Thu Aug 30 23:00:08 +0000 2018...,ddos,{'technology and computing': {'internet techno...,threat,['http://addons.prestashop.com/en/23513-anti-d...,https://addons.prestashop.com/en/23513-anti-dd...,True,"[technology and computing, internet technology...","[software, computer security, networking]"
1,b'5b8876f9bb325e65fa7e78e5',2018-08-30 23:00:09+00:00,1035301173178249217,Data leak from Huazhu Hotels may affect 130 mi...,{'created_at': 'Thu Aug 30 23:00:09 +0000 2018...,leak,"{'travel': {'hotels': {}}, 'home and garden': ...",threat,['http://www.hotelmanagement.net/tech/data-lea...,http://www.hotelmanagement.net/tech/data-leak-...,True,"[travel, hotels, home and garden, home improve...",[other]
2,b'5b8876fabb325e65fa7e78e6',2018-08-30 23:00:09+00:00,1035301174583353344,Instagram App 41.1788.50991.0 #Denial Of #Serv...,{'created_at': 'Thu Aug 30 23:00:09 +0000 2018...,general,{'science': {'weather': {'meteorological disas...,threat,['https://packetstormsecurity.com/files/149120...,https://packetstormsecurity.com/files/149120/i...,True,"[science, weather, meteorological disaster, hu...",[hardware]
3,b'5b88770abb325e65fa7e78e7',2018-08-30 23:00:25+00:00,1035301242271096832,(good slides): \n\nThe Advanced Exploitation o...,{'created_at': 'Thu Aug 30 23:00:25 +0000 2018...,vulnerability,{'business and industrial': {'business operati...,threat,['https://twitter.com/i/web/status/10353012422...,https://twitter.com/i/web/status/1035301242271...,True,"[business and industrial, business operations,...",[operating systems]
4,b'5b887713bb325e65fa7e78e8',2018-08-30 23:00:35+00:00,1035301282095853569,CVE-2018-1000532 (beep)\nhttps://t.co/CaKbo38U...,{'created_at': 'Thu Aug 30 23:00:35 +0000 2018...,vulnerability,{'technology and computing': {'computer securi...,threat,['https://web.nvd.nist.gov/view/vuln/detail?vu...,https://nvd.nist.gov/vuln/detail/CVE-2018-1000532,True,"[technology and computing, computer security, ...","[software, computer security, hardware]"


## 4. Model Training

We will now train different models and evaluate their performance.


In [13]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(threat_tweets['target'])

X = preprocess_texts(
    list_str=threat_tweets['text'],
    model_path=GLOVE_6B_300D_TXT,
    embedding_dim=300
)


In [14]:
X_train_val, y_train_val, X_test, y_test = iterative_train_test_split(
    X, y,
    test_size=TEST_SIZE
)

X_train, y_train, X_val, y_val = iterative_train_test_split(
    X_train_val, y_train_val,
    test_size=TEST_SIZE
)


### 4.1. Logistic Regressor


In [15]:
ml_lr_classifier = MultiLabelClassifier(
    classifier='logistic_regression',
    classes=mlb.classes_,
    to_optimize=True,
    random_state=RANDOM_STATE
)

ml_lr_classifier = ml_lr_classifier.optimize(
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val
)


|   iter    |  target   |     C     |  penalty  |  solver   |
-------------------------------------------------------------


  warn(msg, stacklevel=1)


| [39m1        [39m | [39m0.8208   [39m | [39m37.46    [39m | [39m0.9507   [39m | [39m0.183    [39m |
| [39m2        [39m | [39m0.8208   [39m | [39m15.61    [39m | [39m0.05808  [39m | [39m0.2165   [39m |
| [35m3        [39m | [35m0.8216   [39m | [35m38.46    [39m | [35m0.1058   [39m | [35m0.226    [39m |
| [39m4        [39m | [39m0.8216   [39m | [39m38.45    [39m | [39m0.2388   [39m | [39m0.2413   [39m |
| [39m5        [39m | [39m0.8216   [39m | [39m39.5     [39m | [39m0.2255   [39m | [39m0.1106   [39m |
| [39m6        [39m | [39m0.8208   [39m | [39m41.19    [39m | [39m0.9527   [39m | [39m0.1614   [39m |
|   iter    |  target   |     C     |  penalty  |  solver   |
-------------------------------------------------------------


  warn(msg, stacklevel=1)


| [39m1        [39m | [39m0.5488   [39m | [39m37.46    [39m | [39m0.9507   [39m | [39m0.183    [39m |
| [35m2        [39m | [35m0.5529   [39m | [35m15.61    [39m | [35m0.05808  [39m | [35m0.2165   [39m |
| [39m3        [39m | [39m0.5523   [39m | [39m16.91    [39m | [39m0.203    [39m | [39m0.106    [39m |
| [39m4        [39m | [39m0.5523   [39m | [39m9.04     [39m | [39m0.115    [39m | [39m0.2222   [39m |
| [39m5        [39m | [39m0.5495   [39m | [39m100.0    [39m | [39m0.794    [39m | [39m0.2184   [39m |




| [35m6        [39m | [35m0.5535   [39m | [35m69.18    [39m | [35m0.07014  [39m | [35m0.07995  [39m |
|   iter    |  target   |     C     |  penalty  |  solver   |
-------------------------------------------------------------


  warn(msg, stacklevel=1)


| [39m1        [39m | [39m0.4524   [39m | [39m37.46    [39m | [39m0.9507   [39m | [39m0.183    [39m |
| [39m2        [39m | [39m0.4505   [39m | [39m15.61    [39m | [39m0.05808  [39m | [39m0.2165   [39m |




| [39m3        [39m | [39m0.4505   [39m | [39m38.46    [39m | [39m0.1058   [39m | [39m0.226    [39m |
| [39m4        [39m | [39m0.4515   [39m | [39m37.41    [39m | [39m0.9513   [39m | [39m0.09812  [39m |
| [39m5        [39m | [39m0.4505   [39m | [39m37.51    [39m | [39m0.9735   [39m | [39m0.2426   [39m |
| [39m6        [39m | [39m0.4505   [39m | [39m89.41    [39m | [39m0.8615   [39m | [39m0.1033   [39m |
|   iter    |  target   |     C     |  penalty  |  solver   |
-------------------------------------------------------------


  warn(msg, stacklevel=1)


| [39m1        [39m | [39m0.6057   [39m | [39m37.46    [39m | [39m0.9507   [39m | [39m0.183    [39m |
| [39m2        [39m | [39m0.6043   [39m | [39m15.61    [39m | [39m0.05808  [39m | [39m0.2165   [39m |
| [35m3        [39m | [35m0.6074   [39m | [35m38.46    [39m | [35m0.1058   [39m | [35m0.226    [39m |
| [39m4        [39m | [39m0.6074   [39m | [39m38.45    [39m | [39m0.2388   [39m | [39m0.2413   [39m |
| [39m5        [39m | [39m0.6057   [39m | [39m40.1     [39m | [39m0.4319   [39m | [39m0.1723   [39m |
| [39m6        [39m | [39m0.6057   [39m | [39m68.52    [39m | [39m0.9574   [39m | [39m0.2324   [39m |




|   iter    |  target   |     C     |  penalty  |  solver   |
-------------------------------------------------------------


  warn(msg, stacklevel=1)


| [39m1        [39m | [39m0.7125   [39m | [39m37.46    [39m | [39m0.9507   [39m | [39m0.183    [39m |
| [39m2        [39m | [39m0.7107   [39m | [39m15.61    [39m | [39m0.05808  [39m | [39m0.2165   [39m |
| [35m3        [39m | [35m0.7134   [39m | [35m38.46    [39m | [35m0.1058   [39m | [35m0.226    [39m |
| [39m4        [39m | [39m0.7107   [39m | [39m40.39    [39m | [39m0.05508  [39m | [39m0.1581   [39m |
| [39m5        [39m | [39m0.7125   [39m | [39m38.65    [39m | [39m0.9145   [39m | [39m0.03207  [39m |




| [39m6        [39m | [39m0.7107   [39m | [39m37.7     [39m | [39m0.04179  [39m | [39m0.1235   [39m |
|   iter    |  target   |     C     |  penalty  |  solver   |
-------------------------------------------------------------


  warn(msg, stacklevel=1)


| [39m1        [39m | [39m0.7557   [39m | [39m37.46    [39m | [39m0.9507   [39m | [39m0.183    [39m |
| [35m2        [39m | [35m0.7558   [39m | [35m15.61    [39m | [35m0.05808  [39m | [35m0.2165   [39m |
| [39m3        [39m | [39m0.7555   [39m | [39m16.91    [39m | [39m0.203    [39m | [39m0.106    [39m |
| [39m4        [39m | [39m0.7557   [39m | [39m37.41    [39m | [39m0.9513   [39m | [39m0.09812  [39m |
| [39m5        [39m | [39m0.7555   [39m | [39m14.91    [39m | [39m0.0259   [39m | [39m0.1663   [39m |
| [39m6        [39m | [39m0.7555   [39m | [39m15.92    [39m | [39m0.1292   [39m | [39m0.2364   [39m |


### 4.2. Gaussian Naïve Bayes


In [None]:
ml_gnb_classifier = MultiLabelClassifier(
    classifier='gaussian_naive_bayes',
    classes=mlb.classes_,
    random_state=RANDOM_STATE
)

ml_gnb_classifier = ml_gnb_classifier.fit(X_train, y_train)


### 4.3. Decision Tree Classifier


In [None]:
ml_dt_classifier = MultiLabelClassifier(
    classifier='decision_tree',
    classes=mlb.classes_,
    random_state=RANDOM_STATE
)

ml_dt_classifier = ml_dt_classifier.fit(X_train, y_train)


### 4.4. Support Vector Classifier


In [None]:
ml_svm_classifier = MultiLabelClassifier(
    classifier='svm',
    classes=mlb.classes_,
    random_state=RANDOM_STATE
)

ml_svm_classifier = ml_svm_classifier.fit(X_train, y_train)


### 4.5. Random Forest Classifier


In [None]:
ml_rf_classifier = MultiLabelClassifier(
    classifier='random_forest',
    classes=mlb.classes_,
    random_state=RANDOM_STATE
)

ml_rf_classifier = ml_rf_classifier.fit(X_train, y_train)


### 4.6. XGBoost Classifier


In [None]:
ml_xgb_classifier = MultiLabelClassifier(
    classifier='xgb',
    classes=mlb.classes_,
    random_state=RANDOM_STATE
)

ml_xgb_classifier = ml_xgb_classifier.fit(X_train, y_train)


## 5. Model Evaluation

Now that we've trained the models, let's evaluate them in more detail.


In [16]:
ml_lr_metrics = ml_lr_classifier.evaluate(X_val, y_val)


Accuracy:	0.43
AUC:	0.81
Classification report:
                   precision    recall  f1-score   support

computer security       0.82      0.82      0.82       931
         hardware       0.45      0.71      0.55       355
       networking       0.33      0.73      0.45       146
operating systems       0.48      0.82      0.61       171
            other       0.65      0.79      0.71       361
         software       0.71      0.80      0.76       644

        micro avg       0.63      0.79      0.70      2608
        macro avg       0.57      0.78      0.65      2608
     weighted avg       0.67      0.79      0.72      2608
      samples avg       0.67      0.79      0.70      2608



In [None]:
ml_gnb_metrics = ml_gnb_classifier.evaluate(X_val, y_val)


In [None]:
ml_dt_metrics = ml_dt_classifier.evaluate(X_val, y_val)


In [None]:
ml_svm_metrics = ml_svm_classifier.evaluate(X_val, y_val)


In [None]:
ml_rf_metrics = ml_rf_classifier.evaluate(X_val, y_val)


In [None]:
ml_xgb_metrics = ml_xgb_classifier.evaluate(X_val, y_val)
