# Multi-Class Classification with Machine Learning
In this notebook, we will explore various machine learning models to solve a multi-class classification problem. We will evaluate and compare the performance of different algorithms on the dataset.


In [1]:
import ast
import json
import os
import re
import string
from collections import defaultdict
from pathlib import Path
from typing import Union, Any

import numpy
import pandas
import scipy.stats as stats
from bayes_opt import BayesianOptimization
from numpy import asarray
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, multilabel_confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier


In [2]:
RANDOM_STATE = 42
TEST_SIZE = 2e-1
KAGGLE_PATH = Path('/kaggle/input')

if os.environ.get('KAGGLE_URL_BASE',''):
    DATA_PATH = KAGGLE_PATH
    MODELS_PATH = KAGGLE_PATH
else:
    DATA_PATH = Path('./data')
    MODELS_PATH = Path('./models')

GLOVE_6B_PATH = MODELS_PATH / Path('glove-embeddings')
THREAT_TWEETS_PATH = DATA_PATH / Path('tweets-dataset-for-cyberattack-detection')

GLOVE_6B_300D_TXT = GLOVE_6B_PATH / Path('glove.6B.300d.txt')
THREAT_TWEETS_CSV = THREAT_TWEETS_PATH / Path('tweets_final.csv')


## Table of Contents
1. [Introduction](#Introduction)
2. [Data Loading & Preprocessing](#Data-Loading-&-Preprocessing)
3. [Exploratory Data Analysis (EDA)](#Exploratory-Data-Analysis-(EDA))
4. [Model Training](#Model-Training)
    1. [Logistic Regression](#Logistic-Regression)
    2. [Random Forest Classifier](#Random-Forest-Classifier)
    3. [Support Vector Machine (SVM)](#Support-Vector-Machine-(SVM))
    4. [K-Nearest Neighbors (KNN)](#K-Nearest-Neighbors-(KNN))
    5. [XGBoost Classifier](#XGBoost-Classifier)
5. [Model Evaluation](#Model-Evaluation)
    1. [Accuracy](#Accuracy)
    2. [Confusion Matrix](#Confusion-Matrix)
    3. [Classification Report](#Classification-Report)
6. [Conclusion](#Conclusion)


## 1. Introduction

In this notebook, we are going to solve a multi-class classification problem using different machine learning models. Our goal is to predict the class of each sample based on the input features.


## 2. Data Loading & Preprocessing
We will load the dataset, inspect its structure, and preprocess it for machine learning models.


In [3]:
def extract_keys(d, path=None):
    """
    Define a recursive function to extract keys

    Parameters
    ----------
    d
    path

    Returns
    -------

    """
    if path is None:
        path = []

    if isinstance(d, dict):
        for key, value in d.items():
            path = extract_keys(value, path + [key])
    else:
        path = [d]

    return path


def process_row(row):
    """
    Function to convert string to dict and extract keys

    Parameters
    ----------
    row

    Returns
    -------

    """
    # Extract keys by traversing the dictionary
    return extract_keys(row)


def build_tree(categories):
    """
    Function to build a tree-like structure (nested dictionary)

    Parameters
    ----------
    categories

    Returns
    -------

    """
    tree = {}

    for category in categories:
        path = category['label'].strip('/').split('/')  # Split the label by '/'
        current = tree

        for part in path:
            if part not in current:
                current[part] = {}  # Add a new dictionary for the subcategory
            current = current[part]  # Move to the next level in the tree
    return tree


def merge_trees_with_counts(tree1, tree2, visit_count):
    """
    Merge two trees recursively, and count the visits to each node.

    Parameters
    ----------
    tree1
    tree2
    visit_count

    Returns
    -------

    """
    for key, value in tree2.items():
        if key not in tree1:
            tree1[key] = value
        else:
            if isinstance(value, dict) and isinstance(tree1[key], dict):
                merge_trees_with_counts(tree1[key], value, visit_count)

        # Count visits for the node
        visit_count[key] += 1
    return tree1


def merge_all_trees_with_counts(trees):
    """
    Merge all trees into one general tree and count the visits to each node.

    Parameters
    ----------
    trees

    Returns
    -------

    """
    # Initialize a visit count dictionary
    visit_count = defaultdict(int)

    # Step 1: Remove duplicate trees by serializing and converting to set
    serialized_trees = {json.dumps(tree, sort_keys=True) for tree in trees}
    unique_trees = [json.loads(tree) for tree in serialized_trees]

    # Step 2: Merge all unique trees into one tree while counting visits
    general_tree = {}
    for tree in unique_trees:
        general_tree = merge_trees_with_counts(general_tree, tree, visit_count)

    return general_tree, visit_count


In [4]:
# Read the CSV file
threat_tweets = pandas.read_csv(filepath_or_buffer=THREAT_TWEETS_CSV)

# Convert the 'watson' column from string representation of dict to actual dict
threat_tweets['watson'] = threat_tweets['watson'].apply(ast.literal_eval)
threat_tweets['tweet'] = threat_tweets['tweet'].apply(ast.literal_eval)

# Extract 'categories' field (list of dictionaries)
categories = threat_tweets['watson'].apply(lambda x: x.get('categories', []))

# Build a tree for each row's categories
category_tree = categories.apply(build_tree)

# Remove duplicates and merge all trees into one
general_tree, visit_count = merge_all_trees_with_counts(category_tree)

threat_tweets['watson'] = category_tree.apply(process_row)

threat_tweets.head()


Unnamed: 0,_id,date,id,relevant,text,tweet,type,watson,annotation,urls,destination_url,valid_certificate
0,b'5b88752abb325e65390af240',2018-08-30 22:52:25+00:00,1035299228984041472,False,Best way to build empathy is through honesty a...,{'created_at': 'Thu Aug 30 22:52:25 +0000 2018...,vulnerability,"[science, social science, philosophy, ethics, ...",irrelevant,[],,
1,b'5b8875d5bb325e65a22cf81b',2018-08-30 22:55:15+00:00,1035299941932036096,False,Cryptocurrency Scams Replacing Ransomware as A...,{'created_at': 'Thu Aug 30 22:55:15 +0000 2018...,ransomware,"[technology and computing, computer security, ...",business,['https://ubm.io/2MCQwfg'],https://www.darkreading.com/endpoint-security,False
2,b'5b88768dbb325e65fa7e78e3',2018-08-30 22:58:20+00:00,1035300715470757889,False,Cryptocurrency Scams Replacing Ransomware as A...,{'created_at': 'Thu Aug 30 22:58:20 +0000 2018...,ransomware,"[technology and computing, computer security, ...",business,['http://quantus.biz/security/2018/08/30/crypt...,http://www.quantusintel.group/,True
3,b'5b8876f9bb325e65fa7e78e4',2018-08-30 23:00:08+00:00,1035301167952211969,True,Protect your customers access Prestashop Ant...,{'created_at': 'Thu Aug 30 23:00:08 +0000 2018...,ddos,"[technology and computing, internet technology...",threat,['http://addons.prestashop.com/en/23513-anti-d...,https://addons.prestashop.com/en/23513-anti-dd...,True
4,b'5b8876f9bb325e65fa7e78e5',2018-08-30 23:00:09+00:00,1035301173178249217,True,Data leak from Huazhu Hotels may affect 130 mi...,{'created_at': 'Thu Aug 30 23:00:09 +0000 2018...,leak,"[travel, hotels, home and garden, home improve...",threat,['http://www.hotelmanagement.net/tech/data-lea...,http://www.hotelmanagement.net/tech/data-leak-...,True


## 3. Exploratory Data Analysis (EDA)
Let's analyze the dataset and gain insights into its distribution.


In [5]:
print('At the top of the hierarchical structure there are:')
for category in list(general_tree.keys()):
    print(f'· {category}')


At the top of the hierarchical structure there are:
· art and entertainment
· technology and computing
· science
· society
· business and industrial
· law, govt and politics
· pets
· news
· education
· hobbies and interests
· automotive and vehicles
· finance
· home and garden
· family and parenting
· travel
· food and drink
· sports
· shopping
· health and fitness
· religion and spirituality
· style and fashion
· careers
· real estate


In [6]:
sorted_visit_count = dict(sorted(visit_count.items(), key=lambda item: item[1], reverse=True))

with open('general_tree.json', 'w') as file:
    file.write(json.dumps(general_tree, indent=4))

with open('general_tree_visit_counts.json', 'w') as file:
    file.write(json.dumps(sorted_visit_count, indent=4))


In [7]:
print('For the goal of the project, the category "technology and computing" is the only one of interest.')
print(json.dumps(general_tree['technology and computing'], indent=4))


For the goal of the project, the category "technology and computing" is the only one of interest.
{
    "computer security": {
        "network security": {},
        "antivirus and malware": {}
    },
    "networking": {
        "network monitoring and management": {},
        "vpn and remote access": {}
    },
    "hardware": {
        "computer components": {
            "sound cards": {},
            "chips and processors": {},
            "motherboards": {},
            "disks": {},
            "memory": {
                "portable": {}
            },
            "graphics cards": {}
        },
        "computer": {
            "portable computer": {
                "laptop": {},
                "palmtops and pdas": {},
                "tablet": {}
            },
            "servers": {},
            "desktop computer": {}
        },
        "computer networking": {
            "router": {},
            "wireless technology": {}
        },
        "computer peripherals": {
      

## 4. Model Training

We will now train different models and evaluate their performance.


In [8]:
threat_tweets_technology_and_computing_categories = [
    list(set(general_tree['technology and computing'].keys()) & set(s))
    for s in threat_tweets['watson']
]

for i, t_list in enumerate(threat_tweets_technology_and_computing_categories):
    temp = [t for t in t_list if visit_count[t] > 1]
    if len(t_list) != len(temp):
        print(f'{t_list} -> {temp}')
    threat_tweets_technology_and_computing_categories[i] = temp

threat_tweets['target'] = threat_tweets_technology_and_computing_categories


In [9]:
def load_word2vec_dict(model_path: Path, embedding_dim: int) -> dict[
    Union[str, list[str]], numpy.ndarray[Any, numpy.dtype]]:
    embeddings_dict = {}

    f = open(model_path, 'r', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[:-embedding_dim]

        if type(word) is list:
            word = ' '.join(word)

        vector = asarray([float(val) for val in values[-embedding_dim:]])
        embeddings_dict[word] = vector
    f.close()

    return embeddings_dict


def preprocess_texts(list_str, model_path, embedding_dim):
    word2vec_dict = load_word2vec_dict(
        model_path=model_path,
        embedding_dim=embedding_dim
    )
    list_embedded_str = numpy.zeros((len(list_str), embedding_dim))
    for i, text in enumerate(list_str):
        tokens = re.findall(r'\w+|[{}]'.format(re.escape(string.punctuation)), text)
        for token in tokens:
            try:
                list_embedded_str[i] += word2vec_dict[token.lower()]
            except KeyError:
                continue
    return list_embedded_str


In [10]:
mlb = MultiLabelBinarizer()

X = preprocess_texts(
    list_str=threat_tweets['text'],
    model_path=GLOVE_6B_300D_TXT,
    embedding_dim=300
)
y = mlb.fit_transform(threat_tweets_technology_and_computing_categories)


In [11]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    shuffle=True
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    shuffle=True
)


In [12]:
classifier_lr = OneVsRestClassifier(LogisticRegression(solver='liblinear'))
classifier_lr.fit(X_train, y_train)


In [13]:
classifier_gnb = OneVsRestClassifier(GaussianNB())
classifier_gnb.fit(X_train, y_train)


In [14]:
classifier_dt = OneVsRestClassifier(DecisionTreeClassifier())
classifier_dt.fit(X_train, y_train)


In [15]:
classifier_svm = OneVsRestClassifier(SVC())
classifier_svm.fit(X_train, y_train)


In [16]:
classifier_rf = OneVsRestClassifier(RandomForestClassifier())
classifier_rf.fit(X_train, y_train)


In [17]:
classifier_xgb = OneVsRestClassifier(XGBClassifier())
classifier_xgb.fit(X_train, y_train)


## 5. Model Evaluation

Now that we've trained the models, let's evaluate them in more detail.


In [18]:
y_pred = classifier_lr.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"Accuracy:\t{accuracy:.2f}")
print(classification_report(y_val, y_pred, target_names=mlb.classes_, zero_division=0))


Accuracy:	0.55
                          precision    recall  f1-score   support

  computer certification       0.36      0.53      0.43        15
          computer crime       0.41      0.42      0.42        40
        computer reviews       0.46      0.29      0.35        21
       computer security       0.79      0.77      0.78      1466
    consumer electronics       0.89      0.65      0.75       320
            data centers       0.15      0.50      0.24         4
   electronic components       0.90      0.90      0.90        10
   enterprise technology       0.58      0.44      0.50        32
                hardware       0.71      0.41      0.52       485
     internet technology       0.60      0.34      0.44       102
            mp3 and midi       0.35      0.31      0.33        26
              networking       0.68      0.35      0.46       118
       operating systems       0.68      0.57      0.62       187
   programming languages       0.81      0.57      0.67     

In [19]:
y_pred = classifier_gnb.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"Accuracy:\t{accuracy:.2f}")
print(classification_report(y_val, y_pred, target_names=mlb.classes_, zero_division=0))


Accuracy:	0.14
                          precision    recall  f1-score   support

  computer certification       0.02      0.67      0.03        15
          computer crime       0.04      0.72      0.07        40
        computer reviews       0.02      0.81      0.04        21
       computer security       0.54      0.69      0.60      1466
    consumer electronics       0.38      0.47      0.42       320
            data centers       0.00      0.75      0.01         4
   electronic components       0.02      0.90      0.04        10
   enterprise technology       0.03      0.59      0.05        32
                hardware       0.21      0.73      0.33       485
     internet technology       0.07      0.49      0.12       102
            mp3 and midi       0.04      0.38      0.06        26
              networking       0.08      0.58      0.14       118
       operating systems       0.12      0.63      0.21       187
   programming languages       0.12      0.61      0.20     

In [20]:
y_pred = classifier_dt.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"Accuracy:\t{accuracy:.2f}")
print(classification_report(y_val, y_pred, target_names=mlb.classes_, zero_division=0))


Accuracy:	0.49
                          precision    recall  f1-score   support

  computer certification       0.21      0.40      0.27        15
          computer crime       0.27      0.30      0.28        40
        computer reviews       0.21      0.29      0.24        21
       computer security       0.69      0.72      0.70      1466
    consumer electronics       0.64      0.72      0.67       320
            data centers       0.20      0.50      0.29         4
   electronic components       0.35      0.70      0.47        10
   enterprise technology       0.43      0.47      0.45        32
                hardware       0.53      0.55      0.54       485
     internet technology       0.32      0.52      0.40       102
            mp3 and midi       0.20      0.31      0.24        26
              networking       0.44      0.42      0.43       118
       operating systems       0.44      0.51      0.48       187
   programming languages       0.47      0.52      0.49     

In [21]:
y_pred = classifier_svm.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"Accuracy:\t{accuracy:.2f}")
print(classification_report(y_val, y_pred, target_names=mlb.classes_, zero_division=0))


Accuracy:	0.53
                          precision    recall  f1-score   support

  computer certification       0.00      0.00      0.00        15
          computer crime       0.00      0.00      0.00        40
        computer reviews       0.00      0.00      0.00        21
       computer security       0.78      0.78      0.78      1466
    consumer electronics       0.98      0.55      0.70       320
            data centers       0.00      0.00      0.00         4
   electronic components       0.80      0.40      0.53        10
   enterprise technology       1.00      0.38      0.55        32
                hardware       0.99      0.15      0.26       485
     internet technology       1.00      0.19      0.31       102
            mp3 and midi       0.00      0.00      0.00        26
              networking       0.00      0.00      0.00       118
       operating systems       0.81      0.32      0.45       187
   programming languages       0.98      0.27      0.42     

In [22]:
y_pred = classifier_rf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"Accuracy:\t{accuracy:.2f}")
print(classification_report(y_val, y_pred, target_names=mlb.classes_, zero_division=0))


Accuracy:	0.65
                          precision    recall  f1-score   support

  computer certification       1.00      0.27      0.42        15
          computer crime       1.00      0.25      0.40        40
        computer reviews       1.00      0.24      0.38        21
       computer security       0.87      0.75      0.80      1466
    consumer electronics       0.99      0.68      0.81       320
            data centers       1.00      0.50      0.67         4
   electronic components       1.00      0.70      0.82        10
   enterprise technology       1.00      0.44      0.61        32
                hardware       0.98      0.41      0.57       485
     internet technology       0.98      0.48      0.64       102
            mp3 and midi       0.86      0.23      0.36        26
              networking       0.95      0.30      0.45       118
       operating systems       0.95      0.40      0.56       187
   programming languages       0.99      0.45      0.62     

In [23]:
y_pred = classifier_xgb.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"Accuracy:\t{accuracy:.2f}")
print(classification_report(y_val, y_pred, target_names=mlb.classes_, zero_division=0))


Accuracy:	0.66
                          precision    recall  f1-score   support

  computer certification       1.00      0.53      0.70        15
          computer crime       0.92      0.30      0.45        40
        computer reviews       1.00      0.29      0.44        21
       computer security       0.83      0.81      0.82      1466
    consumer electronics       0.98      0.72      0.83       320
            data centers       1.00      0.50      0.67         4
   electronic components       1.00      0.70      0.82        10
   enterprise technology       1.00      0.44      0.61        32
                hardware       0.88      0.53      0.66       485
     internet technology       0.98      0.50      0.66       102
            mp3 and midi       0.86      0.23      0.36        26
              networking       0.92      0.37      0.53       118
       operating systems       0.90      0.56      0.69       187
   programming languages       0.98      0.57      0.72     

In [24]:
def xgboost_hyper_param(learning_rate, n_estimators, max_depth, subsample, colsample, gamma):
    max_depth = int(max_depth)
    n_estimators = int(n_estimators)

    clf = OneVsRestClassifier(XGBClassifier(
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        gamma=gamma
    ))
    return numpy.mean(cross_val_score(clf, X_train_val, y_train_val, cv=3, scoring='roc_auc'))


In [25]:
# Define the hyperparameter space
pbounds = {
    'learning_rate': (0.01, 1.0),
    'n_estimators': (100, 1000),
    'max_depth': (3,10),
    'subsample': (1.0, 1.0),  # Change for big datasets
    'colsample': (1.0, 1.0),  # Change for datasets with lots of features
    'gamma': (0, 5)
}

optimizer = BayesianOptimization(
    f=xgboost_hyper_param,
    pbounds=pbounds,
    random_state=RANDOM_STATE
)
optimizer.maximize(n_iter=15)


|   iter    |  target   | colsample |   gamma   | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.8434   [39m | [39m1.0      [39m | [39m4.754    [39m | [39m0.7347   [39m | [39m7.191    [39m | [39m240.4    [39m | [39m1.0      [39m |
| [35m2        [39m | [35m0.8471   [39m | [35m1.0      [39m | [35m4.331    [39m | [35m0.6051   [39m | [35m7.957    [39m | [35m118.5    [39m | [35m1.0      [39m |
| [35m3        [39m | [35m0.8906   [39m | [35m1.0      [39m | [35m1.062    [39m | [35m0.19     [39m | [35m4.284    [39m | [35m373.8    [39m | [35m1.0      [39m |
| [39m4        [39m | [39m0.865    [39m | [39m1.0      [39m | [39m1.456    [39m | [39m0.6157   [39m | [39m3.976    [39m | [39m362.9    [39m | [39m1.0      [39m |
| [39m5        [39m | [39m0.8719   [39m | [39m1.0      [39m | [39m3.926    [39m | [

In [26]:
print("Best Parameter Setting : {}".format(optimizer.max["params"]))
print("Best Target Value      : {}".format(optimizer.max["target"]))
classification_report

Best Parameter Setting : {'colsample': 1.0, 'gamma': 0.8071542285606459, 'learning_rate': 0.148857389184163, 'max_depth': 3.010612993602497, 'n_estimators': 998.1918782470218, 'subsample': 1.0}
Best Target Value      : 0.8926889512788782


<function sklearn.metrics._classification.classification_report(y_true, y_pred, *, labels=None, target_names=None, sample_weight=None, digits=2, output_dict=False, zero_division='warn')>

In [27]:
max_depth = int(optimizer.max["params"]["max_depth"])
n_estimators = int(optimizer.max["params"]["n_estimators"])

optimal_classifier_xgb = XGBClassifier(
    max_depth=max_depth,
    learning_rate=optimizer.max["params"]["learning_rate"],
    n_estimators=n_estimators,
    gamma=optimizer.max["params"]["gamma"]
)
optimal_classifier_xgb.fit(X_train, y_train)

y_pred = optimal_classifier_xgb.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"Accuracy:\t{accuracy:.2f}")
print(classification_report(y_val, y_pred, target_names=mlb.classes_, zero_division=0))


Accuracy:	0.67
                          precision    recall  f1-score   support

  computer certification       0.88      0.47      0.61        15
          computer crime       1.00      0.33      0.49        40
        computer reviews       1.00      0.29      0.44        21
       computer security       0.84      0.83      0.84      1466
    consumer electronics       0.97      0.71      0.82       320
            data centers       1.00      0.50      0.67         4
   electronic components       1.00      0.70      0.82        10
   enterprise technology       1.00      0.41      0.58        32
                hardware       0.86      0.54      0.66       485
     internet technology       0.96      0.49      0.65       102
            mp3 and midi       0.75      0.23      0.35        26
              networking       0.89      0.41      0.56       118
       operating systems       0.92      0.58      0.71       187
   programming languages       0.96      0.56      0.71     