# Multi-Class Classification with Machine Learning
In this notebook, we will explore various machine learning models to solve a multi-class classification problem. We will evaluate and compare the performance of different algorithms on the dataset.


In [None]:
import ast
import random
from collections import Counter
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from skmultilearn.model_selection import iterative_train_test_split
from tqdm import tqdm

from preprocess_functions import build_tree, extract_keys, preprocess_texts
from utils import CalibratedLabelRankClassifier, ChainOfClassifiers, LabelPowersetClassifier, \
    assess_models, prune_and_subsample, ConditionalDependencyNetwork, MetaBinaryRelevance


In [None]:
OVERWRITE = False
RANDOM_STATE = 42

np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)


In [None]:
INIT_POINTS = 1
N_ITER = 5
TEST_SIZE = 2e-1

BASE_CLASSIFIERS = {
    'logistic_regression': LogisticRegression(solver='liblinear', random_state=RANDOM_STATE),
    'gaussian_nb': GaussianNB(),
    'decision_tree': DecisionTreeClassifier(random_state=RANDOM_STATE),
    'random_forest': RandomForestClassifier(random_state=RANDOM_STATE),
    'xgb': xgb.XGBClassifier(random_state=RANDOM_STATE),
}

COLAB_PATH = Path('/content/drive/MyDrive')
KAGGLE_PATH = Path('/kaggle/input')
LOCAL_PATH = Path('./')

# Step 1: Check if running in Google Colab
try:
    import google.colab

    DATA_PATH = COLAB_PATH / Path('data')
    MODELS_PATH = COLAB_PATH / Path('models')
except ImportError:
    # Step 2: Check if running in Kaggle
    try:
        import kaggle_secrets

        DATA_PATH = KAGGLE_PATH
        MODELS_PATH = KAGGLE_PATH
    except ImportError:
        # Step 3: Default to local Jupyter Notebook
        DATA_PATH = LOCAL_PATH / Path('data')
        MODELS_PATH = LOCAL_PATH / Path('models')

GLOVE_6B_PATH = MODELS_PATH / Path('glove-embeddings')
THREAT_TWEETS_PATH = DATA_PATH / Path('tweets-dataset-for-cyberattack-detection')

GLOVE_6B_300D_TXT = GLOVE_6B_PATH / Path('glove.6B.300d.txt')
THREAT_TWEETS_CSV = THREAT_TWEETS_PATH / Path('tweets_final.csv')


## 1. Introduction

In this notebook, we are going to solve a multi-class classification problem using different machine learning models. Our goal is to predict the class of each sample based on the input features.


## 2. Data Loading and Preprocessing
We will load the dataset, inspect its structure, and preprocess it for machine learning models.


In [None]:
# Read the CSV file and process columns in one step
threat_tweets = (
    pd.read_csv(filepath_or_buffer=THREAT_TWEETS_CSV)
    .assign(
        tweet=lambda df: df['tweet'].apply(func=ast.literal_eval),
        watson=lambda df: df['watson'].apply(func=ast.literal_eval)
        .apply(func=lambda x: x.get('categories', []))
        .apply(func=build_tree),
        watson_list=lambda df: df['watson'].apply(func=extract_keys),
    )
    .query(expr='relevant == True')
    .drop(labels=['relevant'], axis=1)
    .dropna(subset=['text'], ignore_index=True)
)

threat_tweets.head()


For the goal of the project, the categories of interest are:
1. computer security/network security
2. computer security/antivirus and malware
3. operating systems/mac os
4. operating systems/windows
5. operating systems/unix
6. operating systems/linux
7. software
8. programming languages, included in software
9. software/databases
10. hardware
11. electronic components, included in hardware
12. hardware/computer/servers
13. hardware/computer/portable computer
14. hardware/computer/desktop computer
15. hardware/computer components
16. hardware/computer networking/router
17. hardware/computer networking/wireless technology
18. networking
19. internet technology, included in networking


In [None]:
FIX_TARGETS = {
    'computer security': 'computer security',
    'operating systems': 'operating systems',
    'software': 'software',
    'programming languages': 'software',
    'hardware': 'hardware',
    'electronic components': 'hardware',
    'networking': 'networking',
    'internet technology': 'networking'
}

chosen_categories = [
    list(set(FIX_TARGETS.keys()) & set(s))
    for s in threat_tweets['watson_list']
]

for i, watson_list in enumerate(chosen_categories):
    temp = list(set([FIX_TARGETS[c] for c in watson_list]))
    if len(temp) < 1:
        temp = ['other']
    chosen_categories[i] = temp

threat_tweets['target'] = chosen_categories

threat_tweets.head()


In [None]:
FIX_TARGETS_OPERATING_SYSTEMS = {
    'mac os': 'mac os',
    'windows': 'windows',
    'unix': 'unix',
    'linux': 'linux'
}

threat_tweets_operating_systems = threat_tweets[
    threat_tweets["watson_list"].apply(lambda x: "operating systems" in x)
].reset_index(drop=True)

chosen_categories = [
    list(set(FIX_TARGETS_OPERATING_SYSTEMS.keys()) & set(s))
    for s in threat_tweets_operating_systems['watson_list']
]

for i, watson_list in enumerate(chosen_categories):
    temp = list(set([FIX_TARGETS_OPERATING_SYSTEMS[c] for c in watson_list]))

    if len(temp) < 1:
        temp = ['other']

    chosen_categories[i] = temp

print(len(chosen_categories))
threat_tweets_operating_systems['target'] = chosen_categories

threat_tweets_operating_systems.head()


In [None]:
Counter([item[0] for item in chosen_categories])


In [None]:
Counter(tuple(item) for item in chosen_categories)


In [None]:
X = preprocess_texts(
    list_str=threat_tweets_operating_systems['text'],
    model_path=GLOVE_6B_300D_TXT,
    embedding_dim=300
)


## 4. Model Training

We will now train different models and evaluate their performance.


In [None]:
br = None
clr = None
cc = None
lp = None
pst = None
cdn = None
mbr = None
PATH_BR_OPERATING_SYSTEMS = Path('models/binary_problems/br_operating_systems.pkl')
PATH_CLR_OPERATING_SYSTEMS = Path('models/binary_problems/clr_operating_systems.pkl')
PATH_CC_OPERATING_SYSTEMS = Path('models/binary_problems/cc_operating_systems.pkl')
PATH_LP_OPERATING_SYSTEMS = Path('models/multiclass_problems/lp_operating_systems.pkl')
PATH_PST_OPERATING_SYSTEMS = Path('models/multiclass_problems/pst_operating_systems.pkl')
PATH_CDN_OPERATING_SYSTEMS = Path('models/ensembles/cdn_operating_systems.pkl')
PATH_MBR_OPERATING_SYSTEMS = Path('models/ensembles/mbr_operating_systems.pkl')

if PATH_BR_OPERATING_SYSTEMS.exists():
    br = joblib.load(PATH_BR_OPERATING_SYSTEMS)

if PATH_CLR_OPERATING_SYSTEMS.exists():
    clr = joblib.load(PATH_CLR_OPERATING_SYSTEMS)

if PATH_CC_OPERATING_SYSTEMS.exists():
    cc = joblib.load(PATH_CC_OPERATING_SYSTEMS)

if PATH_LP_OPERATING_SYSTEMS.exists():
    lp = joblib.load(PATH_LP_OPERATING_SYSTEMS)

if PATH_PST_OPERATING_SYSTEMS.exists():
    pst = joblib.load(PATH_PST_OPERATING_SYSTEMS)

if PATH_CDN_OPERATING_SYSTEMS.exists():
    cdn = joblib.load(PATH_CDN_OPERATING_SYSTEMS)

if PATH_MBR_OPERATING_SYSTEMS.exists():
    mbr = joblib.load(PATH_MBR_OPERATING_SYSTEMS)


In [47]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y=threat_tweets_operating_systems['target'])

unique_label_sets, threat_tweets_operating_systems['target_lp'] = np.unique(
    ar=y,
    axis=0,
    return_inverse=True
)

X_pst, y_pst, label_map_pst, _ = prune_and_subsample(
    x=X,
    y=y,
    pruning_threshold=10,
    max_sub_samples=3000
)

y_lp = threat_tweets_operating_systems['target_lp']

label_map_lp = {i: tuple(lbl_set) for i, lbl_set in enumerate(unique_label_sets)}


In [48]:
# BR, CDR, CC, CDN, MBR
X_train_val, y_train_val, X_test, y_test = iterative_train_test_split(
    X=X,
    y=y,
    test_size=TEST_SIZE
)

X_train, y_train, X_val, y_val = iterative_train_test_split(
    X=X_train_val,
    y=y_train_val,
    test_size=TEST_SIZE
)

# LP
X_train_val_lp, X_test_lp, y_train_val_lp, y_test_lp = train_test_split(
    X, y_lp,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    shuffle=True,
    stratify=y_lp
)

X_train_lp, X_val_lp, y_train_lp, y_val_lp = train_test_split(
    X_train_val_lp, y_train_val_lp,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    shuffle=True,
    stratify=y_train_val_lp
)

# PSt
X_train_val_pst, X_test_pst, y_train_val_pst, y_test_pst = train_test_split(
    X_pst, y_pst,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    shuffle=True,
    stratify=y_pst
)

X_train_pst, X_val_pst, y_train_pst, y_val_pst = train_test_split(
    X_train_val_pst, y_train_val_pst,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    shuffle=True,
    stratify=y_train_val_pst
)


### 4.1. Binary Problems


#### 4.1.1. BR (Binary Relevance)


In [49]:
if not br or OVERWRITE:
    br = {}

    for k in tqdm(BASE_CLASSIFIERS.keys()):
        br[k] = OneVsRestClassifier(estimator=BASE_CLASSIFIERS[k]).fit(
            X=X_train,
            y=y_train
        )

    joblib.dump(br, PATH_BR_OPERATING_SYSTEMS, compress=9)


100%|██████████| 5/5 [00:14<00:00,  2.83s/it]


#### 4.1.2. CLR (Calibrated Label Ranking)


In [50]:
if not clr or OVERWRITE:
    clr = {}

    for k, v in tqdm(BASE_CLASSIFIERS.items()):
        model = CalibratedLabelRankClassifier(
            classifier=v,
            classes=mlb.classes_,
            random_state=RANDOM_STATE
        )

        clr[k] = model.fit(
            x=X_train,
            y=[list(mlb.classes_[np.where(row == 1)[0]]) for row in y_train]
        )

    joblib.dump(clr, PATH_CLR_OPERATING_SYSTEMS, compress=9)


100%|██████████| 5/5 [00:27<00:00,  5.47s/it]


#### 4.1.3. CC (Classifier Chains)


In [51]:
if not cc or OVERWRITE:
    cc = {}

    for k, v in tqdm(BASE_CLASSIFIERS.items()):
        model = ChainOfClassifiers(
            classifier=v,
            classes=mlb.classes_,
            random_state=RANDOM_STATE
        )

        cc[k] = model.fit(
            x=X_train,
            y=y_train
        )

    joblib.dump(cc, PATH_CC_OPERATING_SYSTEMS, compress=9)


100%|██████████| 5/5 [00:11<00:00,  2.28s/it]


### 4.2. Multi-class Problems



#### 4.2.1. LP (Label Powerset)


In [52]:
if not lp or OVERWRITE:
    lp = {}

    for k, v in tqdm(BASE_CLASSIFIERS.items()):
        model = LabelPowersetClassifier(
            classifier=v,
            label_map=label_map_lp,
            random_state=RANDOM_STATE
        )

        lp[k] = model.fit(
            x=X_train_lp,
            y=y_train_lp
        )

    joblib.dump(lp, PATH_LP_OPERATING_SYSTEMS, compress=9)


100%|██████████| 5/5 [00:13<00:00,  2.79s/it]


#### 4.2.2. PSt (Pruned Sets)


In [53]:
if not pst or OVERWRITE:
    pst = {}

    for k, v in tqdm(BASE_CLASSIFIERS.items()):
        model = LabelPowersetClassifier(
            classifier=v,
            label_map=label_map_pst,
            random_state=RANDOM_STATE
        )

        pst[k] = model.fit(
            x=X_train_pst,
            y=y_train_pst
        )

    joblib.dump(pst, PATH_PST_OPERATING_SYSTEMS, compress=9)


100%|██████████| 5/5 [00:15<00:00,  3.06s/it]


### 4.3. Ensembles


#### 4.3.1. CDN (Conditional Dependency Network)


In [54]:
if not cdn or OVERWRITE:
    cdn = {}

    for k, v in tqdm(BASE_CLASSIFIERS.items()):
        model = ConditionalDependencyNetwork(
            classifier=v,
            num_iterations=100,
            burn_in=10
        )

        cdn[k] = model.fit(
            x=X_train,
            y=y_train
        )

    joblib.dump(cdn, PATH_CDN_OPERATING_SYSTEMS, compress=9)


100%|██████████| 5/5 [00:12<00:00,  2.41s/it]


#### 4.3.2. MBR (Meta-Binary Relevance)


In [55]:
if not mbr or OVERWRITE:
    mbr = {}

    for k, v in tqdm(BASE_CLASSIFIERS.items()):
        model = MetaBinaryRelevance(
            classifier=v,
            use_cross_val=True,
            n_splits=5
        )

        mbr[k] = model.fit(
            x=X_train,
            y=y_train
        )

    joblib.dump(mbr, PATH_MBR_OPERATING_SYSTEMS, compress=9)


100%|██████████| 5/5 [01:18<00:00, 15.74s/it]


## 5. Model Evaluation

Now that we've trained the models, let's evaluate them in more detail.


In [56]:
evaluation = {
    'BR': assess_models(
        x=X_val,
        y=y_val,
        technique=br
    ),
    'CLR': assess_models(
        x=X_val,
        y=y_val,
        technique=clr
    ),
    'CC': assess_models(
        x=X_val,
        y=y_val,
        technique=cc
    ),
    'LP': assess_models(
        x=X_val_lp,
        y=np.array([list(label_map_lp[yp]) for yp in y_val_lp]),
        technique=lp
    ),
    'PST': assess_models(
        x=X_val_pst,
        y=np.array([list(label_map_pst[yp]) for yp in y_val_pst]),
        technique=pst
    ),
    'CDN': assess_models(
        x=X_val,
        y=y_val,
        technique=cdn
    ),
    'MBR': assess_models(
        x=X_val,
        y=y_val,
        technique=mbr
    )
}


In [57]:
performances = pd.DataFrame(evaluation).T
performances


Unnamed: 0,Accuracy,Classifier,Model,Precision example-based,Recall example-based,F1 example-based,Hamming loss,Micro precision,Micro recall,Micro F1,Macro precision,Macro recall,Macro F1,Coverage
BR,0.783626,logistic_regression,OneVsRestClassifier(estimator=LogisticRegressi...,0.830409,0.853801,0.836257,0.063158,0.86413,0.845745,0.854839,0.901334,0.804221,0.840235,1.754386
CLR,0.77193,logistic_regression,CalibratedLabelRankClassifier(classes=array(['...,0.827485,0.856725,0.835283,0.066667,0.846561,0.851064,0.848806,0.87915,0.814747,0.837391,1.766082
CC,0.77193,logistic_regression,"ChainOfClassifiers(classes=array(['linux', 'ma...",0.827485,0.856725,0.835283,0.067836,0.842105,0.851064,0.846561,0.867385,0.814747,0.832012,1.766082
LP,0.853801,logistic_regression,LabelPowersetClassifier(classifier=LogisticReg...,0.897661,0.880117,0.88499,0.050292,0.895604,0.871658,0.883469,0.889528,0.844484,0.865601,1.619883
PST,0.742268,logistic_regression,LabelPowersetClassifier(classifier=LogisticReg...,0.809278,0.811856,0.803265,0.085567,0.802817,0.806604,0.804706,0.803769,0.810964,0.800519,1.938144
CDN,0.760234,logistic_regression,ConditionalDependencyNetwork(classifier=Logist...,0.81384,0.847953,0.822612,0.074854,0.822917,0.840426,0.831579,0.848304,0.800054,0.814944,1.807018
MBR,0.783626,logistic_regression,MetaBinaryRelevance(classifier=LogisticRegress...,0.830409,0.853801,0.836257,0.063158,0.86413,0.845745,0.854839,0.901334,0.804221,0.840235,1.754386


In [59]:
z = evaluation['LP']['Model'].predict(X_val_lp)

print(classification_report(
    y_true=np.array([list(label_map_lp[yp]) for yp in y_val_lp]),
    y_pred=z,
    target_names=mlb.classes_,
    zero_division=0
))


              precision    recall  f1-score   support

       linux       0.92      0.92      0.92        78
      mac os       0.88      0.74      0.80        19
       other       0.88      0.83      0.86        18
        unix       0.91      0.88      0.89        24
     windows       0.85      0.85      0.85        48

   micro avg       0.90      0.87      0.88       187
   macro avg       0.89      0.84      0.87       187
weighted avg       0.90      0.87      0.88       187
 samples avg       0.90      0.88      0.88       187

