# Multi-Class Classification with Machine Learning
In this notebook, we will explore various machine learning models to solve a multi-class classification problem. We will evaluate and compare the performance of different algorithms on the dataset.


In [None]:
import ast
import json
import random
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from skmultilearn.model_selection import iterative_train_test_split
from tqdm import tqdm
from xgboost import XGBClassifier

from preprocess_functions import build_tree, extract_keys, merge_all_trees_with_counts, preprocess_texts
from utils import CalibratedLabelRankClassifier, ChainOfClassifiers, LabelPowersetClassifier, \
    assess_models, prune_and_subsample, ConditionalDependencyNetwork, MetaBinaryRelevance


In [None]:
OVERWRITE = False
RANDOM_STATE = 42

np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)


In [None]:
INIT_POINTS = 1
N_ITER = 5
TEST_SIZE = 2e-1

BASE_CLASSIFIERS = {
    'logistic_regression': LogisticRegression(solver='liblinear', random_state=RANDOM_STATE),
    'gaussian_nb': GaussianNB(),
    'decision_tree': DecisionTreeClassifier(random_state=RANDOM_STATE),
    'random_forest': RandomForestClassifier(random_state=RANDOM_STATE),
    'xgb': XGBClassifier(random_state=RANDOM_STATE)
}

COLAB_PATH = Path('/content/drive/MyDrive')
KAGGLE_PATH = Path('/kaggle/input')
LOCAL_PATH = Path('./')

# Step 1: Check if running in Google Colab
try:
    import google.colab

    DATA_PATH = COLAB_PATH / Path('data')
    MODELS_PATH = COLAB_PATH / Path('models')
except ImportError:
    # Step 2: Check if running in Kaggle
    try:
        import kaggle_secrets

        DATA_PATH = KAGGLE_PATH
        MODELS_PATH = KAGGLE_PATH
    except ImportError:
        # Step 3: Default to local Jupyter Notebook
        DATA_PATH = LOCAL_PATH / Path('data')
        MODELS_PATH = LOCAL_PATH / Path('models')

GLOVE_6B_PATH = MODELS_PATH / Path('glove-embeddings')
THREAT_TWEETS_PATH = DATA_PATH / Path('tweets-dataset-for-cyberattack-detection')

GLOVE_6B_300D_TXT = GLOVE_6B_PATH / Path('glove.6B.300d.txt')
THREAT_TWEETS_CSV = THREAT_TWEETS_PATH / Path('tweets_final.csv')


## 1. Introduction

In this notebook, we are going to solve a multi-class classification problem using different machine learning models. Our goal is to predict the class of each sample based on the input features.


## 2. Data Loading and Preprocessing
We will load the dataset, inspect its structure, and preprocess it for machine learning models.


In [None]:
# Read the CSV file and process columns in one step
threat_tweets = (
    pd.read_csv(filepath_or_buffer=THREAT_TWEETS_CSV)
    .assign(
        tweet=lambda df: df['tweet'].apply(func=ast.literal_eval),
        watson=lambda df: df['watson'].apply(func=ast.literal_eval)
        .apply(func=lambda x: x.get('categories', []))
        .apply(func=build_tree),
        watson_list=lambda df: df['watson'].apply(func=extract_keys),
    )
    .query(expr='relevant == True')
    .drop(labels=['relevant'], axis=1)
    .dropna(subset=['text'], ignore_index=True)
)

threat_tweets.head()


In [None]:
print(f"Number of CS related tweets:\t{len(threat_tweets)}")


In [None]:
general_tree, visit_count = merge_all_trees_with_counts(trees=threat_tweets['watson'])


In [None]:
print("The subcategories in 'technology and computing' are:")
for category in list(general_tree['technology and computing'].keys()):
    print(f'· {category}')


In [None]:
sorted_visit_count = dict(sorted(visit_count.items(), key=lambda item: item[1], reverse=True))

with open('general_tree.json', 'w') as file:
    file.write(json.dumps(general_tree, indent=4))

with open('general_tree_visit_counts.json', 'w') as file:
    file.write(json.dumps(sorted_visit_count, indent=4))


## 3. Exploratory Data Analysis (EDA)
Let's analyze the dataset and gain insights into its distribution.


In [None]:
print('At macro categories are:')
for category in list(general_tree.keys()):
    print(f'· {category}')


For the goal of the project, the categories of interest are:
1. computer security/network security
2. computer security/antivirus and malware
3. operating systems/mac os
4. operating systems/windows
5. operating systems/unix
6. operating systems/linux
7. software
8. programming languages, included in software
9. software/databases
10. hardware
11. electronic components, included in hardware
12. hardware/computer/servers
13. hardware/computer/portable computer
14. hardware/computer/desktop computer
15. hardware/computer components
16. hardware/computer networking/router
17. hardware/computer networking/wireless technology
18. networking
19. internet technology, included in networking


In [None]:
FIX_TARGETS = {
    'computer security': 'computer security',
    'operating systems': 'operating systems',
    'software': 'software',
    'programming languages': 'software',
    'hardware': 'hardware',
    'electronic components': 'hardware',
    'networking': 'networking',
    'internet technology': 'networking'
}

chosen_categories = [
    list(set(FIX_TARGETS.keys()) & set(s))
    for s in threat_tweets['watson_list']
]

for i, watson_list in enumerate(chosen_categories):
    temp = list(set([FIX_TARGETS[c] for c in watson_list]))
    if len(temp) < 1:
        temp = ['other']
    chosen_categories[i] = temp

threat_tweets['target'] = chosen_categories

threat_tweets.head()


In [None]:
X = preprocess_texts(
    list_str=threat_tweets['text'],
    model_path=GLOVE_6B_300D_TXT,
    embedding_dim=300
)


## 4. Model Training

We will now train different models and evaluate their performance.


In [None]:
br = None
clr = None
cc = None
lp = None
pst = None
cdn = None
mbr = None

if Path('models/br.pkl').exists():
    br = joblib.load('models/br.pkl')

if Path('models/clr.pkl').exists():
    clr = joblib.load('models/clr.pkl')

if Path('models/cc.pkl').exists():
    cc = joblib.load('models/cc.pkl')

if Path('models/lp.pkl').exists():
    lp = joblib.load('models/lp.pkl')

if Path('models/pst.pkl').exists():
    pst = joblib.load('models/pst.pkl')

if Path('models/cdn.pkl').exists():
    cdn = joblib.load('models/cdn.pkl')

if Path('models/ensembles/meta_binary_relevance.pkl').exists():
    mbr = joblib.load('models/ensembles/meta_binary_relevance.pkl')


In [None]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y=threat_tweets['target'])

unique_label_sets, threat_tweets['target mcc'] = np.unique(ar=y, axis=0, return_inverse=True)
threat_tweets['target mcc pruned'], label_map_pst = prune_and_subsample(y, pruning_threshold=5, max_sub_samples=3)

y_lp = threat_tweets['target mcc']
y_pst = threat_tweets['target mcc pruned']

label_map_lp = {i: tuple(lbl_set) for i, lbl_set in enumerate(unique_label_sets)}


In [None]:
X_train_val, y_train_val, X_test, y_test = iterative_train_test_split(
    X=X,
    y=y,
    test_size=TEST_SIZE
)

X_train, y_train, X_val, y_val = iterative_train_test_split(
    X=X_train_val,
    y=y_train_val,
    test_size=TEST_SIZE
)


In [None]:
X_train_val_mcc, X_test_mcc, y_train_val_mcc, y_test_mcc = train_test_split(
    X, y_lp,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    shuffle=True,
    stratify=y_lp
)

X_train_mcc, X_val_mcc, y_train_mcc, y_val_mcc = train_test_split(
    X_train_val_mcc, y_train_val_mcc,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    shuffle=True,
    stratify=y_train_val_mcc
)


In [None]:
X_train_val_mcc_pruned, X_test_mcc_pruned, y_train_val_mcc_pruned, y_test_mcc_pruned = train_test_split(
    X, y_pst,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    shuffle=True,
    stratify=y_pst
)

X_train_mcc_pruned, X_val_mcc_pruned, y_train_mcc_pruned, y_val_mcc_pruned = train_test_split(
    X_train_val_mcc_pruned, y_train_val_mcc_pruned,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    shuffle=True,
    stratify=y_train_val_mcc_pruned
)


### 4.1. Binary Problems


#### 4.1.1. BR (Binary Relevance)


In [None]:
if not br or OVERWRITE:
    br = {}

    for k in tqdm(BASE_CLASSIFIERS.keys()):
        br[k] = OneVsRestClassifier(estimator=BASE_CLASSIFIERS[k]).fit(
            X=X_train,
            y=y_train
        )

    joblib.dump(br, 'models/br.pkl', compress=9)


#### 4.1.2. CLR (Calibrated Label Ranking)


In [None]:
if not clr or OVERWRITE:
    clr = {}

    for k, v in tqdm(BASE_CLASSIFIERS.items()):
        model = CalibratedLabelRankClassifier(
            classifier=v,
            classes=mlb.classes_,
            random_state=RANDOM_STATE
        )

        clr[k] = model.fit(
            x=X_train,
            y=[list(mlb.classes_[np.where(row == 1)[0]]) for row in y_train]
        )

    joblib.dump(clr, 'models/clr.pkl', compress=9)


#### 4.1.3. CC (Classifier Chains)


In [None]:
if not cc or OVERWRITE:
    cc = {}

    for k, v in tqdm(BASE_CLASSIFIERS.items()):
        model = ChainOfClassifiers(
            classifier=v,
            classes=mlb.classes_,
            random_state=RANDOM_STATE
        )

        cc[k] = model.fit(
            x=X_train,
            y=y_train
        )

    joblib.dump(cc, 'models/cc.pkl', compress=9)


### 4.2. Multi-class Problems



#### 4.2.1. LP (Label Powerset)


In [None]:
if not lp or OVERWRITE:
    lp = {}

    for k, v in tqdm(BASE_CLASSIFIERS.items()):
        model = LabelPowersetClassifier(
            classifier=v,
            label_map=label_map_lp,
            random_state=RANDOM_STATE
        )

        lp[k] = model.fit(
            x=X_train_mcc,
            y=y_train_mcc
        )

    joblib.dump(lp, 'models/lp.pkl', compress=9)


#### 4.2.2. PSt (Pruned Sets)


In [None]:
if not pst or OVERWRITE:
    pst = {}

    for k, v in tqdm(BASE_CLASSIFIERS.items()):
        model = LabelPowersetClassifier(
            classifier=v,
            label_map=label_map_pst,
            random_state=RANDOM_STATE
        )

        pst[k] = model.fit(
            x=X_train_mcc_pruned,
            y=y_train_mcc_pruned
        )

    joblib.dump(pst, 'models/pst.pkl', compress=9)


### 4.3. Ensembles


#### 4.3.1. CDN (Conditional Dependency Network)


In [None]:
if not cdn or OVERWRITE:
    cdn = {}

    for k, v in tqdm(BASE_CLASSIFIERS.items()):
        model = ConditionalDependencyNetwork(
            classifier=v,
            num_iterations=100,
            burn_in=10
        )

        cdn[k] = model.fit(
            x=X_train,
            y=y_train
        )

    joblib.dump(cdn, 'models/cdn.pkl', compress=9)


#### 4.3.2. MBR (Meta-Binary Relevance)


In [None]:
if not mbr or OVERWRITE:
    mbr = {}

    for k, v in tqdm(BASE_CLASSIFIERS.items()):
        model = MetaBinaryRelevance(
            classifier=v,
            use_cross_val=True,
            n_splits=5
        )

        mbr[k] = model.fit(
            x=X_train,
            y=y_train
        )

    joblib.dump(mbr, 'models/ensembles/meta_binary_relevance.pkl', compress=9)


## 5. Model Evaluation

Now that we've trained the models, let's evaluate them in more detail.


In [None]:
evaluation = {
    'BR': assess_models(
        x=X_val,
        y=y_val,
        technique=br
    ),
    'CLR': assess_models(
        x=X_val,
        y=y_val,
        technique=clr
    ),
    'CC': assess_models(
        x=X_val,
        y=y_val,
        technique=cc
    ),
    'LP': assess_models(
        x=X_val_mcc,
        y=np.array([list(label_map_lp[yp]) for yp in y_val_mcc]),
        technique=lp
    ),
    'PST': assess_models(
        x=X_val_mcc_pruned,
        y=np.array([list(label_map_pst[yp]) for yp in y_val_mcc_pruned]),
        technique=pst
    ),
    'CDN' : assess_models(
        x=X_val,
        y=y_val,
        technique=cdn
    ),
    'MBR': assess_models(
        x=X_val,
        y=y_val,
        technique=mbr
    )
}


In [None]:
performances = pd.DataFrame(evaluation).T
performances


In [None]:
z = evaluation['LP']['Model'].predict(X_val_mcc)
acc = accuracy_score(np.array([list(label_map_lp[yp]) for yp in y_val_mcc]), z)

print(classification_report(y_true=np.array([list(label_map_lp[yp]) for yp in y_val_mcc]), y_pred=z, target_names=mlb.classes_, zero_division=0))
print(acc)