# Multi-Label Classification

## Imports

In [None]:
import ast
import os
import pickle
import random

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from numpy.typing import ArrayLike
from sklearn import preprocessing
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    hamming_loss,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from tqdm.contrib.itertools import product
from tqdm.notebook import tqdm

load_dotenv()

RANDOM_STATE = int(os.getenv("RANDOM_STATE"))
TEST_SIZE = float(os.getenv("TEST_SIZE"))

os.chdir(os.getenv("ROOT"))
np.random.seed(seed=RANDOM_STATE)
random.seed(a=RANDOM_STATE)

import MLC
from notebooks.utils import (
    assess,
    extract_models,
    preprocess_texts,
    replace_text_components,
    translate_source_categories,
)

In [None]:
class ModuleSystem(BaseEstimator, ClassifierMixin):
    def __init__(self, models, mlb):
        self.models = models
        self.mlb = mlb

    def predict(self, X_test: ArrayLike) -> ArrayLike:
        """
        Predict labels for each component using the chosen model in the dictionary.

        Parameters
        ----------
        X_test : ArrayLike
            Test data.

        Returns
        -------
        Y_pred : ArrayLike
            Predicted labels for each component.
        """
        n_samples = X_test.shape[0]
        n_components = len(self.models)
        Y_pred = np.zeros((n_samples, n_components))

        for idx, model_info in self.models.items():
            model = model_info["model"]
            Y_pred[:, int(idx)] = (model.predict(X_test))[:, int(idx)]

        return Y_pred

## Model Training

In [None]:
CLASSIFICATION_METHODS = {
    "br": MLC.BRClassifier,
    "clr": MLC.CLRClassifier,
    "cc": MLC.CCClassifier,
    "lp": MLC.LPClassifier,
    "pst": MLC.PStClassifier,
    #'cdn': MLC.CDNClassifier,
    "mbr": MLC.MBRClassifier,
    "rakel": MLC.RAkELClassifier,
    "homer": MLC.HOMERClassifier,
}

CLASSIFICATION_ALGORITHM = {
    "lr": LogisticRegression(
        solver="liblinear", max_iter=10000, random_state=RANDOM_STATE
    ),
    "gnb": GaussianNB(),
    #"dt": DecisionTreeClassifier(random_state=RANDOM_STATE),
    "rf": RandomForestClassifier(random_state=RANDOM_STATE),
}

EMBEDDING_SOURCES = [
    {
        "name": "GloVe.6B.50D",
        "model-path": os.getenv("GLOVE_6B_50D_PATH"),
        "embedding-dim": 50,
    },
    {
        "name": "GloVe.6B.100D",
        "model-path": os.getenv("GLOVE_6B_100D_PATH"),
        "embedding-dim": 100,
    },
    {
        "name": "GloVe.6B.200D",
        "model-path": os.getenv("GLOVE_6B_200D_PATH"),
        "embedding-dim": 200,
    },
    {
        "name": "GloVe.6B.300D",
        "model-path": os.getenv("GLOVE_6B_300D_PATH"),
        "embedding-dim": 300,
    },
    {
        "name": "DistilRoBERTa",
        "model-path": os.getenv("DISTILROBERTA_PATH"),
        "embedding-dim": None,
    },
    {"name": "SBERT", "model-path": os.getenv("SBERT_PATH"), "embedding-dim": None},
    {
        "name": "ATT&CK-BERT",
        "model-path": os.getenv("ATTACK_BERT_PATH"),
        "embedding-dim": None,
    },
]

print()
SYSTEM_MODULES = [
    {
        "level": "general",
        "targets": {
            "computer security": "computer security",
            "operating systems": "operating systems",
            "software": "software",
            "programming languages": "software",
            "hardware": "hardware",
            "electronic components": "hardware",
            "networking": "networking",
            "internet technology": "networking",
        }
    },
    {
        "level": "computer security",
        "targets": {
            "network security": "network security",
            "antivirus and malware": "antivirus and malware",
        },
    },
    {
        "level": "operating systems",
        "targets": {
            "mac os": "mac os",
            "windows": "windows",
            "unix": "unix",
            "linux": "linux",
        },
    },
    {"level": "software", "targets": {"databases": "databases"}},
    {
        "level": "hardware",
        "targets": {
            "computer": "computer",
            "computer components": "computer components",
            "computer networking": "computer networking",
        },
    },
]

In [None]:
with open(os.getenv("TEMP_CATEGORIES_TRAIN_CSV"), "rb") as f:
    train = pd.read_csv(f)
    train = train.assign(
        tweet=lambda df: df["tweet"].apply(func=ast.literal_eval),
        watson=lambda df: df["watson"].apply(func=ast.literal_eval),
        urls=lambda df: df["urls"].apply(func=ast.literal_eval),
        watson_list=lambda df: df["watson_list"].apply(func=ast.literal_eval),
        target=lambda df: df["target"].apply(func=ast.literal_eval),
    )
    train["text"] = [replace_text_components(t) for t in train["text"]]
    for idx, t in enumerate(train['relevant']):
        if not t and 'other' not in train.loc[idx, "target"]:
            train.loc[idx, "target"] = ["other"]

best_module = None
for module_config in tqdm(SYSTEM_MODULES, desc="Modules", unit="module"):
    accuracy = 0
    best_module = None

    for embedder in tqdm(EMBEDDING_SOURCES, desc="Embedders", unit="embedder"):
        tweets = train.assign(
            target=lambda df: df["target"].apply(
                func=translate_source_categories, args=(module_config["targets"],)
            )
        )

        if module_config["level"] != "general":
            tweets = tweets[
                tweets["watson_list"].apply(lambda x: module_config["level"] in x)
            ].reset_index(drop=True)

        tweets["text"] = [replace_text_components(t) for t in tweets["text"]]
        texts = tweets["text"]
        x = preprocess_texts(
            list_str=texts,
            model_path=embedder["model-path"],
            embedding_dim=embedder["embedding-dim"],
        )

        targets = tweets["target"]
        mlb = preprocessing.MultiLabelBinarizer()
        y = mlb.fit_transform(y=targets)
        _, y_mcp = np.unique(ar=y, axis=0, return_inverse=True)

        X_train, X_val, y_train, y_val = train_test_split(
            x,
            y,
            test_size=TEST_SIZE,
            random_state=RANDOM_STATE,
            shuffle=True,
            stratify=y_mcp,
        )

        models = {}
        trained_models = []
        for method_name, algorithm_name in product(
                CLASSIFICATION_METHODS,
                CLASSIFICATION_ALGORITHM,
                desc="Methods & Algorithms",
                unit="pair",
        ):
            model = CLASSIFICATION_METHODS[method_name](
                CLASSIFICATION_ALGORITHM[algorithm_name]
            )
            try:
                model.fit(X_train, y_train)
                model_name = f"{method_name.upper()}[{algorithm_name.upper()}] - {embedder['name']}"
            except Exception as e:
                print(f"{model} failed to fit due to {e}")
                model_name = None
                continue
            if model_name is not None:
                trained_models.append(model_name)
                models[model_name] = {
                    "model": model,
                    "assess": model.evaluate(X_val, y_val),
                }

        top_performing_models = extract_models(mlb.classes_, models, "f1-score")
        module = ModuleSystem(top_performing_models, mlb)
        y_val_pred = module.predict(X_val)
        validation_results = assess(y_val, y_val_pred)

        # Convert the predicted labels back to the original format using MultiLabelBinarizer
        if validation_results["accuracy"] > accuracy:
            accuracy = validation_results["accuracy"]
            best_module = [m["name"] for m in top_performing_models.values()]
            print("==========================================================")
            print(f"{module_config["level"].capitalize()} Performance")
            print("==========================================================")
            print(f"Accuracy:\t{validation_results['accuracy'] * 100:.2f}%")
            print(f"Hamming Loss:\t{validation_results['hamming_loss']:.3f}")
            report = pd.DataFrame(validation_results["report"])
            report.columns = list(module.mlb.classes_) + [
                "micro avg",
                "macro avg",
                "weighted avg",
                "samples avg",
            ]
            report = (
                report.transpose()
                .map(lambda x: f"{x:.2f}" if isinstance(x, float) else x)
                .to_string()
            )
            print(f"{report}")
            print("==========================================================\n")

        model_dictionary_filename = f"{os.getenv("MAPPINGS_DIR")}/{module_config["level"].capitalize()}.{embedder['name']}"
        with open(f"{model_dictionary_filename}.pickle", "wb") as f:
            pickle.dump(models, f)

        del tweets, texts, x, targets, mlb, y, y_mcp, X_train, X_val, y_train, y_val, models, trained_models, top_performing_models, module, y_val_pred, validation_results, model_dictionary_filename

    module_filename = f"{os.getenv("MODELS_DIR")}/{''.join(module_config["level"].capitalize().split(' '))}"
    with open(f"{module_filename}.pickle", "wb") as f:
        pickle.dump(best_module, f)

In [None]:
module_idx = 0
with open(f"{os.getenv("MAPPINGS_DIR")}/General.ATT&CK-BERT.pickle", "rb") as f:
    general_models = pickle.load(f)

with open(os.getenv("TEMP_CATEGORIES_TRAIN_CSV"), "rb") as f:
    train = pd.read_csv(f)
    train = train.assign(
        tweet=lambda df: df["tweet"].apply(func=ast.literal_eval),
        watson=lambda df: df["watson"].apply(func=ast.literal_eval),
        urls=lambda df: df["urls"].apply(func=ast.literal_eval),
        watson_list=lambda df: df["watson_list"].apply(func=ast.literal_eval),
        target=lambda df: df["target"].apply(func=ast.literal_eval),
    )
    train["text"] = [replace_text_components(t) for t in train["text"]]
    for idx, t in enumerate(train['relevant']):
        if not t and 'other' not in train.loc[idx, "target"]:
            train.loc[idx, "target"] = ["other"]

tweets = train.assign(
    target=lambda df: df["target"].apply(
        func=translate_source_categories, args=(SYSTEM_MODULES[module_idx]["targets"],)
    )
)

if SYSTEM_MODULES[module_idx]["level"] != "general":
    tweets = tweets[
        tweets["watson_list"].apply(lambda x: SYSTEM_MODULES[module_idx]["level"] in x)
    ].reset_index(drop=True)

targets = tweets["target"]
mlb = preprocessing.MultiLabelBinarizer()
mlb.fit(y=targets)
general_models = extract_models(mlb.classes_, general_models, "f1-score")
module = ModuleSystem(general_models, mlb)

module_filename = f"{os.getenv("MODELS_DIR")}/{''.join(SYSTEM_MODULES[module_idx]["level"].capitalize().split(' '))}"
with open(f"{module_filename}.pickle", "wb") as f:
    pickle.dump(module, f)

del tweets, targets, mlb, general_models, train, module_filename, module

# 7. Model Evaluation

In [None]:
with open(os.getenv("TEMP_CATEGORIES_TEST_CSV"), "rb") as f:
    test_tweets = pd.read_csv(f)
    test_tweets = test_tweets.assign(
        tweet=lambda df: df["tweet"].apply(func=ast.literal_eval),
        watson=lambda df: df["watson"].apply(func=ast.literal_eval),
        urls=lambda df: df["urls"].apply(func=ast.literal_eval),
        watson_list=lambda df: df["watson_list"].apply(func=ast.literal_eval),
        target=lambda df: df["target"].apply(func=ast.literal_eval),
    )
    test_tweets['text'] = [replace_text_components(t) for t in test_tweets['text']]
    relevant = test_tweets['relevant'] == True
    is_other = test_tweets['target'].apply(lambda x: 'other' in x)
    for idx in test_tweets[relevant & is_other].index:
        test_tweets.loc[idx, 'relevant'] = False

texts = test_tweets['text']
targets = test_tweets['target']
print(f"Threat Tweets: {len(test_tweets)}")
test_tweets.head()

In [None]:
with open("models/General.pickle", "rb") as f:
    general_mod = pickle.load(f)

with open("models/Computersecurity.pickle", "rb") as f:
    cs_mod = pickle.load(f)

with open("models/Operatingsystems.pickle", "rb") as f:
    os_mod = pickle.load(f)

with open("models/Software.pickle", "rb") as f:
    sw_mod = pickle.load(f)

with open("models/Hardware.pickle", "rb") as f:
    hw_mod = pickle.load(f)

In [None]:
class System(BaseEstimator, ClassifierMixin):
    def __init__(self, general, cs, os, sw, hw):
        self.general = general  # ATT&CK-BERT
        self.cs = cs  # DistilRoBERTa
        self.os = os  # ATT&CK-BERT
        self.sw = sw  # DistilRoBERTa
        self.hw = hw  # ATT&CK-BERT

    def predict(self, texts: pd.Series) -> ArrayLike:
        general_dim = len(self.general.mlb.classes_)
        cs_dim = len(self.cs.mlb.classes_) - 1
        os_dim = len(self.os.mlb.classes_) - 1
        sw_dim = len(self.sw.mlb.classes_) - 1
        hw_dim = len(self.hw.mlb.classes_) - 1
        Y_pred = np.zeros((texts.shape[0], general_dim + cs_dim + os_dim + sw_dim + hw_dim))
        X_ATTACK_BERT = preprocess_texts(
            list_str=texts,
            model_path=os.getenv("ATTACK_BERT_PATH"),
            embedding_dim=None,
        )
        X_DistilRoBERTa = preprocess_texts(
            list_str=texts,
            model_path=os.getenv("DISTILROBERTA_PATH"),
            embedding_dim=None,
        )
        y_general = self.general.predict(X_ATTACK_BERT)
        Y_pred[:, :general_dim] = y_general
        for idx, t in enumerate(self.general.mlb.inverse_transform(y_general)):
            if 'other' in t:
                Y_pred[idx, :] = np.zeros((1, general_dim + cs_dim + os_dim + sw_dim + hw_dim))
                Y_pred[idx, 4:5] = 1.0
            if 'computer security' in t and 'other' not in t:
                y_cs = self.cs.predict(X_DistilRoBERTa[idx:idx + 1])
                Y_pred[idx, general_dim:general_dim + cs_dim] = y_cs[:, :cs_dim]
            if 'operating systems' in t and 'other' not in t:
                y_os = self.os.predict(X_ATTACK_BERT[idx:idx + 1])
                Y_pred[idx, general_dim + cs_dim:general_dim + cs_dim + os_dim] = np.hstack(
                    (y_os[:, :2], y_os[:, 3:]))
            if "software" in t and 'other' not in t:
                y_sw = self.sw.predict(X_DistilRoBERTa[idx:idx + 1])
                Y_pred[idx, general_dim + cs_dim + os_dim:general_dim + cs_dim + os_dim + sw_dim] = y_sw[:, :sw_dim]
            if "hardware" in t and 'other' not in t:
                y_hw = self.hw.predict(X_ATTACK_BERT[idx:idx + 1])
                Y_pred[idx, general_dim + cs_dim + os_dim + sw_dim:] = y_hw[:, :hw_dim]
            if len(t) == 0:
                Y_pred[idx, 4:5] = 1.0
        return Y_pred

    def predict_targets(self, texts: pd.Series) -> pd.Series:
        general_dim = len(self.general.mlb.classes_)
        cs_dim = len(self.cs.mlb.classes_) - 1
        os_dim = len(self.os.mlb.classes_) - 1
        sw_dim = len(self.sw.mlb.classes_) - 1
        Y_pred = self.predict(texts)
        y_general = Y_pred[:, :general_dim]
        y_cs = Y_pred[:, general_dim:general_dim + cs_dim]
        y_os = Y_pred[:, general_dim + cs_dim:general_dim + cs_dim + os_dim]
        y_sw = Y_pred[:, general_dim + cs_dim + os_dim:general_dim + cs_dim + os_dim + sw_dim]
        y_hw = Y_pred[:, general_dim + cs_dim + os_dim + sw_dim:]
        targets_general = self.general.mlb.inverse_transform(y_general)
        targets_cs = self.cs.mlb.inverse_transform(np.hstack((y_cs, np.zeros((y_cs.shape[0], 1)))))
        targets_os = self.os.mlb.inverse_transform(np.hstack((y_os[:, :2], np.zeros((y_os.shape[0], 1)), y_os[:, 2:])))
        targets_sw = self.sw.mlb.inverse_transform(np.hstack((y_sw, np.zeros((y_sw.shape[0], 1)))))
        targets_hw = self.hw.mlb.inverse_transform(np.hstack((y_hw, np.zeros((y_hw.shape[0], 1)))))
        return pd.Series(data=[list(set(target_general + target_cs + target_os + target_sw + target_hw)) for
                               target_general, target_cs, target_os, target_sw, target_hw in
                               zip(targets_general, targets_cs, targets_os, targets_sw, targets_hw)], index=texts.index)

    def evaluate(self, texts: pd.Series, targets: pd.Series) -> dict[str, float]:
        targets_general = self.general.mlb.transform(targets)
        targets_cs = self.cs.mlb.transform(targets)
        targets_os = self.os.mlb.transform(targets)
        targets_sw = self.sw.mlb.transform(targets)
        targets_hw = self.hw.mlb.transform(targets)
        Y_test = np.hstack((targets_general, targets_cs[:, :-1], targets_os[:, :2], targets_os[:, 3:],
                            targets_sw[:, :-1], targets_hw[:, :-1]))
        Y_pred = self.predict(texts)
        return assess(Y_test, Y_pred)

    def evaluate_filter(self, texts: pd.Series, targets: pd.Series) -> dict[str, float]:
        Y_test = pd.Series([1 if t else 0 for t in targets])
        Y_pred = self.predict(texts)
        return assess(Y_test, Y_pred[:, 4:5])

    def classes(self):
        return list(self.general.mlb.classes_) + list(self.cs.mlb.classes_[:-1]) + list(
            self.os.mlb.classes_[:2]) + list(self.os.mlb.classes_[3:]) + list(self.sw.mlb.classes_[:-1]) + list(
            self.hw.mlb.classes_[:-1])


solution = System(general_mod, cs_mod, os_mod, sw_mod, hw_mod)

In [None]:
pd.DataFrame(
    [
        solution.predict_targets(texts[:10]), targets[:10], texts[:10],
        test_tweets['relevant'][:10]
    ],
    index=["Predicted", "Actual", "Text", "Relevant"]
).transpose()

In [None]:
system_performance = solution.evaluate(texts, targets)
print("==========================================================")
print("System Performance")
print("==========================================================")
print(f"Accuracy:\t{system_performance['accuracy'] * 100:.2f}%")
print(f"Hamming Loss:\t{system_performance['hamming_loss']:.3f}")
report = pd.DataFrame(system_performance["report"])
report.columns = solution.classes() + ["micro avg", "macro avg", "weighted avg", "samples avg"]
report = (report.transpose().map(lambda x: f"{x:.2f}" if isinstance(x, float) else x).to_string())
print(f"{report}")
print("==========================================================\n")

In [None]:
Y_test = pd.Series([1 if t else 0 for t in test_tweets['relevant']])
Y_pred = pd.Series([not t for t in solution.predict(texts)[:, 4:5]])
auc_score_macro = roc_auc_score(Y_test, Y_pred, average="macro")
auc_score_weighted = roc_auc_score(Y_test, Y_pred, average="weighted")
report = classification_report(Y_test, Y_pred, output_dict=True, zero_division=0.0)
report["macro avg"]["auc"] = auc_score_macro
report["weighted avg"]["auc"] = auc_score_weighted
system_performance = {"hamming_loss": hamming_loss(Y_test, Y_pred), "report": report}

print("==========================================================")
print("System Performance")
print("==========================================================")
print(f"Hamming Loss:\t{system_performance['hamming_loss']:.3f}")
report = pd.DataFrame(system_performance["report"])
report.columns = ["not relevant", "relevant"] + ["accuracy", "macro avg", "weighted avg"]
report = (report.transpose().map(lambda x: f"{x:.2f}" if isinstance(x, float) else x).to_string())
print(f"{report}")
print("==========================================================\n")

In [None]:
pd.DataFrame(np.hstack((Y_test.values.reshape(-1, 1), Y_pred.values.reshape(-1, 1), test_tweets["text"].values.reshape(-1, 1))), columns=["relevant", "predicted", "text"])

In [None]:
with open("models/System.pickle", "wb") as f:
    pickle.dump(solution, f)

del solution