# Multi-Label Classification

## 1. Imports

In [1]:
import ast
import os
import pickle
import random
from dotenv import load_dotenv

import numpy as np
import pandas as pd
from numpy.typing import ArrayLike
from sklearn import preprocessing
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from tqdm.contrib.itertools import product
from tqdm.notebook import tqdm

load_dotenv()

RANDOM_STATE = int(os.getenv("RANDOM_STATE"))
TEST_SIZE = float(os.getenv("TEST_SIZE"))

os.chdir(os.getenv("ROOT"))
np.random.seed(seed=RANDOM_STATE)
random.seed(a=RANDOM_STATE)

import MLC
from notebooks.utils import (
    assess,
    extract_models,
    preprocess_texts,
    replace_text_components,
    translate_source_categories,
)

In [2]:
class ModuleSystem(BaseEstimator, ClassifierMixin):
    def __init__(self, models, mlb):
        self.models = models
        self.mlb = mlb

    def predict(self, X_test: ArrayLike) -> ArrayLike:
        """
        Predict labels for each component using the chosen model in the dictionary.

        Parameters
        ----------
        X_test : ArrayLike
            Test data.

        Returns
        -------
        Y_pred : ArrayLike
            Predicted labels for each component.
        """
        n_samples = X_test.shape[0]
        n_components = len(self.models)
        Y_pred = np.zeros((n_samples, n_components))

        for idx, model_info in self.models.items():
            model = model_info["model"]
            Y_pred[:, int(idx)] = (model.predict(X_test))[:, int(idx)]

        return Y_pred

## 2. Configurations & Constants

In [3]:
GENERAL_MAPPINGS = {
    "computer security": "computer security",
    "operating systems": "operating systems",
    "software": "software",
    "programming languages": "software",
    "hardware": "hardware",
    "electronic components": "hardware",
    "networking": "networking",
    "internet technology": "networking",
}

## Model Training

In [4]:
CLASSIFICATION_METHODS = {
    "br": MLC.BRClassifier,
    "clr": MLC.CLRClassifier,
    "cc": MLC.CCClassifier,
    "lp": MLC.LPClassifier,
    "pst": MLC.PStClassifier,
    #'cdn': MLC.CDNClassifier,
    "mbr": MLC.MBRClassifier,
    "rakel": MLC.RAkELClassifier,
    "homer": MLC.HOMERClassifier,
}

CLASSIFICATION_ALGORITHM = {
    "lr": LogisticRegression(
        solver="liblinear", max_iter=10000, random_state=RANDOM_STATE
    ),
    "gnb": GaussianNB(),
    "dt": DecisionTreeClassifier(random_state=RANDOM_STATE),
    "rf": RandomForestClassifier(random_state=RANDOM_STATE),
}

EMBEDDING_SOURCES = [
    {
        "name": "GloVe.6B.50D",
        "model-path": os.getenv("GLOVE_6B_50D_PATH"),
        "embedding-dim": 50,
    },
    {
        "name": "GloVe.6B.100D",
        "model-path": os.getenv("GLOVE_6B_100D_PATH"),
        "embedding-dim": 100,
    },
    {
        "name": "GloVe.6B.200D",
        "model-path": os.getenv("GLOVE_6B_200D_PATH"),
        "embedding-dim": 200,
    },
    {
        "name": "GloVe.6B.300D",
        "model-path": os.getenv("GLOVE_6B_300D_PATH"),
        "embedding-dim": 300,
    },
    {
        "name": "DistilRoBERTa",
        "model-path": os.getenv("DISTILROBERTA_PATH"),
        "embedding-dim": None,
    },
    {"name": "SBERT", "model-path": os.getenv("SBERT_PATH"), "embedding-dim": None},
    {
        "name": "ATT&CK-BERT",
        "model-path": os.getenv("ATTACK_BERT_PATH"),
        "embedding-dim": None,
    },
]

SYSTEM_MODULES = [
    {
        "level": "general",
        "targets": {
            "computer security": "computer security",
            "operating systems": "operating systems",
            "software": "software",
            "programming languages": "software",
            "hardware": "hardware",
            "electronic components": "hardware",
            "networking": "networking",
            "internet technology": "networking",
        },
    },
    {
        "level": "computer security",
        "targets": {
            "network security": "network security",
            "antivirus and malware": "antivirus and malware",
        },
    },
    {
        "level": "operating systems",
        "targets": {
            "mac os": "mac os",
            "windows": "windows",
            "unix": "unix",
            "linux": "linux",
        },
    },
    {"level": "software", "targets": {"databases": "databases"}},
    {
        "level": "hardware",
        "targets": {
            "computer": "computer",
            "computer components": "computer components",
            "computer networking": "computer networking",
        },
    },
]

In [None]:

module = ModuleSystem(top_performing_models, mlb)
y_val_pred = module.predict(X_val)
validation_results = assess(y_val, y_val_pred)

In [5]:
with open(os.getenv("TEMP_CATEGORIES_TRAIN_CSV"), "rb") as f:
    train = pd.read_csv(f)
    train = train.assign(
        tweet=lambda df: df["tweet"].apply(func=ast.literal_eval),
        watson=lambda df: df["watson"].apply(func=ast.literal_eval),
        urls=lambda df: df["urls"].apply(func=ast.literal_eval),
        watson_list=lambda df: df["watson_list"].apply(func=ast.literal_eval),
        target=lambda df: df["target"].apply(func=ast.literal_eval),
    )
    train["text"] = [replace_text_components(t) for t in train["text"]]

for module_config in tqdm(SYSTEM_MODULES, desc="Modules", unit="module"):
    accuracy = 0
    best_module = None

    for embedder in tqdm(EMBEDDING_SOURCES, desc="Embedders", unit="embedder"):
        tweets = train.assign(
            target=lambda df: df["target"].apply(
                func=translate_source_categories, args=(module_config["targets"],)
            )
        )

        if module_config["level"] != "general":
            tweets = tweets[
                tweets["watson_list"].apply(lambda x: module_config["level"] in x)
            ].reset_index(drop=True)

        tweets["text"] = [replace_text_components(t) for t in tweets["text"]]
        texts = tweets["text"]
        x = preprocess_texts(
            list_str=texts,
            model_path=embedder["model-path"],
            embedding_dim=embedder["embedding-dim"],
        )

        targets = tweets["target"]
        mlb = preprocessing.MultiLabelBinarizer()
        y = mlb.fit_transform(y=targets)
        _, y_mcp = np.unique(ar=y, axis=0, return_inverse=True)

        X_train, X_val, y_train, y_val = train_test_split(
            x,
            y,
            test_size=TEST_SIZE,
            random_state=RANDOM_STATE,
            shuffle=True,
            stratify=y_mcp,
        )

        models = {}
        trained_models = []
        for method_name, algorithm_name in product(
            CLASSIFICATION_METHODS,
            CLASSIFICATION_ALGORITHM,
            desc="Methods & Algorithms",
            unit="pair",
        ):
            model = CLASSIFICATION_METHODS[method_name](
                CLASSIFICATION_ALGORITHM[algorithm_name]
            )
            try:
                model.fit(X_train, y_train)
                model_name = f"{method_name.upper()}[{algorithm_name.upper()}] - {embedder['name']}"
            except Exception as e:
                print(f"{model} failed to fit due to {e}")
                model_name = None
                continue
            if model_name is not None:
                trained_models.append(model_name)
                models[model_name] = {
                    "model": model,
                    "assess": model.evaluate(X_val, y_val),
                }

        top_performing_models = extract_models(mlb.classes_, models, "f1-score")
        module = ModuleSystem(top_performing_models, mlb)
        y_val_pred = module.predict(X_val)
        validation_results = assess(y_val, y_val_pred)

        # Convert the predicted labels back to the original format using MultiLabelBinarizer
        if validation_results["accuracy"] > accuracy:
            accuracy = validation_results["accuracy"]
            best_module = module
            print("==========================================================")
            print(f"{module_config["level"].capitalize()} Performance")
            print("==========================================================")
            print(f"Accuracy:\t{validation_results['accuracy'] * 100:.2f}%")
            print(f"Hamming Loss:\t{validation_results['hamming_loss']:.3f}")
            report = pd.DataFrame(validation_results["report"])
            report.columns = list(module.mlb.classes_) + [
                "micro avg",
                "macro avg",
                "weighted avg",
                "samples avg",
            ]
            report = (
                report.transpose()
                .map(lambda x: f"{x:.2f}" if isinstance(x, float) else x)
                .to_string()
            )
            print(f"{report}")
            print("==========================================================\n")

        model_dictionary_filename = f"{os.getenv("MAPPINGS_DIR")}/{module_config["level"].capitalize()}.{embedder['name']}"
        with open(f"{model_dictionary_filename}.pkl", "wb") as f:
            pickle.dump(models, f)

        del x, X_train, X_val, y_train, y_val, trained_models

    module_filename = f"{os.getenv("MODELS_DIR")}/{module_config["level"].capitalize()}"
    with open(f"{module_filename}.pkl", "wb") as f:
        pickle.dump(best_module, f)

Modules:   0%|          | 0/5 [00:00<?, ?module/s]

Embedders:   0%|          | 0/7 [00:00<?, ?embedder/s]

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
General Performance
Accuracy:	66.40%
Hamming Loss:	0.102
                  precision recall f1-score  support   auc
computer security      0.84   0.71     0.77  1389.00  0.80
hardware               0.94   0.47     0.63   369.00  0.73
networking             0.93   0.50     0.65   193.00  0.75
operating systems      0.93   0.60     0.73   123.00  0.80
other                  0.66   0.92     0.77  1282.00  0.80
software               0.85   0.69     0.77   761.00  0.83
micro avg              0.77   0.74     0.75  4117.00  0.84
macro avg              0.86   0.65     0.72  4117.00  0.79
weighted avg           0.80   0.74     0.75  4117.00  0.80
samples avg      

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)


Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
General Performance
Accuracy:	66.49%
Hamming Loss:	0.094
                  precision recall f1-score  support   auc
computer security      0.86   0.73     0.79  1389.00  0.82
hardware               0.94   0.47     0.63   369.00  0.74
networking             0.93   0.49     0.64   193.00  0.74
operating systems      0.77   0.72     0.74   123.00  0.85
other                  0.73   0.85     0.78  1282.00  0.82
software               0.90   0.66     0.76   761.00  0.82
micro avg              0.81   0.72     0.76  4117.00  0.84
macro avg              0.85   0.65     0.72  4117.00  0.80
weighted avg           0.83   0.72     0.76  4117.00  0.81
samples avg      

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
General Performance
Accuracy:	67.41%
Hamming Loss:	0.092
                  precision recall f1-score  support   auc
computer security      0.84   0.74     0.79  1389.00  0.82
hardware               0.96   0.47     0.63   369.00  0.74
networking             0.93   0.49     0.64   193.00  0.74
operating systems      0.96   0.61     0.75   123.00  0.80
other                  0.73   0.88     0.80  1282.00  0.83
software               0.90   0.67     0.77   761.00  0.82
micro avg              0.81   0.73     0.77  4117.00  0.84
macro avg              0.89   0.64     0.73  4117.00  0.79
weighted avg           0.83   0.73     0.77  4117.00  0.81
samples avg      

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
General Performance
Accuracy:	69.35%
Hamming Loss:	0.083
                  precision recall f1-score  support   auc
computer security      0.87   0.79     0.83  1389.00  0.85
hardware               0.96   0.46     0.63   369.00  0.73
networking             0.93   0.46     0.61   193.00  0.73
operating systems      0.92   0.59     0.72   123.00  0.80
other                  0.78   0.86     0.82  1282.00  0.85
software               0.92   0.66     0.77   761.00  0.82
micro avg              0.85   0.74     0.79  4117.00  0.85
macro avg              0.90   0.64     0.73  4117.00  0.80
weighted avg           0.86   0.74     0.78  4117.00  0.83
samples avg      

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
General Performance
Accuracy:	70.61%
Hamming Loss:	0.083
                  precision recall f1-score  support   auc
computer security      0.86   0.79     0.82  1389.00  0.85
hardware               0.92   0.48     0.63   369.00  0.74
networking             0.95   0.49     0.65   193.00  0.75
operating systems      0.75   0.72     0.73   123.00  0.85
other                  0.76   0.88     0.82  1282.00  0.85
software               0.90   0.71     0.79   761.00  0.84
micro avg              0.83   0.76     0.79  4117.00  0.86
macro avg              0.86   0.68     0.74  4117.00  0.81
weighted avg           0.84   0.76     0.79  4117.00  0.83
samples avg      

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)


FileNotFoundError: [Errno 2] No such file or directory: 'None/General.pkl'

# 7. Model Evaluation

In [None]:
with open("../data/tweets-dataset/categories_test.csv", "rb") as f:
    test_tweets = pd.read_csv(f)

test_tweets = test_tweets.assign(
    tweet=lambda df: df["tweet"].apply(func=ast.literal_eval),
    watson=lambda df: df["watson"].apply(func=ast.literal_eval),
    urls=lambda df: df["urls"].apply(func=ast.literal_eval),
    watson_list=lambda df: df["watson_list"].apply(func=ast.literal_eval),
    target=lambda df: df["target"].apply(func=ast.literal_eval),
)
test_tweets['text'] = [replace_text_components(t) for t in test_tweets['text']]

test_tweets = test_tweets.assign(
    target=lambda df: df["target"].apply(func=map_targets, args=(
        {
            'computer security': 'computer security',
            'operating systems': 'operating systems',
            'software': 'software',
            'programming languages': 'software',
            'hardware': 'hardware',
            'electronic components': 'hardware',
            'networking': 'networking',
            'internet technology': 'networking',
            'network security': 'network security',
            'antivirus and malware': 'antivirus and malware',
            'mac os': 'mac os',
            'windows': 'windows',
            'unix': 'unix',
            'linux': 'linux',
            'databases': 'databases',
            'computer': 'computer',
            'computer components': 'computer components',
            'computer networking': 'computer networking'
        }, ))
)

print(f"Threat Tweets: {len(test_tweets)}")
test_tweets.head()

In [None]:
with open("../data/tweets-dataset/filter_test.csv", "rb") as f:
    test_filter = pd.read_csv(f)
test_filter.head()

In [None]:
test_filter = test_filter.assign(
    tweet=lambda df: df["tweet"].apply(func=ast.literal_eval),
    watson=lambda df: df["watson"].apply(func=ast.literal_eval),
    urls=lambda df: df["urls"].apply(func=ast.literal_eval),
    watson_list=lambda df: df["watson_list"].apply(func=ast.literal_eval),
    target=lambda df: df["target"].apply(func=ast.literal_eval),
)
test_filter["text"] = [replace_text_components(t) for t in test_filter["text"]]

test_filter = test_filter.assign(
    target=lambda df: df["target"].apply(
        func=translate_source_categories,
        args=(
            {
                "computer security": "computer security",
                "operating systems": "operating systems",
                "software": "software",
                "programming languages": "software",
                "hardware": "hardware",
                "electronic components": "hardware",
                "networking": "networking",
                "internet technology": "networking",
                "network security": "network security",
                "antivirus and malware": "antivirus and malware",
                "mac os": "mac os",
                "windows": "windows",
                "unix": "unix",
                "linux": "linux",
                "databases": "databases",
                "computer": "computer",
                "computer components": "computer components",
                "computer networking": "computer networking",
            },
        ),
    )
)

print(f"Threat Tweets: {len(test_filter)}")
test_filter.head()

In [None]:
texts = test_tweets['text']
targets = test_tweets['target']

In [None]:
import joblib

filter = joblib.load(filename="../models/filter.pkl")

In [None]:
with open("../models/GENERAL.pkl", "rb") as f:
    general = pickle.load(f)
    general.mlb.fit(targets)
    general.scaler_ = preprocessing.StandardScaler().fit(preprocess_texts(
        list_str=texts,
        model_path='basel/ATTACK-BERT',
        embedding_dim=None,
    ))

In [None]:
for idx in general.models:
    general.models[idx]["model"].scaler_ = general.scaler_

In [None]:
with open("../models/COMPUTER.SECURITY.pkl", "rb") as f:
    cs = pickle.load(f)

In [None]:
with open("../models/OPERATING.SYSTEMS.pkl", "rb") as f:
    os = pickle.load(f)

In [None]:
with open("../models/SOFTWARE.pkl", "rb") as f:
    sw = pickle.load(f)

In [None]:
with open("../models/HARDWARE.pkl", "rb") as f:
    hw = pickle.load(f)

In [None]:
#ATT&CK-BERT, DistilRoBERTa, DistilRoBERTa, DistilRoBERTa, ATT&CK-BERT

In [None]:
class System(BaseEstimator, ClassifierMixin):
    def __init__(self, filter, general, cs, os, sw, hw):
        self.filter = filter
        self.general = general  # ATT&CK-BERT
        self.cs = cs  # DistilRoBERTa
        self.os = os  # DistilRoBERTa
        self.sw = sw  # DistilRoBERTa
        self.hw = hw  # ATT&CK-BERT

    def predict(self, texts: pd.Series) -> ArrayLike:
        X_SBERT = preprocess_texts(
            list_str=texts,
            model_path='sentence-transformers/all-mpnet-base-v2',
            embedding_dim=None,
        )
        general_dim = len(self.general.mlb.classes_)
        cs_dim = len(self.cs.mlb.classes_) - 1
        os_dim = len(self.os.mlb.classes_) - 1
        sw_dim = len(self.sw.mlb.classes_) - 1
        hw_dim = len(self.hw.mlb.classes_) - 1
        Y_pred = np.zeros((texts.shape[0], general_dim + cs_dim + os_dim + sw_dim + hw_dim))
        X_ATTACK_BERT = preprocess_texts(
            list_str=texts,
            model_path='basel/ATTACK-BERT',
            embedding_dim=None,
        )
        X_DistilRoBERTa = preprocess_texts(
            list_str=texts,
            model_path='sentence-transformers/all-distilroberta-v1',
            embedding_dim=None,
        )
        y_filter = self.filter.predict(X_SBERT)
        y_general = self.general.predict(X_ATTACK_BERT)
        Y_pred[:, :general_dim] = y_general
        for idx, (t, f) in enumerate(zip(self.general.mlb.inverse_transform(y_general), y_filter)):
            if not False:
                if 'computer security' in t:
                    y_cs = self.cs.predict(X_DistilRoBERTa[idx:idx + 1])
                    Y_pred[idx, general_dim:general_dim + cs_dim] = y_cs[:, :cs_dim]
                if 'operating systems' in t:
                    y_os = self.os.predict(X_DistilRoBERTa[idx:idx + 1])
                    Y_pred[idx, general_dim + cs_dim:general_dim + cs_dim + os_dim] = np.hstack((y_os[:, :2], y_os[:, 3:]))
                if "software" in t:
                    y_sw = self.sw.predict(X_DistilRoBERTa[idx:idx + 1])
                    Y_pred[idx, general_dim + cs_dim + os_dim:general_dim + cs_dim + os_dim + sw_dim] = y_sw[:, :sw_dim]
                if "hardware" in t:
                    y_hw = self.hw.predict(X_ATTACK_BERT[idx:idx + 1])
                    Y_pred[idx, general_dim + cs_dim + os_dim + sw_dim:] = y_hw[:, :hw_dim]
            else:
                Y_pred[idx, :] = np.zeros(Y_pred.shape[1])
                Y_pred[idx, 4] = 1.0
        return Y_pred

    def predict_targets(self, texts: pd.Series) -> pd.Series:
        general_dim = len(self.general.mlb.classes_)
        cs_dim = len(self.cs.mlb.classes_) - 1
        os_dim = len(self.os.mlb.classes_) - 1
        sw_dim = len(self.sw.mlb.classes_) - 1
        Y_pred = self.predict(texts)
        y_general = Y_pred[:, :general_dim]
        y_cs = Y_pred[:, general_dim:general_dim + cs_dim]
        y_os = Y_pred[:, general_dim + cs_dim:general_dim + cs_dim + os_dim]
        y_sw = Y_pred[:, general_dim + cs_dim + os_dim:general_dim + cs_dim + os_dim + sw_dim]
        y_hw = Y_pred[:, general_dim + cs_dim + os_dim + sw_dim:]
        targets_general = self.general.mlb.inverse_transform(y_general)
        targets_cs = self.cs.mlb.inverse_transform(np.hstack((y_cs, np.zeros((y_cs.shape[0], 1)))))
        targets_os = self.os.mlb.inverse_transform(np.hstack((y_os[:, :2], np.zeros((y_os.shape[0], 1)), y_os[:, 2:])))
        targets_sw = self.sw.mlb.inverse_transform(np.hstack((y_sw, np.zeros((y_sw.shape[0], 1)))))
        targets_hw = self.hw.mlb.inverse_transform(np.hstack((y_hw, np.zeros((y_hw.shape[0], 1)))))
        return pd.Series(data=[list(set(target_general + target_cs + target_os + target_sw + target_hw)) for target_general, target_cs, target_os, target_sw, target_hw in zip(targets_general, targets_cs, targets_os, targets_sw, targets_hw)], index=texts.index)

    def evaluate(self, texts: pd.Series, targets: pd.Series) -> dict[str, float]:
        targets_general = self.general.mlb.transform(targets)
        targets_cs = self.cs.mlb.transform(targets)
        targets_os = self.os.mlb.transform(targets)
        targets_sw = self.sw.mlb.transform(targets)
        targets_hw = self.hw.mlb.transform(targets)
        Y_test = np.hstack((targets_general, targets_cs[:, :-1], targets_os[:, :2], targets_os[:, 3:], targets_sw[:, :-1], targets_hw[:, :-1]))
        Y_pred = self.predict(texts)
        return assess(Y_test, Y_pred)

    def classes(self):
        return list(self.general.mlb.classes_) + list(self.cs.mlb.classes_[:-1]) + list(self.os.mlb.classes_[:2]) + list(self.os.mlb.classes_[3:]) + list(self.sw.mlb.classes_[:-1]) + list(self.hw.mlb.classes_[:-1])


solution = System(filter, general, cs, os, sw, hw)

In [None]:
pd.DataFrame([solution.predict_targets(texts[100:110]), targets[100:110], texts[100:110]], index=["Predicted", "Actual", "Text"]).transpose()

In [None]:
system_performance = solution.evaluate(texts[:100], targets[:100])
print("==========================================================")
print("System Performance")
print("==========================================================")
print(f"Accuracy:\t{system_performance['accuracy'] * 100:.2f}%")
print(f"Hamming Loss:\t{system_performance['hamming_loss']:.3f}")
report = pd.DataFrame(system_performance["report"])
report.columns = solution.classes() + ["micro avg", "macro avg", "weighted avg", "samples avg"]
report = (report.transpose().map(lambda x: f"{x:.2f}" if isinstance(x, float) else x).to_string())
print(f"{report}")
print("==========================================================\n")

In [None]:
pd.DataFrame(data=solution.predict(pd.Series(["There are rootkits threats ongoing in the wild, and they're infecting the AMD CPUs when you open an EXE file attackers send via mail. Pay attention"])), columns=solution.classes())