# Multi-Label Classification

## Imports

In [1]:
import ast
import os
import pickle
import random

import numpy as np
import pandas as pd
from copy import copy
from dotenv import load_dotenv
from numpy.typing import ArrayLike
from sklearn import preprocessing
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    hamming_loss,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from tqdm.contrib.itertools import product
from tqdm.notebook import tqdm
from typing import Dict, Union

In [2]:
load_dotenv()

RANDOM_STATE = int(os.getenv("RANDOM_STATE", "42"))
TEST_SIZE = float(os.getenv("TEST_SIZE", "0.2"))

os.chdir(os.getenv("ROOT", "./"))
np.random.seed(seed=RANDOM_STATE)
random.seed(a=RANDOM_STATE)

In [3]:
import MLC
import notebooks.utils as su

[nltk_data] Downloading package stopwords to /home/cela/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/cela/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
class ModuleSystem(BaseEstimator, ClassifierMixin):
    def __init__(self, models: Dict[str, Dict[str, Union[str, float, BaseEstimator]]], mlb: preprocessing.MultiLabelBinarizer):
        self.models = models
        self.mlb = mlb
        self.classes = mlb.classes_

    def predict(self, X_test: ArrayLike) -> ArrayLike:
        """
        Predict labels for each component using the chosen model in the dictionary.

        Parameters
        ----------
        X_test : ArrayLike
            Test data.

        Returns
        -------
        Y_pred : ArrayLike
            Predicted labels for each component.
        """
        n_samples = X_test.shape[0]
        n_components = len(self.models)
        Y_pred = np.zeros((n_samples, n_components))

        for idx, model_info in self.models.items():
            model = model_info["model"]
            Y_pred[:, int(idx)] = (model.predict(X_test))[:, int(idx)]

        return Y_pred


class System(BaseEstimator, ClassifierMixin):
    def __init__(self, gn: ModuleSystem, cs: ModuleSystem, os: ModuleSystem, sw: ModuleSystem, hw: ModuleSystem):
        self.gn = gn
        self.cs = cs
        self.os = os
        self.sw = sw
        self.hw = hw

    def predict(self, texts: pd.Series) -> ArrayLike:
        gn_dim = len(self.gn.classes) # 6
        cs_dim = len(self.cs.classes) - 1 # 2
        os_dim = len(self.os.classes) - 1 # 4
        sw_dim = len(self.sw.classes) - 1 # 1
        hw_dim = len(self.hw.classes) - 1 # 3
        Y_cols = gn_dim + cs_dim + os_dim + sw_dim + hw_dim # 16
        Y = np.zeros((texts.shape[0], Y_cols)) # N x 16

        X_GLOVE_200D = su.preprocess_texts(
            list_str=texts,
            model_path=os.getenv("GLOVE_6B_200D_PATH"),
            embedding_dim=200,
        ) # N x 200
        X_DISTILROBERTA = su.preprocess_texts(
            list_str=texts,
            model_path=os.getenv("DISTILROBERTA_PATH"),
            embedding_dim=None,
        )
        X_SBERT = su.preprocess_texts(
            list_str=texts,
            model_path=os.getenv("SBERT_PATH"),
            embedding_dim=None,
        )
        X_ATTACK_BERT = su.preprocess_texts(
            list_str=texts,
            model_path=os.getenv("ATTACK_BERT_PATH"),
            embedding_dim=None,
        )
        
        y_gn = self.gn.predict(X_ATTACK_BERT) # N x 6
        Y[:, :gn_dim] = y_gn # N x 16
        
        for idx, categories in enumerate(self.gn.mlb.inverse_transform(y_gn)):
            if len(categories) == 0:
                Y[idx, 4:5] = 1.0
            else:
                if "other" in categories:
                    
                    Y[idx, :] = np.zeros((1, Y_cols))
                    Y[idx, 4:5] = 1.0
                else:
                    if "computer security" in categories and "other" not in categories:
                        y_cs = self.cs.predict(X_DISTILROBERTA[idx : idx + 1])
                        Y[idx, gn_dim : gn_dim + cs_dim] = y_cs[:, :cs_dim]
                    
                    if "operating systems" in categories and "other" not in categories:
                        y_os = self.os.predict(X_ATTACK_BERT[idx : idx + 1])
                        Y[idx, gn_dim + cs_dim : gn_dim + cs_dim + os_dim] = np.hstack((y_os[:, :2], y_os[:, 3:]))
                    
                    if "software" in categories and "other" not in categories:
                        y_sw = self.sw.predict(X_GLOVE_200D[idx : idx + 1])
                        Y[idx, gn_dim + cs_dim + os_dim : gn_dim + cs_dim + os_dim + sw_dim] = y_sw[:, :sw_dim]
                    
                    if "hardware" in categories and "other" not in categories:
                        y_hw = self.hw.predict(X_SBERT[idx : idx + 1])
                        Y[idx, gn_dim + cs_dim + os_dim + sw_dim :] = y_hw[:, :hw_dim]
        
        return Y

    def predict_targets(self, texts: pd.Series) -> pd.Series:
        Y = self.predict(texts) # N x 16
        gn_dim = len(self.gn.classes) # 6
        cs_dim = len(self.cs.classes) - 1 # 2
        os_dim = len(self.os.classes) - 1 # 4
        sw_dim = len(self.sw.classes) - 1 # 1

        y_gn = Y[:, :gn_dim]
        y_cs = Y[:, gn_dim : gn_dim + cs_dim]
        y_os = Y[:, gn_dim + cs_dim : gn_dim + cs_dim + os_dim]
        y_sw = Y[:, gn_dim + cs_dim + os_dim : gn_dim + cs_dim + os_dim + sw_dim]
        y_hw = Y[:, gn_dim + cs_dim + os_dim + sw_dim :]

        gn_ctg = self.gn.mlb.inverse_transform(y_gn)
        cs_ctg = self.cs.mlb.inverse_transform(np.column_stack((y_cs, np.zeros((Y.shape[0], 1)))))
        os_ctg = self.os.mlb.inverse_transform(np.column_stack((y_os[:, :2], np.zeros((Y.shape[0], 1)), y_os[:, 2:])))
        sw_ctg = self.sw.mlb.inverse_transform(np.column_stack((y_sw, np.zeros((Y.shape[0], 1)))))
        hw_ctg = self.hw.mlb.inverse_transform(np.column_stack((y_hw, np.zeros((Y.shape[0], 1)))))
        
        return pd.Series(
            data=[
                list(set(gni + csi + osi + swi + hwi))
                for gni, csi, osi, swi, hwi in zip(gn_ctg, cs_ctg, os_ctg, sw_ctg, hw_ctg)
            ],
            index=texts.index,
        )

    def evaluate(self, texts: pd.Series, targets: pd.Series) -> dict[str, float]:
        gn_ctg = self.gn.mlb.transform([[target for target in targetsi if target in self.gn.classes] for targetsi in targets])
        cs_ctg = self.cs.mlb.transform([[target for target in targetsi if target in self.cs.classes] for targetsi in targets])
        os_ctg = self.os.mlb.transform([[target for target in targetsi if target in self.os.classes] for targetsi in targets])
        sw_ctg = self.sw.mlb.transform([[target for target in targetsi if target in self.sw.classes] for targetsi in targets])
        hw_ctg = self.hw.mlb.transform([[target for target in targetsi if target in self.hw.classes] for targetsi in targets])
        Y = np.hstack((gn_ctg, cs_ctg[:, :-1], os_ctg[:, :2], os_ctg[:, 3:], sw_ctg[:, :-1], hw_ctg[:, :-1]))
        Y_pred = self.predict(texts)
        
        return su.assess(Y, Y_pred)

    def evaluate_filter(self, texts: pd.Series, targets: pd.Series) -> dict[str, float]:
        Y_test = pd.Series([1 if t else 0 for t in targets])
        Y_pred = self.predict(texts)
        
        return su.assess(Y_test, Y_pred[:, 4:5])

    def classes(self):
        return (list(self.gn.classes) + list(self.cs.classes[:-1]) + list(self.os.classes[:2]) + list(self.os.classes[3:]) + list(self.sw.classes[:-1]) + list(self.hw.classes[:-1]))

## Model Training

In [5]:
CLASSIFICATION_METHODS = {
    "br": MLC.BRClassifier,
    "clr": MLC.CLRClassifier,
    "cc": MLC.CCClassifier,
    "lp": MLC.LPClassifier,
    "pst": MLC.PStClassifier,
    "mbr": MLC.MBRClassifier,
    "rakel": MLC.RAkELClassifier,
    "homer": MLC.HOMERClassifier,
}

CLASSIFICATION_ALGORITHM = {
    "lr": LogisticRegression(solver="liblinear", max_iter=10000, random_state=RANDOM_STATE),
    "gnb": GaussianNB(),
    "dt": DecisionTreeClassifier(random_state=RANDOM_STATE),
    "rf": RandomForestClassifier(random_state=RANDOM_STATE),
}

EMBEDDING_SOURCES = [
    {"name": "GloVe.6B.50D", "model-path": os.getenv("GLOVE_6B_50D_PATH"), "embedding-dim": 50},
    {"name": "GloVe.6B.100D", "model-path": os.getenv("GLOVE_6B_100D_PATH"), "embedding-dim": 100},
    {"name": "GloVe.6B.200D", "model-path": os.getenv("GLOVE_6B_200D_PATH"), "embedding-dim": 200},
    {"name": "GloVe.6B.300D", "model-path": os.getenv("GLOVE_6B_300D_PATH"), "embedding-dim": 300},
    {"name": "DistilRoBERTa", "model-path": os.getenv("DISTILROBERTA_PATH"), "embedding-dim": None},
    {"name": "SBERT", "model-path": os.getenv("SBERT_PATH"), "embedding-dim": None},
    {"name": "ATT&CK-BERT", "model-path": os.getenv("ATTACK_BERT_PATH"), "embedding-dim": None},
]

SYSTEM_MODULES = [
    {
        "level": "general",
        "targets": {
            "computer security": "computer security",
            "operating systems": "operating systems",
            "software": "software",
            "programming languages": "software",
            "hardware": "hardware",
            "electronic components": "hardware",
            "networking": "networking",
            "internet technology": "networking",
        },
    },
    {
        "level": "computer security",
        "targets": {
            "network security": "network security",
            "antivirus and malware": "antivirus and malware",
        },
    },
    {
        "level": "operating systems",
        "targets": {
            "mac os": "mac os",
            "windows": "windows",
            "unix": "unix",
            "linux": "linux",
        },
    },
    {
        "level": "software",
        "targets": {
            "databases": "databases"
        }
    },
    {
        "level": "hardware",
        "targets": {
            "computer": "computer",
            "computer components": "computer components",
            "computer networking": "computer networking",
        },
    },
]

In [6]:
with open(os.getenv("ONE_STEP_CATEGORIES_TRAIN_CSV"), "rb") as f:
    train = pd.read_csv(f)
    train = train.assign(
        tweet=lambda df: df["tweet"].apply(func=ast.literal_eval),
        watson=lambda df: df["watson"].apply(func=ast.literal_eval),
        urls=lambda df: df["urls"].apply(func=ast.literal_eval),
        watson_list=lambda df: df["watson_list"].apply(func=ast.literal_eval),
        target=lambda df: df["target"].apply(func=ast.literal_eval),
    )
    train["text"] = [su.replace_txt_components(t) for t in train["text"]]
    for idx, relevant in enumerate(train["relevant"]):
        if not relevant and "other" not in train.loc[idx, "target"]:
            train.loc[idx, "target"] = ["other"]

for module_config in tqdm(SYSTEM_MODULES, desc="Modules", unit="module"):
    f1 = 0
    best_module = None

    for embedder in tqdm(EMBEDDING_SOURCES, desc="Embedders", unit="embedder"):
        tweets = train.assign(target=lambda df: df["target"].apply(func=su.translate_src_categories, args=(module_config["targets"],)))

        if module_config["level"] != "general":
            tweets = tweets[tweets["watson_list"].apply(lambda x: module_config["level"] in x)].reset_index(drop=True)

        tweets["text"] = [su.replace_txt_components(t) for t in tweets["text"]]
        texts = tweets["text"]

        x = su.preprocess_texts(
            list_str=texts,
            model_path=embedder["model-path"],
            embedding_dim=embedder["embedding-dim"],
        )

        targets = tweets["target"]
        mlb = preprocessing.MultiLabelBinarizer()
        y = mlb.fit_transform(y=targets)
        _, y_mcp = np.unique(ar=y, axis=0, return_inverse=True)

        X_train, X_val, y_train, y_val = train_test_split(
            x,
            y,
            test_size=TEST_SIZE,
            random_state=RANDOM_STATE,
            shuffle=True,
            stratify=y_mcp,
        )

        models = {}
        trained_models = []
        for mtd_name, alg_name in product(
            CLASSIFICATION_METHODS,
            CLASSIFICATION_ALGORITHM,
            desc="Methods & Algorithms",
            unit="pair",
        ):
            model = CLASSIFICATION_METHODS[mtd_name](CLASSIFICATION_ALGORITHM[alg_name])
            
            try:
                model.fit(X_train, y_train)
                model_name = f"{mtd_name.upper()}[{alg_name.upper()}] - {embedder['name']}"
            except Exception as e:
                print(f"{model} failed to fit due to {e}")
                model_name = None
                
                continue
            
            if model_name is not None:
                trained_models.append(model_name)
                models[model_name] = {
                    "model": model,
                    "assess": model.evaluate(X_val, y_val),
                }

        top_performing_models = su.extract_models(mlb.classes_, models, "f1-score")
        print(top_performing_models.values())
        module = ModuleSystem(top_performing_models, mlb)
        y_val_pred = module.predict(X_val)
        validation_results = su.assess(y_val, y_val_pred)

        # Convert the predicted labels back to the original format using MultiLabelBinarizer
        if validation_results["report"]["macro avg"]["f1-score"] > f1:
            f1 = validation_results["report"]["macro avg"]["f1-score"]
            best_module = copy(module)
            print("==========================================================")
            print(f"{module_config['level'].capitalize()} Performance")
            print("==========================================================")
            print(f"Accuracy:\t{validation_results['accuracy'] * 100:.2f}%")
            print(f"Hamming Loss:\t{validation_results['hamming_loss']:.3f}")
            report = pd.DataFrame(validation_results["report"])
            report.columns = list(module.classes) + ["micro avg", "macro avg", "weighted avg", "samples avg"]
            report = (
                report.transpose()
                .map(lambda x: f"{x:.2f}" if isinstance(x, float) else x)
                .to_string()
            )
            print(f"{report}")
            print("==========================================================\n")

        model_dictionary_filename = f"{os.getenv('MAPPINGS_DIR')}/{module_config['level'].capitalize()}.{embedder['name']}"
        with open(f"{model_dictionary_filename}.pickle", "wb") as f:
           pickle.dump(models, f)

        del (
            tweets,
            texts,
            x,
            targets,
            mlb,
            y,
            y_mcp,
            X_train,
            X_val,
            y_train,
            y_val,
            models,
            trained_models,
            top_performing_models,
            module,
            y_val_pred,
            validation_results,
            model_dictionary_filename,
        )

    module_filename = f"{os.getenv('MODELS_DIR')}/{''.join(module_config['level'].capitalize().split(' '))}"
    with open(f"{module_filename}.pickle", "wb") as f:
       pickle.dump(best_module, f)

Modules:   0%|          | 0/5 [00:00<?, ?module/s]

Embedders:   0%|          | 0/7 [00:00<?, ?embedder/s]

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'MBR[RF] - GloVe.6B.50D', 'performance': 0.6824644549763034, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'HOMER[RF] - GloVe.6B.50D', 'performance': 0.6186046511627907, 'model': HOMERClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'LP[RF] - GloVe.6B.50D', 'performance': 0.5846153846153846, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'LP[RF] - GloVe.6B.50D', 'performance': 0.7666666666666667, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'RAKEL[RF] - GloVe.6B.50D', 'performan

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'MBR[RF] - GloVe.6B.100D', 'performance': 0.6666666666666666, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'LP[RF] - GloVe.6B.100D', 'performance': 0.6244131455399061, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'LP[RF] - GloVe.6B.100D', 'performance': 0.5833333333333334, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'LP[RF] - GloVe.6B.100D', 'performance': 0.7666666666666667, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'RAKEL[RF] - GloVe.6B.100D', 'performanc

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'MBR[RF] - GloVe.6B.200D', 'performance': 0.6835616438356165, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'LP[RF] - GloVe.6B.200D', 'performance': 0.6261682242990654, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'RAKEL[RF] - GloVe.6B.200D', 'performance': 0.59375, 'model': RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=12)}, {'name': 'LP[RF] - GloVe.6B.200D', 'performance': 0.7934782608695652, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'RAKEL[RF] -

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'MBR[RF] - GloVe.6B.300D', 'performance': 0.6863013698630137, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'RAKEL[RF] - GloVe.6B.300D', 'performance': 0.6187050359712231, 'model': RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=12)}, {'name': 'LP[RF] - GloVe.6B.300D', 'performance': 0.5759162303664922, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'LP[RF] - GloVe.6B.300D', 'performance': 0.7845303867403315, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': '

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'MBR[RF] - DistilRoBERTa', 'performance': 0.6868131868131868, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'LP[RF] - DistilRoBERTa', 'performance': 0.6172248803827751, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'HOMER[RF] - DistilRoBERTa', 'performance': 0.532608695652174, 'model': HOMERClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'LP[RF] - DistilRoBERTa', 'performance': 0.7241379310344828, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'BR[RF] - DistilRoBERTa', 'performa

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'MBR[RF] - SBERT', 'performance': 0.7128580946035976, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'RAKEL[RF] - SBERT', 'performance': 0.6350710900473934, 'model': RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=12)}, {'name': 'RAKEL[RF] - SBERT', 'performance': 0.5698924731182796, 'model': RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=12)}, {'name': 'LP[RF] - SBERT', 'performance': 0.7868852459016393, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'n

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'MBR[RF] - ATT&CK-BERT', 'performance': 0.712491738268341, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'RAKEL[RF] - ATT&CK-BERT', 'performance': 0.6352941176470588, 'model': RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=12)}, {'name': 'RAKEL[RF] - ATT&CK-BERT', 'performance': 0.5789473684210527, 'model': RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=12)}, {'name': 'LP[RF] - ATT&CK-BERT', 'performance': 0.7956989247311828, 'model': LPClassifier(base_estimator=RandomForestClassifier(

Embedders:   0%|          | 0/7 [00:00<?, ?embedder/s]

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'BR[RF] - GloVe.6B.50D', 'performance': 0.7913779830638953, 'model': BRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'HOMER[RF] - GloVe.6B.50D', 'performance': 0.615071283095723, 'model': HOMERClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'CC[RF] - GloVe.6B.50D', 'performance': 0.7042735042735043, 'model': CCClassifier(base_estimator=RandomForestClassifier(random_state=42))}])
Computer security Performance
Accuracy:	70.55%
Hamming Loss:	0.193
                      precision recall f1-score  support   auc
antivirus and malware      0.79   0.79     0.79   647.00  0.80
netwo

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'MBR[RF] - GloVe.6B.100D', 'performance': 0.7925407925407926, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'HOMER[RF] - GloVe.6B.100D', 'performance': 0.6300813008130082, 'model': HOMERClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'LP[RF] - GloVe.6B.100D', 'performance': 0.7027506654835848, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}])
Computer security Performance
Accuracy:	67.46%
Hamming Loss:	0.188
                      precision recall f1-score  support   auc
antivirus and malware      0.80   0.79     0.79   647.00  0.81

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'MBR[RF] - GloVe.6B.200D', 'performance': 0.804915514592934, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'HOMER[RF] - GloVe.6B.200D', 'performance': 0.6204081632653061, 'model': HOMERClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'CC[RF] - GloVe.6B.200D', 'performance': 0.712468193384224, 'model': CCClassifier(base_estimator=RandomForestClassifier(random_state=42))}])
Computer security Performance
Accuracy:	68.75%
Hamming Loss:	0.187
                      precision recall f1-score  support   auc
antivirus and malware      0.80   0.81     0.80   647.00  0.82
n

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'LP[RF] - GloVe.6B.300D', 'performance': 0.7921686746987951, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'MBR[RF] - GloVe.6B.300D', 'performance': 0.6305220883534136, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'LP[RF] - GloVe.6B.300D', 'performance': 0.6999109528049866, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}])


Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'MBR[RF] - DistilRoBERTa', 'performance': 0.8093797276853253, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'MBR[RF] - DistilRoBERTa', 'performance': 0.6517311608961304, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'CC[RF] - DistilRoBERTa', 'performance': 0.7132216014897579, 'model': CCClassifier(base_estimator=RandomForestClassifier(random_state=42))}])
Computer security Performance
Accuracy:	70.99%
Hamming Loss:	0.175
                      precision recall f1-score  support   auc
antivirus and malware      0.79   0.83     0.81   647.00  0.82
net

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'MBR[RF] - SBERT', 'performance': 0.8071748878923767, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'HOMER[RF] - SBERT', 'performance': 0.6521739130434783, 'model': HOMERClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'CC[RF] - SBERT', 'performance': 0.7047970479704797, 'model': CCClassifier(base_estimator=RandomForestClassifier(random_state=42))}])


Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'HOMER[RF] - ATT&CK-BERT', 'performance': 0.8042650418888042, 'model': HOMERClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'HOMER[RF] - ATT&CK-BERT', 'performance': 0.6284584980237155, 'model': HOMERClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'CC[RF] - ATT&CK-BERT', 'performance': 0.7105500450856628, 'model': CCClassifier(base_estimator=RandomForestClassifier(random_state=42))}])


Embedders:   0%|          | 0/7 [00:00<?, ?embedder/s]

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'BR[LR] - GloVe.6B.50D', 'performance': 0.9074074074074074, 'model': BRClassifier(base_estimator=LogisticRegression(max_iter=10000, random_state=42,
                                               solver='liblinear'))}, {'name': 'CC[DT] - GloVe.6B.50D', 'performance': 0.8, 'model': CCClassifier(base_estimator=DecisionTreeClassifier(random_state=42))}, {'name': 'LP[LR] - GloVe.6B.50D', 'performance': 0.7636363636363637, 'model': LPClassifier(base_estimator=LogisticRegression(max_iter=10000, random_state=42,
                                               solver='liblinear'))}, {'name': 'MBR[DT] - GloVe.6B.50D', 'performance': 0.952380952

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'MBR[RF] - GloVe.6B.100D', 'performance': 0.9090909090909091, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'BR[RF] - GloVe.6B.100D', 'performance': 0.6666666666666666, 'model': BRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'MBR[DT] - GloVe.6B.100D', 'performance': 0.75, 'model': MBRClassifier(base_estimator=DecisionTreeClassifier(random_state=42))}, {'name': 'BR[RF] - GloVe.6B.100D', 'performance': 0.9473684210526315, 'model': BRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'LP[RF] - GloVe.6B.100D', 'performance': 0.866666666

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'LP[RF] - GloVe.6B.200D', 'performance': 0.8807339449541285, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'MBR[LR] - GloVe.6B.200D', 'performance': 0.7692307692307693, 'model': MBRClassifier(base_estimator=LogisticRegression(max_iter=10000, random_state=42,
                                                solver='liblinear'))}, {'name': 'CC[RF] - GloVe.6B.200D', 'performance': 0.7547169811320755, 'model': CCClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'BR[LR] - GloVe.6B.200D', 'performance': 1.0, 'model': BRClassifier(base_estimator=LogisticRegression(max_iter

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'RAKEL[RF] - GloVe.6B.300D', 'performance': 0.9056603773584906, 'model': RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=10)}, {'name': 'MBR[LR] - GloVe.6B.300D', 'performance': 0.7692307692307693, 'model': MBRClassifier(base_estimator=LogisticRegression(max_iter=10000, random_state=42,
                                                solver='liblinear'))}, {'name': 'RAKEL[DT] - GloVe.6B.300D', 'performance': 0.6923076923076923, 'model': RAkELClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                n_estimators=10)}, {'name': 'BR[LR] - GloVe.6B.300D', 'perfo

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'MBR[LR] - DistilRoBERTa', 'performance': 0.918918918918919, 'model': MBRClassifier(base_estimator=LogisticRegression(max_iter=10000, random_state=42,
                                                solver='liblinear'))}, {'name': 'BR[RF] - DistilRoBERTa', 'performance': 0.6666666666666666, 'model': BRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'CC[GNB] - DistilRoBERTa', 'performance': 0.7619047619047619, 'model': CCClassifier(base_estimator=GaussianNB())}, {'name': 'LP[GNB] - DistilRoBERTa', 'performance': 1.0, 'model': LPClassifier(base_estimator=GaussianNB())}, {'name': 'LP[RF] - DistilRoBERTa', 'p

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'MBR[LR] - SBERT', 'performance': 0.9345794392523364, 'model': MBRClassifier(base_estimator=LogisticRegression(max_iter=10000, random_state=42,
                                                solver='liblinear'))}, {'name': 'BR[GNB] - SBERT', 'performance': 0.6666666666666666, 'model': BRClassifier(base_estimator=GaussianNB())}, {'name': 'CC[LR] - SBERT', 'performance': 0.8, 'model': CCClassifier(base_estimator=LogisticRegression(max_iter=10000, random_state=42,
                                               solver='liblinear'))}, {'name': 'LP[RF] - SBERT', 'performance': 1.0, 'model': LPClassifier(base_estimator=RandomForestClassifie

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'MBR[RF] - ATT&CK-BERT', 'performance': 0.9019607843137255, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'MBR[DT] - ATT&CK-BERT', 'performance': 0.7692307692307693, 'model': MBRClassifier(base_estimator=DecisionTreeClassifier(random_state=42))}, {'name': 'RAKEL[LR] - ATT&CK-BERT', 'performance': 0.8363636363636363, 'model': RAkELClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear'),
                n_estimators=10)}, {'name': 'LP[RF] - ATT&CK-BERT',

Embedders:   0%|          | 0/7 [00:00<?, ?embedder/s]

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

RAkELClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear'),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=GaussianNB(), n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                  

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

RAkELClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear'),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=GaussianNB(), n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                  

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

RAkELClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear'),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=GaussianNB(), n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                  

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

RAkELClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear'),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=GaussianNB(), n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                  

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

RAkELClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear'),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=GaussianNB(), n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                  

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

RAkELClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear'),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=GaussianNB(), n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                  

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

RAkELClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear'),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=GaussianNB(), n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                  

Embedders:   0%|          | 0/7 [00:00<?, ?embedder/s]

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'LP[RF] - GloVe.6B.50D', 'performance': 0.8078602620087336, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'MBR[RF] - GloVe.6B.50D', 'performance': 0.5185185185185185, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'MBR[RF] - GloVe.6B.50D', 'performance': 0.6486486486486487, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'CC[RF] - GloVe.6B.50D', 'performance': 0.6222222222222222, 'model': CCClassifier(base_estimator=RandomForestClassifier(random_state=42))}])
Hardware Performance
Accuracy:	69.27%
Hamming Loss

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'LP[RF] - GloVe.6B.100D', 'performance': 0.8173913043478261, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'LP[LR] - GloVe.6B.100D', 'performance': 0.5641025641025641, 'model': LPClassifier(base_estimator=LogisticRegression(max_iter=10000, random_state=42,
                                               solver='liblinear'))}, {'name': 'MBR[RF] - GloVe.6B.100D', 'performance': 0.6153846153846154, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'MBR[RF] - GloVe.6B.100D', 'performance': 0.63, 'model': MBRClassifier(base_estimator=RandomForestClassifier(ra

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'LP[RF] - GloVe.6B.200D', 'performance': 0.8104575163398693, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'CC[LR] - GloVe.6B.200D', 'performance': 0.5641025641025641, 'model': CCClassifier(base_estimator=LogisticRegression(max_iter=10000, random_state=42,
                                               solver='liblinear'))}, {'name': 'LP[RF] - GloVe.6B.200D', 'performance': 0.6111111111111112, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'CC[RF] - GloVe.6B.200D', 'performance': 0.6095238095238096, 'model': CCClassifier(base_estimator=RandomForestCla

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'LP[RF] - GloVe.6B.300D', 'performance': 0.8111587982832618, 'model': LPClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'MBR[DT] - GloVe.6B.300D', 'performance': 0.6, 'model': MBRClassifier(base_estimator=DecisionTreeClassifier(random_state=42))}, {'name': 'MBR[GNB] - GloVe.6B.300D', 'performance': 0.6530612244897959, 'model': MBRClassifier(base_estimator=GaussianNB())}, {'name': 'MBR[RF] - GloVe.6B.300D', 'performance': 0.6161616161616161, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}])
Hardware Performance
Accuracy:	66.48%
Hamming Loss:	0.135
                    precisi

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'BR[RF] - DistilRoBERTa', 'performance': 0.8411633109619687, 'model': BRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'BR[GNB] - DistilRoBERTa', 'performance': 0.5161290322580645, 'model': BRClassifier(base_estimator=GaussianNB())}, {'name': 'MBR[GNB] - DistilRoBERTa', 'performance': 0.6976744186046512, 'model': MBRClassifier(base_estimator=GaussianNB())}, {'name': 'MBR[LR] - DistilRoBERTa', 'performance': 0.6666666666666666, 'model': MBRClassifier(base_estimator=LogisticRegression(max_iter=10000, random_state=42,
                                                solver='liblinear'))}])
Hardware Performan

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'MBR[RF] - SBERT', 'performance': 0.8447488584474886, 'model': MBRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'MBR[GNB] - SBERT', 'performance': 0.625, 'model': MBRClassifier(base_estimator=GaussianNB())}, {'name': 'MBR[LR] - SBERT', 'performance': 0.7, 'model': MBRClassifier(base_estimator=LogisticRegression(max_iter=10000, random_state=42,
                                                solver='liblinear'))}, {'name': 'MBR[GNB] - SBERT', 'performance': 0.6699029126213593, 'model': MBRClassifier(base_estimator=GaussianNB())}])
Hardware Performance
Accuracy:	71.79%
Hamming Loss:	0.112
                

Methods & Algorithms:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  random_state=42,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
dict_values([{'name': 'BR[RF] - ATT&CK-BERT', 'performance': 0.8292682926829268, 'model': BRClassifier(base_estimator=RandomForestClassifier(random_state=42))}, {'name': 'BR[GNB] - ATT&CK-BERT', 'performance': 0.5957446808510638, 'model': BRClassifier(base_estimator=GaussianNB())}, {'name': 'MBR[LR] - ATT&CK-BERT', 'performance': 0.6829268292682927, 'model': MBRClassifier(base_estimator=LogisticRegression(max_iter=10000, random_state=42,
                                                solver='liblinear'))}, {'name': 'MBR[GNB] - ATT&CK-BERT', 'performance': 0.6810810810810811, 'model': MBRClassifier(base_estimator=GaussianNB())}])


# 7. Model Evaluation

In [7]:
with open(os.getenv("ONE_STEP_CATEGORIES_TEST_CSV"), "rb") as f:
    test_tweets = pd.read_csv(f)
    test_tweets = test_tweets.assign(
        tweet=lambda df: df["tweet"].apply(func=ast.literal_eval),
        watson=lambda df: df["watson"].apply(func=ast.literal_eval),
        urls=lambda df: df["urls"].apply(func=ast.literal_eval),
        watson_list=lambda df: df["watson_list"].apply(func=ast.literal_eval),
        target=lambda df: df["target"].apply(func=ast.literal_eval),
    )
    test_tweets["text"] = [su.replace_txt_components(t) for t in test_tweets["text"]]
    relevant = test_tweets["relevant"] == True
    is_other = test_tweets["target"].apply(lambda x: "other" in x)
    for idx in test_tweets[relevant & is_other].index:
        test_tweets.loc[idx, "relevant"] = False

texts = test_tweets["text"]
targets = test_tweets["target"]
print(f"Threat Tweets: {len(test_tweets)}")
test_tweets.head()

Threat Tweets: 4067


Unnamed: 0.1,Unnamed: 0,_id,date,id,relevant,text,tweet,type,watson,annotation,urls,destination_url,valid_certificate,watson_list,target
0,1533,b'5b8e2e39bb325e2996c66aab',2018-09-04 07:03:16+00:00,1036872307057352704,True,threatmeter: irisgraphic sql injection vulnera...,{'created_at': 'Tue Sep 04 07:03:16 +0000 2018...,vulnerability,{'technology and computing': {'computer securi...,threat,[https://ift.tt/2wNsX91],https://cxsecurity.com/issue/WLB-2018090028,True,"[technology and computing, computer security, ...","[antivirus and malware, software, databases, c..."
1,59,b'5b887f63bb325e65fa7e7922',2018-08-30 23:36:02+00:00,1035310204102078464,False,Download this how-to guide on preparing for an...,{'created_at': 'Thu Aug 30 23:36:02 +0000 2018...,ransomware,{'technology and computing': {'computer securi...,business,[http://bit.ly/2wuUTyT],https://logrhythm.com/the-ransomware-threat-eb...,True,"[technology and computing, computer security, ...","[antivirus and malware, computer security]"
2,10824,b'5b92cad0bb325e521c5ae40a',2018-09-07 19:00:31+00:00,1038139971562078208,True,The US Justice Department is poised to charge ...,{'created_at': 'Fri Sep 07 19:00:31 +0000 2018...,ransomware,"{'law, govt and politics': {'government': {}, ...",threat,[https://twitter.com/i/web/status/103813997156...,https://twitter.com/i/web/status/1038139971562...,True,"[law, govt and politics, government, armed for...","[antivirus and malware, computer security]"
3,2104,b'5b90af51bb325e70071868e2',2018-09-06 04:38:41+00:00,1037560694789554176,False,"3) Control and monitor compliance. Also, in...",{'created_at': 'Thu Sep 06 04:38:41 +0000 2018...,general,{'finance': {'personal finance': {'insurance':...,,[https://twitter.com/i/web/status/103756069478...,https://twitter.com/i/web/status/1037560694789...,True,"[finance, personal finance, insurance, busines...",[other]
4,12292,b'5b935e61bb325e521c5aeab9',2018-09-08 05:30:09+00:00,1038298421893967877,True,"Ransomware down, but not out, report reveals",{'created_at': 'Sat Sep 08 05:30:09 +0000 2018...,ransomware,{'business and industrial': {'company': {'annu...,threat,[http://fsecure.smh.re/4AS],http://fsecure.smh.re/4AS,True,"[business and industrial, company, annual repo...","[databases, software]"


In [29]:
with open("models/General.pickle", "rb") as f:
    gn_mod = pickle.load(f)

with open("models/Computersecurity.pickle", "rb") as f:
    cs_mod = pickle.load(f)

with open("models/Operatingsystems.pickle", "rb") as f:
    os_mod = pickle.load(f)

with open("models/Software.pickle", "rb") as f:
    sw_mod = pickle.load(f)

with open("models/Hardware.pickle", "rb") as f:
    hw_mod = pickle.load(f)

In [30]:
solution = System(gn_mod, cs_mod, os_mod, sw_mod, hw_mod)

In [21]:
solution.predict(texts[0:10])

array([[1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [22]:
solution.predict_targets(texts[0:10])

0    [antivirus and malware, databases, software, c...
1                                              [other]
2           [antivirus and malware, computer security]
3                                              [other]
4                                [databases, software]
5                                              [other]
6                                              [other]
7                                              [other]
8                                              [other]
9                                              [other]
dtype: object

In [23]:
pd.DataFrame(
    [
        solution.predict_targets(texts[10:20]),
        targets[10:20],
        texts[10:20],
        test_tweets["relevant"][10:20],
    ],
    index=["Predicted", "Actual", "Text", "Relevant"],
).transpose()

Unnamed: 0,Predicted,Actual,Text,Relevant
10,[other],[other],HOTEL MANAGEMENT - Data leak from Huazhu Hotel...,False
11,[other],"[databases, software]","Yes the vulnerability does exist and yes, the ...",True
12,[other],"[network security, computer security]",Healthier mentally &amp; more comfortable in m...,False
13,[other],"[antivirus and malware, computer security]",[] Slush doesn't give a sh*t about DDoS on his...,True
14,[other],[other],DLVRIT: US charges North Korean man in Sony ha...,False
15,"[antivirus and malware, databases, software, c...","[antivirus and malware, databases, software, c...",Designed by Tristar Software Solutions India S...,True
16,[other],"[network security, computer security]",Don’t call it a friendship unless you’re actua...,False
17,[other],[other],\nWe will be starting an exciting new project ...,False
18,"[antivirus and malware, computer security]","[network security, computer security]",attack from cripples Bank of website\n\n …,True
19,[other],"[computer, network security, hardware, compute...",7 bizarre security analogies about preventing ...,True


In [27]:
system_performance = solution.evaluate(texts, targets)
print("==========================================================")
print("System Performance")
print("==========================================================")
print(f"Accuracy:\t{system_performance['accuracy'] * 100:.2f}%")
print(f"Hamming Loss:\t{system_performance['hamming_loss']:.3f}")
report = pd.DataFrame(system_performance["report"])
report.columns = solution.classes() + [
    "micro avg",
    "macro avg",
    "weighted avg",
    "samples avg",
]
report = (
    report.transpose()
    .map(lambda x: f"{x:.2f}" if isinstance(x, float) else x)
    .to_string()
)
print(f"{report}")
print("==========================================================\n")

System Performance
Accuracy:	59.65%
Hamming Loss:	0.080
                      precision recall f1-score  support   auc
computer security          0.93   0.41     0.57  1738.00  0.70
hardware                   0.98   0.37     0.54   461.00  0.68
networking                 0.95   0.31     0.47   240.00  0.66
operating systems          0.95   0.56     0.71   156.00  0.78
other                      0.54   0.98     0.70  1603.00  0.72
software                   0.93   0.54     0.68   950.00  0.76
antivirus and malware      0.92   0.48     0.63  1110.00  0.73
network security           0.91   0.24     0.37   817.00  0.61
linux                      0.98   0.62     0.76    77.00  0.81
mac os                     1.00   0.46     0.63    13.00  0.73
unix                       0.82   0.64     0.72    14.00  0.82
windows                    0.88   0.46     0.60    46.00  0.73
databases                  0.92   0.60     0.73   342.00  0.80
computer                   0.97   0.36     0.52   362.00  0.68

In [25]:
Y = pd.Series([1 if t else 0 for t in test_tweets["relevant"]])
Y_pred = pd.Series([not t for t in solution.predict(texts)[:, 4:5]])
auc_score_macro = roc_auc_score(Y, Y_pred, average="macro")
auc_score_weighted = roc_auc_score(Y, Y_pred, average="weighted")
report = classification_report(Y, Y_pred, output_dict=True, zero_division=0.0)
report["macro avg"]["auc"] = auc_score_macro
report["weighted avg"]["auc"] = auc_score_weighted
system_performance = {"hamming_loss": hamming_loss(Y, Y_pred), "report": report}

print("==========================================================")
print("System Performance")
print("==========================================================")
print(f"Hamming Loss:\t{system_performance['hamming_loss']:.3f}")
report = pd.DataFrame(system_performance["report"])
report.columns = ["not relevant", "relevant"] + [
    "accuracy",
    "macro avg",
    "weighted avg",
]
report = (
    report.transpose()
    .map(lambda x: f"{x:.2f}" if isinstance(x, float) else x)
    .to_string()
)
print(f"{report}")
print("==========================================================\n")

System Performance
Hamming Loss:	0.179
             precision recall f1-score  support   auc
not relevant      0.80   0.94     0.86  2444.00   nan
relevant          0.88   0.64     0.74  1623.00   nan
accuracy          0.82   0.82     0.82     0.82  0.82
macro avg         0.84   0.79     0.80  4067.00  0.79
weighted avg      0.83   0.82     0.81  4067.00  0.79



In [26]:
pd.DataFrame(
    np.hstack(
        (
            Y.values.reshape(-1, 1),
            Y_pred.values.reshape(-1, 1),
            test_tweets["text"].values.reshape(-1, 1),
        )
    ),
    columns=["relevant", "predicted", "text"],
)

Unnamed: 0,relevant,predicted,text
0,1,True,threatmeter: irisgraphic sql injection vulnera...
1,0,False,Download this how-to guide on preparing for an...
2,1,True,The US Justice Department is poised to charge ...
3,0,False,"3) Control and monitor compliance. Also, in..."
4,1,True,"Ransomware down, but not out, report reveals"
...,...,...,...
4062,0,False,– with full knowledge of my increased vulnerab...
4063,1,False,CVE-2018-16460\n\nA command Injection in ps pa...
4064,1,True,threatmeter: To Video Converter Professional...
4065,1,False,"New vulnerability on the NVD on September 10, ..."


In [31]:
with open("models/System.pickle", "wb") as f:
    pickle.dump(solution, f)

del solution

In [32]:
system = pickle.load(file=open("models/System.pickle", "rb"))
system