# Multi-Label Classification

## 1. Imports

In [1]:
import ast
import pickle
import random
import re
import string
import unicodedata
from collections import Counter
from pathlib import Path
from typing import Any, Union, Optional, Callable

import nltk
import numpy as np
import pandas as pd
from nltk import WordNetLemmatizer, word_tokenize
from nltk.corpus import stopwords, wordnet
from numpy import asarray
from numpy.typing import ArrayLike
from sentence_transformers import SentenceTransformer
from sklearn import preprocessing
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import hamming_loss, accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from tqdm.contrib.itertools import product
from tqdm.notebook import tqdm

import MLC

In [2]:
def replace_text_components(
        text: str,
        replace_emails: bool = True,
        replace_urls: bool = True,
        replace_mentions: bool = True,
        replace_hashtags: bool = True,
        replace_phone_numbers: bool = True,
        custom_replacements: Optional[dict[str, str]] = None,
) -> str:
    """
    Replace specific text components (e.g., emails, URLs, mentions, hashtags) with placeholders.

    Parameters
    ----------
    text : str
        Input text to process.
    replace_emails : bool
        Whether to replace email addresses. Default is True.
    replace_urls : bool
        Whether to replace URLs. Default is True.
    replace_mentions : bool
        Whether to replace mentioned users. Default is True.
    replace_hashtags : bool
        Whether to replace hashtags. Default is True.
    replace_phone_numbers : bool
        Whether to replace phone numbers. Default is True.
    custom_replacements : dict
        Custom replacement rules as a dictionary. Default is None.

    Returns
    -------
    str : str
        Text with specified components replaced.
    """
    # Replace email addresses
    if replace_emails:
        text = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", "", text)

    # Replace URLs
    if replace_urls:
        text = re.sub(r"https?://\S+|www\.\S+", "", text)

    # Replace mentioned users
    if replace_mentions:
        text = re.sub(r"@\w+", "", text)

    # Replace hashtags
    if replace_hashtags:
        text = re.sub(r"#\w+", "", text)  # Remove hashtags entirely

    # Replace phone numbers
    if replace_phone_numbers:
        text = re.sub(
            r"\b(?:\+\d{1,2}\s?)?(?:\(\d{3}\)|\d{3})[\s.-]?\d{3}[\s.-]?\d{4}\b",
            "",
            text,
        )

    # Apply custom replacements if provided
    if custom_replacements is not None:
        for pattern, replacement in custom_replacements.items():
            text = re.sub(pattern, replacement, text)

    return text


def clean_text(
        text: str,
        remove_punctuation: bool = True,
        remove_emojis: bool = True,
        normalize_whitespace: bool = True,
        lowercase: bool = True,
) -> str:
    """
    Clean and preprocess text data for machine learning tasks.

    Parameters
    ----------
    text : str
        Input text to be cleaned.
    remove_punctuation : bool
        Whether to remove punctuation. Default is True.
    remove_emojis : bool
        Whether to remove emojis and emoticons. Default is True.
    normalize_whitespace : bool
        Whether to normalize whitespace. Default is True.
    lowercase : bool
        Whether to convert text to lowercase. Default is True.

    Returns
    -------
    str : str
        Cleaned and preprocessed text.
    """
    # Convert text to lowercase if specified
    if lowercase:
        text = text.lower()

    # Remove punctuation if specified
    if remove_punctuation:
        text = re.sub(r"[^\w\s]", "", text)

    # Normalize whitespace if specified
    if normalize_whitespace:
        text = re.sub(r"\s+", " ", text).strip()

    # Remove emojis and emoticons if specified
    if remove_emojis:
        # Remove emojis and emoticons using Unicode ranges
        text = re.sub(r"[\U00010000-\U0010ffff]", "", text)
        # Remove additional emoticons and symbols
        text = re.sub(r"[\u2600-\u26FF\u2700-\u27BF]", "", text)

    # Normalize Unicode characters (e.g., convert accented characters to their base form)
    text = (
        unicodedata.normalize("NFKD", text)
        .encode("ascii", "ignore")
        .decode("utf-8", "ignore")
    )

    return text


def get_wordnet_pos(treebank_tag):
    """
    Map Treebank POS tags to WordNet POS tags for lemmatization.

    Args:
        treebank_tag (str): Treebank POS tag.

    Returns:
        str: Corresponding WordNet POS tag.
    """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if no match


def lemmatize_text(text, lemmatizer=WordNetLemmatizer()):
    """
    Lemmatize text using WordNetLemmatizer with POS tagging for better accuracy.

    Parameters
    ----------
    text : str
        Input text to be lemmatized.
    lemmatizer : WordNetLemmatizer
        Lemmatizer instance.

    Returns
    -------
        str: Lemmatized text.
    """
    # Tokenize the text
    tokens = word_tokenize(text)

    # Get POS tags for each token
    pos_tags = nltk.pos_tag(tokens)

    # Lemmatize each token with its corresponding POS tag
    lemmatized_tokens = []
    for token, tag in pos_tags:
        wordnet_pos = get_wordnet_pos(tag)  # Convert Treebank tag to WordNet POS
        lemmatized_token = lemmatizer.lemmatize(token, pos=wordnet_pos)
        lemmatized_tokens.append(lemmatized_token)

    # Join the lemmatized tokens into a single string
    return " ".join(lemmatized_tokens)


def remove_stopwords(text, language='english', custom_stopwords=None, lowercase=True):
    """
    Remove stopwords from the input text.

    Args:
        text (str): Input text to process.
        language (str): Language of the stopwords. Default is 'english'.
        custom_stopwords (set): Custom set of stopwords to use. Default is None.
        lowercase (bool): Whether to convert text to lowercase before processing. Default is True.

    Returns:
        str: Text with stopwords removed.
    """
    # Convert text to lowercase if specified
    if lowercase:
        text = text.lower()

    # Tokenize the text
    tokens = word_tokenize(text)

    # Load stopwords
    if custom_stopwords is not None:
        stop_words = set(custom_stopwords)
    else:
        stop_words = set(stopwords.words(language))

    # Remove stopwords
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Join the filtered tokens into a single string
    return ' '.join(filtered_tokens)


def load_word2vec_dict(
        model_path: Path, embedding_dim: int
) -> dict[Union[str, list[str]], np.ndarray[Any, np.dtype]]:
    embeddings_dict = {}
    f = open(model_path, "r", encoding="utf-8")
    for line in f:
        values = line.split()
        word = values[:-embedding_dim]

        if type(word) is list:
            word = " ".join(word)

        vector = asarray([float(val) for val in values[-embedding_dim:]])
        embeddings_dict[word] = vector
    f.close()
    return embeddings_dict


def tokenizer_transform(
        x: pd.Series,
        embedder_addr: str,
        preprocessing_list: Optional[list[Callable[[str], str]]] = None,
) -> np.ndarray[Any, np.dtype[Any]]:
    """
    Generate embeddings for the sentences in the DataFrame.

    Parameters
    ----------
    x : pd.Series
        The DataFrame containing the sentences.
    embedder_addr : str
        Address of the embedder.
    preprocessing_list : Optional[list[Callable[[str], str]]]
        List of functions to apply to each sentence.

    Returns
    -------
    encoded_sentences : ArrayLike

    """
    # Preprocess the text
    sentences = x.tolist()

    if preprocessing_list is None:
        preprocessing_list = []

    for preprocessor in preprocessing_list:
        sentences = [preprocessor(s) for s in sentences]

    model = SentenceTransformer(model_name_or_path=embedder_addr)
    return model.encode(sentences)


def preprocess_texts(list_str, model_path, embedding_dim):
    if embedding_dim is None:
        return tokenizer_transform(x=list_str, embedder_addr=model_path)

    word2vec_dict = load_word2vec_dict(
        model_path=model_path, embedding_dim=embedding_dim
    )
    list_embedded_str = np.zeros((len(list_str), embedding_dim))

    for i, text in enumerate(list_str):
        tokens = re.findall(r"\w+|[{}]".format(re.escape(string.punctuation)), text)
        for token in tokens:
            try:
                list_embedded_str[i] += word2vec_dict[token.lower()]
            except KeyError:
                continue

    return list_embedded_str


def map_targets(watson_list, fix_targets):
    targets = set(fix_targets.keys()) & set(watson_list)
    mapped_targets = {fix_targets[category] for category in targets}
    return list(mapped_targets) if mapped_targets else ["other"]


def extract_models(classes, models, performance):
    attack_models = {}
    for idx in range(len(classes)):
        best_model = {}
        for model_name in models:
            temp_performance = models[model_name]["assess"]["report"][f"{idx}"][performance]
            if len(best_model) == 0:
                best_model["name"] = model_name
                best_model["performance"] = temp_performance
                best_model["model"] = models[model_name]["model"]
            else:
                if best_model["performance"] < temp_performance:
                    best_model["name"] = model_name
                    best_model["performance"] = temp_performance
                    best_model["model"] = models[model_name]["model"]
        attack_models[f"{idx}"] = best_model
    return attack_models


def assess(Y: ArrayLike, Y_pred: ArrayLike) -> dict[str, float]:
    """
    Evaluate the model on the given data.

    Parameters
    ----------
    Y : ArrayLike of shape (n_samples, n_labels)
        The input features.
    Y_pred : ArrayLike of shape (n_samples, n_labels)
        The true binary label matrix.

    Returns
    -------
    metrics : dict[str, float]
        Dictionary containing accuracy, micro F1 score, and hamming loss.
    """
    accuracy = accuracy_score(Y, Y_pred)

    auc_score_micro = roc_auc_score(Y, Y_pred, average="micro")
    auc_score_macro = roc_auc_score(Y, Y_pred, average="macro")
    auc_score_weighted = roc_auc_score(Y, Y_pred, average="weighted")
    auc_score_samples = roc_auc_score(Y, Y_pred, average="samples")
    auc_per_label = roc_auc_score(Y, Y_pred, average=None)

    report = classification_report(Y, Y_pred, output_dict=True, zero_division=0.0)
    report["micro avg"]["auc"] = auc_score_micro
    report["macro avg"]["auc"] = auc_score_macro
    report["weighted avg"]["auc"] = auc_score_weighted
    report["samples avg"]["auc"] = auc_score_samples

    n_classes = Y.shape[1]
    class_names = [f"{i}" for i in range(n_classes)]
    for i, target in enumerate(class_names):
        if target in report:
            report[target]["auc"] = auc_per_label[i]
        else:
            # In case labels are not printed per class, you can store them separately
            report[target] = {"auc": auc_per_label[i]}

    hamming = hamming_loss(Y, Y_pred)
    return {
        "accuracy": accuracy,
        "hamming_loss": hamming,
        "report": report
    }

In [3]:
class ModuleSystem(BaseEstimator, ClassifierMixin):
    def __init__(self, models, mlb):
        self.models = models
        self.mlb = mlb

    def predict(self, X_test: ArrayLike) -> ArrayLike:
        """
        Predict labels for each component using the chosen model in the dictionary.

        Parameters
        ----------
        X_test : ArrayLike
            Test data.

        Returns
        -------
        Y_pred : ArrayLike
            Predicted labels for each component.
        """
        n_samples = X_test.shape[0]
        n_components = len(self.models)
        Y_pred = np.zeros((n_samples, n_components))

        for idx, model_info in self.models.items():
            model = model_info['model']
            Y_pred[:, int(idx)] = (model.predict(X_test))[:, int(idx)]

        return Y_pred

## 2. Configurations & Constants

In [10]:
FIX_TARGETS = {
    'computer security': 'computer security',
    'operating systems': 'operating systems',
    'software': 'software',
    'programming languages': 'software',
    'hardware': 'hardware',
    'electronic components': 'hardware',
    'networking': 'networking',
    'internet technology': 'networking'
}
OVERWRITE = False
RETRAIN = False
RANDOM_STATE = 42
MODEL_LEVEL = 'operating systems'
TEST_SIZE = 2e-1

np.random.seed(seed=RANDOM_STATE)
random.seed(a=RANDOM_STATE)

### Directory Paths

In [5]:
COLAB_DIR = Path("/content/drive/MyDrive")
KAGGLE_DIR = Path("/kaggle/input")
LOCAL_DIR = Path("../")

try:
    import google.colab

    IN_COLAB = True
except ImportError:
    IN_COLAB = False

try:
    import kaggle_secrets

    IN_KAGGLE = True
except ImportError:
    IN_KAGGLE = False

if IN_COLAB:
    DATA_DIR = COLAB_DIR / "data"
    MODELS_DIR = COLAB_DIR / "models"
elif IN_KAGGLE:
    DATA_DIR = KAGGLE_DIR
    MODELS_DIR = KAGGLE_DIR
else:
    DATA_DIR = LOCAL_DIR / "data"
    MODELS_DIR = LOCAL_DIR / "models"

TWEETS_CSV = DATA_DIR / "tweets-dataset/categories_train.csv"

## Model Training

In [6]:
METHODS = {
    "br": MLC.BRClassifier,
    "clr": MLC.CLRClassifier,
    "cc": MLC.CCClassifier,
    "lp": MLC.LPClassifier,
    "pst": MLC.PStClassifier,
    #'cdn': MLC.CDNClassifier,
    "mbr": MLC.MBRClassifier,
    "rakel": MLC.RAkELClassifier,
    "homer": MLC.HOMERClassifier,
}

CLASSIFIERS = {
    "lr": LogisticRegression(solver="liblinear", max_iter=10000),
    "gnb": GaussianNB(),
    "dt": DecisionTreeClassifier(random_state=RANDOM_STATE),
    "rf": RandomForestClassifier(random_state=RANDOM_STATE),
    #'xgb': XGBClassifier(random_state=RANDOM_STATE)
}

EMBEDDERS = [
    {
        "name": "GloVe.6B.50D",
        "model-path": "../embedders/glove.6B.50d.txt",
        "embedding-dim": 50,
    },
    {
        "name": "GloVe.6B.100D",
        "model-path": "../embedders/glove.6B.100d.txt",
        "embedding-dim": 100,
    },
    {
        "name": "GloVe.6B.200D",
        "model-path": "../embedders/glove.6B.200d.txt",
        "embedding-dim": 200,
    },
    {
        "name": "GloVe.6B.300D",
        "model-path": "../embedders/glove.6B.300d.txt",
        "embedding-dim": 300,
    },
    {
        "name": "DistilRoBERTa",
        "model-path": "sentence-transformers/all-distilroberta-v1",
        "embedding-dim": None,
    },
    {
        "name": "SBERT",
        "model-path": "sentence-transformers/all-mpnet-base-v2",
        "embedding-dim": None,
    },
    {"name": "ATT&CK-BERT", "model-path": "basel/ATTACK-BERT", "embedding-dim": None},
]

PROBLEMS = [
    {
        "level": "computer security",
        "targets": {
            'network security': 'network security',
            'antivirus and malware': 'antivirus and malware',
        }
    },
    {
        "level": "operating systems",
        "targets": {
            'mac os': 'mac os',
            'windows': 'windows',
            'unix': 'unix',
            'linux': 'linux'
        }
    },
    {
        "level": "software",
        "targets": {
            'databases': 'databases'
        }
    },
    {
        "level": "hardware",
        "targets": {
            'computer': 'computer',
            'computer components': 'computer components',
            'computer networking': 'computer networking'
        }
    }
]

In [27]:
with open(TWEETS_CSV, "rb") as f:
    train = pd.read_csv(f)
    train = train.assign(
        tweet=lambda df: df["tweet"].apply(func=ast.literal_eval),
        watson=lambda df: df["watson"].apply(func=ast.literal_eval),
        urls=lambda df: df["urls"].apply(func=ast.literal_eval),
        watson_list=lambda df: df["watson_list"].apply(func=ast.literal_eval),
        target=lambda df: df["target"].apply(func=ast.literal_eval),
    )
    train['text'] = [replace_text_components(t) for t in train['text']]

for problem in tqdm(PROBLEMS, desc="Modules", unit="module"):
    accuracy = 0
    best_module = None

    for embedder in tqdm(EMBEDDERS, desc="Embedders", unit="embedder"):
        tweets = train.assign(
            target=lambda df: df["target"].apply(func=map_targets, args=(problem["targets"],))
        )

        if problem["level"] != "general":
            tweets = tweets[
                tweets["watson_list"].apply(lambda x: ' '.join(problem["level"].split(sep=' ')) in x)
            ].reset_index(drop=True)

        tweets['text'] = [replace_text_components(t) for t in tweets['text']]
        # Find valid targets (with frequency ≥ 5)
        target_freq = Counter(tuple(item) for item in tweets["target"])
        valid_targets = {target for target, freq in target_freq.items() if freq >= 5}
        # Filter the DataFrame to keep only valid targets
        filtered_df = tweets[tweets["target"].apply(lambda valid_target: tuple(valid_target) in valid_targets)]
        filtered_df = filtered_df.reset_index(drop=True)

        mlb = preprocessing.MultiLabelBinarizer()
        y = mlb.fit_transform(y=filtered_df["target"])
        _, y_mcp = np.unique(ar=y, axis=0, return_inverse=True)
        x = preprocess_texts(
            list_str=filtered_df["text"],
            model_path=embedder["model-path"],
            embedding_dim=embedder["embedding-dim"],
        )
        X_train, X_val, y_train, y_val = train_test_split(
            x,
            y,
            test_size=TEST_SIZE,
            random_state=RANDOM_STATE,
            shuffle=True,
            stratify=y_mcp
        )

        models = {}
        models_to_embedder = []
        for name_method, name_classifier in product(METHODS, CLASSIFIERS, desc="Methods & Classifiers", unit="pair"):
            model = METHODS[name_method](CLASSIFIERS[name_classifier])
            try:
                model.fit(X_train, y_train)
                model_name = f"{name_method.upper()}[{name_classifier.upper()}] - {embedder['name']}"
            except Exception as e:
                print(f"{model} failed to fit due to {e}")
                model_name = None
                continue
            if model_name is not None:
                models_to_embedder.append(model_name)
                models[model_name] = {"model": model, "assess": model.evaluate(X_val, y_val)}

        best_models = extract_models(mlb.classes_, models, "f1-score")
        module = ModuleSystem(best_models, mlb)
        y_pred = module.predict(X_val)
        assessment = assess(y_val, y_pred)

        # Convert the predicted labels back to the original format using MultiLabelBinarizer
        if assessment["accuracy"] > accuracy:
            accuracy = assessment["accuracy"]
            best_module = module
            print("==========================================================")
            print("System Performance")
            print("==========================================================")
            print(f"Accuracy:\t{assessment['accuracy'] * 100:.2f}%")
            print(f"Hamming Loss:\t{assessment['hamming_loss']:.3f}")
            report = pd.DataFrame(assessment["report"])
            report.columns = list(module.mlb.classes_) + ["micro avg", "macro avg", "weighted avg", "samples avg"]
            report = report.transpose().map(lambda x: f"{x:.2f}" if isinstance(x, float) else x).to_string()
            print(f"{report}")
            print("==========================================================\n")

        dict_name = f"{'.'.join(problem["level"].capitalize().split(' '))}.{embedder['name']}.pkl"
        with open(f"../models/dictionaries/{dict_name}", "wb") as f:
            pickle.dump(models, f)

    with open(f"../models/{'.'.join(problem["level"].upper().split(' '))}.pkl", "wb") as f:
        pickle.dump(best_module, f)

        del x, X_train, X_val, y_train, y_val, models_to_embedder

Modules:   0%|          | 0/4 [00:00<?, ?module/s]

Embedders:   0%|          | 0/7 [00:00<?, ?embedder/s]

Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
System Performance
Accuracy:	84.52%
Hamming Loss:	0.071
                      precision recall f1-score  support   auc
antivirus and malware      0.90   0.97     0.94   673.00  0.83
network security           0.88   0.82     0.85   329.00  0.88
other                      1.00   0.14     0.25     7.00  0.57
micro avg                  0.89   0.92     0.91  1009.00  0.93
macro avg                  0.93   0.65     0.68  1009.00  0.76
weighted avg               0.89   0.92     0.90  1009.00  0.85
samples avg                0.91   0.92     0.91  1009.00  0.93



Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
System Performance
Accuracy:	84.74%
Hamming Loss:	0.069
                      precision recall f1-score  support   auc
antivirus and malware      0.90   0.97     0.94   673.00  0.84
network security           0.89   0.82     0.85   329.00  0.88
other                      1.00   0.14     0.25     7.00  0.57
micro avg                  0.90   0.92     0.91  1009.00  0.93
macro avg                  0.93   0.64     0.68  1009.00  0.76
weighted avg               0.90   0.92     0.90  1009.00  0.85
samples avg                0.91   0.93     0.91  1009.00  0.93



Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)


Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
System Performance
Accuracy:	85.51%
Hamming Loss:	0.064
                      precision recall f1-score  support   auc
antivirus and malware      0.90   0.97     0.94   673.00  0.84
network security           0.91   0.83     0.87   329.00  0.89
other                      1.00   0.14     0.25     7.00  0.57
micro avg                  0.91   0.92     0.91  1009.00  0.93
macro avg                  0.94   0.65     0.69  1009.00  0.77
weighted avg               0.91   0.92     0.91  1009.00  0.86
samples avg                0.91   0.93     0.91  1009.00  0.94



Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
System Performance
Accuracy:	87.16%
Hamming Loss:	0.056
                      precision recall f1-score  support   auc
antivirus and malware      0.92   0.97     0.95   673.00  0.87
network security           0.92   0.85     0.89   329.00  0.91
other                      1.00   0.14     0.25     7.00  0.57
micro avg                  0.92   0.93     0.92  1009.00  0.94
macro avg                  0.95   0.66     0.69  1009.00  0.78
weighted avg               0.92   0.93     0.92  1009.00  0.88
samples avg                0.92   0.93     0.92  1009.00  0.95



Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)


Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)


Embedders:   0%|          | 0/7 [00:00<?, ?embedder/s]

Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
System Performance
Accuracy:	79.47%
Hamming Loss:	0.053
             precision recall f1-score support   auc
linux             0.98   0.83     0.90   71.00  0.91
mac os            1.00   0.69     0.81   16.00  0.84
other             0.92   0.80     0.86   15.00  0.90
unix              0.88   0.83     0.86   18.00  0.91
windows           0.92   0.78     0.84   45.00  0.87
micro avg         0.95   0.80     0.87  165.00  0.89
macro avg         0.94   0.79     0.85  165.00  0.89
weighted avg      0.95   0.80     0.87  165.00  0.89
samples avg       0.81   0.80     0.81  165.00  0.90



Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)


Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
System Performance
Accuracy:	80.79%
Hamming Loss:	0.050
             precision recall f1-score support   auc
linux             0.94   0.89     0.91   71.00  0.92
mac os            1.00   0.62     0.77   16.00  0.81
other             0.80   0.80     0.80   15.00  0.89
unix              0.94   0.94     0.94   18.00  0.97
windows           0.92   0.80     0.86   45.00  0.89
micro avg         0.93   0.84     0.88  165.00  0.91
macro avg         0.92   0.81     0.86  165.00  0.89
weighted avg      0.93   0.84     0.88  165.00  0.90
samples avg       0.85   0.84     0.84  165.00  0.91



Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
System Performance
Accuracy:	81.46%
Hamming Loss:	0.048
             precision recall f1-score support   auc
linux             0.91   0.90     0.91   71.00  0.91
mac os            0.80   0.75     0.77   16.00  0.86
other             1.00   0.80     0.89   15.00  0.90
unix              1.00   0.94     0.97   18.00  0.97
windows           0.90   0.82     0.86   45.00  0.89
micro avg         0.92   0.86     0.89  165.00  0.92
macro avg         0.92   0.84     0.88  165.00  0.91
weighted avg      0.92   0.86     0.89  165.00  0.91
samples avg       0.86   0.86     0.86  165.00  0.92



Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
System Performance
Accuracy:	88.74%
Hamming Loss:	0.028
             precision recall f1-score support   auc
linux             0.98   0.90     0.94   71.00  0.94
mac os            1.00   0.81     0.90   16.00  0.91
other             1.00   0.87     0.93   15.00  0.93
unix              1.00   0.94     0.97   18.00  0.97
windows           0.90   0.96     0.92   45.00  0.95
micro avg         0.96   0.91     0.93  165.00  0.95
macro avg         0.98   0.90     0.93  165.00  0.94
weighted avg      0.96   0.91     0.93  165.00  0.95
samples avg       0.92   0.91     0.91  165.00  0.95



Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)


Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)


Embedders:   0%|          | 0/7 [00:00<?, ?embedder/s]

Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

RAkELClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear'),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=GaussianNB(), n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data,

Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

RAkELClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear'),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=GaussianNB(), n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data,

Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

RAkELClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear'),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=GaussianNB(), n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data,

Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

RAkELClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear'),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=GaussianNB(), n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data,

Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

RAkELClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear'),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=GaussianNB(), n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data,

Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

RAkELClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear'),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=GaussianNB(), n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data,

Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

RAkELClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear'),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=GaussianNB(), n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
RAkELClassifier(base_estimator=RandomForestClassifier(random_state=42),
                n_estimators=4) failed to fit due to Cannot take a larger sample than population when 'replace=False'
HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data,

Embedders:   0%|          | 0/7 [00:00<?, ?embedder/s]

Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
System Performance
Accuracy:	84.42%
Hamming Loss:	0.062
                    precision recall f1-score support   auc
computer                 0.89   1.00     0.94  260.00  0.74
computer components      0.60   0.64     0.62   28.00  0.80
computer networking      0.90   0.51     0.66   37.00  0.75
other                    1.00   0.40     0.57   10.00  0.70
micro avg                0.87   0.90     0.88  335.00  0.93
macro avg                0.85   0.64     0.70  335.00  0.75
weighted avg             0.87   0.90     0.87  335.00  0.74
samples avg              0.89   0.90     0.89  335.00  0.93



Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
System Performance
Accuracy:	85.36%
Hamming Loss:	0.058
                    precision recall f1-score support   auc
computer                 0.90   1.00     0.95  260.00  0.75
computer components      1.00   0.43     0.60   28.00  0.71
computer networking      0.71   0.65     0.68   37.00  0.81
other                    1.00   0.40     0.57   10.00  0.70
micro avg                0.88   0.90     0.89  335.00  0.93
macro avg                0.90   0.62     0.70  335.00  0.74
weighted avg             0.89   0.90     0.88  335.00  0.75
samples avg              0.89   0.90     0.89  335.00  0.93



Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)


Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)


Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)


Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
System Performance
Accuracy:	85.67%
Hamming Loss:	0.054
                    precision recall f1-score support   auc
computer                 0.90   1.00     0.95  260.00  0.77
computer components      0.78   0.64     0.71   28.00  0.81
computer networking      0.71   0.78     0.74   37.00  0.87
other                    1.00   0.50     0.67   10.00  0.75
micro avg                0.87   0.93     0.90  335.00  0.94
macro avg                0.85   0.73     0.77  335.00  0.80
weighted avg             0.87   0.93     0.90  335.00  0.78
samples avg              0.90   0.93     0.91  335.00  0.94



Methods & Classifiers:   0%|          | 0/32 [00:00<?, ?pair/s]

HOMERClassifier(base_estimator=LogisticRegression(max_iter=10000,
                                                  solver='liblinear')) failed to fit due to This solver needs samples of at least 2 classes in the data, but the data contains only one class: np.int64(1)
System Performance
Accuracy:	86.60%
Hamming Loss:	0.055
                    precision recall f1-score support   auc
computer                 0.90   0.99     0.94  260.00  0.76
computer components      0.73   0.68     0.70   28.00  0.83
computer networking      0.86   0.65     0.74   37.00  0.82
other                    1.00   0.40     0.57   10.00  0.70
micro avg                0.88   0.91     0.90  335.00  0.93
macro avg                0.87   0.68     0.74  335.00  0.78
weighted avg             0.88   0.91     0.89  335.00  0.77
samples avg              0.90   0.91     0.90  335.00  0.94



# 7. Model Evaluation

In [65]:
with open("../data/tweets-dataset/categories_test.csv", "rb") as f:
    test_tweets = pd.read_csv(f)

test_tweets = test_tweets.assign(
    tweet=lambda df: df["tweet"].apply(func=ast.literal_eval),
    watson=lambda df: df["watson"].apply(func=ast.literal_eval),
    urls=lambda df: df["urls"].apply(func=ast.literal_eval),
    watson_list=lambda df: df["watson_list"].apply(func=ast.literal_eval),
    target=lambda df: df["target"].apply(func=ast.literal_eval),
)
test_tweets['text'] = [replace_text_components(t) for t in test_tweets['text']]

test_tweets = test_tweets.assign(
    target=lambda df: df["target"].apply(func=map_targets, args=(
        {
            'computer security': 'computer security',
            'operating systems': 'operating systems',
            'software': 'software',
            'programming languages': 'software',
            'hardware': 'hardware',
            'electronic components': 'hardware',
            'networking': 'networking',
            'internet technology': 'networking',
            'network security': 'network security',
            'antivirus and malware': 'antivirus and malware',
            'mac os': 'mac os',
            'windows': 'windows',
            'unix': 'unix',
            'linux': 'linux',
            'databases': 'databases',
            'computer': 'computer',
            'computer components': 'computer components',
            'computer networking': 'computer networking'
        }, ))
)

print(f"Threat Tweets: {len(test_tweets)}")
test_tweets.head()

Threat Tweets: 2176


Unnamed: 0.1,Unnamed: 0,_id,date,id,relevant,text,tweet,type,watson,annotation,urls,destination_url,valid_certificate,watson_list,target
0,7083,b'5b93abb3bb325e521c5aee04',2018-09-08 11:00:03+00:00,1038381444463304705,True,Protect your customers access Prestashop Ant...,{'created_at': 'Sat Sep 08 11:00:03 +0000 2018...,ddos,{'technology and computing': {'internet techno...,threat,[http://addons.prestashop.com/en/23513-anti-dd...,http://addons.prestashop.com/en/23513-anti-ddo...,True,"[technology and computing, internet technology...","[software, computer security, networking, anti..."
1,3390,b'5b917413bb325e7007187828',2018-09-06 18:38:08+00:00,1037771948808237056,True,ProtonMail names one of the attackers behind a...,{'created_at': 'Thu Sep 06 18:38:08 +0000 2018...,ddos,{'society': {'crime': {'personal offense': {'a...,threat,[http://dlvr.it/QjLlwb],https://techcrunch.com/2018/09/06/protonmail-n...,True,"[society, crime, personal offense, assault, te...","[computer security, antivirus and malware]"
2,9759,b'5b96d920bb325e7a2e199c79',2018-09-10 20:50:37+00:00,1039254842538119168,True,stop neglecting me or i will ddos you,{'created_at': 'Mon Sep 10 20:50:37 +0000 2018...,ddos,{'technology and computing': {'computer securi...,threat,[],,,"[technology and computing, computer security, ...","[computer security, antivirus and malware]"
3,2141,b'5b911868bb325e7007186ec7',2018-09-06 12:07:03+00:00,1037673532241731586,True,"Some very good points here, but we also need t...",{'created_at': 'Thu Sep 06 12:07:03 +0000 2018...,vulnerability,{'health and fitness': {'addiction': {'substan...,,[https://twitter.com/i/web/status/103767353224...,https://twitter.com/i/web/status/1037673532241...,True,"[health and fitness, addiction, substance abus...","[network security, computer security]"
4,9696,b'5b96d003bb325e7a2e199bef',2018-09-10 20:11:46+00:00,1039245063685955584,True,British law enforcement identified George Duk...,{'created_at': 'Mon Sep 10 20:11:46 +0000 2018...,general,"{'law, govt and politics': {'law enforcement':...",threat,[https://twitter.com/i/web/status/103924506368...,https://twitter.com/i/web/status/1039245063685...,True,"[law, govt and politics, law enforcement, tech...",[computer security]


In [167]:
with open("../data/tweets-dataset/filter_test.csv", "rb") as f:
    test_filter = pd.read_csv(f)
test_filter.head()

Unnamed: 0.1,Unnamed: 0,_id,date,id,relevant,text,tweet,type,watson,annotation,urls,destination_url,valid_certificate
0,8686,b'5b9230a1bb325e7007188268',2018-09-07 08:02:41+00:00,1037974421254721536,False,"Vulnerability and clarity, two important elements for a long and happy life, especially for all of us creatives.","{'created_at': 'Fri Sep 07 08:02:41 +0000 2018', 'id': 1037974421254721536, 'id_str': '1037974421254721536', 'text': 'Vulnerability and clarity, two important elements for a long and happy life, especially for all of us creatives.', 'source': '<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 35801905, 'id_str': '35801905', 'name': 'The Hollands!', 'screen_name': 'the_hollands', 'location': 'Australia/USA', 'url': 'http://www.thehollands.org', 'description': 'Merrymakers * Folk Revivalist * Story Tellers * Pre-order our Album, The Last Dance at http://www.thehollands.org', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 596, 'friends_count': 399, 'listed_count': 49, 'favourites_count': 7070, 'statuses_count': 7193, 'created_at': 'Mon Apr 27 17:32:09 +0000 2009', 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'lang': 'en', 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': '000000', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme6/bg.gif', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme6/bg.gif', 'profile_background_tile': False, 'profile_link_color': '7FDBB6', 'profile_sidebar_border_color': '000000', 'profile_sidebar_fill_color': '000000', 'profile_text_color': '000000', 'profile_use_background_image': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/575149645099225089/4bsMKofr_normal.jpeg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/575149645099225089/4bsMKofr_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/35801905/1486743200', 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'quoted_status_id': 1037948185346334720, 'quoted_status_id_str': '1037948185346334720', 'quoted_status': {'created_at': 'Fri Sep 07 06:18:26 +0000 2018', 'id': 1037948185346334720, 'id_str': '1037948185346334720', 'text': 'This was hard to write but I’m glad I did. https://t.co/PhiDcG6jVl', 'display_text_range': [0, 42], 'source': '<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 202881464, 'id_str': '202881464', 'name': 'Charlotte Abroms', 'screen_name': 'charlotteabroms', 'location': None, 'url': None, 'description': 'Music Manager.', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 456, 'friends_count': 215, 'listed_count': 6, 'favourites_count': 1265, 'statuses_count': 482, 'created_at': 'Fri Oct 15 01:15:39 +0000 2010', 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'lang': 'en', 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': True, 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'profile_image_url': 'http://pbs.twimg.com/profile_images/985087313423228929/t9sgjfIk_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/985087313423228929/t9sgjfIk_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/202881464/1523699084', 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'quoted_status_id': 1037854713956392961, 'quoted_status_id_str': '1037854713956392961', 'is_quote_status': True, 'quote_count': 1, 'reply_count': 1, 'retweet_count': 1, 'favorite_count': 5, 'entities': {'hashtags': [], 'urls': [{'url': 'https://t.co/PhiDcG6jVl', 'expanded_url': 'https://twitter.com/theindustry_o/status/1037854713956392961', 'display_url': 'twitter.com/theindustry_o/…', 'indices': [43, 66]}], 'user_mentions': [], 'symbols': []}, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'filter_level': 'low', 'lang': 'en'}, 'quoted_status_permalink': {'url': 'https://t.co/9bz4Pca2V6', 'expanded': 'https://twitter.com/charlotteabroms/status/1037948185346334720', 'display': 'twitter.com/charlotteabrom…'}, 'is_quote_status': True, 'quote_count': 0, 'reply_count': 0, 'retweet_count': 0, 'favorite_count': 0, 'entities': {'hashtags': [], 'urls': [], 'user_mentions': [], 'symbols': []}, 'favorited': False, 'retweeted': False, 'filter_level': 'low', 'lang': 'en', 'timestamp_ms': '1536307361316'}",vulnerability,"{'usage': {'text_units': 1, 'text_characters': 112, 'features': 2}, 'language': 'en', 'entities': [], 'categories': [{'score': 0.300311, 'label': '/pets/dogs'}, {'score': 0.232011, 'label': '/health and fitness'}, {'score': 0.176324, 'label': '/technology and computing/computer security/network security'}]}",irrelevant,[],,
1,12831,b'5b9342b0bb325e521c5aea1a',2018-09-08 03:31:59+00:00,1038268685394354176,False,9 targets for #hackers and #cyberattacks aiming to #disrupt #PwC via @MikeQuindazzi @ravikikan #corpgov… https://t.co/6tMVaohEDi,"{'created_at': 'Sat Sep 08 03:31:59 +0000 2018', 'id': 1038268685394354176, 'id_str': '1038268685394354176', 'text': '9 targets for #hackers and #cyberattacks aiming to #disrupt #PwC via @MikeQuindazzi @ravikikan #corpgov… https://t.co/6tMVaohEDi', 'display_text_range': [0, 140], 'source': '<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>', 'truncated': True, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 294020478, 'id_str': '294020478', 'name': 'Ravi Kikan', 'screen_name': 'ravikikan', 'location': 'New Delhi', 'url': 'https://www.linkedin.com/in/ravikikan/', 'description': '#Cofounder of my son, Leads the largest members only group for #entrepreneurs on #LinkedIn ; Helps #Startups in Scaling up #Growthhacker #Startup #Fintech #Tech', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 10254, 'friends_count': 9024, 'listed_count': 3239, 'favourites_count': 11706, 'statuses_count': 30203, 'created_at': 'Fri May 06 11:46:40 +0000 2011', 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'lang': 'en', 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_link_color': '054385', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'profile_image_url': 'http://pbs.twimg.com/profile_images/378800000640900148/9ae8c23b6c6a4289f2eacf9a36b8bbf8_normal.jpeg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/378800000640900148/9ae8c23b6c6a4289f2eacf9a36b8bbf8_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/294020478/1450858225', 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'extended_tweet': {'full_text': '9 targets for #hackers and #cyberattacks aiming to #disrupt #PwC via @MikeQuindazzi @ravikikan #corpgov #BoardOfDirectors #cybercrime #cybersecurity #ransomware #malware #iot #Infographics Cc: @Fisher85M @antgrasso #ai #ml #dl CC #ai #ml #dl #iot #infogr https://t.co/7shWrZuGp8', 'display_text_range': [0, 255], 'entities': {'hashtags': [{'text': 'hackers', 'indices': [14, 22]}, {'text': 'cyberattacks', 'indices': [27, 40]}, {'text': 'disrupt', 'indices': [51, 59]}, {'text': 'PwC', 'indices': [60, 64]}, {'text': 'corpgov', 'indices': [96, 104]}, {'text': 'BoardOfDirectors', 'indices': [105, 122]}, {'text': 'cybercrime', 'indices': [123, 134]}, {'text': 'cybersecurity', 'indices': [135, 149]}, {'text': 'ransomware', 'indices': [150, 161]}, {'text': 'malware', 'indices': [162, 170]}, {'text': 'iot', 'indices': [171, 175]}, {'text': 'Infographics', 'indices': [176, 189]}, {'text': 'ai', 'indices': [216, 219]}, {'text': 'ml', 'indices': [220, 223]}, {'text': 'dl', 'indices': [224, 227]}, {'text': 'ai', 'indices': [231, 234]}, {'text': 'ml', 'indices': [235, 238]}, {'text': 'dl', 'indices': [239, 242]}, {'text': 'iot', 'indices': [243, 247]}, {'text': 'infogr', 'indices': [248, 255]}], 'urls': [], 'user_mentions': [{'screen_name': 'MikeQuindazzi', 'name': 'Mike Quindazzi ✨', 'id': 2344530218, 'id_str': '2344530218', 'indices': [69, 83]}, {'screen_name': 'ravikikan', 'name': 'Ravi Kikan', 'id': 294020478, 'id_str': '294020478', 'indices': [84, 94]}, {'screen_name': 'Fisher85M', 'name': 'Michael Fisher', 'id': 846014785246367745, 'id_str': '846014785246367745', 'indices': [194, 204]}, {'screen_name': 'antgrasso', 'name': 'Antonio Grasso', 'id': 4338452835, 'id_str': '4338452835', 'indices': [205, 215]}], 'symbols': [], 'media': [{'id': 1038268673126031360, 'id_str': '1038268673126031360', 'indices': [256, 279], 'media_url': 'http://pbs.twimg.com/media/Dmir23MX4AAxNlq.jpg', 'media_url_https': 'https://pbs.twimg.com/media/Dmir23MX4AAxNlq.jpg', 'url': 'https://t.co/7shWrZuGp8', 'display_url': 'pic.twitter.com/7shWrZuGp8', 'expanded_url': 'https://twitter.com/ravikikan/status/1038268685394354176/photo/1', 'type': 'photo', 'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'}, 'medium': {'w': 1200, 'h': 774, 'resize': 'fit'}, 'small': {'w': 680, 'h': 439, 'resize': 'fit'}, 'large': {'w': 1200, 'h': 774, 'resize': 'fit'}}}]}, 'extended_entities': {'media': [{'id': 1038268673126031360, 'id_str': '1038268673126031360', 'indices': [256, 279], 'media_url': 'http://pbs.twimg.com/media/Dmir23MX4AAxNlq.jpg', 'media_url_https': 'https://pbs.twimg.com/media/Dmir23MX4AAxNlq.jpg', 'url': 'https://t.co/7shWrZuGp8', 'display_url': 'pic.twitter.com/7shWrZuGp8', 'expanded_url': 'https://twitter.com/ravikikan/status/1038268685394354176/photo/1', 'type': 'photo', 'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'}, 'medium': {'w': 1200, 'h': 774, 'resize': 'fit'}, 'small': {'w': 680, 'h': 439, 'resize': 'fit'}, 'large': {'w': 1200, 'h': 774, 'resize': 'fit'}}}]}}, 'quote_count': 0, 'reply_count': 0, 'retweet_count': 0, 'favorite_count': 0, 'entities': {'hashtags': [{'text': 'hackers', 'indices': [14, 22]}, {'text': 'cyberattacks', 'indices': [27, 40]}, {'text': 'disrupt', 'indices': [51, 59]}, {'text': 'PwC', 'indices': [60, 64]}, {'text': 'corpgov', 'indices': [96, 104]}], 'urls': [{'url': 'https://t.co/6tMVaohEDi', 'expanded_url': 'https://twitter.com/i/web/status/1038268685394354176', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [106, 129]}], 'user_mentions': [{'screen_name': 'MikeQuindazzi', 'name': 'Mike Quindazzi ✨', 'id': 2344530218, 'id_str': '2344530218', 'indices': [69, 83]}, {'screen_name': 'ravikikan', 'name': 'Ravi Kikan', 'id': 294020478, 'id_str': '294020478', 'indices': [84, 94]}], 'symbols': []}, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'filter_level': 'low', 'lang': 'en', 'timestamp_ms': '1536377519354'}",general,"{'usage': {'text_units': 1, 'text_characters': 129, 'features': 2}, 'language': 'en', 'entities': [{'type': 'Company', 'text': 'PwC', 'relevance': 0.33, 'count': 1}, {'type': 'TwitterHandle', 'text': '@MikeQuindazzi', 'relevance': 0.33, 'count': 1}, {'type': 'Hashtag', 'text': '#cyberattacks', 'relevance': 0.33, 'count': 1}, {'type': 'TwitterHandle', 'text': '@ravikikan', 'relevance': 0.33, 'count': 1}], 'categories': [{'score': 0.251119, 'label': '/technology and computing/computer security/antivirus and malware'}, {'score': 0.124779, 'label': '/business and industrial'}, {'score': 0.122301, 'label': '/law, govt and politics/government'}]}",business,['https://twitter.com/i/web/status/1038268685394354176'],https://twitter.com/i/web/status/1038268685394354176,True
2,2953,b'5b90df38bb325e7007186b80',2018-09-06 08:03:03+00:00,1037612127442411520,True,You can't contain me: elevation-of-privilege vulnerability in Docker for Windows https://t.co/0Xs7Og9g0y,"{'created_at': 'Thu Sep 06 08:03:03 +0000 2018', 'id': 1037612127442411520, 'id_str': '1037612127442411520', 'text': ""You can't contain me: elevation-of-privilege vulnerability in Docker for Windows https://t.co/0Xs7Og9g0y"", 'source': '<a href=""https://ifttt.com"" rel=""nofollow"">IFTTT</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 770515379626119168, 'id_str': '770515379626119168', 'name': 'Mangesh Tekale', 'screen_name': '_mangesh_tekale', 'location': 'Pune, India', 'url': None, 'description': 'iOS Developer at Fundtech #developer #IOS New on #twitter', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 150, 'friends_count': 116, 'listed_count': 17, 'favourites_count': 31, 'statuses_count': 23045, 'created_at': 'Tue Aug 30 06:55:57 +0000 2016', 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'lang': 'en', 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': '000000', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_link_color': '7FDBB6', 'profile_sidebar_border_color': '000000', 'profile_sidebar_fill_color': '000000', 'profile_text_color': '000000', 'profile_use_background_image': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/770520201704120320/8R-gOz8T_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/770520201704120320/8R-gOz8T_normal.jpg', 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'quote_count': 0, 'reply_count': 0, 'retweet_count': 0, 'favorite_count': 0, 'entities': {'hashtags': [], 'urls': [{'url': 'https://t.co/0Xs7Og9g0y', 'expanded_url': 'https://srcincite.io/blog/2018/08/31/you-cant-contain-me-analyzing-and-exploiting-an-elevation-of-privilege-in-docker-for-windows.html', 'display_url': 'srcincite.io/blog/2018/08/3…', 'indices': [81, 104]}], 'user_mentions': [], 'symbols': []}, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'filter_level': 'low', 'lang': 'en', 'timestamp_ms': '1536220983740'}",vulnerability,"{'usage': {'text_units': 1, 'text_characters': 104, 'features': 2}, 'language': 'en', 'entities': [], 'categories': [{'score': 0.77392, 'label': '/technology and computing/operating systems/windows'}, {'score': 0.215739, 'label': '/technology and computing/software'}, {'score': 0.200026, 'label': '/technology and computing/operating systems'}]}",,['https://srcincite.io/blog/2018/08/31/you-cant-contain-me-analyzing-and-exploiting-an-elevation-of-privilege-in-docker-for-windows.html'],https://srcincite.io/blog/2018/08/31/you-cant-contain-me-analyzing-and-exploiting-an-elevation-of-privilege-in-docker-for-windows.html,True
3,15462,b'5b945ba4bb325e521c5af5b5',2018-09-08 23:30:44+00:00,1038570361493893120,True,A Google Engineer Discovered a Vulnerability Letting Him Take Control of Keycard-Controlled Doors https://t.co/fOxCxwDHoc,"{'created_at': 'Sat Sep 08 23:30:44 +0000 2018', 'id': 1038570361493893120, 'id_str': '1038570361493893120', 'text': 'A Google Engineer Discovered a Vulnerability Letting Him Take Control of Keycard-Controlled Doors https://t.co/fOxCxwDHoc', 'source': '<a href=""https://sproutsocial.com"" rel=""nofollow"">Sprout Social</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1379830590, 'id_str': '1379830590', 'name': 'Migo', 'screen_name': 'MigoKed', 'location': 'United States', 'url': None, 'description': 'Building security products. Passionate about #infosec, product management and yoga. \n#prodmgmt #cyber #security #malware #exploit #ransomware #infosec', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 776, 'friends_count': 588, 'listed_count': 80, 'favourites_count': 4139, 'statuses_count': 22332, 'created_at': 'Thu Apr 25 16:30:21 +0000 2013', 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'lang': 'en', 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': '59472F', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_link_color': '52555C', 'profile_sidebar_border_color': '000515', 'profile_sidebar_fill_color': '061127', 'profile_text_color': '827972', 'profile_use_background_image': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1018600418409140227/4FfWiIAa_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1018600418409140227/4FfWiIAa_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/1379830590/1531605871', 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'quote_count': 0, 'reply_count': 0, 'retweet_count': 0, 'favorite_count': 0, 'entities': {'hashtags': [], 'urls': [{'url': 'https://t.co/fOxCxwDHoc', 'expanded_url': 'http://bit.ly/2oTuruG', 'display_url': 'bit.ly/2oTuruG', 'indices': [98, 121]}], 'user_mentions': [], 'symbols': []}, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'filter_level': 'low', 'lang': 'en', 'timestamp_ms': '1536449444541'}",vulnerability,"{'usage': {'text_units': 1, 'text_characters': 121, 'features': 2}, 'language': 'en', 'entities': [{'type': 'Company', 'text': 'Google', 'relevance': 0.33, 'disambiguation': {'subtype': ['AcademicInstitution', 'AwardPresentingOrganization', 'OperatingSystemDeveloper', 'ProgrammingLanguageDeveloper', 'SoftwareDeveloper', 'VentureFundedCompany'], 'name': 'Google', 'dbpedia_resource': 'http://dbpedia.org/resource/Google'}, 'count': 1}, {'type': 'JobTitle', 'text': 'Engineer', 'relevance': 0.33, 'count': 1}], 'categories': [{'score': 0.432962, 'label': '/home and garden/gardening and landscaping/yard and patio'}, {'score': 0.382486, 'label': '/technology and computing/computer security/network security'}, {'score': 0.274921, 'label': '/technology and computing'}]}",threat,['http://bit.ly/2oTuruG'],http://bit.ly/2oTuruG,True
4,20246,b'5b976074bb325e7a2e19a22e',2018-09-11 06:28:03+00:00,1039400157610041345,False,"""Despite huge progress in the #vulnerability disclosure process, things remain broken when it comes to vendor-resea… https://t.co/3069kTuLj7","{'created_at': 'Tue Sep 11 06:28:03 +0000 2018', 'id': 1039400157610041345, 'id_str': '1039400157610041345', 'text': '""Despite huge progress in the #vulnerability disclosure process, things remain broken when it comes to vendor-resea… https://t.co/3069kTuLj7', 'display_text_range': [0, 140], 'source': '<a href=""https://buffer.com"" rel=""nofollow"">Buffer</a>', 'truncated': True, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 985141348910096384, 'id_str': '985141348910096384', 'name': 'Armada Innovation Labs', 'screen_name': 'armadainnolabs', 'location': 'Manila City', 'url': 'http://armadalabs.tech', 'description': 'Cyber Security Experts in the Philippines', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 20, 'friends_count': 34, 'listed_count': 0, 'favourites_count': 0, 'statuses_count': 47, 'created_at': 'Sat Apr 14 13:02:55 +0000 2018', 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'lang': 'en', 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': '000000', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_link_color': '1B95E0', 'profile_sidebar_border_color': '000000', 'profile_sidebar_fill_color': '000000', 'profile_text_color': '000000', 'profile_use_background_image': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1011244703298154496/WjjNH3GH_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1011244703298154496/WjjNH3GH_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/985141348910096384/1529934581', 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None}, 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'extended_tweet': {'full_text': '""Despite huge progress in the #vulnerability disclosure process, things remain broken when it comes to vendor-researcher relationships."" #cybersecurity #cybercrime https://t.co/qyJrmB4Cv5 https://t.co/TNl2yiN4k1', 'display_text_range': [0, 187], 'entities': {'hashtags': [{'text': 'vulnerability', 'indices': [30, 44]}, {'text': 'cybersecurity', 'indices': [137, 151]}, {'text': 'cybercrime', 'indices': [152, 163]}], 'urls': [{'url': 'https://t.co/qyJrmB4Cv5', 'expanded_url': 'https://buff.ly/2M3HDWS', 'display_url': 'buff.ly/2M3HDWS', 'indices': [164, 187]}], 'user_mentions': [], 'symbols': [], 'media': [{'id': 1039400155642880000, 'id_str': '1039400155642880000', 'indices': [188, 211], 'media_url': 'http://pbs.twimg.com/media/Dmyw70CUYAA5EjL.jpg', 'media_url_https': 'https://pbs.twimg.com/media/Dmyw70CUYAA5EjL.jpg', 'url': 'https://t.co/TNl2yiN4k1', 'display_url': 'pic.twitter.com/TNl2yiN4k1', 'expanded_url': 'https://twitter.com/armadainnolabs/status/1039400157610041345/photo/1', 'type': 'photo', 'sizes': {'medium': {'w': 700, 'h': 533, 'resize': 'fit'}, 'thumb': {'w': 150, 'h': 150, 'resize': 'crop'}, 'large': {'w': 700, 'h': 533, 'resize': 'fit'}, 'small': {'w': 680, 'h': 518, 'resize': 'fit'}}}]}, 'extended_entities': {'media': [{'id': 1039400155642880000, 'id_str': '1039400155642880000', 'indices': [188, 211], 'media_url': 'http://pbs.twimg.com/media/Dmyw70CUYAA5EjL.jpg', 'media_url_https': 'https://pbs.twimg.com/media/Dmyw70CUYAA5EjL.jpg', 'url': 'https://t.co/TNl2yiN4k1', 'display_url': 'pic.twitter.com/TNl2yiN4k1', 'expanded_url': 'https://twitter.com/armadainnolabs/status/1039400157610041345/photo/1', 'type': 'photo', 'sizes': {'medium': {'w': 700, 'h': 533, 'resize': 'fit'}, 'thumb': {'w': 150, 'h': 150, 'resize': 'crop'}, 'large': {'w': 700, 'h': 533, 'resize': 'fit'}, 'small': {'w': 680, 'h': 518, 'resize': 'fit'}}}]}}, 'quote_count': 0, 'reply_count': 0, 'retweet_count': 0, 'favorite_count': 0, 'entities': {'hashtags': [{'text': 'vulnerability', 'indices': [30, 44]}], 'urls': [{'url': 'https://t.co/3069kTuLj7', 'expanded_url': 'https://twitter.com/i/web/status/1039400157610041345', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [117, 140]}], 'user_mentions': [], 'symbols': []}, 'favorited': False, 'retweeted': False, 'possibly_sensitive': False, 'filter_level': 'low', 'lang': 'en', 'timestamp_ms': '1536647283352'}",vulnerability,"{'usage': {'text_units': 1, 'text_characters': 140, 'features': 2}, 'language': 'en', 'entities': [{'type': 'Hashtag', 'text': '#vulnerability', 'relevance': 0.01, 'count': 1}], 'categories': [{'score': 0.175489, 'label': '/technology and computing/operating systems'}, {'score': 0.156594, 'label': '/technology and computing/software/databases'}, {'score': 0.144681, 'label': '/finance/accounting and auditing'}]}",business,['https://twitter.com/i/web/status/1039400157610041345'],https://twitter.com/i/web/status/1039400157610041345,True


In [166]:
test_filter = test_filter.assign(
    tweet=lambda df: df["tweet"].apply(func=ast.literal_eval),
    watson=lambda df: df["watson"].apply(func=ast.literal_eval),
    urls=lambda df: df["urls"].apply(func=ast.literal_eval),
    watson_list=lambda df: df["watson_list"].apply(func=ast.literal_eval),
    target=lambda df: df["target"].apply(func=ast.literal_eval),
)
test_filter['text'] = [replace_text_components(t) for t in test_filter['text']]

test_filter = test_filter.assign(
    target=lambda df: df["target"].apply(func=map_targets, args=(
        {
            'computer security': 'computer security',
            'operating systems': 'operating systems',
            'software': 'software',
            'programming languages': 'software',
            'hardware': 'hardware',
            'electronic components': 'hardware',
            'networking': 'networking',
            'internet technology': 'networking',
            'network security': 'network security',
            'antivirus and malware': 'antivirus and malware',
            'mac os': 'mac os',
            'windows': 'windows',
            'unix': 'unix',
            'linux': 'linux',
            'databases': 'databases',
            'computer': 'computer',
            'computer components': 'computer components',
            'computer networking': 'computer networking'
        }, ))
)

print(f"Threat Tweets: {len(test_filter)}")
test_filter.head()

KeyError: 'watson_list'

In [66]:
texts = test_tweets['text']
targets = test_tweets['target']

In [146]:
import joblib

filter = joblib.load(filename="../models/filter.pkl")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [124]:
with open("../models/GENERAL.pkl", "rb") as f:
    general = pickle.load(f)
    general.mlb.fit(targets)
    general.scaler_ = preprocessing.StandardScaler().fit(preprocess_texts(
        list_str=texts,
        model_path='basel/ATTACK-BERT',
        embedding_dim=None,
    ))

In [129]:
for idx in general.models:
    general.models[idx]["model"].scaler_ = general.scaler_

In [68]:
with open("../models/COMPUTER.SECURITY.pkl", "rb") as f:
    cs = pickle.load(f)

In [69]:
with open("../models/OPERATING.SYSTEMS.pkl", "rb") as f:
    os = pickle.load(f)

In [70]:
with open("../models/SOFTWARE.pkl", "rb") as f:
    sw = pickle.load(f)

In [71]:
with open("../models/HARDWARE.pkl", "rb") as f:
    hw = pickle.load(f)

In [72]:
#ATT&CK-BERT, DistilRoBERTa, DistilRoBERTa, DistilRoBERTa, ATT&CK-BERT

In [161]:
class System(BaseEstimator, ClassifierMixin):
    def __init__(self, filter, general, cs, os, sw, hw):
        self.filter = filter
        self.general = general  # ATT&CK-BERT
        self.cs = cs  # DistilRoBERTa
        self.os = os  # DistilRoBERTa
        self.sw = sw  # DistilRoBERTa
        self.hw = hw  # ATT&CK-BERT

    def predict(self, texts: pd.Series) -> ArrayLike:
        X_SBERT = preprocess_texts(
            list_str=texts,
            model_path='sentence-transformers/all-mpnet-base-v2',
            embedding_dim=None,
        )
        general_dim = len(self.general.mlb.classes_)
        cs_dim = len(self.cs.mlb.classes_) - 1
        os_dim = len(self.os.mlb.classes_) - 1
        sw_dim = len(self.sw.mlb.classes_) - 1
        hw_dim = len(self.hw.mlb.classes_) - 1
        Y_pred = np.zeros((texts.shape[0], general_dim + cs_dim + os_dim + sw_dim + hw_dim))
        X_ATTACK_BERT = preprocess_texts(
            list_str=texts,
            model_path='basel/ATTACK-BERT',
            embedding_dim=None,
        )
        X_DistilRoBERTa = preprocess_texts(
            list_str=texts,
            model_path='sentence-transformers/all-distilroberta-v1',
            embedding_dim=None,
        )
        y_filter = self.filter.predict(X_SBERT)
        y_general = self.general.predict(X_ATTACK_BERT)
        Y_pred[:, :general_dim] = y_general
        for idx, (t, f) in enumerate(zip(self.general.mlb.inverse_transform(y_general), y_filter)):
            if not False:
                if 'computer security' in t:
                    y_cs = self.cs.predict(X_DistilRoBERTa[idx:idx + 1])
                    Y_pred[idx, general_dim:general_dim + cs_dim] = y_cs[:, :cs_dim]
                if 'operating systems' in t:
                    y_os = self.os.predict(X_DistilRoBERTa[idx:idx + 1])
                    Y_pred[idx, general_dim + cs_dim:general_dim + cs_dim + os_dim] = np.hstack((y_os[:, :2], y_os[:, 3:]))
                if "software" in t:
                    y_sw = self.sw.predict(X_DistilRoBERTa[idx:idx + 1])
                    Y_pred[idx, general_dim + cs_dim + os_dim:general_dim + cs_dim + os_dim + sw_dim] = y_sw[:, :sw_dim]
                if "hardware" in t:
                    y_hw = self.hw.predict(X_ATTACK_BERT[idx:idx + 1])
                    Y_pred[idx, general_dim + cs_dim + os_dim + sw_dim:] = y_hw[:, :hw_dim]
            else:
                Y_pred[idx, :] = np.zeros(Y_pred.shape[1])
                Y_pred[idx, 4] = 1.0
        return Y_pred

    def predict_targets(self, texts: pd.Series) -> pd.Series:
        general_dim = len(self.general.mlb.classes_)
        cs_dim = len(self.cs.mlb.classes_) - 1
        os_dim = len(self.os.mlb.classes_) - 1
        sw_dim = len(self.sw.mlb.classes_) - 1
        Y_pred = self.predict(texts)
        y_general = Y_pred[:, :general_dim]
        y_cs = Y_pred[:, general_dim:general_dim + cs_dim]
        y_os = Y_pred[:, general_dim + cs_dim:general_dim + cs_dim + os_dim]
        y_sw = Y_pred[:, general_dim + cs_dim + os_dim:general_dim + cs_dim + os_dim + sw_dim]
        y_hw = Y_pred[:, general_dim + cs_dim + os_dim + sw_dim:]
        targets_general = self.general.mlb.inverse_transform(y_general)
        targets_cs = self.cs.mlb.inverse_transform(np.hstack((y_cs, np.zeros((y_cs.shape[0], 1)))))
        targets_os = self.os.mlb.inverse_transform(np.hstack((y_os[:, :2], np.zeros((y_os.shape[0], 1)), y_os[:, 2:])))
        targets_sw = self.sw.mlb.inverse_transform(np.hstack((y_sw, np.zeros((y_sw.shape[0], 1)))))
        targets_hw = self.hw.mlb.inverse_transform(np.hstack((y_hw, np.zeros((y_hw.shape[0], 1)))))
        return pd.Series(data=[list(set(target_general + target_cs + target_os + target_sw + target_hw)) for target_general, target_cs, target_os, target_sw, target_hw in zip(targets_general, targets_cs, targets_os, targets_sw, targets_hw)], index=texts.index)

    def evaluate(self, texts: pd.Series, targets: pd.Series) -> dict[str, float]:
        targets_general = self.general.mlb.transform(targets)
        targets_cs = self.cs.mlb.transform(targets)
        targets_os = self.os.mlb.transform(targets)
        targets_sw = self.sw.mlb.transform(targets)
        targets_hw = self.hw.mlb.transform(targets)
        Y_test = np.hstack((targets_general, targets_cs[:, :-1], targets_os[:, :2], targets_os[:, 3:], targets_sw[:, :-1], targets_hw[:, :-1]))
        Y_pred = self.predict(texts)
        return assess(Y_test, Y_pred)

    def classes(self):
        return list(self.general.mlb.classes_) + list(self.cs.mlb.classes_[:-1]) + list(self.os.mlb.classes_[:2]) + list(self.os.mlb.classes_[3:]) + list(self.sw.mlb.classes_[:-1]) + list(self.hw.mlb.classes_[:-1])


solution = System(filter, general, cs, os, sw, hw)

In [164]:
pd.DataFrame([solution.predict_targets(texts[100:110]), targets[100:110], texts[100:110]], index=["Predicted", "Actual", "Text"]).transpose()

Unnamed: 0,Predicted,Actual,Text
100,"[computer security, antivirus and malware]","[computer security, antivirus and malware]",My dad and 02 are they DDoS protection provider as well we don't like i deactivated temporarily if u ever notice how to be ugly word I may
101,"[computer, software, databases, hardware]","[computer, software, databases, hardware]",CVE-2018-3952 An exploitable code execution vulnerability exists in the connect functionality of NordVPN 6.14.28.0.…
102,[],"[network security, computer security, antivirus and malware]",A Real-world Deep Dive into the Top Cloud Threats - ➱ LinkedIn (The Top Threats: Data Breaches; Insufficient Identi…
103,"[computer security, antivirus and malware]","[computer, computer security, hardware, antivirus and malware]","So, . Why is this Blue Botnet C2 still alive? hxxp://softnew[.]website/"
104,[software],[software],"There is a vulnerability in IBM Java SDK Technology Edition, Version 8 used by Transparent Cloud Tiering. This issu…"
105,"[computer, software, hardware]","[computer, software, hardware]",Exploit Code POC Published for Intel Chipset Vulnerability - Latest Hacking News by
106,"[software, databases, computer security, antivirus and malware]","[software, databases, computer security, antivirus and malware]",Indian Society of Tele Dermatology Insted SQL Injection Vulnerability
107,"[software, network security, computer security]","[software, network security, computer security]",Sony Global - Software Vulnerability Prevention Initiative\n
108,"[computer security, antivirus and malware]","[computer security, antivirus and malware]",New Fallout Exploit Kit Drops GandCrab Ransomware or Redirects to PUPs. ⁦⁩
109,[other],[software],"RT druvainc ""RT TeamWebinar: Join us to learn ""How Hatco Leverages AWS and Druva to Beat Ransomware"" on Sept 21st.…"


In [153]:
system_performance = solution.evaluate(texts[:100], targets[:100])
print("==========================================================")
print("System Performance")
print("==========================================================")
print(f"Accuracy:\t{system_performance['accuracy'] * 100:.2f}%")
print(f"Hamming Loss:\t{system_performance['hamming_loss']:.3f}")
report = pd.DataFrame(system_performance["report"])
report.columns = solution.classes() + ["micro avg", "macro avg", "weighted avg", "samples avg"]
report = (report.transpose().map(lambda x: f"{x:.2f}" if isinstance(x, float) else x).to_string())
print(f"{report}")
print("==========================================================\n")



System Performance
Accuracy:	19.00%
Hamming Loss:	0.193
                      precision recall f1-score support   auc
computer security          0.83   0.09     0.16   57.00  0.53
hardware                   0.00   0.00     0.00   20.00  0.50
networking                 1.00   0.12     0.22    8.00  0.56
operating systems          0.00   0.00     0.00   11.00  0.50
other                      0.18   0.94     0.30   17.00  0.52
software                   1.00   0.05     0.09   41.00  0.52
antivirus and malware      1.00   0.05     0.10   39.00  0.53
network security           0.75   0.14     0.23   22.00  0.56
linux                      0.00   0.00     0.00    7.00  0.50
mac os                     0.00   0.00     0.00    0.00   nan
unix                       0.00   0.00     0.00    1.00  0.50
windows                    0.00   0.00     0.00    3.00  0.50
databases                  0.00   0.00     0.00   14.00  0.50
computer                   0.00   0.00     0.00   18.00  0.50
computer compo



In [136]:
pd.DataFrame(data=solution.predict(pd.Series(["There are rootkits threats ongoing in the wild, and they're infecting the AMD CPUs when you open an EXE file attackers send via mail. Pay attention"])), columns=solution.classes())

Unnamed: 0,computer security,hardware,networking,operating systems,other,software,antivirus and malware,network security,linux,mac os,unix,windows,databases,computer,computer components,computer networking
0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
