# System Classifier

## 1. Imports

In [1]:
import ast
import random
import re
import string
from pathlib import Path
from typing import Any, Union

import joblib
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.preprocessing import MultiLabelBinarizer

from preprocess_functions import build_tree, extract_keys, map_targets, pos_tagging
from utils import tokenizer_transform, replace_text_components, clean_text

[nltk_data] Downloading package punkt to /home/cela/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/cela/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/cela/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/cela/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package stopwords to /home/cela/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2025-02-12 11:50:37.404073: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow 

## 2. Configurations & Constants

### Global Configuration Constants

In [2]:
OVERWRITE = True
RANDOM_STATE = 42

### Set Random Seeds

In [3]:
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

### Category Mappings

In [26]:
FIX_TARGETS = {
    'computer security': 'computer security',
    'operating systems': 'operating systems',
    'software': 'software',
    'programming languages': 'software',
    'hardware': 'hardware',
    'electronic components': 'hardware',
    'networking': 'networking',
    'internet technology': 'networking',
    #'network security': 'network security',
    'antivirus and malware': 'antivirus and malware',
    #'mac os': 'mac os',
    #'windows': 'windows',
    #'unix': 'unix',
    #'linux': 'linux',
    #'databases': 'databases',
    #'computer': 'computer',
    #'computer components': 'computer components',
    #'computer networking': 'computer networking',
    #'servers': 'servers',
    #'portable computer': 'portable computer',
    #'desktop computer': 'desktop computer',
    #'router': 'router',
    #'wireless technology': 'wireless technology'
}

### Directory Paths

In [27]:
COLAB_DIR = Path('/content/drive/MyDrive')
KAGGLE_DIR = Path('/kaggle/input')
LOCAL_DIR = Path('./')

try:
    import google.colab

    IN_COLAB = True
except ImportError:
    IN_COLAB = False

try:
    import kaggle_secrets

    IN_KAGGLE = True
except ImportError:
    IN_KAGGLE = False

if IN_COLAB:
    DATA_DIR = COLAB_DIR / 'data'
    MODELS_DIR = COLAB_DIR / 'models'
elif IN_KAGGLE:
    DATA_DIR = KAGGLE_DIR
    MODELS_DIR = KAGGLE_DIR
else:
    DATA_DIR = LOCAL_DIR / 'data'
    MODELS_DIR = LOCAL_DIR / 'models'

### Dataset & Model Paths

In [28]:
FILTER_PKL = Path('models/filter.pkl')
GENERAL_PKL = Path('models/general.pkl')
COMPUTER_PKL = Path('models/computer.pkl')
COMPUTER_NETWORKING_PKL = Path('models/computer_networking.pkl')
#COMPUTER_SECURITY_PKL = Path('models/computer_security.pkl')
COMPUTER_SECURITY_PKL = Path('models/fine_grained_computer_security.pkl')
HARDWARE_PKL = Path('models/hardware.pkl')
OPERATING_SYSTEMS_PKL = Path('models/operating_systems.pkl')
SOFTWARE_PKL = Path('models/software.pkl')

BINARY_PROBLEMS_DIR = MODELS_DIR / 'binary_problems'
ENSEMBLES_DIR = MODELS_DIR / 'ensembles'
MULTICLASS_PROBLEMS_DIR = MODELS_DIR / 'multiclass_problems'
GLOVE_6B_DIR = MODELS_DIR / 'glove-embeddings'
GLOVE_6B_300D_TXT = GLOVE_6B_DIR / 'glove.6B.300d.txt'

DATASET_DIR = DATA_DIR / 'tweets-dataset-for-cyberattack-detection'
THREAT_TWEETS_CSV = DATASET_DIR / 'tweets_test.csv'

## 3. Data Loading & Preprocessing

The general classifier detects the classes of a text among:
- computer security
- operating systems
- software
- hardware
- networking
- other

The "computer security" classifier detects the classes of a text already labeled as "computer security" among:
- antivirus and malware
- network security

The "operating systems" classifier detects the classes of a text already labeled as "operating systems" among:
- linux
- mac os
- unix
- windows

The "software" classifier detects the classes of a text already labeled as "software" among:
- databases

The "hardware" classifier detects the classes of a text already labeled as "hardware" among:
- computer
- computer components
- computer networking

The "computer" classifier detects the classes of a text already labeled as "computer" among:
- servers
- portable computer
<!-- - desktop computer

The "computer networking" classifier detects the classes of a text already labeled as "computer networking" among:
- router
<!-- - wireless technology

In [29]:
tweets = (
    pd.read_csv(filepath_or_buffer=THREAT_TWEETS_CSV)
    .assign(
        tweet=lambda df: df['tweet'].apply(func=ast.literal_eval),
        watson=lambda df: df['watson'].apply(func=ast.literal_eval)
        .apply(func=lambda x: x.get('categories', []))
        .apply(func=build_tree),
        watson_list=lambda df: df['watson'].apply(func=extract_keys),
        target=lambda df: df['watson_list'].apply(func=map_targets, args=(FIX_TARGETS,)),
        pos_tags=lambda df: df['text'].apply(func=pos_tagging)
    )
)

print(f"Threat Tweets: {len(tweets)}")
tweets.head()

Threat Tweets: 2223


Unnamed: 0,_id,date,id,relevant,text,tweet,type,watson,annotation,urls,destination_url,valid_certificate,watson_list,target,pos_tags
0,b'5b952d42bb325e521c5afd7b',2018-09-09 14:25:06+00:00,1038795435564048385,True,"@Opulxx @Jirxeh @JakeZenith @n9ire Yeah same, ...",{'created_at': 'Sun Sep 09 14:25:06 +0000 2018...,botnet,{'technology and computing': {'computer securi...,threat,[],,,"[technology and computing, computer security, ...","[antivirus and malware, computer security]","[(@, ADJ), (Opulxx, NOUN), (@, NOUN), (Jirxeh,..."
1,b'5b92b148bb325e521c5ae1f7',2018-09-07 17:11:36+00:00,1038112561240174592,True,What is a Distributed Denial of Service (DDoS)...,{'created_at': 'Fri Sep 07 17:11:36 +0000 2018...,ddos,{'technology and computing': {'computer securi...,,['https://ift.tt/2wSnU7f'],https://www.varonis.com/blog/what-is-a-ddos-at...,True,"[technology and computing, computer security, ...","[antivirus and malware, computer security]","[(What, PRON), (is, VERB), (a, DET), (Distribu..."
2,b'5b9260d0bb325e7007188503',2018-09-07 11:28:15+00:00,1038026156107358210,True,Troldesh Ransomware Spreading Via Weaponized W...,{'created_at': 'Fri Sep 07 11:28:15 +0000 2018...,ransomware,{'technology and computing': {'networking': {'...,threat,['https://gbhackers.com/troldesh-ransomware-wo...,https://gbhackers.com/troldesh-ransomware-word...,True,"[technology and computing, networking, vpn and...","[networking, hardware]","[(Troldesh, NOUN), (Ransomware, NOUN), (Spread..."
3,b'5b950a4cbb325e521c5afbe5',2018-09-09 11:55:55+00:00,1038757891942088705,True,#cybercriminalite : Nestled in hacked sites–Ne...,{'created_at': 'Sun Sep 09 11:55:55 +0000 2018...,ransomware,{'technology and computing': {'computer securi...,threat,['https://twitter.com/i/web/status/10387578919...,https://twitter.com/i/web/status/1038757891942...,True,"[technology and computing, computer security, ...","[antivirus and malware, computer security, har...","[(#, .), (cybercriminalite, NOUN), (:, .), (Ne..."
4,b'5b93032dbb325e521c5ae749',2018-09-07 23:01:00+00:00,1038200492411822081,True,O no rest of wondering how to animals b i don’...,{'created_at': 'Fri Sep 07 23:01:00 +0000 2018...,ddos,{'technology and computing': {'computer securi...,threat,[],,,"[technology and computing, computer security, ...","[antivirus and malware, computer security]","[(O, NOUN), (no, DET), (rest, NOUN), (of, ADP)..."


## 5. Final Classifier Definition


In [31]:
class ClassifierSystem:
    def __init__(self, models: dict[str, (BaseEstimator, list[str])], embedder_path: str, embedding_dim: Any):
        self.models = models
        self.embedder_path = embedder_path
        self.embedding_dim = embedding_dim

    def predict(self, x):
        x_emb = self.preprocess_texts(list_str=x)
        filter_predict = self.models['filter']['model'].predict(x_emb)

        translate = lambda predictions, classes: [[
            classes[yi] for yi in y_hat
        ] for y_hat in [
            np.where(y_hat == 1)[0]
            for y_hat in predictions
        ]]

        general_predict = translate(
            predictions=self.models['general_classifier']['model'].predict(x_emb),
            classes=self.models['general_classifier']['classes']
        )
        y_prediction = general_predict.copy()

        for idx, yp in enumerate(general_predict):
            y_prediction_i = set(yp)
            if 'computer security' in y_prediction_i:
                y_prediction_i = set(translate(
                    predictions=self.models['computer_security_classifier']['model'].predict(x_emb),
                    classes=self.models['computer_security_classifier']['classes']
                )[0]) - {'other'} | set(y_prediction_i)
            #if 'operating systems' in y_prediction_i:
            #    y_prediction_i = set(translate(
            #        predictions=self.models['operating_systems_classifier']['model'].predict(x_emb),
            #        classes=self.models['operating_systems_classifier']['classes']
            #    )[0]) - {'other'} | set(y_prediction_i)
            #if 'software' in y_prediction_i:
            #    y_prediction_i = set(translate(
            #        predictions=self.models['software_classifier']['model'].predict(x_emb),
            #        classes=self.models['software_classifier']['classes']
            #    )[0]) - {'other'} | set(y_prediction_i)
            #if 'hardware' in y_prediction_i:
            #    y_prediction_i = set(translate(
            #        predictions=self.models['hardware_classifier']['model'].predict(x_emb),
            #        classes=self.models['hardware_classifier']['classes']
            #    )[0]) - {'other'} | set(y_prediction_i)
            #if 'computer' in y_prediction_i:
            #    y_prediction_i = set(translate(
            #        predictions=self.models['computer_classifier']['model'].predict(x_emb),
            #        classes=self.models['computer_classifier']['classes']
            #    )[0]) - {'other'} | set(y_prediction_i)
            #if 'computer networking' in y_prediction_i:
            #    y_prediction_i = set(translate(
            #        predictions=self.models['computer_networking_classifier']['model'].predict(x_emb),
            #        classes=self.models['computer_networking_classifier']['classes']
            #    )[0]) - {'other'} | set(y_prediction_i)

            y_prediction[idx] = list(y_prediction_i)

        for idx in range(len(filter_predict)):
            if not filter_predict[idx]:
                y_prediction[idx] = ['other']
        return y_prediction

    def load_embedder_dict(self) -> dict[Union[str, list[str]], np.ndarray[Any, np.dtype]]:
        embeddings_dict = {}
        f = open(self.embedder_path, 'r', encoding='utf-8')

        for line in f:
            values = line.split()
            word = values[:-self.embedding_dim]

            if type(word) is list:
                word = ' '.join(word)

            vector = np.asarray([float(val) for val in values[-self.embedding_dim:]])
            embeddings_dict[word] = vector

        f.close()

        return embeddings_dict

    def preprocess_texts(self, list_str: pd.Series) -> np.ndarray[Any, np.dtype]:
        if self.embedding_dim is None:
            return tokenizer_transform(
                x=list_str,
                embedder_addr=self.embedder_path,
                preprocessing_list=[replace_text_components, clean_text]
            )

        embedder = self.load_embedder_dict()
        list_embedded_str = np.zeros((len(list_str), self.embedding_dim))

        for i, text in enumerate(list_str):
            tokens = re.findall(r'\w+|[{}]'.format(re.escape(string.punctuation)), text)

            for token in tokens:
                try:
                    list_embedded_str[i] += embedder[token.lower()]
                except KeyError:
                    continue

        return list_embedded_str

In [32]:
models_dict = {
    'filter': {
        'model': joblib.load(filename=FILTER_PKL),
        'classes': ['not relevant', 'relevant']
    },
    'general_classifier': {
        'model': joblib.load(filename=GENERAL_PKL),
        'classes': ['computer security', 'hardware', 'networking', 'operating systems', 'other', 'software'],
    },
    'computer_security_classifier': {
        'model': joblib.load(filename=COMPUTER_SECURITY_PKL),
        #'classes': ['antivirus and malware', 'network security', 'other']
        'classes': ['antivirus and malware', 'other']
    },
    #'operating_systems_classifier': {
    #    'model': joblib.load(filename=OPERATING_SYSTEMS_PKL),
    #    'classes': ['linux', 'mac os', 'other', 'unix', 'windows']
    #},
    #'software_classifier': {
    #    'model': joblib.load(filename=SOFTWARE_PKL),
    #    'classes': ['databases', 'other']
    #},
    #'hardware_classifier': {
    #    'model': joblib.load(filename=HARDWARE_PKL),
    #    'classes': ['computer', 'computer components', 'computer networking', 'other']
    #},
    #'computer_classifier': {
    #    'model': joblib.load(filename=COMPUTER_PKL),
    #    'classes': ['desktop computer', 'other', 'portable computer', 'servers', ]
    #},
    #'computer_networking_classifier': {
    #    'model': joblib.load(filename=COMPUTER_NETWORKING_PKL),
    #    'classes': ['router', 'wireless technology']
    #}
}

In [33]:
system = ClassifierSystem(
    models=models_dict,
    embedder_path='sentence-transformers/all-mpnet-base-v2',
    embedding_dim=None
)

In [34]:
from sklearn.metrics import accuracy_score

from sklearn.metrics import roc_auc_score

from sklearn.metrics import classification_report

end = len(tweets)
mlb = MultiLabelBinarizer()
y_test = mlb.fit_transform(y=tweets['target'][:end])
y_hat_raw = system.predict(tweets['text'][:end])
y_hat = mlb.transform(y_hat_raw)

In [35]:
print(classification_report(
    y_true=y_test,
    y_pred=y_hat,
    target_names=mlb.classes_,
    zero_division=0
))

print(f"Accuracy\t{accuracy_score(y_true=y_test, y_pred=y_hat) * 100:.2f}")
print(f"AUC\t{roc_auc_score(y_true=y_test, y_score=y_hat) * 100:.2f}")

                       precision    recall  f1-score   support

antivirus and malware       0.65      0.80      0.72       857
    computer security       0.83      0.76      0.80      1164
             hardware       0.81      0.38      0.52       429
           networking       0.84      0.35      0.49       176
    operating systems       0.94      0.58      0.71       227
                other       0.60      0.67      0.63       472
             software       0.86      0.65      0.74       822

            micro avg       0.75      0.67      0.71      4147
            macro avg       0.79      0.60      0.66      4147
         weighted avg       0.78      0.67      0.70      4147
          samples avg       0.66      0.66      0.64      4147

Accuracy	45.52
AUC	75.22


In [36]:
print(system.predict(pd.Series(['Microsoft Windows Storage contains a link following vulnerability that could allow for privilege escalation. This vulnerability could allow an attacker to delete data including data that results in the service being unavailable.'])))
print(system.predict(pd.Series(['Microsoft Windows Ancillary Function Driver for WinSock contains a heap-based buffer overflow vulnerability that allows for privilege escalation, enabling a local attacker to gain SYSTEM privileges. '])))
print(system.predict(pd.Series(['Multiple Zyxel DSL CPE devices contain a post-authentication command injection vulnerability in the CGI program that could allow an authenticated attacker to execute OS commands via a crafted HTTP request. '])))
print(system.predict(pd.Series(['7-Zip contains a protection mechanism failure vulnerability that allows remote attackers to bypass the Mark-of-the-Web security feature to execute arbitrary code in the context of the current user. '])))
print(system.predict(pd.Series(['Microsoft Outlook contains an improper input validation vulnerability that allows for remote code execution. Successful exploitation of this vulnerability would allow an attacker to bypass the Office Protected View and open in editing mode rather than protected mode. '])))

[[]]
[['antivirus and malware', 'computer security', 'hardware', 'operating systems']]
[['hardware']]
[['software']]
[['antivirus and malware', 'computer security', 'software']]


In [37]:
joblib.dump(system, 'systems.pkl', compress=9)

['systems.pkl']

# Anastasia's Dataset

In [38]:
systems = joblib.load('systems.pkl')

In [2]:
anastasia = pd.read_csv(filepath_or_buffer='data/anastasia.csv')

print(f"Threat Tweets: {len(anastasia)}")
anastasia.head()

Threat Tweets: 24474


Unnamed: 0,full_text,lang,Related
0,Improperly Implemented Security Check vulnerab...,en,1
1,profanity through 1.60 has only four billion p...,en,1
2,Cross-Site Request Forgery (CSRF) in GitHub re...,en,1
3,gailcosme the art of love is largely th,en,0
4,Pimcore is an Open Source Data & Experience Ma...,en,1


In [7]:
anastasia.keys()

Index(['full_text', 'lang', 'Related'], dtype='object')

In [6]:
from collections import Counter
import numpy as np
class_freq = Counter([item for item in anastasia['Related']])

print("Target Frequencies:")
for target, freq in class_freq.most_common():
    print(f"{target}: {freq}")

Target Frequencies:
1: 17468
0: 7006


In [40]:
anastasia = anastasia[anastasia['lang'] == 'en']

In [42]:
anastasia['categories'] = system.predict(anastasia['full_text'])

In [43]:
anastasia.head()

Unnamed: 0,full_text,lang,Related,categories
0,Improperly Implemented Security Check vulnerab...,en,1,"[computer security, hardware]"
1,profanity through 1.60 has only four billion p...,en,1,[computer security]
2,Cross-Site Request Forgery (CSRF) in GitHub re...,en,1,[software]
3,gailcosme the art of love is largely th,en,0,[other]
4,Pimcore is an Open Source Data & Experience Ma...,en,1,[software]


In [44]:
anastasia.to_csv(path_or_buf='data/anastasia_with_categories.csv')

In [49]:
anastasia.tail(40)

Unnamed: 0,full_text,lang,Related,categories
24432,URL Confusion When Scheme Not Supplied in GitH...,en,1,[other]
24433,TP-LINK TL-WR840N(ES)_V6.20 was discovered to ...,en,1,"[hardware, software]"
24434,Tenda AC6 v15.03.05.09_multi was discovered to...,en,1,[software]
24435,rt hizarther also i just found this concept...,en,0,[other]
24436,robschneider if you need any kind of line art...,en,0,[other]
24437,A flaw was found in moodle where ID numbers di...,en,1,[software]
24438,CSCMS Music Portal System v4.2 was discovered ...,en,1,"[computer security, software]"
24439,Foxit PDF Reader before 11.2.2 and PDF Editor ...,en,1,[]
24440,A CWE-669: Incorrect Resource Transfer Between...,en,1,[]
24441,There is a flaw in convert2rhel. When the --ac...,en,1,[]


# X + Reddit Dataset