In [1]:
import ast
import random
import re
import string
from pathlib import Path
from typing import Any, Union

import joblib
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from preprocess_functions import build_tree, extract_keys, preprocess_texts

OVERWRITE = False
RANDOM_STATE = 42
INIT_POINTS = 1
N_ITER = 5
TEST_SIZE = 2e-1
COLAB_PATH = Path('/content/drive/MyDrive')
KAGGLE_PATH = Path('/kaggle/input')
LOCAL_PATH = Path('./')

np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

try:
    import google.colab

    DATA_PATH = COLAB_PATH / Path('data')
    MODELS_PATH = COLAB_PATH / Path('models')
except ImportError:
    try:
        import kaggle_secrets

        DATA_PATH = KAGGLE_PATH
        MODELS_PATH = KAGGLE_PATH
    except ImportError:
        DATA_PATH = LOCAL_PATH / Path('data')
        MODELS_PATH = LOCAL_PATH / Path('models')

GLOVE_6B_PATH = MODELS_PATH / Path('glove-embeddings')
THREAT_TWEETS_PATH = DATA_PATH / Path('tweets-dataset-for-cyberattack-detection')

GLOVE_6B_300D_TXT = GLOVE_6B_PATH / Path('glove.6B.300d.txt')
THREAT_TWEETS_CSV = THREAT_TWEETS_PATH / Path('tweets_final.csv')


In [2]:
BASE_CLASSIFIERS = {
    'logistic_regression': LogisticRegression(solver='liblinear', random_state=RANDOM_STATE),
    'gaussian_nb': GaussianNB(),
    'decision_tree': DecisionTreeClassifier(random_state=RANDOM_STATE),
    'random_forest': RandomForestClassifier(random_state=RANDOM_STATE),
    'xgb': XGBClassifier(random_state=RANDOM_STATE)
}

GENERAL_CLASSIFIER_PATH = Path('models/lp.pkl')
COMPUTER_SECURITY_CLASSIFIER_PATH = Path('models/multiclass_problems/pst_computer_security.pkl')
OPERATING_SYSTEMS_CLASSIFIER_PATH = Path('models/multiclass_problems/lp_operating_systems.pkl')
SOFTWARE_CLASSIFIER_PATH = Path('models/multiclass_problems/lp_software.pkl')
HARDWARE_CLASSIFIER_PATH = Path('models/multiclass_problems/lp_hardware.pkl')
COMPUTER_CLASSIFIER_PATH = Path('models/binary_problems/cc_computer.pkl')
COMPUTER_NETWORKING_CLASSIFIER_PATH = Path('models/multiclass_problems/lp_computer_networking.pkl')


In [3]:
threat_tweets = (
    pd.read_csv(filepath_or_buffer=THREAT_TWEETS_CSV)
    .assign(
        tweet=lambda df: df['tweet'].apply(func=ast.literal_eval),
        watson=lambda df: df['watson'].apply(func=ast.literal_eval)
        .apply(func=lambda x: x.get('categories', []))
        .apply(func=build_tree),
        watson_list=lambda df: df['watson'].apply(func=extract_keys),
    )
    .query(expr='relevant == True')
    .drop(labels=['relevant'], axis=1)
    .dropna(subset=['text'], ignore_index=True)
)

threat_tweets.head()


Unnamed: 0,_id,date,id,text,tweet,type,watson,annotation,urls,destination_url,valid_certificate,watson_list
0,b'5b8876f9bb325e65fa7e78e4',2018-08-30 23:00:08+00:00,1035301167952211969,Protect your customers access Prestashop Ant...,{'created_at': 'Thu Aug 30 23:00:08 +0000 2018...,ddos,{'technology and computing': {'internet techno...,threat,['http://addons.prestashop.com/en/23513-anti-d...,https://addons.prestashop.com/en/23513-anti-dd...,True,"[technology and computing, internet technology..."
1,b'5b8876f9bb325e65fa7e78e5',2018-08-30 23:00:09+00:00,1035301173178249217,Data leak from Huazhu Hotels may affect 130 mi...,{'created_at': 'Thu Aug 30 23:00:09 +0000 2018...,leak,"{'travel': {'hotels': {}}, 'home and garden': ...",threat,['http://www.hotelmanagement.net/tech/data-lea...,http://www.hotelmanagement.net/tech/data-leak-...,True,"[travel, hotels, home and garden, home improve..."
2,b'5b8876fabb325e65fa7e78e6',2018-08-30 23:00:09+00:00,1035301174583353344,Instagram App 41.1788.50991.0 #Denial Of #Serv...,{'created_at': 'Thu Aug 30 23:00:09 +0000 2018...,general,{'science': {'weather': {'meteorological disas...,threat,['https://packetstormsecurity.com/files/149120...,https://packetstormsecurity.com/files/149120/i...,True,"[science, weather, meteorological disaster, hu..."
3,b'5b88770abb325e65fa7e78e7',2018-08-30 23:00:25+00:00,1035301242271096832,(good slides): \n\nThe Advanced Exploitation o...,{'created_at': 'Thu Aug 30 23:00:25 +0000 2018...,vulnerability,{'business and industrial': {'business operati...,threat,['https://twitter.com/i/web/status/10353012422...,https://twitter.com/i/web/status/1035301242271...,True,"[business and industrial, business operations,..."
4,b'5b887713bb325e65fa7e78e8',2018-08-30 23:00:35+00:00,1035301282095853569,CVE-2018-1000532 (beep)\nhttps://t.co/CaKbo38U...,{'created_at': 'Thu Aug 30 23:00:35 +0000 2018...,vulnerability,{'technology and computing': {'computer securi...,threat,['https://web.nvd.nist.gov/view/vuln/detail?vu...,https://nvd.nist.gov/vuln/detail/CVE-2018-1000532,True,"[technology and computing, computer security, ..."


In [4]:
X = preprocess_texts(
    list_str=threat_tweets['text'],
    model_path=GLOVE_6B_300D_TXT,
    embedding_dim=300
)


In [5]:
FIX_TARGETS = {
    'computer security': 'computer security',
    'operating systems': 'operating systems',
    'software': 'software',
    'programming languages': 'software',
    'hardware': 'hardware',
    'electronic components': 'hardware',
    'networking': 'networking',
    'internet technology': 'networking'
}

chosen_categories = [
    list(set(FIX_TARGETS.keys()) & set(s))
    for s in threat_tweets['watson_list']
]

for i, watson_list in enumerate(chosen_categories):
    temp = list(set([FIX_TARGETS[c] for c in watson_list]))
    if len(temp) < 1:
        temp = ['other']
    chosen_categories[i] = temp

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y=chosen_categories)

mlb.classes_


array(['computer security', 'hardware', 'networking', 'operating systems',
       'other', 'software'], dtype=object)

The general classifier detects the classes of a text among:
- computer security
- operating systems
- software
- hardware
- networking
- other


In [6]:
general_classifier = joblib.load(filename=GENERAL_CLASSIFIER_PATH)
general_classifier = general_classifier['xgb']


The "computer security" classifier detects the classes of a text already labeled as "computer security" among:
- antivirus and malware
- network security


In [7]:
computer_security_classifier = joblib.load(filename=COMPUTER_SECURITY_CLASSIFIER_PATH)
computer_security_classifier = computer_security_classifier['xgb']


The "operating systems" classifier detects the classes of a text already labeled as "operating systems" among:
- linux
- mac os
- unix
- windows


In [8]:
operating_systems_classifier = joblib.load(filename=OPERATING_SYSTEMS_CLASSIFIER_PATH)
operating_systems_classifier = operating_systems_classifier['logistic_regression']


The "software" classifier detects the classes of a text already labeled as "software" among:
- databases


In [9]:
software_classifier = joblib.load(filename=SOFTWARE_CLASSIFIER_PATH)
software_classifier = software_classifier['logistic_regression']


The "hardware" classifier detects the classes of a text already labeled as "hardware" among:
- computer
- computer components
- computer networking


In [10]:
hardware_classifier = joblib.load(filename=HARDWARE_CLASSIFIER_PATH)
hardware_classifier = hardware_classifier['xgb']

The "computer" classifier detects the classes of a text already labeled as "computer" among:
- servers
- portable computer
- desktop computer


In [11]:
computer_classifier = joblib.load(filename=COMPUTER_CLASSIFIER_PATH)
computer_classifier = computer_classifier['xgb']


The "computer networking" classifier detects the classes of a text already labeled as "computer networking" among:
- router
- wireless technology


In [12]:
computer_networking_classifier = joblib.load(filename=COMPUTER_NETWORKING_CLASSIFIER_PATH)
computer_networking_classifier = computer_networking_classifier['logistic_regression']


In [13]:
pd.DataFrame(data=general_classifier.predict(X[:1]), columns=mlb.classes_)


Unnamed: 0,computer security,hardware,networking,operating systems,other,software
0,1,0,1,0,0,1


# Final Classifier


In [135]:
class ClassifierSystem:
    def __init__(self, models: dict[str, (BaseEstimator, list[str])], embedder_path: Path, embedding_dim: int):
        self.models = models
        self.embedder_path = embedder_path
        self.embedding_dim = embedding_dim

    def predict(self, x_emb):
        #x_emb = self.preprocess_texts(list_str=x)

        translate = lambda predictions, classes: [[
            classes[yi] for yi in y_hat
        ] for y_hat in [
            np.where(y_hat == 1)[0]
            for y_hat in predictions
        ]]

        general_predict = translate(
            predictions=self.models['general_classifier']['model'].predict(x_emb),
            classes=self.models['general_classifier']['classes']
        )
        y_prediction = general_predict.copy()

        for idx, yp in enumerate(general_predict):
            y_prediction_i = set(yp)
            if 'computer security' in y_prediction_i:
                y_prediction_i = set(translate(
                    predictions=self.models['computer_security_classifier']['model'].predict(x_emb),
                    classes=self.models['computer_security_classifier']['classes']
                )[0]) | set(y_prediction_i)
            if 'operating systems' in y_prediction_i:
                y_prediction_i = set(translate(
                    predictions=self.models['operating_systems_classifier']['model'].predict(x_emb),
                    classes=self.models['operating_systems_classifier']['classes']
                )[0]) | set(y_prediction_i)
            if 'software' in y_prediction_i:
                y_prediction_i = set(translate(
                    predictions=self.models['software_classifier']['model'].predict(x_emb),
                    classes=self.models['software_classifier']['classes']
                )[0]) | set(y_prediction_i)
            if 'hardware' in y_prediction_i:
                y_prediction_i = set(translate(
                    predictions=self.models['hardware_classifier']['model'].predict(x_emb),
                    classes=self.models['hardware_classifier']['classes']
                )[0]) | set(y_prediction_i)
            if 'computer' in y_prediction_i:
                y_prediction_i = set(translate(
                    predictions=self.models['computer_classifier']['model'].predict(x_emb),
                    classes=self.models['computer_classifier']['classes']
                )[0]) | set(y_prediction_i)
            if 'computer networking' in y_prediction_i:
                y_prediction_i = set(translate(
                    predictions=self.models['computer_networking_classifier']['model'].predict(x_emb),
                    classes=self.models['computer_networking_classifier']['classes']
                )[0]) | set(y_prediction_i)

            y_prediction[idx] = list(y_prediction_i)

        return y_prediction

    def load_embedder_dict(self) -> dict[Union[str, list[str]], np.ndarray[Any, np.dtype]]:
        embeddings_dict = {}
        f = open(self.embedder_path, 'r', encoding='utf-8')

        for line in f:
            values = line.split()
            word = values[:-self.embedding_dim]

            if type(word) is list:
                word = ' '.join(word)

            vector = np.asarray([float(val) for val in values[-self.embedding_dim:]])
            embeddings_dict[word] = vector

        f.close()

        return embeddings_dict

    def preprocess_texts(self, list_str: list[str]) -> np.ndarray[Any, np.dtype]:
        embedder = self.load_embedder_dict()
        list_embedded_str = np.zeros(shape=(len(list_str), self.embedding_dim))

        for i, text in enumerate(list_str):
            tokens = re.findall(r'\w+|[{}]'.format(re.escape(pattern=string.punctuation)), text)

            for token in tokens:
                try:
                    list_embedded_str[i] += embedder[token.lower()]
                except KeyError:
                    continue

        return list_embedded_str



In [132]:
models_dict = {
    'general_classifier': {
        'model': general_classifier,
        'classes': ['computer security', 'hardware', 'networking', 'operating systems', 'other', 'software'],
    },
    'computer_security_classifier': {
        'model': computer_security_classifier,
        'classes': ['antivirus and malware', 'network security', 'computer security']
    },
    'operating_systems_classifier': {
        'model': operating_systems_classifier,
        'classes': ['mac os', 'linux', 'unix', 'windows', 'operating systems']
    },
    'software_classifier': {
        'model': software_classifier,
        'classes': ['databases', 'software']
    },
    'hardware_classifier': {
        'model': hardware_classifier,
        'classes': ['computer', 'computer components', 'computer networking', 'hardware']
    },
    'computer_classifier': {
        'model': computer_classifier,
        'classes': ['servers', 'portable computer', 'desktop computer', 'computer']
    },
    'computer_networking_classifier': {
        'model': computer_networking_classifier,
        'classes': ['router', 'wireless technology', 'computer networking']
    }
}


In [136]:
system = ClassifierSystem(
    models=models_dict,
    embedder_path=GLOVE_6B_300D_TXT,
    embedding_dim=300
)


In [145]:
y_predict = system.predict(X[:10])
y_predict

[['antivirus and malware', 'networking', 'software', 'computer security'],
 ['other'],
 ['computer security', 'antivirus and malware'],
 ['operating systems'],
 ['software'],
 ['hardware',
  'computer',
  'antivirus and malware',
  'computer security',
  'portable computer'],
 ['computer security', 'antivirus and malware'],
 ['computer security', 'antivirus and malware'],
 ['computer security', 'antivirus and malware'],
 ['computer security', 'antivirus and malware']]

In [144]:
threat_tweets[['text', 'watson_list']]

Unnamed: 0,text,watson_list
0,Protect your customers access Prestashop Anti DDoS Attack #Prestashop https://t.co/xUGxiI8zw8 https://t.co/hXmPX30FmK,"[technology and computing, internet technology, ecommerce, software, computer security, antivirus and malware]"
1,Data leak from Huazhu Hotels may affect 130 million customers https://t.co/RRXVTAWnIu,"[travel, hotels, home and garden, home improvement and repair, plumbing, health and fitness, disease, incontinence]"
2,Instagram App 41.1788.50991.0 #Denial Of #Service https://t.co/pTsuQe2HT9 #PacketStorm,"[science, weather, meteorological disaster, hurricane, technology and computing, hardware, computer networking, router, art and entertainment, movies and tv, movies]"
3,(good slides): \n\nThe Advanced Exploitation of 64-bit Edge Browser Use-After-Free Vulnerability on Windows 10:… https://t.co/crTHE7uTUt,"[business and industrial, business operations, business plans, technology and computing, operating systems, windows, sports, martial arts, sumo]"
4,CVE-2018-1000532 (beep)\nhttps://t.co/CaKbo38UEg\nbeep version 1.3 and up contains a External Control of File Name or Path vulnerability in --,"[technology and computing, computer security, network security, software, databases, hardware, computer]"
...,...,...
11107,@PicturesFoIder They're not wrong because chrome is a botnet,"[technology and computing, computer security, antivirus and malware, law, govt and politics, legal issues, legislation, government]"
11108,Incredibly powerful presentation by @rlovell100 on victim vulnerability. Map of unsubmitted sexual assault kits fro… https://t.co/VKZq38SSGh,"[society, crime, personal offense, assault, health and fitness, disease, aids and hiv, technology and computing, computer security, network security]"
11109,@NarutoRomania ransomware content blackmail followers,"[art and entertainment, comics and animation, anime and manga, movies and tv, movies, travel, tourist destinations, eastern europe]"
11110,Make sure you have upgraded to #tor 8.0 #privacy @torproject https://t.co/yfGmKnAcXs,"[technology and computing, hardware, computer networking, router, art and entertainment, books and literature, science fiction]"
