# Imports


In [None]:
import gzip
import os
import shutil
import string
from pathlib import Path

import gensim.models
import inflect
import nltk
import numpy as np
import pandas as pd
import requests
import torch.cuda
#from google.colab import drive
from matplotlib import pyplot as plt
from nltk import PorterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from tensorflow.keras.layers import Conv1D, Dense, Flatten, Input, MaxPooling1D
from tensorflow.keras.models import Sequential
from transformers import pipeline


In [None]:
nltk.download('stopwords')


# Environment Settings


In [None]:
# Define the regex pattern
RETWEET_PATTERN = r"RT\ \@.*"
EVERY_PATTERN = r".*"

# CSV files
X_CSV = Path("data/X.csv")

# JSON files
THREAT_TWEETS_JSON = Path('data/threat.tweets.json')
X_JSON = Path("data/X.json")

# Directories
#ROOT_DIR = Path('./')
#THESIS_DIR = ROOT_DIR + '/'

# Models
WORD2VEC_BIN = Path('models/GoogleNews-vectors-negative300.bin')
WORD2VEC_BIN_GZ = Path('models/GoogleNews-vectors-negative300.bin.gz')
WORD2VEC_BIN_GZ_URL = 'https://drive.usercontent.google.com/download?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download&authuser=0&confirm=t&uuid=7d62ae10-fee5-4471-a14b-d3fc3c8de6cf&at=AENtkXYp0oeqJDsqv8DR2sbelnZ5%3A1732188868578'

# Constant variables
EMBEDDING_50D_DIM = 50
NUM_FOLDS = 10
RANDOM_SEED = 42


In [None]:
if os.path.exists(WORD2VEC_BIN):
    print('File already decompressed.')
else:
    # Check if the compressed file exists
    if os.path.exists(WORD2VEC_BIN_GZ):
        try:
            with gzip.open(WORD2VEC_BIN_GZ, 'rb') as f_in:
                with open(WORD2VEC_BIN, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)

            print('File decompressed successfully.')
        except OSError as e:
            print(f'An error occurred during decompression: {e}')
    else:
        # Download the file from Google Drive if it's not present
        response = requests.get(WORD2VEC_BIN_GZ_URL)

        if response.status_code == 200:
            with open(WORD2VEC_BIN_GZ, 'wb') as f:
                f.write(response.content)

            print('File downloaded successfully.')

        try:
            with gzip.open(WORD2VEC_BIN_GZ, 'rb') as f_in:
                with open(WORD2VEC_BIN, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)

            print('File decompressed successfully.')
        except OSError as e:
            print(f'An error occurred during decompression: {e}')


# Functions


In [None]:
def replace_digits_with_words(tokens):
    inflect_engine = inflect.engine()
    return [inflect_engine.number_to_words(token) if token.isdigit() else token for token in tokens]


# [A Framework for Unsupervised Classification and Data Mining of Tweets about Cyber Vulnerabilities](https://arxiv.org/abs/2104.11695)

Recent studies have indicated that the NVD is not always up to date, with known vulnerabilities being discussed publicly on social media platforms, like Twitter and Reddit, months before they are published to the NVD. To that end, we present a framework for unsupervised classification to filter tweets for relevance to cybersecurity. We consider and evaluate two unsupervised ML techniques for inclusion in our framework, and show that zero-shot classification using a Bidirectional and Auto-Regressive Transformers (BART) model outperforms the other technique with 83.52% accuracy and a F1 score of 83.88%, allowing for accurate filtering of tweets without human intervention or labelled data for training.

Additionally, we discuss different insights that can be derived from these cyber-relevant tweets, such as trending topics of tweets and the counts of Twitter mentions for Common Vulnerabilities and Exposures (CVEs), that can be used in an alert or report to augment current NVD-based risk assessment tools.


In this experiment, the unsupervised techniques were evaluated on a labeled dataset of tweets from Behzadan et al.


## [Corpus and Deep Learning Classifier for Collection of Cyber Threat Indicators in Twitter Stream](https://ieeexplore.ieee.org/document/8622506)

A corpus of 21.000 tweets was curated directly from [Twitter](https://github.com/behzadanksu/cybertweets)


In [None]:
# Load JSON into a pandas DataFrame
threat_tweets = pd.read_json(path_or_buf=THREAT_TWEETS_JSON)


In [None]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

threat_tweets['preprocess-text'] = threat_tweets['text']

# 1. Conversion of all characters of the tweet to lower case
threat_tweets['preprocess-text'] = threat_tweets['preprocess-text'].str.lower()

# 2. Tokenize the text according to white-space separations
threat_tweets['preprocess-text'] = threat_tweets['preprocess-text'].str.split()

# 3. Remove tokens that are not encoded in ASCII
threat_tweets['preprocess-text'] = threat_tweets['preprocess-text'].apply(
    lambda tokens: [token for token in tokens if token.isascii()]
)

# 4. Remove punctuation from each token
threat_tweets['preprocess-text'] = threat_tweets['preprocess-text'].apply(
    lambda tokens: [token.translate(str.maketrans('', '', string.punctuation)) for token in tokens]
)

# 5. Remove tokens that are not composed of alphanumeric characters
threat_tweets['preprocess-text'] = threat_tweets['preprocess-text'].apply(
    lambda tokens: [token for token in tokens if token.isalnum()]
)

# 6. Substitute digits with word representations (e.g., 4 -> four)
threat_tweets['preprocess-text'] = threat_tweets['preprocess-text'].apply(replace_digits_with_words)

# 7. Remove stop words
threat_tweets['preprocess-text'] = threat_tweets['preprocess-text'].apply(
    lambda tokens: [word for word in tokens if word.lower() not in stop_words]
)

# 8. Stem tokens
threat_tweets['preprocess-text'] = threat_tweets['preprocess-text'].apply(
    lambda tokens: [stemmer.stem(token) for token in tokens]
)


In [None]:
threat_tweets.info()


In [None]:
# Instantiate the model
model = Sequential(layers=[
    Input(shape=(300, 1)),
    Conv1D(filters=32, kernel_size=8, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(30, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', 'recall', 'precision', 'f1_score']
)

# Display the model summary
model.summary()


In [None]:
def sentence_to_embedding(sentence, model, embedding_dim):
    # Initialize a list to store the embeddings for the words in the sentence
    word_embeddings = []

    for word in sentence:
        # Check if the word exists in the model's vocabulary
        if word in model.key_to_index:
            word_embeddings.append(model[word])
        else:
            # If the word is not in the vocabulary, use a zero vector
            word_embeddings.append(np.zeros(embedding_dim))

    # If no words are in the vocabulary, return a zero vector
    if len(word_embeddings) == 0:
        return np.zeros(embedding_dim)

    # Average the word embeddings to get the sentence embedding
    sentence_embedding = np.mean(word_embeddings, axis=0)

    return sentence_embedding


In [None]:
embedder = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC_BIN, binary=True)

X = np.array([
    sentence_to_embedding(words, embedder, embedding_dim=300)
    for words in threat_tweets['preprocess-text']
])
y = np.array([1. if yi == True else 0. for yi in threat_tweets['relevant']])


In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_SEED,
    stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_val, y_val,
    test_size=0.5,
    random_state=RANDOM_SEED,
    stratify=y_val
)

class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weights_dict = dict(enumerate(class_weights))

X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)

y_val = y_val.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
y_train = y_train.reshape(-1, 1)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=32,
    class_weight=class_weights_dict
).history


In [None]:
# Display the training history
plt.plot(history['accuracy'], label='accuracy')
plt.plot(history['val_accuracy'], label='val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.show()


In [None]:
# Evaluate the model
loss, accuracy, recall, precision, f1 = model.evaluate(X_test, y_test, verbose=0)
print(f"Loss: {loss:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")


## Continue


In [None]:
values = "\n".join([f"\t· {str(s)}" for s in threat_tweets["annotation"].unique()])

print(f'The original values in the field "annotation":\n{values}')


The dataset contains 21.368 tweets collected over four days using common cybersecurity keywords, and were labeled as 'threat,' 'business,' 'unknown,' and 'irrelevant.' Since we focused on vulnerabilities, we first filtered the dataset for tweets that contain the term 'vulnerability,' which came out to 9.963 tweets.


In [None]:
# Filter rows where 'text' column contains the word 'vulnerability'
vulnerability_threat_tweets = threat_tweets[
    threat_tweets['type'].apply(lambda x: any('vulnerability' in t.lower() for t in x))]
vulnerability_threat_tweets = vulnerability_threat_tweets.dropna(subset=['annotation'])

values = "\n".join([f"\t· {str(s)}" for s in vulnerability_threat_tweets["annotation"].unique()])

print(f'The original values in the field "annotation":\n{values}\n')
vulnerability_threat_tweets.info()


Tweets labeled as ‘business,’ ‘unknown,’ and ‘threat’ were replaced with a ‘cyber-relevant’ label as they also appeared to be relevant to cybersecurity, and comprised 54.5% of the filtered dataset.

In [None]:
vulnerability_threat_tweets['annotation'].value_counts()


In [None]:
vulnerability_threat_tweets.loc[
    vulnerability_threat_tweets['annotation'].isin(['business', 'unknown', 'threat']), 'y-test'] = 1
vulnerability_threat_tweets.loc[vulnerability_threat_tweets['annotation'].isin(['irrelevant']), 'y-test'] = 0
vulnerability_threat_tweets['y-test'] = vulnerability_threat_tweets['y-test'].astype(int)

vulnerability_threat_tweets['y-test'].value_counts()


In [None]:
device = 0 if torch.cuda.is_available() else -1

classifier = pipeline(
    task="zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=device
)

candidate_labels = [
    "The text describes a personal or emotional vulnerability, unrelated to technology or cybersecurity.",
    "The text describes a cybersecurity-related vulnerability, such as a weakness in software, systems, or networks."
]

candidate_labels_dict = {label: i for i, label in enumerate(candidate_labels)}


In [None]:
vulnerability_threat_tweets['zero-shot-bart-large-mnli'] = [
    classifier(sequence_to_classify, candidate_labels)['labels'][0]
    for sequence_to_classify in vulnerability_threat_tweets['text']
]

vulnerability_threat_tweets['zero-shot-bart-large-mnli'] = vulnerability_threat_tweets['zero-shot-bart-large-mnli'].map(
    candidate_labels_dict
)


In [None]:
# Compute the accuracy and the F1 score of the model by knowing that the column y-test has the correct values, while zero-shot-bart-large-mnli has the predicted labels

accuracy = accuracy_score(
    vulnerability_threat_tweets['y-test'],
    vulnerability_threat_tweets['zero-shot-bart-large-mnli']
)

f1 = f1_score(
    vulnerability_threat_tweets['y-test'],
    vulnerability_threat_tweets['zero-shot-bart-large-mnli']
)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")


In [None]:
vulnerability_threat_tweets[['zero-shot-bart-large-mnli', 'y-test', 'text']]
