In [None]:
!pip install tensorflow==2.10.1 transformers==4.30.2

Collecting tensorflow==2.10.1
  Downloading tensorflow-2.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.1 kB)
Collecting transformers==4.30.2
  Downloading transformers-4.30.2-py3-none-any.whl.metadata (113 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.6/113.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow==2.10.1)
  Downloading gast-0.4.0-py3-none-any.whl.metadata (1.1 kB)
Collecting keras<2.11,>=2.10.0 (from tensorflow==2.10.1)
  Downloading keras-2.10.0-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting keras-preprocessing>=1.1.1 (from tensorflow==2.10.1)
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting protobuf<3.20,>=3.9.2 (from tensorflow==2.10.1)
  Downloading protobuf-3.19.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (787 bytes)
Collecting tensorboard<2.11,>=2.10 (from tensorflow==2.10.1)
  Downloading tensorboard

In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=2d2ffb3878690e5bc06a0f04b03b57906340eb8a39882e5505167b1dd99aaf36
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import transformers
from transformers import AutoTokenizer, TFAutoModelForTokenClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from sklearn.metrics import classification_report

## Read Data

In [None]:
pd.set_option('display.max_colwidth', None)
threat_train = pd.read_csv('../data/cyber-threat-intelligence-splited_train.csv')
threat_validate = pd.read_csv('../data/cyber-threat-intelligence-splited_validate.csv')
threat_test = pd.read_csv('../data/cyber-threat-intelligence-splited_test.csv')

# Add id column
threat_train.columns = ['id'] + list(threat_train.columns[1:])
threat_validate.columns = ['id'] + list(threat_validate.columns[1:])
threat_test.columns = ['id'] + list(threat_test.columns[1:])

In [None]:
threat_train.head()

Unnamed: 0,id,index,text,entities,relations,Comments,id.1,label,start_offset,end_offset
0,0,11709,"Malicious version of My First APP, which adds Metasploit SideWinder has used malicious apps as part of its operation before.","[{'id': 36090, 'label': 'malware', 'start_offset': 46, 'end_offset': 56}, {'id': 36091, 'label': 'threat-actor', 'start_offset': 61, 'end_offset': 71}]",[],[],36090.0,malware,46.0,56.0
1,1,4382,HYPERSCRAPE demonstrates Charming Kitten’s commitment to developing and maintaining purpose-built capabilities.,"[{'id': 13145, 'label': 'threat-actor', 'start_offset': 25, 'end_offset': 40}, {'id': 50195, 'label': 'malware', 'start_offset': 0, 'end_offset': 11}]",[],[],13145.0,threat-actor,25.0,40.0
2,2,12581,This looked similar to an earlier in-the-wild iOS vulnerability analyzed by Ian Beer of Google Project Zero.,[],[],[],,,,
3,3,11292,We need to request that a download link be generated for the collected files with the “trendmicro-visionone-download-information-for-collected-forensic-file”.,[],[],[],,,,
4,4,13494,"Some malicious actors, such as ransomware operators, earn directly from their cyberattacks.",[],[],[],,,,


## Combine All Non-Empty Entities Rows from Dataset

In [None]:
filtered_train = threat_train[threat_train['label'].notna()]
filtered_validate = threat_validate[threat_validate['label'].notna()]
filtered_test = threat_test[threat_test['label'].notna()]

combined_df = pd.concat([filtered_train, filtered_validate, filtered_test], ignore_index=True)
combined_df.rename(columns={'label':'entity'}, inplace=True)

In [None]:
combined_df.head()

Unnamed: 0,id,index,text,entities,relations,Comments,id.1,entity,start_offset,end_offset
0,0,11709,"Malicious version of My First APP, which adds Metasploit SideWinder has used malicious apps as part of its operation before.","[{'id': 36090, 'label': 'malware', 'start_offset': 46, 'end_offset': 56}, {'id': 36091, 'label': 'threat-actor', 'start_offset': 61, 'end_offset': 71}]",[],[],36090.0,malware,46.0,56.0
1,1,4382,HYPERSCRAPE demonstrates Charming Kitten’s commitment to developing and maintaining purpose-built capabilities.,"[{'id': 13145, 'label': 'threat-actor', 'start_offset': 25, 'end_offset': 40}, {'id': 50195, 'label': 'malware', 'start_offset': 0, 'end_offset': 11}]",[],[],13145.0,threat-actor,25.0,40.0
2,5,5386,"We have evidence that the routers of at least one vendor other than Asus and WatchGuard are connecting to Cyclops Blink C&Cs as well, but so far we have been unable to collect malware samples for this router brand.","[{'id': 51380, 'label': 'malware', 'start_offset': 106, 'end_offset': 119}]",[],[],51380.0,malware,106.0,119.0
3,11,4,The first known campaign was launched by Crimeware on November 2014.,"[{'id': 45806, 'label': 'TIME', 'start_offset': 55, 'end_offset': 68}, {'id': 48942, 'label': 'malware', 'start_offset': 42, 'end_offset': 51}]",[],[],45806.0,TIME,55.0,68.0
4,13,13297,The TSSL project has 64-bit version.,"[{'id': 41403, 'label': 'identity', 'start_offset': 24, 'end_offset': 27}]",[],[],41403.0,identity,24.0,27.0


In [None]:
print(combined_df.shape)

(4731, 10)


## Exploratory Data Analysis (EDA)

In [None]:
print(combined_df['entity'].value_counts())

entity
malware           885
location          691
SOFTWARE          602
attack-pattern    581
identity          564
threat-actor      411
TIME              229
tools             186
FILEPATH          149
vulnerability     105
SHA2               80
URL                62
campaign           54
IPV4               30
SHA1               30
DOMAIN             25
Infrastucture      18
EMAIL              12
REGISTRYKEY         9
MD5                 8
Name: count, dtype: int64


Fix typo and change the dataset into lists

In [None]:
combined_df['entity'] = combined_df['entity'].replace('Infrastucture', 'Infrastructure')

texts = combined_df['text'].tolist()
entities = combined_df['entities'].tolist()



Create labels from all of the entities in the dataset

In [None]:
import ast
label_list = ["O"]

for entity in entities:
    for ent in ast.literal_eval(entity):
        label_list.append(f"B-{ent['label']}")
        label_list.append(f"I-{ent['label']}")

label_list = list(set(label_list))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

## Named Entity Recognition

In [None]:
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Create tokenizers that contains entities of each data

In [None]:
def align_labels_with_tokens(text, entities, tokenizer):
    """
    Tokenize the input text using the provided tokenizer.
    - padding='max_length': Ensures all sequences are padded to the same length.
    - max_length=128: Limits the tokenized sequence length to 128 tokens.
    - truncation=True: Truncates sequences longer than the maximum length.
    - return_offsets_mapping=True: Returns the mapping of character start and end positions for each token.
    - return_tensors='np': Outputs the tokens as NumPy arrays.
    """
    tokens = tokenizer(text, padding='max_length',max_length=128, truncation=True, return_offsets_mapping=True, return_tensors='np')

    # Initialize a list of labels corresponding to tokens with "O" (Outside)
    labels = ["O"] * len(tokens["input_ids"][0])

    for entity in ast.literal_eval(entities):
        # Extract the start and end character positions
        start, end, label = entity['start_offset'], entity['end_offset'], entity['label']
        for i, (token_start, token_end) in enumerate(tokens["offset_mapping"][0]):
            # Check if the token is within the range of the entity.
            if token_start >= start and token_end <= end:
                if token_start == start:
                    # Assign the beginning label (B-label)
                    labels[i] = f"B-{label}"
                else:
                    # Assign the inside label (I-label)
                    labels[i] = f"I-{label}"

    # Convert the string labels into their corresponding numerical IDs
    label_ids = [label2id[label] for label in labels]
    return tokens, label_ids

tokenized_texts = []
token_labels = []

for text, entity_list in zip(texts, entities):
    tokens, labels = align_labels_with_tokens(text, entity_list, tokenizer)
    tokenized_texts.append(tokens)
    token_labels.append(labels)

Create tensorflow dataset using input IDs, attention masks, and label IDs with batch size of 16.

In [None]:
def create_tf_dataset(tokenized_texts, token_labels, batch_size=16):
    input_ids = [tokens["input_ids"][0] for tokens in tokenized_texts]
    attention_masks = [tokens["attention_mask"][0] for tokens in tokenized_texts]
    label_ids = token_labels

    dataset = tf.data.Dataset.from_tensor_slices((
        {"input_ids": input_ids, "attention_mask": attention_masks},
        label_ids
    ))
    return dataset.batch(batch_size)

train_dataset = create_tf_dataset(tokenized_texts, token_labels)


Create model using the label2id

In [None]:
model = TFAutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able t

In [None]:
# Compile the model
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

# Train the model
model.fit(train_dataset, epochs=3)


Save model

In [None]:
model.save_pretrained("../model/NER_model")
tokenizer.save_pretrained("../model/NER_tokenizer")

### Evaluate NER

In [None]:
def evaluate_combined_df(df, tokenizer, model, label2id):
    y_true = []
    y_pred = []

    for text, entity_list in zip(df['text'], df['entities']):
        # Align tokens
        tokens, true_labels = align_labels_with_tokens(text, entity_list, tokenizer)
        # Predict using input IDs and attention mask
        predictions = model.predict({"input_ids": tokens["input_ids"], "attention_mask": tokens["attention_mask"]})
        predicted_ids = np.argmax(predictions.logits, axis=-1)[0]
        predicted_labels = [id2label[id] for id in predicted_ids]

        # Filter out padding tokens
        true_labels_filtered = [id2label[label] for label, token in zip(true_labels, tokens["attention_mask"][0]) if token == 1]
        predicted_labels_filtered = [label for label, token in zip(predicted_labels, tokens["attention_mask"][0]) if token == 1]

        y_true.extend(true_labels_filtered)
        y_pred.extend(predicted_labels_filtered)

    print("Label-label yang diprediksi:")
    print(list(set(y_pred)))

    return y_true, y_pred, classification_report(
        y_true,
        y_pred,
        labels=list(label2id.keys()),
        zero_division=0
    )

y_true, y_pred, class_report = evaluate_combined_df(combined_df, tokenizer, model, label2id)



In [None]:
!pip install transformers[torch]

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from transformers import DistilBertTokenizer, AutoModelForSequenceClassification
from itertools import combinations

  from .autonotebook import tqdm as notebook_tqdm


## Relation Extraction Task

In [None]:
def generate_new_data_relation_feature(text, entities):
    processed_entities = [entity for entity in entities if entity != 'O']
    processed_entities = [entity[2:] if entity.startswith(("B-", "I-")) else entity for entity in processed_entities]

    entity_pairs = list(combinations(entities, 2))

    features = []
    for entity_1, entity_2 in entity_pairs:
        # Generate input text
        input_text = f"{text} [SEP] {entity_1} [SEP] {entity_2}"

        features.append([input_text, entity_1, entity_2])

    return features

In [None]:
relation_extraction_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
relation_extraction_model = AutoModelForSequenceClassification.from_pretrained("GoDillonAudris/distilbert-relation-extraction")

label_list = ['attributed-to',
             'authored-by',
             'beacons-to',
             'communicates-with',
             'compromises',
             'consists-of',
             'controls',
             'delivers',
             'downloads',
             'drops',
             'duplicate-of',
             'exfiltrates-to',
             'exploits',
             'has',
             'hosts',
             'impersonates',
             'indicates',
             'located-at',
             'no_relation',
             'originates-from',
             'owns',
             'related-to',
             'targets',
             'uses']

In [None]:
def predict_relations(text, entities):
    features = generate_new_data_relation_feature(text, entities)

    relations = []
    for feature in features:
        inputs = relation_extraction_tokenizer(feature[0], return_tensors="pt")
        outputs = relation_extraction_model(**inputs)

        predictions = outputs.logits
        predicted_label = predictions.argmax(dim=1).item()

        decoded_label = label_list[predicted_label]

        if decoded_label != 'no_relation':
            relations.append(feature[1] + " " + decoded_label + " " + feature[2])

    return relations

## Pipeline

In [None]:
# Assume there is text, and entities list
text = ""
entities = y_pred

# Relation extraction task
relations = predict_relations(text, entities)