## Preparation

In [18]:
!pip install tensorflow==2.10.1 transformers==4.30.2



In [19]:
!pip install seqeval



In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import transformers
from transformers import AutoTokenizer, TFAutoModelForTokenClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from sklearn.metrics import classification_report

In [21]:
pd.set_option('display.max_colwidth', None)
threat_train = pd.read_csv('cyber-threat-intelligence-splited_train.csv')
threat_validate = pd.read_csv('cyber-threat-intelligence-splited_validate.csv')
threat_test = pd.read_csv('cyber-threat-intelligence-splited_test.csv')

# Add id column
threat_train.columns = ['id'] + list(threat_train.columns[1:])
threat_validate.columns = ['id'] + list(threat_validate.columns[1:])
threat_test.columns = ['id'] + list(threat_test.columns[1:])

In [22]:
threat_train.head()

Unnamed: 0,id,index,text,entities,relations,Comments,id.1,label,start_offset,end_offset
0,0,11709,"Malicious version of My First APP, which adds Metasploit SideWinder has used malicious apps as part of its operation before.","[{'id': 36090, 'label': 'malware', 'start_offset': 46, 'end_offset': 56}, {'id': 36091, 'label': 'threat-actor', 'start_offset': 61, 'end_offset': 71}]",[],[],36090.0,malware,46.0,56.0
1,1,4382,HYPERSCRAPE demonstrates Charming Kitten’s commitment to developing and maintaining purpose-built capabilities.,"[{'id': 13145, 'label': 'threat-actor', 'start_offset': 25, 'end_offset': 40}, {'id': 50195, 'label': 'malware', 'start_offset': 0, 'end_offset': 11}]",[],[],13145.0,threat-actor,25.0,40.0
2,2,12581,This looked similar to an earlier in-the-wild iOS vulnerability analyzed by Ian Beer of Google Project Zero.,[],[],[],,,,
3,3,11292,We need to request that a download link be generated for the collected files with the “trendmicro-visionone-download-information-for-collected-forensic-file”.,[],[],[],,,,
4,4,13494,"Some malicious actors, such as ransomware operators, earn directly from their cyberattacks.",[],[],[],,,,


### Combine All Non-Empty Entities Rows from Dataset

In [23]:
filtered_train = threat_train[threat_train['label'].notna()]
filtered_validate = threat_validate[threat_validate['label'].notna()]
filtered_test = threat_test[threat_test['label'].notna()]

combined_df = pd.concat([filtered_train, filtered_validate, filtered_test], ignore_index=True)
combined_df.rename(columns={'label':'entity'}, inplace=True)

In [24]:
combined_df.head()

Unnamed: 0,id,index,text,entities,relations,Comments,id.1,entity,start_offset,end_offset
0,0,11709,"Malicious version of My First APP, which adds Metasploit SideWinder has used malicious apps as part of its operation before.","[{'id': 36090, 'label': 'malware', 'start_offset': 46, 'end_offset': 56}, {'id': 36091, 'label': 'threat-actor', 'start_offset': 61, 'end_offset': 71}]",[],[],36090.0,malware,46.0,56.0
1,1,4382,HYPERSCRAPE demonstrates Charming Kitten’s commitment to developing and maintaining purpose-built capabilities.,"[{'id': 13145, 'label': 'threat-actor', 'start_offset': 25, 'end_offset': 40}, {'id': 50195, 'label': 'malware', 'start_offset': 0, 'end_offset': 11}]",[],[],13145.0,threat-actor,25.0,40.0
2,5,5386,"We have evidence that the routers of at least one vendor other than Asus and WatchGuard are connecting to Cyclops Blink C&Cs as well, but so far we have been unable to collect malware samples for this router brand.","[{'id': 51380, 'label': 'malware', 'start_offset': 106, 'end_offset': 119}]",[],[],51380.0,malware,106.0,119.0
3,11,4,The first known campaign was launched by Crimeware on November 2014.,"[{'id': 45806, 'label': 'TIME', 'start_offset': 55, 'end_offset': 68}, {'id': 48942, 'label': 'malware', 'start_offset': 42, 'end_offset': 51}]",[],[],45806.0,TIME,55.0,68.0
4,13,13297,The TSSL project has 64-bit version.,"[{'id': 41403, 'label': 'identity', 'start_offset': 24, 'end_offset': 27}]",[],[],41403.0,identity,24.0,27.0


In [25]:
print(combined_df.shape)

(4731, 10)


## Exploratory Data Analysis (EDA)

In [26]:
print(combined_df['entity'].value_counts())

entity
malware           885
location          691
SOFTWARE          602
attack-pattern    581
identity          564
threat-actor      411
TIME              229
tools             186
FILEPATH          149
vulnerability     105
SHA2               80
URL                62
campaign           54
IPV4               30
SHA1               30
DOMAIN             25
Infrastucture      18
EMAIL              12
REGISTRYKEY         9
MD5                 8
Name: count, dtype: int64


In [27]:
combined_df['entity'] = combined_df['entity'].replace('Infrastucture', 'Infrastructure')

texts = combined_df['text'].tolist()
entities = combined_df['entities'].tolist()



In [28]:
import ast
label_list = ["O"]

for entity in entities:
    for ent in ast.literal_eval(entity):
        label_list.append(f"B-{ent['label']}")
        label_list.append(f"I-{ent['label']}")

label_list = list(set(label_list))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

In [29]:
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)



In [30]:
def align_labels_with_tokens(text, entities, tokenizer):
    tokens = tokenizer(text, padding='max_length',max_length=128, truncation=True, return_offsets_mapping=True, return_tensors='np')
    labels = ["O"] * len(tokens["input_ids"][0])

    for entity in ast.literal_eval(entities):
        start, end, label = entity['start_offset'], entity['end_offset'], entity['label']
        for i, (token_start, token_end) in enumerate(tokens["offset_mapping"][0]):
            if token_start >= start and token_end <= end:
                if token_start == start:
                    labels[i] = f"B-{label}"
                else:
                    labels[i] = f"I-{label}"

    label_ids = [label2id[label] for label in labels]
    return tokens, label_ids

tokenized_texts = []
token_labels = []

for text, entity_list in zip(texts, entities):
    tokens, labels = align_labels_with_tokens(text, entity_list, tokenizer)
    tokenized_texts.append(tokens)
    token_labels.append(labels)

In [31]:
def create_tf_dataset(tokenized_texts, token_labels, batch_size=16):
    input_ids = [tokens["input_ids"][0] for tokens in tokenized_texts]
    attention_masks = [tokens["attention_mask"][0] for tokens in tokenized_texts]
    label_ids = token_labels

    dataset = tf.data.Dataset.from_tensor_slices((
        {"input_ids": input_ids, "attention_mask": attention_masks},
        label_ids
    ))
    return dataset.batch(batch_size)

train_dataset = create_tf_dataset(tokenized_texts, token_labels)


In [32]:
model = TFAutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForTokenClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able t

In [None]:
# Compile the model
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

# Train the model
model.fit(train_dataset, epochs=3)


In [None]:
model.save_pretrained("./NER_model")
tokenizer.save_pretrained("./NER_tokenizer")

In [37]:
def evaluate_combined_df(df, tokenizer, model, label2id):
    y_true = []
    y_pred = []

    for text, entity_list in zip(df['text'], df['entities']):
        tokens, true_labels = align_labels_with_tokens(text, entity_list, tokenizer)
        predictions = model.predict({"input_ids": tokens["input_ids"], "attention_mask": tokens["attention_mask"]})
        predicted_ids = np.argmax(predictions.logits, axis=-1)[0]
        predicted_labels = [id2label[id] for id in predicted_ids]

        # Filter out padding tokens
        true_labels_filtered = [id2label[label] for label, token in zip(true_labels, tokens["attention_mask"][0]) if token == 1]
        predicted_labels_filtered = [label for label, token in zip(predicted_labels, tokens["attention_mask"][0]) if token == 1]

        y_true.extend(true_labels_filtered)
        y_pred.extend(predicted_labels_filtered)

    print("Label-label yang diprediksi:")
    print(list(set(y_pred)))

    return y_true, y_pred, classification_report(
        y_true,
        y_pred,
        labels=list(label2id.keys()),
        zero_division=0
    )

In [None]:
y_true, y_pred, class_report = evaluate_combined_df(combined_df, tokenizer, model, label2id)

In [None]:
print(class_report)