In [1]:
pip install transformers[torch]

Note: you may need to restart the kernel to use updated packages.


In [19]:
import pandas as pd
import json
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
from itertools import combinations
from torch.utils.data import DataLoader, Dataset

### Read and Add ID to Dataset

In [3]:
# Read dataset
pd.set_option('display.max_colwidth', None)
threat_train = pd.read_csv('/kaggle/input/cyber-threat/cyber-threat-intelligence-splited_train.csv')
threat_validate = pd.read_csv('/kaggle/input/cyber-threat/cyber-threat-intelligence-splited_validate.csv')
threat_test = pd.read_csv('/kaggle/input/cyber-threat/cyber-threat-intelligence-splited_test.csv')

# Add id column
threat_train.columns = ['id'] + list(threat_train.columns[1:])
threat_validate.columns = ['id'] + list(threat_validate.columns[1:])
threat_test.columns = ['id'] + list(threat_test.columns[1:])

### Convert Dataset Columns Value

In [4]:
# Function to convert dataset columns
def convert_columns(dataset):
    
    # Convert relations column
    def process_relations(row):
        entity_dict = {entity['id'] : entity['label'] for entity in json.loads(row.entities.replace("'", '"'))}
        
        relations = []
        for relation in json.loads(row.relations.replace("'", '"')):
            from_label = entity_dict[relation['from_id']]
            to_label = entity_dict[relation['to_id']]
            relation_type = relation['type']
            relations.append({"from": from_label, "to": to_label, "type": relation_type})

        unique_relations = []
        seen_relations = set()
        for relation in relations:
            relation_item = tuple(relation.items())
            if relation_item not in seen_relations:
                seen_relations.add(relation_item)
                unique_relations.append(relation)

        return unique_relations
    
    dataset['relations'] = dataset.apply(process_relations, axis=1)
    
    # Convert entities column
    def process_entities(row):
        entities = [entity['label'] for entity in json.loads(row.entities.replace("'", '"'))]
        entities = list(set(entities))
        return entities
    
    dataset['entities'] = dataset.apply(process_entities, axis=1)

In [5]:
convert_columns(threat_train)
convert_columns(threat_validate)
convert_columns(threat_test)

### Generate Features

In [6]:
# Create features to be used during training
def generate_features(dataset):
    features = []

    for index, row in dataset.iterrows():
        text = row['text']
        entities = row['entities']
        relations = row['relations']

        entity_pairs = list(combinations(entities, 2))
        relation_map = {(rel['from'], rel['to']): rel['type'] for rel in relations}

        for entity_1, entity_2 in entity_pairs:
            # Generate label
            if (entity_1, entity_2) in relation_map:
                label = relation_map[(entity_1, entity_2)]
            elif (entity_2, entity_1) in relation_map:
                label = relation_map[(entity_2, entity_1)]
            else:
                label = "no_relation"

            # Generate input text
            input_text = f"{text} [SEP] {entity_1} [SEP] {entity_2}"
            
            features.append((input_text, label))

    return features

In [7]:
threat_train_features = generate_features(threat_train)
threat_validate_features = generate_features(threat_validate)
threat_test_features = generate_features(threat_test)

### Training Model

In [8]:
# Encode labels
label_encoder = LabelEncoder()
labels = [label for _, label in threat_train_features]
label_encoder.fit(labels)

In [9]:
class RelationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        input_text, label = self.data[index]
        
        encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt',
            truncation=True,
        )

        label = label_encoder.transform([label])[0]

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label)
        }

In [26]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Train dataset and loader
train_dataset = RelationDataset(threat_train_features, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Validate dataset and loader
validate_dataset = RelationDataset(threat_validate_features, tokenizer)
validate_loader = DataLoader(validate_dataset, batch_size=8, shuffle=False)

In [11]:
# Model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_encoder.classes_))

# Training arguments
training_args = TrainingArguments(
    output_dir="./model",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = validate_dataset
)

# Train the model
trainer.train()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.580546
2,1.064100,0.502068
3,0.519600,0.48898


TrainOutput(global_step=1176, training_loss=0.7357732357622004, metrics={'train_runtime': 3763.6202, 'train_samples_per_second': 2.499, 'train_steps_per_second': 0.312, 'total_flos': 311586170480640.0, 'train_loss': 0.7357732357622004, 'epoch': 3.0})

### Model Evaluation

In [12]:
# Evaluate model
trainer.evaluate()

{'eval_loss': 0.4889800548553467,
 'eval_runtime': 63.3999,
 'eval_samples_per_second': 9.606,
 'eval_steps_per_second': 1.215,
 'epoch': 3.0}

### Model Prediction

In [18]:
def generate_new_data_feature(text, entities):
    processed_entities = [entity for entity in entities if entity != 'O']
    processed_entities = [entity[2:] if entity.startswith(("B-", "I-")) else entity for entity in processed_entities]
    
    entity_pairs = list(combinations(entities, 2))

    features = []
    for entity_1, entity_2 in entity_pairs:
        # Generate input text
        input_text = f"{text} [SEP] {entity_1} [SEP] {entity_2}"
        
        features.append([input_text, entity_1, entity_2])

    return features

In [25]:
model_dir = "./model/checkpoint-1176"
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_dir)

label_list = ['attributed-to',
             'authored-by', 
             'beacons-to',
             'communicates-with',
             'compromises',
             'consists-of', 
             'controls', 
             'delivers', 
             'downloads', 
             'drops',
             'duplicate-of', 
             'exfiltrates-to', 
             'exploits', 
             'has', 
             'hosts', 
             'impersonates',
             'indicates', 
             'located-at', 
             'no_relation', 
             'originates-from', 
             'owns',
             'related-to', 
             'targets', 
             'uses']

In [27]:
def predict_relations(text, entities):
    features = generate_new_data_feature(text, entities)

    relations = []
    for feature in features:
        inputs = tokenizer(feature[0], return_tensors="pt")
        outputs = loaded_model(**inputs)

        predictions = outputs.logits
        predicted_label = predictions.argmax(dim=1).item()

        decoded_label = label_list[predicted_label]

        if decoded_label != 'no_relation':
            relations.append(feature[1] + " " + decoded_label + " " + feature[2])

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=5ddb5738-f049-4079-b804-6b2a5e8558e2' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>