# AI Chatbot for MITRE ATT&CK Threat Classification and Organizational Impact Analysis

In [9]:
from tqdm.auto import tqdm
import torch, torchtext
from torch import nn
import torch.nn.functional as F
import random, math, time
from datasets import load_dataset
import pandas as pd
import numpy as np
import re

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cuda:1


# 1. Load Dataset

In [10]:
torch.__version__

'2.3.0+cu121'

In [11]:
from datasets import load_dataset

# Load the dataset (replace 'your_dataset_name' with the actual name)
dataset = load_dataset('tumeteor/Security-TTP-Mapping')

# # Optionally, select a specific split or a subset
# dataset = dataset['train']  # or 'test', depending on the split

# # Optionally select a specific range
# dataset = dataset.select(range(10000))

# Display the dataset
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text1', 'labels'],
        num_rows: 14936
    })
    validation: Dataset({
        features: ['text1', 'labels'],
        num_rows: 2630
    })
    test: Dataset({
        features: ['text1', 'labels'],
        num_rows: 3170
    })
})


In [12]:
# from datasets import load_dataset, DatasetDict

# # Load your dataset
# dataset = load_dataset('sarahwei/cyber_MITRE_technique_CTI_dataset_v16')

# # Split the dataset into train, validation, and test
# dataset_split = dataset['train'].train_test_split(test_size=0.1, seed=42)

# # Further split the train dataset into train and validation
# train_dataset, val_dataset = dataset_split['train'].train_test_split(test_size=0.125, seed=42).values()

# # Create a DatasetDict containing the splits
# dataset_dict = {
#     'train': train_dataset,
#     'validation': val_dataset,
#     'test': dataset_split['test']
# }

# # Convert to a DatasetDict
# dataset = DatasetDict(dataset_dict)

# # Print the DatasetDict to get the desired output
# print(dataset)


In [13]:
from datasets import Dataset, DatasetDict
import ast


def flatten_labels(split):
    new_rows = []
    for example in split:
        lst = ast.literal_eval(example['labels'])
        for i in lst:
            new_rows.append({'g': example['text1'],'labels': i})
    return Dataset.from_list(new_rows)

# Apply to all splits
dataset = DatasetDict({
    'train': flatten_labels(dataset['train']),
    'validation': flatten_labels(dataset['validation']),
    'test': flatten_labels(dataset['test']),
})

## EDA

In [14]:
print(dataset['train'][1000]['text1'])
print(dataset['train'][1000]['labels'])

Inception’s malware is modular and the attackers will load plugins based on requirements for each attack. The group has used a range of plugins in recent attacks, some of which are improved versions of plugins used in 2014, while others were previously unseen
T1057


In [15]:
print(dataset['train'][5]['text1'])
print(dataset['train'][5]['labels'])

When communicating with its C2 server, Psylo will use HTTPS with a unique user-agent of (notice the lack of a space between "5.0" and "(Windows
T1071.001


In [16]:
print(dataset['train'][:5]['text1'])
print(dataset['train'][:5]['labels'])

['The command processing function starts by substituting the main module name and path in the hosting process PEB, with the one of the default internet browser. The path of the main browser of the workstation is obtained by reading the registry value', 'Along the way, HermeticWiper’s more mundane operations provide us with further IOCs to monitor for. These include the momentary creation of the abused driver as well as a system service. It also modifies several registry keys, including setting the SYSTEM\\CurrentControlSet\\Control\\CrashControl CrashDumpEnabled key to 0, effectively disabling crash dumps before the abused driver’s execution starts', 'These Microsoft Office templates are hosted on a command and control server and the downloaded link is embedded in the first stage malicious document', 'Additionally, the IP 211[.]72 [.]242[.]120 is one of the hosts for the domain microsoftmse[.]com, which has been used by several KIVARS variants', 'Additionally, the IP 211[.]72 [.]242[.]

In [17]:
train = [(row['text1'], row['labels']) for row in dataset['train']]

In [18]:
#let's take a look at one example of train
sample = next(iter(train))
sample

('The command processing function starts by substituting the main module name and path in the hosting process PEB, with the one of the default internet browser. The path of the main browser of the workstation is obtained by reading the registry value',
 'T1057')

# 2. Preprocess Data

In [19]:
from datasets import load_dataset
from transformers import BertTokenizer

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('ibm-research/CTI-BERT')

# Function to tokenize the text data
def tokenize_function(examples):
    return tokenizer(examples['text1'], padding="max_length", truncation=True, max_length=64)

# Apply tokenization to train and validation datasets
train_data = dataset['train'].map(tokenize_function, batched=True)
val_data = dataset['validation'].map(tokenize_function, batched=True)
test_data = dataset['test'].map(tokenize_function, batched=True)

# Print an example to verify
print(train_data[0]['text1'])  # It should show tokenized input

# # Decode the tokenized text back to human-readable text
# decoded_text = tokenizer.decode(train_data[0]['input_ids'], skip_special_tokens=True)
# print(f"Decoded Text: {decoded_text}")


Map:   0%|          | 0/16340 [00:00<?, ? examples/s]

Map:   0%|          | 0/2882 [00:00<?, ? examples/s]

Map:   0%|          | 0/3785 [00:00<?, ? examples/s]

The command processing function starts by substituting the main module name and path in the hosting process PEB, with the one of the default internet browser. The path of the main browser of the workstation is obtained by reading the registry value


In [20]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('ibm-research/CTI-BERT')

# Example sentence
text = "The command processing function starts by substituting the main module name and path in the hosting process PEB, with the one of the default internet browser. The path of the main browser of the workstation is obtained by reading the registry value."

# Tokenize the text
tokens = tokenizer.tokenize(text)
print(tokens)


['the', 'command', 'processing', 'function', 'starts', 'by', 'substituting', 'the', 'main', 'module', 'name', 'and', 'path', 'in', 'the', 'hosting', 'process', 'peb', ',', 'with', 'the', 'one', 'of', 'the', 'default', 'internet', 'browser', '.', 'the', 'path', 'of', 'the', 'main', 'browser', 'of', 'the', 'workstation', 'is', 'obtained', 'by', 'reading', 'the', 'registry', 'value', '.']


# 3. Tokenizer and Model

In [21]:
# Extract unique labels (MITRE techniques) from both train and validation datasets
labels = list(set(dataset['train']['labels']).union(set(dataset['validation']['labels'])))  # Extract unique labels
label_map = {label: i for i, label in enumerate(labels)}

# Function to encode labels into integers
def encode_labels(examples):
    # Safely map the labels, providing a default value if a label is not found in the label_map
    examples['labels'] = [label_map.get(label, -1) for label in examples['labels']]
    return examples

# Apply label encoding
train_data = train_data.map(encode_labels, batched=True)
val_data = val_data.map(encode_labels, batched=True)

# Print an example to verify
print(train_data[0])  # It should show tokenized input along with the encoded label


Map:   0%|          | 0/16340 [00:00<?, ? examples/s]

Map:   0%|          | 0/2882 [00:00<?, ? examples/s]

{'text1': 'The command processing function starts by substituting the main module name and path in the hosting process PEB, with the one of the default internet browser. The path of the main browser of the workstation is obtained by reading the registry value', 'labels': 395, 'input_ids': [2, 114, 979, 3435, 962, 4929, 229, 41301, 114, 1069, 2014, 810, 137, 1508, 120, 114, 4393, 612, 24978, 16, 214, 114, 479, 135, 114, 1074, 904, 1437, 18, 114, 1508, 135, 114, 1069, 1437, 135, 114, 7966, 146, 4395, 229, 2979, 114, 2782, 887, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [22]:
# # Extract unique labels (MITRE techniques) and map to integers
# labels = list(set(dataset['train']['label']))  # Extract unique labels
# label_map = {label: i for i, label in enumerate(labels)}

# # # Extract unique labels (MITRE techniques) from both train and validation datasets
# # labels = list(set(dataset['train']['label']).union(set(dataset['validation']['label'])))  # Extract unique labels
# # label_map = {label: i for i, label in enumerate(labels)}

# # Function to encode labels into integers
# def encode_labels(examples):
#     examples['label'] = [label_map[label] for label in examples['label']]
#     return examples

# # # Function to encode labels into integers
# # def encode_labels(examples):
# #     for label in examples['label']:
# #         if label not in label_map:
# #             print(f"Warning: Label '{label}' not found in label_map!")  # Optional, for debugging
# #     examples['label'] = [label_map.get(label, -1) for label in examples['label']]  # Default to -1 if missing
# #     return examples

# # Apply label encoding
# train_data = train_data.map(encode_labels, batched=True)
# val_data = val_data.map(encode_labels, batched=True)

# # Print an example to verify
# print(train_data[0])  # It should show tokenized input along with the encoded label


In [23]:
# # Extract unique labels (MITRE techniques) from both train and validation datasets
# labels = list(set(dataset['train']['label']).union(set(dataset['validation']['label'])))  # Extract unique labels
# label_map = {label: i for i, label in enumerate(labels)}

In [24]:
# from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# # Load the pre-trained BERT model for sequence classification
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(labels))

# # Define the training arguments
# training_args = TrainingArguments(
#     output_dir='./results',          # Output directory
#     num_train_epochs=3,              # Number of training epochs
#     per_device_train_batch_size=8,   # Batch size for training
#     per_device_eval_batch_size=16,   # Batch size for evaluation
#     evaluation_strategy="epoch",     # Evaluate after each epoch
#     # logging_dir='./logs',            # Directory for storing logs
# )

# # Initialize the Trainer
# trainer = Trainer(
#     model=model,                         # The pre-trained model
#     args=training_args,                  # The training arguments
#     train_dataset=train_data,            # The training dataset
#     eval_dataset=val_data,               # The validation dataset
# )

# # Train the model
# trainer.train()

In [25]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(p):
    preds, labels = p
    # If using softmax, we need to use argmax to get the final class prediction
    preds = preds.argmax(axis=-1)
    
    # Calculate precision, recall, F1-score, and accuracy
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    accuracy = accuracy_score(labels, preds)
    
    # Return the metrics as a dictionary
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


## Train with CTI-Bert

In [26]:
"""Training hyperparameters
The following hyperparameters were used during training:

learning_rate: 0.0005
train_batch_size: 128
eval_batch_size: 128
seed: 42
gradient_accumulation_steps: 16
total_train_batch_size: 2048
optimizer: Adam with betas=(0.9,0.98) and epsilon=1e-06
lr_scheduler_type: linear
lr_scheduler_warmup_steps: 10000
training_steps: 200000"""

'Training hyperparameters\nThe following hyperparameters were used during training:\n\nlearning_rate: 0.0005\ntrain_batch_size: 128\neval_batch_size: 128\nseed: 42\ngradient_accumulation_steps: 16\ntotal_train_batch_size: 2048\noptimizer: Adam with betas=(0.9,0.98) and epsilon=1e-06\nlr_scheduler_type: linear\nlr_scheduler_warmup_steps: 10000\ntraining_steps: 200000'

In [27]:
from transformers import Trainer, TrainingArguments, BertForSequenceClassification

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('ibm-research/CTI-BERT', num_labels=len(labels))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=8,   
    per_device_eval_batch_size=16,   
    evaluation_strategy="epoch",     
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # The pre-trained model
    args=training_args,                  # The training arguments
    train_dataset=train_data,            # The training dataset
    eval_dataset=val_data,               # The validation dataset
    compute_metrics=compute_metrics      # Add the compute_metrics function
)

# Train the model
trainer.train()

# Optionally save the final model manually (this step is usually not required as the trainer saves it automatically)
trainer.save_model("./final_model1")  # You can specify any directory you prefer

# Also save the tokenizer (if necessary)
tokenizer.save_pretrained("./final_model1")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ibm-research/CTI-BERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mworamethr[0m ([33mworamethr-asian-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,3.8684,2.775704,0.47467,0.371924,0.47467,0.392578
2,2.3919,2.341063,0.527412,0.44669,0.527412,0.462944
3,1.9618,2.228477,0.544414,0.473586,0.544414,0.487865


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


('./final_model1/tokenizer_config.json',
 './final_model1/special_tokens_map.json',
 './final_model1/vocab.txt',
 './final_model1/added_tokens.json')

In [28]:
# Evaluate the model on the validation dataset
eval_results = trainer.evaluate(eval_dataset=val_data)

# Print the evaluation results
print("Evaluation results:", eval_results)




Evaluation results: {'eval_loss': 2.2284772396087646, 'eval_accuracy': 0.5444136016655101, 'eval_precision': 0.47358587962405074, 'eval_recall': 0.5444136016655101, 'eval_f1': 0.4878651706700848, 'eval_runtime': 6.7708, 'eval_samples_per_second': 425.65, 'eval_steps_per_second': 6.794, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [29]:
# Assuming test_data is defined and contains the test set
test_results = trainer.evaluate(eval_dataset=test_data)

# Print the test results
print("Test results:", test_results)


Test results: {'eval_runtime': 9.0208, 'eval_samples_per_second': 419.585, 'eval_steps_per_second': 6.651, 'epoch': 3.0}


In [30]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Load the pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('./final_model1')
# model = BertForSequenceClassification.from_pretrained('ibm-research/CTI-BERT')
tokenizer = BertTokenizer.from_pretrained('ibm-research/CTI-BERT')

# Function to get prediction for manual input text
def predict(input_text):
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Get the model's output
    with torch.no_grad():  # Disable gradients for inference
        outputs = model(**inputs)
    
    # Get the logits (model output before applying softmax)
    logits = outputs.logits
    
    # Apply softmax to get probabilities (for multi-class classification)
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    # Get the predicted label (index of max probability)
    predicted_label = torch.argmax(probabilities, dim=-1).item()
    
    return predicted_label, probabilities

# Test with manual input
input_text = input("Enter text for classification: ")

predicted_label, probabilities = predict(input_text)

# Assuming `labels` contains the label names and was created previously (e.g., from `dataset['train']['label']`)
# You can map the predicted label index back to the label name
predicted_label_name = labels[predicted_label]

print(f"Predicted Label: {predicted_label_name}")
print(f"Prediction Probabilities: {probabilities}")


Enter text for classification:  hello


Predicted Label: T1071
Prediction Probabilities: tensor([[2.8728e-04, 6.2711e-04, 4.3520e-04, 3.5924e-03, 7.3747e-04, 3.5778e-03,
         4.8376e-04, 2.7944e-03, 3.3418e-04, 2.0135e-04, 1.0493e-03, 7.8652e-03,
         1.8842e-03, 1.1182e-04, 6.5573e-05, 3.1861e-03, 2.2755e-04, 1.2578e-04,
         1.2454e-04, 1.4929e-04, 9.0229e-05, 4.8688e-03, 3.3605e-04, 5.4051e-04,
         4.2112e-04, 4.6200e-04, 1.6454e-03, 5.7851e-04, 1.0574e-04, 2.2906e-04,
         3.1178e-04, 4.3422e-03, 1.1435e-04, 3.0856e-04, 6.2763e-05, 2.7954e-04,
         9.9990e-05, 1.5496e-03, 6.2040e-03, 9.6692e-04, 8.5517e-05, 1.1505e-03,
         8.7555e-04, 3.4619e-04, 2.0657e-03, 4.6954e-03, 1.1256e-04, 3.1992e-03,
         3.7916e-04, 2.8691e-04, 1.5573e-04, 2.8532e-04, 5.9052e-04, 1.4731e-03,
         4.9593e-03, 1.0899e-04, 1.1049e-03, 6.6293e-04, 1.7441e-04, 8.5080e-05,
         2.5543e-04, 5.9926e-04, 9.7129e-05, 1.4882e-02, 9.6313e-05, 9.9160e-05,
         2.8374e-04, 9.3408e-05, 3.4460e-04, 2.2904e-03, 4.9

Predicted Label: ['T1203']
Prediction Probabilities: tensor([[8.5942e-05, 1.7216e-04, 2.1037e-04,  ..., 4.3844e-04, 1.2077e-03,
         1.0486e-03]])


In [31]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch
import nltk
nltk.download('punkt_tab')

# Load the pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('./final_model1')
# model = BertForSequenceClassification.from_pretrained('ibm-research/CTI-BERT')
tokenizer = BertTokenizer.from_pretrained('ibm-research/CTI-BERT')

# # Assuming you have a list of labels
# labels = ['Label_0', 'Label_1', 'Label_2']  # Update with your actual labels

# Function to get prediction for manual input text
def predict(input_text):
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Get the model's output
    with torch.no_grad():  # Disable gradients for inference
        outputs = model(**inputs)
    
    # Get the logits (model output before applying softmax)
    logits = outputs.logits
    
    # Apply softmax to get probabilities (for multi-class classification)
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    # Get the predicted label (index of max probability)
    predicted_label = torch.argmax(probabilities, dim=-1).item()
    
    return predicted_label, probabilities

# Function to break text into sentences using nltk
def sentence_tokenizer(text):
    # Tokenize the text into sentences
    sentences = nltk.sent_tokenize(text)
    return sentences

# Test with article input
input_text = """
An Iranian state-sponsored actor has been observed scanning and attempting to abuse the Log4Shell flaw in publicly-exposed Java applications to deploy a hitherto undocumented PowerShell-based modular backdoor dubbed "CharmPower" for follow-on post-exploitation. "The actor's attack setup was obviously rushed, as they used the basic open-source tool for the exploitation and based their operations on previous infrastructure, which made the attack easier to detect and attribute," researchers from Check Point said in a report published this week. The Israeli cybersecurity company linked the attack to a group known as APT35, which is also tracked using the codenames Charming Kitten, Phosphorus, and TA453, citing overlaps with toolsets previously identified as infrastructure used by the threat actor. Cybersecurity Log4Shell aka CVE-2021-44228 (CVSS score: 10.0) concerns a critical security vulnerability in the popular Log4j logging library that, if successfully exploited, could lead to remote execution of arbitrary code on compromised systems.
"""

# Break the article into sentences
sentences = sentence_tokenizer(input_text)

# Process each sentence through the model
for sentence in sentences:
    predicted_label, probabilities = predict(sentence)
    
    # Map the predicted label index to the label name
    predicted_label_name = labels[predicted_label]
    
    print(f"Sentence: {sentence}")
    print(f"Predicted Label: {predicted_label_name}")
    print(f"Prediction Probabilities: {probabilities}\n")


[nltk_data] Downloading package punkt_tab to /home/jupyter-
[nltk_data]     st124903/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Sentence: 
An Iranian state-sponsored actor has been observed scanning and attempting to abuse the Log4Shell flaw in publicly-exposed Java applications to deploy a hitherto undocumented PowerShell-based modular backdoor dubbed "CharmPower" for follow-on post-exploitation.
Predicted Label: T1190
Prediction Probabilities: tensor([[5.1243e-04, 8.1736e-04, 1.9886e-03, 6.2065e-04, 5.7447e-04, 1.0098e-02,
         8.5801e-04, 1.2725e-03, 1.2196e-03, 7.8081e-04, 1.6700e-03, 1.7396e-03,
         9.5309e-04, 6.1896e-04, 5.0379e-04, 9.5596e-04, 5.8830e-04, 8.1963e-04,
         4.7799e-04, 8.1372e-04, 7.2214e-04, 1.5212e-03, 5.9348e-04, 1.8004e-03,
         8.8915e-04, 9.3078e-04, 1.5904e-03, 1.3662e-03, 2.5677e-04, 6.9799e-04,
         2.1121e-03, 1.4003e-03, 5.5481e-04, 9.2005e-04, 3.9183e-04, 9.1157e-04,
         7.2661e-04, 6.3973e-04, 5.8083e-04, 1.6261e-03, 3.4794e-04, 7.4668e-04,
         5.6557e-04, 5.8662e-04, 1.4305e-03, 1.1343e-03, 7.2599e-04, 3.5753e-03,
         3.4277e-04, 1.0854e-0

In [32]:
import spacy
import torch
from transformers import BertForSequenceClassification, BertTokenizer

# Load the pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('./final_model1')  # Replace with your model path
tokenizer = BertTokenizer.from_pretrained('ibm-research/CTI-BERT')  # Replace with your tokenizer if needed

# # Assuming you have a list of labels (e.g., ['Label_0', 'Label_1', 'Label_2'])
# labels = model.config.id2label # Update with your actual labels
# print(labels)
# Function to get prediction for a given sentence
def predict(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    with torch.no_grad():  # Disable gradients for inference
        outputs = model(**inputs)
    
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    predicted_label = torch.argmax(probabilities, dim=-1).item()
    
    return predicted_label, probabilities

# Function to break text into sentences using spaCy
def sentence_tokenizer_spacy(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]  # Strip leading/trailing spaces
    return sentences

# Example news article
input_text = """
An Iranian state-sponsored actor has been observed scanning and attempting to abuse the Log4Shell flaw in publicly-exposed Java applications to deploy a hitherto undocumented PowerShell-based modular backdoor dubbed "CharmPower" for follow-on post-exploitation. "The actor's attack setup was obviously rushed, as they used the basic open-source tool for the exploitation and based their operations on previous infrastructure, which made the attack easier to detect and attribute," researchers from Check Point said in a report published this week. The Israeli cybersecurity company linked the attack to a group known as APT35, which is also tracked using the codenames Charming Kitten, Phosphorus, and TA453, citing overlaps with toolsets previously identified as infrastructure used by the threat actor. Cybersecurity Log4Shell aka CVE-2021-44228 (CVSS score: 10.0) concerns a critical security vulnerability in the popular Log4j logging library that, if successfully exploited, could lead to remote execution of arbitrary code on compromised systems.
"""

# Break the article into sentences using spaCy
sentences = sentence_tokenizer_spacy(input_text)

# Process each sentence through the model and get predictions
for sentence in sentences:
    predicted_label, probabilities = predict(sentence)
    
    # Map the predicted label index to the label name
    predicted_label_name = labels[predicted_label]
    
    print(f"Sentence: {sentence}")
    print(f"Predicted Label: {predicted_label_name}")
    print(f"Prediction Probabilities: {probabilities}\n")


Sentence: An Iranian state-sponsored actor has been observed scanning and attempting to abuse the Log4Shell flaw in publicly-exposed Java applications to deploy a hitherto undocumented PowerShell-based modular backdoor dubbed "CharmPower" for follow-on post-exploitation.
Predicted Label: T1190
Prediction Probabilities: tensor([[5.1243e-04, 8.1736e-04, 1.9886e-03, 6.2065e-04, 5.7447e-04, 1.0098e-02,
         8.5801e-04, 1.2725e-03, 1.2196e-03, 7.8081e-04, 1.6700e-03, 1.7396e-03,
         9.5309e-04, 6.1896e-04, 5.0379e-04, 9.5596e-04, 5.8830e-04, 8.1963e-04,
         4.7799e-04, 8.1372e-04, 7.2214e-04, 1.5212e-03, 5.9348e-04, 1.8004e-03,
         8.8915e-04, 9.3078e-04, 1.5904e-03, 1.3662e-03, 2.5677e-04, 6.9799e-04,
         2.1121e-03, 1.4003e-03, 5.5481e-04, 9.2005e-04, 3.9183e-04, 9.1157e-04,
         7.2661e-04, 6.3973e-04, 5.8083e-04, 1.6261e-03, 3.4794e-04, 7.4668e-04,
         5.6557e-04, 5.8662e-04, 1.4305e-03, 1.1343e-03, 7.2599e-04, 3.5753e-03,
         3.4277e-04, 1.0854e-03

In [35]:
import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score
import torch
from transformers import BertForSequenceClassification, BertTokenizer

# Load model and tokenizer
model = BertForSequenceClassification.from_pretrained('./final_model1')
tokenizer = BertTokenizer.from_pretrained('ibm-research/CTI-BERT')
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Your label map
labels = [...]  # List of label names, e.g., ['T1190', 'T1078', 'T1059', ...]
label_map = {label: idx for idx, label in enumerate(labels)}
label_map_rev = {idx: label for label, idx in label_map.items()}

# Prediction function
def predict_full_text(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    return probabilities.squeeze().cpu().numpy()  # shape (num_classes,)

# NDCG computation
def compute_ndcg_for_sample(probs, true_labels, label_map, k=5):
    num_classes = probs.shape[0]
    true_relevance = np.zeros((1, num_classes))
    for label in true_labels:
        idx = label_map.get(label)
        if idx is not None:
            true_relevance[0, idx] = 1  # mark relevant classes
    probs = probs.reshape(1, -1)
    return ndcg_score(true_relevance, probs, k=k)

# Load your CSS-like dataset
data = pd.read_csv('real_news_maps.csv')  # 'news' and 'maps' columns

all_ndcg_scores = []

for idx, row in data.iterrows():
    text = row['news']
    true_labels = eval(row['maps'])  # Convert string to list
    
    probs = predict_full_text(text)

    ndcg = compute_ndcg_for_sample(probs, true_labels, label_map, k=20)

    all_ndcg_scores.append(ndcg)
    print(f"Sample {idx}: NDCG@5 = {ndcg:.4f}")

# After all samples
mean_ndcg = np.mean(all_ndcg_scores)
print(f"\nMean NDCG@5 over dataset = {mean_ndcg:.4f}")


Sample 0: NDCG@5 = 0.0000
Sample 1: NDCG@5 = 0.0000
Sample 2: NDCG@5 = 0.0000
Sample 3: NDCG@5 = 0.0000
Sample 4: NDCG@5 = 0.0000
Sample 5: NDCG@5 = 0.0000
Sample 6: NDCG@5 = 0.0000
Sample 7: NDCG@5 = 0.0000
Sample 8: NDCG@5 = 0.0000
Sample 9: NDCG@5 = 0.0000

Mean NDCG@5 over dataset = 0.0000
