# Setup 

In [1]:
import torch
import datasets
import pandas as pd 
from tqdm import TqdmWarning
from tokenizers import ByteLevelBPETokenizer 
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers.integrations import WandbCallback
from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM
from sklearn.model_selection import train_test_split

import os 
import json 
import pickle
import warnings

# Hide all warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=TqdmWarning)

# Set up weights & biases 
os.environ["WANDB_PROJECT"] = "malbert-hf"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

# Config options 
MAX_LENGTH = 10         # Max number of tokens in an instruction
VOCAB_SIZE = 10000      # Number of tokens 
SUBSET_SIZE = 1e-3      # Size of dataset as a fraction of the total number of files (0 - 1)

# Directory containing files of disassembled executables
DATASET_BASE = "/Volumes/New Volume/malware-detection-dataset/opcodes/disasm"

# Evaluation samples
EVAL_DS_PATH = "./data.pickle"


with open(os.path.join(DATASET_BASE, 'labels.json'), 'r') as dataset_file:
    dataset = json.load(dataset_file)
files = [os.path.join(DATASET_BASE, name) for name in dataset.keys()]

files, _ = train_test_split(files, test_size=1 - SUBSET_SIZE)
print(f"{len(files)} files will be used in training")

8 files will be used in training


# Tokenizer

Train a new tokenizer if we haven't already.

In [None]:
!mkdir MalBERT

if not os.path.exists("./MalBERT"):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=files, vocab_size=VOCAB_SIZE, min_frequency=2, special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ])

    tokenizer.save_model('MalBERT')

# Dataset

Set up the dataset

## Un-tokenized dataset

In [2]:
roberta_tokenizer = RobertaTokenizerFast.from_pretrained('./MalBERT', max_len=MAX_LENGTH)

def tokenize_fn(line):
    return roberta_tokenizer(line['text'], truncation=True, padding="max_length", max_length=MAX_LENGTH)

train_files, test_files = train_test_split(files)

raw_dataset = datasets.load_dataset('text', data_files={
    "train": train_files, 
    "test": test_files
})

print(f"{len(raw_dataset['train'])} lines in training dataset")
print(f"{len(raw_dataset['test'])} lines in testing dataset")

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

630086 lines in training dataset
129868 lines in testing dataset


## Tokenized dataset

WARNING: This cell may take a long time to run depending on the size of the dataset. 

In [3]:
dataset = raw_dataset.map(tokenize_fn, batched=True, remove_columns=['text'], num_proc=8, batch_size=1024)
dataset.save_to_disk("data/tokenized")

Map (num_proc=8):   0%|          | 0/630086 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/129868 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/630086 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/129868 [00:00<?, ? examples/s]

# Train

## Custom weights & biases callback 

In [4]:
# Weights & Biases callback to log evaluation samples
class LogPredictionsCallback(WandbCallback):
    def __init__(self, data_path, tokenizer):
        super().__init__()
        self.tokenizer = tokenizer
        with open(data_path, 'rb') as data_file: 
            self.data = pickle.load(data_file)
        
    def on_train_end(self, args, state, control, **kwargs):
        model = kwargs['model']
        device = model.device 
        
        model.eval()
        with torch.no_grad():
            output = model(
                input_ids=torch.tensor(self.data['input_ids']).to(device),
                attention_mask=torch.tensor(self.data['attention_mask']).to(device),
                labels=torch.tensor(self.data['labels']).to(device)
            )

        self.data['input_ids'] = torch.tensor(self.data['input_ids']).detach()
        self.data['attention_mask'] = torch.tensor(self.data['attention_mask']).detach()
        self.data['labels'] = torch.tensor(self.data['labels']).detach()

        mask_pos = torch.where(self.data['input_ids'] == roberta_tokenizer.mask_token_id)

        input = torch.clone(self.data['input_ids'])
        actual = torch.clone(self.data['input_ids'])
        predicted = torch.clone(self.data['input_ids'])

        actual[actual == roberta_tokenizer.mask_token_id] = self.data['labels'][mask_pos[0], mask_pos[1]]
        predicted[predicted == roberta_tokenizer.mask_token_id] = output.logits[mask_pos[0], mask_pos[1], :].argmax(dim=-1).cpu()

        x = [roberta_tokenizer.decode(xi[~torch.isin(xi, torch.tensor([0, 1, 2, 3]))]) for xi in input]
        y = roberta_tokenizer.batch_decode(actual, skip_special_tokens=True)
        y_hat = roberta_tokenizer.batch_decode(predicted, skip_special_tokens=True)

        df = pd.DataFrame({"Input": x, "Actual": y, "Predicted": y_hat})
        table = self._wandb.Table(dataframe=df)
        self._wandb.log({"sample": table})

## Model Creation 

In [5]:
config = RobertaConfig(
    vocab_size=VOCAB_SIZE, 
    max_position_embeddings=MAX_LENGTH, 
    num_attention_heads=4,
    num_hidden_layers=3,
    type_vocab_size=1
)

model = RobertaForMaskedLM(config=config)

## Training Config

In [6]:
data_collator = DataCollatorForLanguageModeling(tokenizer=roberta_tokenizer, mlm=True, mlm_probability=0.15)
callback = LogPredictionsCallback(EVAL_DS_PATH, roberta_tokenizer)

train_args = TrainingArguments(
    output_dir="./MalBERT",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64, 
    save_steps=10_000, 
    save_total_limit=2,
    prediction_loss_only=True,  
    report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=train_args, 
    processing_class=roberta_tokenizer,
    data_collator=data_collator,
    train_dataset=dataset['train'], 
    eval_dataset=dataset['test']
)

trainer.add_callback(callback)

## Start training

In [7]:
trainer.train()
trainer.save_model("./MalBERT")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mlainon[0m ([33mhenry-williams[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,3.3363
1000,2.0923
1500,1.8523
2000,1.6865
2500,1.6511
3000,1.5754
3500,1.5532
4000,1.4868
4500,1.4912
5000,1.4622


[34m[1mwandb[0m: Adding directory to artifact (./MalBERT/checkpoint-9846)... Done. 1.1s
[34m[1mwandb[0m: Adding directory to artifact (./MalBERT/checkpoint-9846)... Done. 1.2s


# Evaluation

Predictions made from the evaluation set

In [13]:
data = json.load(open("artifacts/run-159ev9rd-sample:v0/sample.table.json", "r"))
pd.DataFrame(columns=data['columns'], data=data['data'])

Unnamed: 0,Input,Actual,Predicted
0,"lea edx, [rip<mask> 0x","lea edx, [rip + 0x","lea edx, [rip + 0x"
1,mov word gs:[eax - 0<mask>,mov word gs:[eax - 0x,mov word gs:[eax - 0x
2,jo 0x<mask>1113,jo 0x140051113,jo 0x140001113
3,<mask>,das,iretd
4,"<mask> edx, dword [rsp + 0","mov edx, dword [rsp + 0","mov edx, dword [rsp + 0"
5,mov qword [rsp063 0x<mask>,mov qword [rsp063 0x10,mov qword [rsp063 0x10
6,"xor al, 0xf<mask>","xor al, 0xf7","xor al, 0xf8"
7,jmp qword [rsi +<mask>x3,jmp qword [rsi + 0x3,jmp qword [rsi + 0x3
8,fld dword [edi<mask> 0x1,fld dword [edi - 0x1,fld dword [edi - 0x1
9,"mov rcx, qword [rdi<mask> 8","mov rcx, qword [rdi + 8","mov rcx, qword [rdi + 8"
