# Setup 

In [None]:
import datasets
from tokenizers import ByteLevelBPETokenizer 
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import RobertaConfig,RobertaTokenizerFast, RobertaForMaskedLM
from sklearn.model_selection import train_test_split

import os 
os.environ["WANDB_PROJECT"] = "malbert-hf"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

# Config options 
MAX_LENGTH = 64
VOCAB_SIZE = 10000

# Base directory of dataset
DATA_PATH = "/Volumes/New Volume/malware-detection-dataset/opcodes/disasm"
files = [
    os.path.join(DATA_PATH, file) 
    for file in os.listdir(DATA_PATH) 
    if not file.startswith("._")
]

# Tokenizer

In [None]:
!mkdir MalBERT

tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=files, vocab_size=VOCAB_SIZE, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

tokenizer.save_model('MalBERT')

# Dataset

In [None]:
roberta_tokenizer = RobertaTokenizerFast.from_pretrained('./MalBERT', max_len=MAX_LENGTH)

def tokenize_fn(line):
    return roberta_tokenizer(line['text'], truncation=True, padding="max_length", max_length=MAX_LENGTH)

train_files, test_files = train_test_split(files)

raw_dataset = datasets.load_dataset('text', data_files={
    "train": train_files, 
    "test": test_files
})

raw_dataset.save_to_disk("data/raw")

In [None]:
dataset = raw_dataset.map(tokenize_fn, batched=True, remove_columns=['text'], num_proc=8, batch_size=1024)
dataset.save_to_disk("data/tokenized")

# Train

In [None]:

config = RobertaConfig(
    vocab_size=VOCAB_SIZE, 
    max_position_embeddings=MAX_LENGTH, 
    num_attention_heads=4,
    num_hidden_layers=3,
    type_vocab_size=1
)

model = RobertaForMaskedLM(config=config)
data_collator = DataCollatorForLanguageModeling(tokenizer=roberta_tokenizer, mlm=True, mlm_probability=0.15)

train_args = TrainingArguments(
    output_dir="./MalBERT",
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=64, 
    save_steps=10_000, 
    save_total_limit=2,
    prediction_loss_only=True,  
    report_to=["wandb"]
)

trainer = Trainer(
    model=model,
    args=train_args, 
    processing_class=roberta_tokenizer,
    data_collator=data_collator,
    train_dataset=dataset['train'], 
    eval_dataset=dataset['test']
)

In [None]:
trainer.train()
trainer.save_model("./MalBERT")