In [1]:
import datasets 
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset
from tokenizers.normalizers import NFKC
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import Tokenizer, trainers, models
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from transformers import RobertaForMaskedLM, RobertaConfig
from transformers import PreTrainedTokenizerFast, RobertaTokenizerFast

import os 
from typing import List
from collections import defaultdict

DATA_PATH = "/Volumes/New Volume/malware-detection-dataset/opcodes/processed-data"
MAX_LENGTH = 64

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
tokenizer.vocab_size

1293

In [2]:
def get_data(path: os.PathLike, full_path: bool = True) -> List[str]:
    all_files = os.listdir(path)
    
    if full_path:
        return [os.path.join(path, file) for file in all_files if file.endswith('.txt') and not file.startswith("._")]
    else: 
        return all_files

def get_labels(filenames):
    return [1 if "VirusShare" in filename else 0 for filename in filenames]

paths = get_data(DATA_PATH)
labels = get_labels(paths)

In [3]:
class OpcodeDataset(Dataset): 
    def __init__(self, paths, labels):
        assert len(paths) == len(labels), "Mismatch between number of files and labels"
        self.paths = paths 
        self.labels = labels

    def __len__(self):
        return len(self.paths)        


    def __getitem__(self, idx):
        assert 0 <= idx <= len(self), "Index out of range"
        label = self.labels[idx]

        with open(self.paths[idx], 'r') as file: 
            content = file.readlines() 
            
        return ' '.join([opcode.rstrip() for opcode in content]), label

opcode_dataset = OpcodeDataset(paths, labels)

In [6]:
if not os.path.exists('./MalBERTa'):
    tokenizer = Tokenizer(models.WordLevel(unk_token="<unk>"))
    tokenizer.normalizer = NFKC()
    tokenizer.pre_tokenizer = Whitespace()

    trainer = trainers.WordLevelTrainer(
        vocab_size=1293, 
        special_tokens=[
            "<s>",
            "<pad>",
            "</s>",
            "<unk>",
            "<mask>",
        ], 
    )
    tokenizer.train(paths, trainer)
    tokenizer.save('MalBERTa/tokenizer.json')

    hf_tokenizer = PreTrainedTokenizerFast(
        tokenizer_file="MalBERTa/tokenizer.json",
        unk_token="<unk>",
        bos_token="<s>",
        eos_token="</s>",
        pad_token="<pad>",
        mask_token="<mask>"
    )
    hf_tokenizer.save_pretrained("MalBERTa")
    tokenizer = hf_tokenizer
else: 
    tokenizer = PreTrainedTokenizerFast.from_pretrained("MalBERTa")

In [7]:
def dataset_generator():
    for text, label in tqdm(opcode_dataset): 
        yield {
            "text": text,
            "label": label
        }
if not os.path.exists('./data/raw'):
    dataset = datasets.Dataset.from_generator(dataset_generator)
    dataset = dataset.train_test_split(test_size=0.2)
    dataset.save_to_disk("data/raw")
else: 
    dataset = datasets.load_from_disk("./data/raw")

In [9]:
def handle_sample(sample):
    texts = sample['text']
    labels = sample['label']
    
    flattened = defaultdict(list)

    for text, label in zip(texts, labels):
        tokenized = tokenizer(
            text,
            padding='max_length',
            max_length=MAX_LENGTH,
            return_overflowing_tokens=True,
            truncation=True
        )

        for i in range(len(tokenized['input_ids'])):
            for k in tokenized:
                flattened[k].append(tokenized[k][i])
            flattened['label'].append(label)

    return dict(flattened)

processed_dataset = dataset.map(
    handle_sample,
    remove_columns=dataset['test'].column_names,
    batch_size=64,
    batched=True,
    num_proc=8,
)


Map (num_proc=8): 100%|██████████| 5552/5552 [03:05<00:00, 29.96 examples/s]
Map (num_proc=8): 100%|██████████| 1388/1388 [00:48<00:00, 28.73 examples/s]


In [10]:
config = RobertaConfig(
    vocab_size=tokenizer.vocab_size, 
    max_position_embeddings=MAX_LENGTH + 2, 
    num_attention_heads=4,
    num_hidden_layers=4,
    type_vocab_size=1,
    hidden_size=128,
    intermediate_size=2048,
)

model = RobertaForMaskedLM(config=config)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

train_ds = processed_dataset['train'].remove_columns('label')
test_ds = processed_dataset['test'].remove_columns('label')

train_args = TrainingArguments(
    output_dir="./MalBERTa",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64, 
    save_steps=10_000, 
    save_total_limit=2,
    prediction_loss_only=True,  
)

trainer = Trainer(
    model=model,
    args=train_args, 
    processing_class=tokenizer,
    data_collator=data_collator,
    train_dataset=train_ds,
    eval_dataset=test_ds,
)

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mlainon[0m ([33mhenry-williams[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,4.7885
1000,3.6554
1500,3.4939
2000,3.4126
2500,3.3506
3000,3.319
3500,3.2799
4000,3.2642
4500,3.2475
5000,3.2309


KeyboardInterrupt: 