In [None]:
import torch
import r2pipe
import numpy as np
from tqdm import tqdm
from transformers import PreTrainedTokenizerFast
from transformers import RobertaForSequenceClassification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import os 
import warnings
from os import PathLike


warnings.filterwarnings("ignore")

SAMPLE_PATH = "/Volumes/New Volume/malware-detection-dataset/malware-source-code/DanielStott-CryptoLite/x64/Debug/CryptoLite.exe"

tokenizer = PreTrainedTokenizerFast.from_pretrained('../MalBERTa')
model = RobertaForSequenceClassification.from_pretrained('../MalBERTA-pretrained-classifier')

In [None]:
def get_disassembly(path: PathLike):
    if not os.path.exists(path):
        raise Exception(f"Could not find specified file at {path}")

    r2 = r2pipe.open(path)
    r2.cmd("aaa")

    info = r2.cmdj("ij")

    if info["bin"]["arch"] != "x86":
        return []

    section_info = r2.cmdj("iSj")
    executable_sections = [
        section for section in section_info if "x" in section.get("perm", "")
    ]

    full_disassembly = []

    for section in executable_sections:
        start = section["vaddr"]
        size = section["vsize"]

        disassembly = r2.cmdj(f"pdaj {size} @ {start}")

        valid = [instr for instr in disassembly if set(instr["bytes"]) != {"0"}]
        full_disassembly.extend(valid)

    return full_disassembly

disassembly = get_disassembly(SAMPLE_PATH)
opcodes = [instr['inst'].split(' ')[0] for instr in disassembly]
opcodes = list(filter(lambda x: x != "invalid", opcodes))
opcodes = ' '.join(opcodes)


In [None]:
input = tokenizer(
    opcodes, 
    padding='max_length',
    max_length=32,
    return_overflowing_tokens=True,
    truncation=True,
    return_special_tokens_mask=True,
)

labels = model(torch.tensor(input['input_ids']), torch.tensor(input['attention_mask']))

In [None]:
labels.logits.argmax(dim=-1).mean(dtype=torch.float).round()