In [1]:
import torch
import wandb 
import r2pipe 
import numpy as np 
from tqdm import trange, tqdm
import torch.nn.functional as F
from sklearn.metrics import classification_report
from transformers import RobertaForSequenceClassification, PreTrainedTokenizerFast

import os
import random
import tempfile
from os import PathLike

device = torch.device('mps')

os.environ['TOKENIZERS_PARALLELISM'] = 'false'
api = wandb.Api() 
runs = {
    'frozen_pretrained': "t4o7wvla",
    'pretrained': "e1tosi4k",
    'base': "e96b8h5a"
}

MAL_PATH = "/Volumes/malware-dataset/unobfuscated-exes/"
BEN_PATH = "/Volumes/malware-dataset/benign_program_dataset_WinXP_SP3/benign_programs_WinXP_SP3"
tokenizer = PreTrainedTokenizerFast.from_pretrained("/Users/henrywilliams/Documents/programming/python/ai/malbert-test/MalBERTa")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_model(id):
    with tempfile.TemporaryDirectory() as tdir: 
        artifact = api.artifact(f'henry-williams/opcode-malberta/model-{id}:v1', type='model')
        base_path = artifact.download(root=tdir)

        return RobertaForSequenceClassification.from_pretrained(base_path)

model = get_model(runs['base']).to(device)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Downloading large artifact model-e96b8h5a:v1, 87.83MB. 7 files... 
[34m[1mwandb[0m:   7 of 7 files downloaded.  
Done. 0:0:0.6


In [3]:
def get_disassm(path: PathLike):
    r2 = r2pipe.open(path, ['-12'])
    r2.cmd("aaa")

    info = r2.cmdj("ij")

    if info["bin"]["arch"] != "x86":
        return []

    section_info = r2.cmdj("iSj")
    executable_sections = [
        section for section in section_info if "x" in section.get("perm", "")
    ]

    full_disassembly = []

    for section in executable_sections:
        start = section["vaddr"]
        size = section["vsize"]

        disassembly = r2.cmdj(f"pdaj {size} @ {start}")

        valid = [instr for instr in disassembly if set(instr["bytes"]) != {"0"}]
        full_disassembly.extend(valid)

    return full_disassembly


In [4]:
def make_opcode_sequences(instrs):
    valid_instrs = [
        instruction["inst"]
        for instruction in instrs
        if instruction["inst"] != "invalid"
    ]

    return ' '.join([instr.split(' ')[0] for instr in valid_instrs])

In [5]:
def tokenize(sample, model):
    seq_length = model.config.max_position_embeddings

    input = tokenizer(
        sample,
        padding='max_length',
        max_length=seq_length - 2,
        return_overflowing_tokens=True,
        truncation=True,
        return_special_tokens_mask=True,
        return_tensors='pt'
    )

    return input

In [7]:
def pipeline(model, path: os.PathLike, batch_size=64):
    if not os.path.exists(path):
        raise Exception(f"Could not find specified file at {path}")
    disassembly = get_disassm(path)
    opcodes = make_opcode_sequences(disassembly)
    input = tokenize(opcodes, model)

    model.eval()

    logits = []
    input_ids = input['input_ids'].split(batch_size)
    attention_mask = input['attention_mask'].split(batch_size)
    token_type_ids = input['token_type_ids'].split(batch_size)
    torch.mps.empty_cache()
    for ids, attn_mask, tok_ty_ids in zip(input_ids, attention_mask, token_type_ids):
        ids = ids.to(device)
        attn_mask = attn_mask.to(device)
        tok_ty_ids = tok_ty_ids.to(device)

        with torch.no_grad():
            logits.append(model(
                input_ids=ids,
                attention_mask=attn_mask,
                token_type_ids=tok_ty_ids
            ))
    
    logits = torch.vstack([logit.logits for logit in logits])
    return F.softmax(logits.mean(dim=0), dim=0)

# pipeline(model, os.path.join(MAL_PATH, '85.exe'))

In [8]:
ben_files = [os.path.join(BEN_PATH, file) for file in os.listdir(BEN_PATH) if file.endswith('.exe') and not file.startswith('._')]
mal_files = [os.path.join(MAL_PATH, file) for file in os.listdir(MAL_PATH) if file.endswith('.exe') and not file.startswith('._')]
random.shuffle(ben_files)
ben_files = ben_files[:len(mal_files)]

files = mal_files + ben_files 
labels = [1] * len(mal_files) + [0] * len(ben_files)

In [11]:
import json 

dataset = {f: l for f, l in zip(files, labels)}
with open('obfuscation-experiment-files.json', 'w') as file: 
    json.dump(dataset, file)

In [144]:
import subprocess

def experiment_step(model, file, p=1.0, batch_size=64):
    ''' 
    p is the likelihood of the file being obfuscated
    '''
    with tempfile.TemporaryDirectory() as tdir: 
        if random.random() < p: 
            obfuscated_path = os.path.join(tdir, file.split('/')[-1])
            subprocess.run(['upx', '-o', obfuscated_path, file], stdout=subprocess.DEVNULL)
            return pipeline(model, obfuscated_path, batch_size=batch_size)
        else: 
            return pipeline(model, file, batch_size=batch_size)

# experiment_step(model, ben_files[0])

In [145]:
def run_experiment(files, labels, model, p=0.0, batch_size=256):
    predicted = []
    actual = []

    for file, label in tqdm(zip(files, labels), total=len(files), leave=False, desc=f'p = {p:.2}'): 

        try: 
            logits = experiment_step(model, file, p=p, batch_size=batch_size)
        except: 
            continue 

        predicted.append(logits.argmax().item())
        actual.append(label)

    return classification_report(actual, predicted, output_dict=True)

In [146]:
obfuscation_likelihood = np.linspace(0, 1, 20)

results = []

for p in obfuscation_likelihood:
    results.append(run_experiment(files, labels, model, p))

p = 0.053:  18%|█▊        | 21/120 [01:20<07:09,  4.34s/it]upx: /Volumes/malware-dataset/unobfuscated-exes/HappyVirus_V_Edition.exe: CantPackException: .NET files are not yet supported
p = 0.053:  30%|███       | 36/120 [03:24<04:35,  3.28s/it]upx: /Volumes/malware-dataset/unobfuscated-exes/CryptoLocker_SI.exe: CantPackException: .NET files are not yet supported
p = 0.053:  33%|███▎      | 40/120 [03:45<08:25,  6.32s/it]upx: /Volumes/malware-dataset/unobfuscated-exes/zombie64.exe: CantPackException: .NET files are not yet supported
                                                            

KeyboardInterrupt: 

In [147]:
results

[{'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 60.0},
  '1': {'precision': 0.3103448275862069,
   'recall': 0.45,
   'f1-score': 0.3673469387755102,
   'support': 60.0},
  'accuracy': 0.225,
  'macro avg': {'precision': 0.15517241379310345,
   'recall': 0.225,
   'f1-score': 0.1836734693877551,
   'support': 120.0},
  'weighted avg': {'precision': 0.15517241379310345,
   'recall': 0.225,
   'f1-score': 0.1836734693877551,
   'support': 120.0}},
 {'0': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 60.0},
  '1': {'precision': 0.29411764705882354,
   'recall': 0.46296296296296297,
   'f1-score': 0.3597122302158273,
   'support': 54.0},
  'accuracy': 0.21929824561403508,
  'macro avg': {'precision': 0.14705882352941177,
   'recall': 0.23148148148148148,
   'f1-score': 0.17985611510791366,
   'support': 114.0},
  'weighted avg': {'precision': 0.1393188854489164,
   'recall': 0.21929824561403508,
   'f1-score': 0.17039000378644453,
   'support': 114.0}}]

In [156]:
experiment_step(model, os.path.join(MAL_PATH, 'davidTrojan.exe'), p=1.)

tensor([0.9348, 0.0652], device='mps:0')