<a href="https://colab.research.google.com/github/Nicoa1409031501/Niao/blob/main/MLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multilayer perceptron(MLP)範例
- 基於惡意程式系統調用(api call)序列預測惡意程式家族

In [None]:
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
from random import shuffle

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 載入原始資料集並將其切割成 trainset, validset, testset

In [None]:
def split_dataset(family2sample):
    trainset = []
    validset = []
    testset = []

    for family, samples in family2sample.items():
        shuffle(samples)
        trainset.extend(samples[:int(len(samples)*0.8)])
        validset.extend(samples[int(len(samples)*0.8):int(len(samples)*0.9)])
        testset.extend(samples[int(len(samples)*0.9):])
    return trainset, validset, testset

In [None]:
# preload dataset
## 請依據自己存放的檔案位置修改line3,line4的檔案位置
trace_path = glob('/content/drive/MyDrive/demo_dataset/trace/*.trace')
df_label = pd.read_csv('/content/drive/MyDrive/demo_dataset/label.csv')

families = list(df_label['family'].unique())
family2sample = {}
for f in families:
    df_family = df_label[df_label['family']==f].reset_index(drop = True)
    family2sample[f] = df_family['sample'].tolist()
sample2family = {row['sample']:row['family'] for index, row in df_label.iterrows()}

# 將資料集切割成trainset, validset, testset
trainset, validset, testset = split_dataset(family2sample)

In [None]:
families

['Emotet', 'Fareit', 'Gandcrab', 'Lokibot', 'Tofsee']

In [None]:
with open(trace_path[0]) as fp:
  trace = [l.strip() for l in fp.readlines()]
trace[:20]

['NtDelayExecution',
 'NtAllocateVirtualMemory',
 'RegOpenKeyExW',
 'RegQueryValueExW',
 'RegCloseKey',
 'NtAllocateVirtualMemory',
 'NtAllocateVirtualMemory',
 'GetSystemWindowsDirectoryW',
 'NtOpenFile',
 'NtQueryInformationFile',
 'NtClose',
 'RegOpenKeyExW',
 'RegQueryValueExW',
 'RegCloseKey',
 'RegOpenKeyExW',
 'RegQueryValueExW',
 'RegCloseKey',
 'LdrGetDllHandle',
 'LdrGetProcedureAddress',
 'NtFreeVirtualMemory']

### 資料集輸入特徵預處理

In [None]:
# 整理每筆資料的輸入
sample2api = {}
for p in tqdm(trace_path):
  s = p.split('/')[-1].split('.trace')[0]
  with open(p) as fp:
    apis = [l.strip() for l in fp]
  sample2api[s] = apis

# 輸入特徵(input feature)預處理(One-Hot Encoding)
all_api = []
for s in tqdm(sample2api):
  all_api.extend(sample2api[s])
all_api = list(set(all_api))
api_onehot = np.eye(len(all_api))

idx2api = {i:a for i, a in enumerate(all_api)}
api2idx = {a:i for i, a in enumerate(all_api)}
idx2label = {i:f for i, f in enumerate(families)}
label2idx = {f:i for i, f in enumerate(families)}

100%|██████████| 4126/4126 [01:57<00:00, 35.10it/s] 
100%|██████████| 4126/4126 [00:01<00:00, 2599.09it/s]


### 製作 torch dataset


In [None]:
def data_processed(dataset, label):
    global api_onehot
    global api2idx
    global label2idx

    padded_len = 100
    processed_dataset = []
    for s, apis in tqdm(dataset.items()):
        input_apis = np.empty([0])
        if len(apis) >= padded_len:
            for a in apis[:padded_len]:
                input_apis = np.concatenate((input_apis, api_onehot[api2idx[a]]))
        else:
            for a in apis:
                input_apis = np.concatenate((input_apis, api_onehot[api2idx[a]]))
            padding = np.zeros(244*(padded_len-len(apis)))
            input_apis = np.concatenate((input_apis, padding))

        processed_dataset.append({
            'api': input_apis.astype('float32'),
            'name': s,
            'label': label2idx[label[s]]
        })
    return processed_dataset

In [None]:
processed_dataset = data_processed(sample2api, sample2family)

train_dataset, valid_dataset, test_dataset = [], [], []
for data in processed_dataset:
    if data['name'] in trainset:
        train_dataset.append(data)
    elif data['name'] in validset:
        valid_dataset.append(data)
    elif data['name'] in testset:
        test_dataset.append(data)

# batch_size=64，一次取64筆資料進行訓練/驗證/測試
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=True)

100%|██████████| 4126/4126 [00:03<00:00, 1220.24it/s]


## 訓練 Model

In [None]:
# 定義MLP model
class MLP(nn.Module):
    def __init__(self, input_size, output_size):
        super(MLP, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(input_size, 100),
            nn.ReLU(),
            nn.Linear(100, output_size)
        )

    def forward(self, x):
        x = self.mlp(x)
        return x

In [None]:
# 每筆資料的輸入長度固定為100個api call，每個api call向量維度為244
padded_len = 100
input_size = len(all_api) * padded_len # 244*100 = 24400
output_size = len(families)

model = MLP(input_size, output_size)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

epochs = 30
for epoch in range(epochs):
  model.train()
  train_losses = []
  for i, data in enumerate(train_loader):
    outputs = model(data['api'])
    labels = data['label']
    loss = loss_fn(outputs, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    train_losses.append(loss.item())

  model.eval()
  correct = 0
  total = 0
  valid_losses = []
  with torch.no_grad():
    for i, data in enumerate(valid_loader):
      outputs = model(data['api'])
      labels = data['label']
      loss = loss_fn(outputs, labels)

      _, predicted = torch.max(outputs.data, 1)
      correct += (predicted == labels).sum().item()
      total += labels.size(0)

      valid_losses.append(loss.item())
  print(f'epoch:{epoch} | train loss:{np.mean(train_losses):.4f} | valid loss:{np.mean(valid_losses):.4f} | valid acc:{100*correct/total:.3f}')

epoch:0 | train loss:0.5044 | valid loss:0.2358 | valid acc:88.378
epoch:1 | train loss:0.1735 | valid loss:0.2051 | valid acc:89.104
epoch:2 | train loss:0.1417 | valid loss:0.2218 | valid acc:92.010
epoch:3 | train loss:0.1299 | valid loss:0.2074 | valid acc:90.315
epoch:4 | train loss:0.1251 | valid loss:0.2029 | valid acc:92.252
epoch:5 | train loss:0.1186 | valid loss:0.2170 | valid acc:91.768
epoch:6 | train loss:0.1208 | valid loss:0.1937 | valid acc:92.252
epoch:7 | train loss:0.1203 | valid loss:0.2114 | valid acc:90.557
epoch:8 | train loss:0.1153 | valid loss:0.2378 | valid acc:92.010
epoch:9 | train loss:0.1144 | valid loss:0.2179 | valid acc:92.494
epoch:10 | train loss:0.1143 | valid loss:0.2009 | valid acc:92.010
epoch:11 | train loss:0.1142 | valid loss:0.2042 | valid acc:92.494
epoch:12 | train loss:0.1177 | valid loss:0.2018 | valid acc:92.252
epoch:13 | train loss:0.1131 | valid loss:0.2237 | valid acc:90.557
epoch:14 | train loss:0.1128 | valid loss:0.2116 | valid a