<a href="https://colab.research.google.com/github/Nicoa1409031501/Niao/blob/main/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recurrent Neural Network(RNN)範例
- 基於惡意程式系統調用(api call)序列預測惡意程式家族

## 練習重點
- 載入原始資料集並將其切割成 trainset, validset, testset
- 資料集輸入特徵預處理
- 製作 torch dataset
- 定義RNN model
- 訓練 model
- 測試 model

## 下載資料集
請先利用底下連結下載練習用資料集並解壓縮後放置到google drive上(對應使用colab的google帳號)  
[資料集]https://drive.google.com/file/d/1K9NtS_JcvRQnXh5nUxhpIXZaa2Nh4x8t/view?usp=sharing  
[惡意程式trace範例]https://drive.google.com/file/d/1phZG6bPq5tvSKAirGfVouVlnLEkxnVFO/view?usp=sharing

## Appendix=>Malware behavior trace 範例

In [None]:
# Jcry
import json
with open('/content/drive/MyDrive/d7e118a3753a132fbedd262fdf4809a76ce121f758eb6c829d9c5de1ffab5a3b.json') as fp:
  trace = json.load(fp)

In [None]:
# Jcry processtree
trace['behavior']['processtree'][1]

{'track': True,
 'pid': 2932,
 'process_name': 'd7e118a3753a132fbedd262fdf4809a76ce121f758eb6c829d9c5de1ffab5a3b.exe',
 'command_line': '"C:\\Users\\Baka\\AppData\\Local\\Temp\\d7e118a3753a132fbedd262fdf4809a76ce121f758eb6c829d9c5de1ffab5a3b.exe" ',
 'first_seen': 1599493475.686914,
 'ppid': 2520,
 'children': [{'track': True,
   'pid': 1676,
   'process_name': 'wscript.exe',
   'command_line': '"C:\\Windows\\System32\\WScript.exe" "C:\\Users\\Baka\\AppData\\Roaming\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\msg.vbs" ',
   'first_seen': 1599497832.904499,
   'ppid': 2932,
   'children': []},
  {'track': True,
   'pid': 912,
   'process_name': 'Enc.exe',
   'command_line': '"C:\\Users\\Baka\\AppData\\Roaming\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\Enc.exe" ',
   'first_seen': 1599497833.029501,
   'ppid': 2932,
   'children': [{'track': True,
     'pid': 3300,
     'process_name': 'vssadmin.exe',
     'command_line': 'vssadmin delete shadows /all',
     'first_seen'

In [None]:
# API call 範例
trace['behavior']['processes'][1]['calls'][980]

{'category': 'file',
 'status': 1,
 'stacktrace': [],
 'api': 'NtCreateFile',
 'return_value': 0,
 'arguments': {'create_disposition': 5,
  'file_handle': '0x0000015c',
  'filepath': 'C:\\Users\\Baka\\AppData\\Roaming\\Microsoft\\Windows\\Start Menu\\Programs\\Startup\\Dec.exe',
  'desired_access': '0x40100080',
  'file_attributes': 0,
  'filepath_r': 'Dec.exe',
  'create_options': 96,
  'status_info': 2,
  'share_access': 1},
 'time': 1599493476.108914,
 'tid': 2124,
 'flags': {'create_disposition': 'FILE_OVERWRITE_IF',
  'desired_access': 'FILE_READ_ATTRIBUTES|SYNCHRONIZE|GENERIC_WRITE',
  'create_options': 'FILE_NON_DIRECTORY_FILE|FILE_SYNCHRONOUS_IO_NONALERT',
  'file_attributes': '',
  'status_info': 'FILE_CREATED',
  'share_access': 'FILE_SHARE_READ'}}

In [None]:
calls = trace['behavior']['processes'][1]['calls'][1035:1037] + [trace['behavior']['processes'][1]['calls'][5711]]
for call in calls:
  print(call['api'])
  if call['category'] == 'file':
    print(call['arguments']['filepath'])
  else:
    print(call['arguments']['command_line'])

GetFileAttributesW
C:\Users\Baka\AppData\Roaming\Microsoft\Windows\Start Menu\Programs\Startup\Enc.exe
NtCreateFile
C:\Users\Baka\AppData\Roaming\Microsoft\Windows\Start Menu\Programs\Startup\Enc.exe
CreateProcessInternalW
"C:\Users\Baka\AppData\Roaming\Microsoft\Windows\Start Menu\Programs\Startup\Enc.exe" 


## 正文開始

In [None]:
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
from random import shuffle

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 載入原始資料集並將其切割成 trainset, validset, testset

In [None]:
def split_dataset(family2sample):
    trainset = []
    validset = []
    testset = []

    for family, samples in family2sample.items():
        shuffle(samples)
        trainset.extend(samples[:int(len(samples)*0.8)])
        validset.extend(samples[int(len(samples)*0.8):int(len(samples)*0.9)])
        testset.extend(samples[int(len(samples)*0.9):])
    return trainset, validset, testset

In [None]:
# preload dataset
## 請依據自己存放的檔案位置修改line3,line4的檔案位置
trace_path = glob('/content/drive/MyDrive/demo_dataset/trace/*.trace')
df_label = pd.read_csv('/content/drive/MyDrive/demo_dataset/label.csv')

families = list(df_label['family'].unique())
family2sample = {}
for f in families:
    df_family = df_label[df_label['family']==f].reset_index(drop = True)
    family2sample[f] = df_family['sample'].tolist()
sample2family = {row['sample']:row['family'] for index, row in df_label.iterrows()}

# 將資料集切割成trainset, validset, testset
trainset, validset, testset = split_dataset(family2sample)

In [None]:
families

['Emotet', 'Fareit', 'Gandcrab', 'Lokibot', 'Tofsee']

In [None]:
with open(trace_path[0]) as fp:
  trace = [l.strip() for l in fp.readlines()]
trace[:20]

['NtDelayExecution',
 'NtAllocateVirtualMemory',
 'RegOpenKeyExW',
 'RegQueryValueExW',
 'RegCloseKey',
 'NtAllocateVirtualMemory',
 'NtAllocateVirtualMemory',
 'GetSystemWindowsDirectoryW',
 'NtOpenFile',
 'NtQueryInformationFile',
 'NtClose',
 'RegOpenKeyExW',
 'RegQueryValueExW',
 'RegCloseKey',
 'RegOpenKeyExW',
 'RegQueryValueExW',
 'RegCloseKey',
 'LdrGetDllHandle',
 'LdrGetProcedureAddress',
 'NtFreeVirtualMemory']

##資料集輸入特徵預處理

In [None]:
# 整理每筆資料的輸入
sample2api = {}
for p in tqdm(trace_path):
  s = p.split('/')[-1].split('.trace')[0]
  with open(p) as fp:
    apis = [l.strip() for l in fp]
  sample2api[s] = apis

all_api = []
for s in tqdm(sample2api):
  all_api.extend(sample2api[s])
all_api = list(set(all_api))

idx2api = {i:a for i, a in enumerate(all_api)}
api2idx = {a:i for i, a in enumerate(all_api)}
idx2label = {i:f for i, f in enumerate(families)}
label2idx = {f:i for i, f in enumerate(families)}

100%|██████████| 4126/4126 [01:36<00:00, 42.97it/s] 
100%|██████████| 4126/4126 [00:00<00:00, 5459.45it/s]


## 製作 torch dataset


In [None]:
def data_processed(dataset, label):
    global api2idx
    global label2idx

    padded_len = 500
    processed_dataset = []
    for s, apis in tqdm(dataset.items()):
        if len(apis) >= padded_len:
          input_apis = [api2idx[a] for a in apis[:padded_len]]
        else:
          input_apis = [api2idx[a] for a in apis] + [244] * (padded_len-len(apis))

        input_apis = torch.tensor(input_apis)
        processed_dataset.append({
            'api': input_apis,
            'name': s,
            'label': label2idx[label[s]]
        })
    return processed_dataset

In [None]:
processed_dataset = data_processed(sample2api, sample2family)

train_dataset, valid_dataset, test_dataset = [], [], []
for data in processed_dataset:
    if data['name'] in trainset:
        train_dataset.append(data)
    elif data['name'] in validset:
        valid_dataset.append(data)
    elif data['name'] in testset:
        test_dataset.append(data)

# batch_size=64，一次取64筆資料進行訓練/驗證/測試
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=True)

100%|██████████| 4126/4126 [00:01<00:00, 2085.38it/s]


## 定義 RNN Model

In [None]:
# 定義RNN model
class RNN(nn.Module):
    def __init__(self, n_vocab, embedding_dim, hidden_size, layers, output_size, use_cuda):
        super(RNN, self).__init__()

        self.use_cuda = use_cuda
        self.hidden_size = hidden_size
        self.num_layers = layers
        self.embeddings = nn.Embedding(n_vocab+1, embedding_dim, padding_idx = n_vocab)
        self.RNN = nn.RNN(embedding_dim, hidden_size, num_layers = self.num_layers, batch_first=True, bidirectional=False)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        if self.use_cuda:
          input = self.embeddings(x.cuda())
          h0 = h0.cuda()
        # out = (N,L,D∗H), hn = (D∗num_layers,N,H)
        out, hn = self.RNN(input, h0)
        out = self.fc(out[:, -1, :])
        return out

# 訓練 Model

In [None]:
embedding_dim = 100
hidden_size = 64
n_vocab = len(all_api)
output_size = len(families)
num_layers = 2
use_cuda = True if torch.cuda.is_available() else False

model = RNN(n_vocab, embedding_dim, hidden_size, num_layers, output_size, use_cuda)
if use_cuda:
  model = model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

epochs = 30
for epoch in range(epochs):
  model.train()
  train_losses = []
  for i, data in enumerate(train_loader):
    outputs = model(data['api'])
    labels = data['label']
    if use_cuda:
      labels = labels.cuda()
    loss = loss_fn(outputs, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    train_losses.append(loss.item())

  model.eval()
  correct = 0
  total = 0
  valid_losses = []
  with torch.no_grad():
    for i, data in enumerate(valid_loader):
      outputs = model(data['api'])
      labels = data['label']
      if use_cuda:
        labels = labels.cuda()
      loss = loss_fn(outputs, labels)

      _, predicted = torch.max(outputs.data, 1)
      correct += (predicted == labels).sum().item()
      total += labels.size(0)

      valid_losses.append(loss.item())
  print(f'epoch:{epoch} | train loss:{np.mean(train_losses):.4f} | valid loss:{np.mean(valid_losses):.4f} | valid acc:{100*correct/total:.3f}')

epoch:0 | train loss:0.6725 | valid loss:0.3845 | valid acc:87.651
epoch:1 | train loss:0.3777 | valid loss:0.3132 | valid acc:89.588
epoch:2 | train loss:0.2955 | valid loss:0.2999 | valid acc:89.104
epoch:3 | train loss:0.2999 | valid loss:0.3135 | valid acc:90.073
epoch:4 | train loss:0.2831 | valid loss:0.3091 | valid acc:89.831
epoch:5 | train loss:0.2739 | valid loss:0.2921 | valid acc:90.315
epoch:6 | train loss:0.2449 | valid loss:0.3506 | valid acc:90.799
epoch:7 | train loss:0.5209 | valid loss:0.3669 | valid acc:89.346
epoch:8 | train loss:0.3015 | valid loss:0.3188 | valid acc:90.073
epoch:9 | train loss:0.2728 | valid loss:0.3105 | valid acc:91.525
epoch:10 | train loss:0.2524 | valid loss:0.3107 | valid acc:90.557
epoch:11 | train loss:0.2481 | valid loss:0.2917 | valid acc:91.041
epoch:12 | train loss:0.2347 | valid loss:0.3274 | valid acc:92.010
epoch:13 | train loss:0.2894 | valid loss:0.2917 | valid acc:91.283
epoch:14 | train loss:0.2425 | valid loss:0.3103 | valid a

# 測試 Model

In [None]:
# predict testset
model.eval()
correct = 0
total = 0
with torch.no_grad():
  all_predict = []
  all_label = []
  for i, data in enumerate(test_loader):
    outputs = model(data['api'])
    labels = data['label']
    if use_cuda:
      labels = labels.cuda()

    _, predicted = torch.max(outputs.data, 1)
    correct += (predicted == labels).sum().item()
    total += labels.size(0)

    all_predict.extend(predicted.tolist())
    all_label.extend(labels.tolist())

print(f'test acc:{100*correct/total:.3f}')

test acc:89.639


In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(all_label, all_predict)

array([[163,   0,   3,   1,   1],
       [  7,  63,   2,   5,   0],
       [  4,   0, 112,   0,   0],
       [  1,  18,   0,  17,   0],
       [  1,   0,   0,   0,  17]])