In [None]:
from scapy.all import *
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
import pickle as pk
import csv
import time
import random
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix, f1_score

In [None]:
def timer(start, end):
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)
    return ("{:0>2}:{:0>2}:{:0>2}".format(int(hours),int(minutes),int(seconds)))

In [None]:
def binary_accuracy(preds, y):
    rounded_preds = (preds>0.5)
    correct = (rounded_preds == y).float()
    acc = correct.sum()
    return acc

def confusion_matrix_(y, preds):
    rounded_preds = (preds>0.5)
    cm = confusion_matrix(y, rounded_preds).ravel()
    return cm

def f1_score_(y, preds):
    rounded_preds = (preds>0.5)
    f1 = f1_score(y, rounded_preds)
    return f1

In [None]:
def binary_accuracy_flow(preds, y, flow):
    rounded_preds = rounded_preds_by_flow(preds, y, flow)
    correct = (rounded_preds == y).float()
    acc = correct.sum()
    return acc

def confusion_matrix_flow(y, preds, flow):
    rounded_preds = rounded_preds_by_flow(preds, y, flow).cpu().detach().numpy()
    cm = confusion_matrix(y, rounded_preds).ravel()
    return cm

def f1_score_flow(y, preds, flow):
    rounded_preds = rounded_preds_by_flow(preds, y, flow).cpu().detach().numpy()
    f1 = f1_score(y, rounded_preds)
    return f1

In [None]:
def rounded_preds_by_flow(preds, y, flow):
    rounded_preds = (preds>0.5).float()
    cnt = []
    for i in range(len(flow)-1):
        if flow[i] != flow[i+1]:
            cnt.append(i+1)
    for i in range(len(cnt)-1):    
        if rounded_preds[cnt[i]:cnt[i+1]].mean() > 0.5:
            rounded_preds[cnt[i]:cnt[i+1]] = 1
        else:
            rounded_preds[cnt[i]:cnt[i+1]] = 0
        #print(rounded_preds[cnt[i]:cnt[i+1]], y[cnt[i]:cnt[i+1]], flow[cnt[i]:cnt[i+1]], '\n')
        
    return rounded_preds    

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
with open('dataset.pk', 'rb') as f:
    dataset = pk.load(f)

In [None]:
dataset = np.asarray(dataset, dtype=np.int64)

# train : validation : test = 8 : 1 : 1
train_data, test_data = train_test_split(dataset, test_size=0.2, stratify=dataset[:,33], random_state=1)
test_data, validation_data = train_test_split(test_data, test_size=0.5, stratify=test_data[:,33], random_state=1)

train_data = pd.DataFrame(train_data).sort_values(by=34)
test_data = pd.DataFrame(test_data).sort_values(by=34)
validation_data = pd.DataFrame(validation_data).sort_values(by=34)

train_data = np.asarray(train_data, dtype=np.int64)
test_data = np.asarray(test_data, dtype=np.int64)
validation_data = np.asarray(validation_data, dtype=np.int64)

train_label = train_data[:, 33]
test_label = test_data[:, 33]
train_flow = train_data[:, 34]
test_flow = test_data[:, 34]
train_data = train_data[:, :33]
test_data = test_data[:, :33]

train_data, train_label, train_flow = np.array(train_data), np.array(train_label), np.array(train_flow)
test_data, test_label, test_flow = np.array(test_data), np.array(test_label), np.array(test_flow)

train_data, train_label, train_flow = torch.Tensor(train_data), torch.Tensor(train_label), torch.Tensor(train_flow)
test_data, test_label, test_flow = torch.Tensor(test_data), torch.Tensor(test_label), torch.Tensor(test_flow)

train_data = TensorDataset(train_data, train_label, train_flow)
test_data = TensorDataset(test_data, test_label, test_flow)

train_loader = DataLoader(train_data, batch_size=100, shuffle=False)
test_loader = DataLoader(test_data, batch_size=100, shuffle=False)

In [None]:
class LSTMmodel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim, dropout, batch_size):
        super(LSTMmodel, self).__init__()
        
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.output_dim = output_dim
        self.batch_size = batch_size
        
        self.drop = nn.Dropout(dropout)
        self.embed = nn.Embedding(num_embeddings = 0xffff+1, embedding_dim=64)
        
        self.lstm = nn.LSTM(input_size=self.input_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers, dropout=dropout, batch_first=True)
        
        self.fc = nn.Linear(self.hidden_dim, self.output_dim)
        
    def forward(self, x):
        emb = self.embed(x)
        out, (hn, cn) = self.lstm(emb)
        out = self.fc(out[:, -1, :])

        return torch.sigmoid(out)

In [None]:
input_dim = 64
hidden_dim = 128
output_dim = 1
num_layers = 3
dropout = 0.2
batch_size = 100

model = LSTMmodel(input_dim, hidden_dim, num_layers, output_dim, dropout, batch_size).to(device)
learning_rate = 1e-4

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.BCELoss()

In [None]:
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0.0
    epoch_acc = 0.0
    epoch_f1_score = 0.0
    
    epoch_tn = 0.0
    epoch_fp = 0.0
    epoch_fn = 0.0
    epoch_tp = 0.0
    
    model.train()

    for x_, y_, flow in iterator:
        x_ = x_.long()
        y_ = y_.to(device)
        optimizer.zero_grad()
        preds = model(x_.to(device)).reshape(-1)
        loss = criterion(preds, y_)
        acc = binary_accuracy(preds, y_)
        cm = confusion_matrix_(y_.cpu().detach().numpy(), preds.cpu().detach().numpy())
        f1 = f1_score_(y_.cpu().detach().numpy(), preds.cpu().detach().numpy())
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
        epoch_f1_score += f1
        
        if len(cm)==1:
            cm = np.append(cm, 0)
            cm = np.append(cm, 0)
            cm = np.append(cm, 0)
        
        epoch_tn += cm[0]
        epoch_fp += cm[1]
        epoch_fn += cm[2]
        epoch_tp += cm[3]
        
    return epoch_loss/len(train_data), epoch_acc/len(train_data), epoch_f1_score/len(iterator), epoch_tn, epoch_fp, epoch_fn, epoch_tp

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0.0
    epoch_acc = 0.0
    epoch_f1_score = 0.0

    epoch_tn = 0.0
    epoch_fp = 0.0
    epoch_fn = 0.0
    epoch_tp = 0.0
    
    model.eval()
    with torch.no_grad():
        for x_, y_, _ in iterator:
            x_ = x_.long()
            y_ = y_.to(device)
            preds = model(x_.to(device)).reshape(-1)
            loss = criterion(preds, y_)
            acc = binary_accuracy(preds, y_)
            cm = confusion_matrix_(y_.cpu().detach().numpy(), preds.cpu().detach().numpy())
            f1 = f1_score_(y_.cpu().detach().numpy(), preds.cpu().detach().numpy())
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()

            epoch_f1_score += f1
            
            if len(cm)==1:
                cm = np.append(cm, 0)
                cm = np.append(cm, 0)
                cm = np.append(cm, 0)
                
            epoch_tn += cm[0]
            epoch_fp += cm[1]
            epoch_fn += cm[2]
            epoch_tp += cm[3]
        
    return epoch_loss/len(test_data), epoch_acc/len(test_data), epoch_f1_score/len(iterator), epoch_tn, epoch_fp, epoch_fn, epoch_tp

In [None]:
def evaluate_flow(model, iterator, criterion):
    
    epoch_loss = 0.0
    epoch_acc = 0.0
    epoch_f1_score = 0.0

    epoch_tn = 0.0
    epoch_fp = 0.0
    epoch_fn = 0.0
    epoch_tp = 0.0
    
    model.eval()
    with torch.no_grad():
        for x_, y_, flow in iterator:
            x_ = x_.long()
            y_ = y_.to(device)

            preds = model(x_.to(device)).reshape(-1)
            loss = criterion(preds, y_)
            acc = binary_accuracy_flow(preds, y_, flow)
            cm = confusion_matrix_flow(y_.cpu().detach().numpy(), preds, flow)
            f1 = f1_score_flow(y_.cpu().detach().numpy(), preds, flow)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()

            epoch_f1_score += f1
            
            if len(cm)==1:
                cm = np.append(cm, 0)
                cm = np.append(cm, 0)
                cm = np.append(cm, 0)
                
            epoch_tn += cm[0]
            epoch_fp += cm[1]
            epoch_fn += cm[2]
            epoch_tp += cm[3]
        
    return epoch_loss/len(test_data), epoch_acc/len(test_data), epoch_f1_score/len(iterator), epoch_tn, epoch_fp, epoch_fn, epoch_tp

In [None]:
# MAIN Process

n_epochs = 20
start_time = time.time()

train_loss_hist = []
train_acc_hist = []
test_loss_hist = []
test_acc_hist = []

train_f1_hist = []
train_tn_hist = []
train_fp_hist = []
train_fn_hist = []
train_tp_hist = []

test_f1_hist = []
test_tn_hist = []
test_fp_hist = []
test_fn_hist = []
test_tp_hist = []

train_cm_hist = []
test_cm_hist = []

train_cm = np.zeros((2, 2))
test_cm = np.zeros((2, 2)) 

for epoch in range(0, n_epochs):
    train_loss, train_acc, train_f1, train_tn, train_fp, train_fn, train_tp = train(model, train_loader, optimizer, criterion)
    test_loss, test_acc, test_f1, test_tn, test_fp, test_fn, test_tp = evaluate(model, test_loader, criterion)

    train_loss_hist.append(train_loss)
    train_acc_hist.append(train_acc)
    test_loss_hist.append(test_loss)
    test_acc_hist.append(test_acc)
    
    train_f1_hist.append(train_f1)
    test_f1_hist.append(test_f1)

    train_cm_hist.append((train_tn, train_fp, train_fn, train_tp))
    test_cm_hist.append((test_tn, test_fp, test_fn, test_tp))
    
    torch.save(model, 'Results/epoch_{}.model'.format(epoch))
    
    print('| Epoch : {:02} | Elapsed time : {} | Train Loss : {:.6f} | Train Acc : {:.2f}% | Test Loss : {:.6f} | Test Acc : {:.2f}% |'
          .format(epoch+1, timer(start_time, time.time()), train_loss, train_acc*100, test_loss, test_acc*100))
    print('| Train f-1 : {:.6f} | Test f-1 : {:.6f} |'.format(train_f1, test_f1))

In [None]:
plt.plot([i for i in range(20)],train_acc_hist[0:20])
plt.plot([i for i in range(20)],test_acc_hist[0:20])
plt.plot([i for i in range(20)],test_acc_hist2[0:20])
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.xticks(np.arange(0,20))
plt.title('Training and Validation Accuracy')
plt.legend(['Training_Accuracy','Validation_Accuracy(by packet)', 'Validation_Accuracy(by flow)'])

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

con_mat = np.zeros([2,2], dtype = np.int)
con_mat[0][0] = test_tn
con_mat[0][1] = test_fp
con_mat[1][0] = test_fn
con_mat[1][1] = test_tp

plt.imshow(con_mat, interpolation='nearest', cmap=plt.cm.Blues)

plt.tight_layout()
plt.colorbar()
label=["Benign(0)", "Malicious(1)"] # 라벨값
tick_marks = np.arange(len(label)) 
plt.xticks(tick_marks, label)
plt.yticks(tick_marks, label)
plt.xlabel('Predicted', fontsize=15)
plt.ylabel('True', fontsize=15)
# 표 안에 숫자 기입하는 방법
name = [['TP','FN'], ['FP', 'TN']]
thresh = con_mat.max() / 2.
for i in range(2):
     for j in range(2):
        plt.text(j, i, str(name[i][j])+" = "+str(con_mat[i, j]),
                 horizontalalignment="center",
                 color="white" if con_mat[i, j] > thresh else "black",
                 fontsize=16)
    
plt.show()

### Test Sets

In [None]:
testset_label = validation_data[:, 33]
testset_flow = validation_data[:, 34]
testset_data = validation_data[:, :33]

testset_data = np.array(testset_data)
testset_label = np.array(testset_label)
testset_flow = np.array(testset_flow)

testset_data, testset_label, testset_flow = torch.Tensor(testset_data), torch.Tensor(testset_label), torch.Tensor(testset_flow)

testset_data = TensorDataset(testset_data, testset_label, testset_flow)

testset_loader = DataLoader(testset_data, batch_size=100, shuffle=False)

In [None]:
model = torch.load('Results/epoch_8.model')

In [None]:
test_loss, test_acc, test_f1, test_tn, test_fp, test_fn, test_tp = evaluate(model, testset_loader, criterion)
print('Test loss : {}, Test Acc : {}, Test F1 : {}'.format(test_loss, test_acc, test_f1))

In [None]:
test_loss, test_acc, test_f1, test_tn, test_fp, test_fn, test_tp = evaluate_flow(model, testset_loader, criterion)
print('Test loss : {}, Test Acc : {}, Test F1 : {}'.format(test_loss, test_acc, test_f1))

### Calibration Curve

In [None]:
from sklearn.calibration import calibration_curve

def train_calibrated(model, iterator, optimizer, criterion):

    epoch_loss = 0.0
    epoch_acc = 0.0
    epoch_f1_score = 0.0
    
    epoch_tn = 0.0
    epoch_fp = 0.0
    epoch_fn = 0.0
    epoch_tp = 0.0
    
    model.train()
    preds_array = []
    y_array = []
    
    for x_, y_ in iterator:
        x_ = x_.long()
        y_ = y_.to(device)
        optimizer.zero_grad()

        preds = model(x_.to(device)).reshape(-1)
        preds_array = np.append(preds_array, preds.cpu().detach().numpy())
        y_array = np.append(y_array, y_.cpu().detach().numpy())

    return y_array, preds_array
   
model = torch.load('Result/iscx-33-8epoch.model')
y_array, preds_array = train_calibrated(model, train_loader, optimizer, criterion)

prob_y, prob_preds = calibration_curve(y_array, preds_array, n_bins=10)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(prob_preds, prob_y, marker='.')
plt.show()