In [18]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [20]:
IDS2017URL = "/gdrive/MyDrive/IDS2017/IDS2017.csv"

In [23]:
import torch
import pandas as pd
import tensorflow as tf
import numpy as np
import torch.nn as nn
import random

from numpy import vstack
from numpy import argmax
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from torch import Tensor
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Softmax
from torch.nn import Module
from torch.optim import SGD
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from torch.nn.init import kaiming_uniform_
from torch.nn.init import xavier_uniform_
from torchvision import transforms

In [24]:
def setup_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     np.random.seed(seed)
     random.seed(seed)
     torch.backends.cudnn.deterministic = True

setup_seed(20)

In [25]:
class IDSDataset(Dataset):

    def __init__(self, IDS2017URL, tranform=True):
      df = pd.read_csv(IDS2017URL)
      df.loc[df['Label'].isin([ 'Web Attack - Sql Injection','Web Attack - XSS', 'Web Attack - Brute Force']), 'Label'] = 'Web Attack'
      df = df[['Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Flow IAT Max', 'Bwd Packet Length Std', 'Fwd Packet Length Max', 'Flow Bytes/s', 'Total Length of Bwd Packets', 'Fwd Packet Length Mean', 'Flow Duration', 'Flow IAT Min', 'Total Length of Fwd Packets', 'Flow IAT Mean', 'Total Backward Packets', 'Bwd Packet Length Max', 'Flow Packets/s', 'Flow IAT Std', 'Fwd IAT Total', 'Bwd Packet Length Min', 'Fwd Packet Length Min', 'Label']]
      
      #df = df[~df['Label'].isin(['Heartbleed', 'Web Attack - Sql Injection', 'Infiltration', 'Web Attack - XSS', 'Web Attack - Brute Force'])]
      df = df.replace([-np.inf, np.inf], np.nan)
      df = df.dropna()
      #df.drop(df.columns[[37,39,61,62,63,64,65,66]],axis=1,inplace=True)
      self.X = df.iloc[:, :-1].apply(lambda x: (x-np.mean(x))/np.std(x)).values
      self.y = df.values[:, -1]
      self.X = self.X.astype('float32')
      le = LabelEncoder()
      le.fit(["BENIGN", "DoS Hulk", "PortScan", "DDoS", "DoS GoldenEye","FTP-Patator","SSH-Patator","DoS slowloris","DoS Slowhttptest","Web Attack","Bot","Infiltration","Heartbleed"])
      self.y = le.transform(self.y)


    def __len__(self):
      return len(self.X)
    
    def __getitem__(self, idx):
      return [self.X[idx], self.y[idx]]
 
    # get indexes for train and test rows
    def get_splits(self, n_test=0.33):
        # determine sizes
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        # calculate the split
        return random_split(self, [train_size, test_size])

class MLP(Module):
    # define model elements
    def __init__(self, n_inputs):
        super(MLP, self).__init__()
        # input to first hidden layer
        self.hidden1 = Linear(n_inputs, 20)
        kaiming_uniform_(self.hidden1.weight, nonlinearity='relu')
        self.act1 = ReLU()
        # second hidden layer
        self.hidden2 = Linear(20, 20)
        kaiming_uniform_(self.hidden2.weight, nonlinearity='relu')
        self.act2 = ReLU()
        # third hidden layer and output
        self.hidden3 = Linear(20, 13)
        xavier_uniform_(self.hidden3.weight)
        self.act3 = Softmax(dim=1)

    def forward(self, X):
        # input to first hidden layer
        X = self.hidden1(X)
        X = self.act1(X)

        # second hidden layer
        X = self.hidden2(X)
        X = self.act2(X)

        # output layer
        X = self.hidden3(X)
        X = self.act3(X)

        return X
 
def prepare_data(path):
    # load the dataset
    dataset = IDSDataset(path)
    # calculate split
    train, test = dataset.get_splits()
    # prepare data loaders
    train_dl = DataLoader(train, batch_size=10000, shuffle=True)
    test_dl = DataLoader(test, batch_size=10000, shuffle=False)
    return train_dl, test_dl

def train_model(train_dl, model):
    # define the optimization
    criterion = CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=0.001)
    # enumerate epochs
    for epoch in range(25):
        # enumerate mini batches
        print(model.state_dict())
        for i, (inputs, targets) in enumerate(train_dl):
            # clear the gradients
            optimizer.zero_grad()
            #torch.autograd.set_detect_anomaly(True)
            # compute the model output
            #print(inputs)
            #print("******************")
            yhat = model(inputs)
            #print(yhat)
            # calculate loss
            loss = criterion(yhat, targets)
            # credit assignment
            loss.backward()
            # update model weights
            optimizer.step()
            if i % 10 == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, i * len(inputs), len(train_dl.dataset),
                100. * i / len(train_dl), loss.item()))
            # print('[%d, %5d] loss: %.3f' %(epoch + 1, i + 1, loss.item()))
            # running_loss = 0.0

def evaluate_model(test_dl, model):
    predictions, actuals = list(), list()
    for i, (inputs, targets) in enumerate(test_dl):
        # evaluate the model on the test set
        yhat = model(inputs)
        # retrieve numpy array
        yhat = yhat.detach().numpy()
        actual = targets.numpy()
        # convert to class labels
        yhat = argmax(yhat, axis=1)
        # reshape for stacking
        actual = actual.reshape((len(actual), 1))
        yhat = yhat.reshape((len(yhat), 1))
        # store
        predictions.append(yhat)
        actuals.append(actual)
    predictions, actuals = vstack(predictions), vstack(actuals)
    # calculate accuracy
    acc = accuracy_score(actuals, predictions)
    return acc

def predict(row, model):
    # convert row to data
    row = Tensor([row])
    # make prediction
    yhat = model(row)
    # retrieve numpy array
    yhat = yhat.detach().numpy()
    return yhat

In [None]:
# class IDSDataset(Dataset):

#     def __init__(self, IDS2017URL, tranform=True):
#       df = pd.read_csv(IDS2017URL)
#       df = df[~df['Label'].isin(['Heartbleed', 'Web Attack - Sql Injection', 'Infiltration', 'Web Attack - XSS', 'Web Attack - Brute Force'])]
#       df = df.replace([-np.inf, np.inf], np.nan)
#       df = df.dropna()
#       df = pd.concat([df.iloc[:,0:37], df[:,38:],df.iloc[:, -2:]], axis=1)
#       self.X = df.iloc[:, :-2].apply(lambda x: (x-np.mean(x))/np.std(x)+0.00001).values
#       self.y = df.values[:, -2]
#       self.X = self.X.astype('float32')
#       self.y = LabelEncoder().fit_transform(self.y)

#     def __len__(self):
#       return len(self.X)
    
#     def __getitem__(self, idx):
#       return [self.X[idx], self.y[idx]]
 
#     # get indexes for train and test rows
#     def get_splits(self, n_test=0.33):
#         # determine sizes
#         test_size = round(n_test * len(self.X))
#         train_size = len(self.X) - test_size
#         # calculate the split
#         return random_split(self, [train_size, test_size])

# class MLP(Module):
#     # define model elements
#     def __init__(self, n_inputs):
#         super(MLP, self).__init__()
#         # input to first hidden layer
#         self.hidden1 = Linear(n_inputs, 76)
#         #kaiming_uniform_(self.hidden1.weight, nonlinearity='relu')
#         self.act1 = ReLU()
#         # second hidden layer
#         self.hidden2 = Linear(76, 76)
#         #kaiming_uniform_(self.hidden2.weight, nonlinearity='relu')
#         self.act2 = ReLU()
#         # third hidden layer and output
#         self.hidden3 = Linear(76, 10)
#         #xavier_uniform_(self.hidden3.weight)
#         self.act3 = Softmax(dim=1)

#     def forward(self, X):
#         # input to first hidden layer
#         X = self.hidden1(X)
#         X = self.act1(X)
#         print(X)
#         # second hidden layer
#         X = self.hidden2(X)
#         X = self.act2(X)
#         print(X)
#         # output layer
#         X = self.hidden3(X)
#         X = self.act3(X)
#         print(X)
#         return X
 
# def prepare_data(path):
#     # load the dataset
#     dataset = IDSDataset(path)
#     # calculate split
#     train, test = dataset.get_splits()
#     # prepare data loaders
#     train_dl = DataLoader(train, batch_size=1000, shuffle=True)
#     test_dl = DataLoader(test, batch_size=1000, shuffle=False)
#     return train_dl, test_dl

# def train_model(train_dl, model):
#     # define the optimization
#     criterion = CrossEntropyLoss()
#     optimizer = Adam(model.parameters(), lr=0.0001)
#     # enumerate epochs
#     for epoch in range(10):
#         # enumerate mini batches
#         print(model.state_dict())
#         for i, (inputs, targets) in enumerate(train_dl):
#             # clear the gradients
#             optimizer.zero_grad()
#             #torch.autograd.set_detect_anomaly(True)
#             # compute the model output
#             #print(inputs)
#             #print("******************")
#             yhat = model(inputs)
#             #print(yhat)
#             # calculate loss
#             loss = criterion(yhat, targets)
#             # credit assignment
#             loss.backward()
#             # update model weights
#             optimizer.step()

# def evaluate_model(test_dl, model):
#     predictions, actuals = list(), list()
#     for i, (inputs, targets) in enumerate(test_dl):
#         # evaluate the model on the test set
#         yhat = model(inputs)
#         # retrieve numpy array
#         yhat = yhat.detach().numpy()
#         actual = targets.numpy()
#         # convert to class labels
#         yhat = argmax(yhat, axis=1)
#         # reshape for stacking
#         actual = actual.reshape((len(actual), 1))
#         yhat = yhat.reshape((len(yhat), 1))
#         # store
#         predictions.append(yhat)
#         actuals.append(actual)
#     predictions, actuals = vstack(predictions), vstack(actuals)
#     # calculate accuracy
#     acc = accuracy_score(actuals, predictions)
#     return acc

# def predict(row, model):
#     # convert row to data
#     row = Tensor([row])
#     # make prediction
#     yhat = model(row)
#     # retrieve numpy array
#     yhat = yhat.detach().numpy()
#     return yhat

In [26]:
train_dl, test_dl = prepare_data(IDS2017URL)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
train_dl.dataset

<torch.utils.data.dataset.Subset at 0x7f69b50f3f98>

In [None]:
print(len(train_dl.dataset), len(test_dl.dataset))

1893185 932464


In [27]:
model = MLP(19)

In [28]:
train_model(train_dl, model)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        [-9.9500e-02,  1.9927e-01, -5.2509e-01,  4.3303e-01,  3.9011e-01,
         -3.1734e-01,  2.6255e-01, -5.1008e-01, -3.1534e-01, -5.2731e-01,
          4.9107e-03, -3.3957e-01,  4.8918e-01, -5.3295e-01,  3.1064e-01,
         -5.3652e-01, -1.6870e-01,  2.7702e-01, -3.5768e-01],
        [-7.8027e-02, -3.6576e-01,  5.8888e-02, -2.2922e-01, -3.9776e-01,
          3.6890e-01, -1.9972e-01, -3.0960e-02,  1.5485e-01,  5.2338e-01,
         -4.8843e-01,  3.9346e-01,  2.6886e-01, -1.5970e-04,  4.5283e-01,
         -4.2455e-01,  3.6887e-01, -5.0473e-02,  1.9975e-01],
        [ 1.9205e-01,  3.5297e-01, -2.8362e-02, -2.9976e-01,  3.9631e-01,
          4.7439e-01, -2.5648e-01, -1.9890e-01,  4.6720e-01,  5.1174e-01,
         -1.2174e-01, -3.2142e-01, -9.9177e-02,  2.7200e-01,  3.8439e-02,
         -3.1218e-01, -3.8657e-01, -1.7081e-01,  4.6206e-01],
        [-5.3422e-01, -9.2193e-02,  1.2230e-01,  1.4425e-01, -3.9535e-01,
         

In [29]:
torch.save(model.state_dict(), '/gdrive/MyDrive/IDS2017/feature19_classes13_lr001_batch10000.pth')

In [30]:
acc = evaluate_model(test_dl, model)
print('Accuracy: %.3f' % acc)

Accuracy: 0.889


In [31]:
print(model.state_dict())

OrderedDict([('hidden1.weight', tensor([[ 4.0304e-01,  2.8209e-01, -1.1475e-01, -3.1173e-01,  9.9177e-01,
         -3.1921e-01, -3.9182e-01,  3.3784e-01, -1.2826e-01, -5.4374e-01,
          3.6627e-01,  6.4200e-01, -1.0975e-01, -5.4544e-02,  1.8861e-01,
          5.5847e-01, -4.7923e-01, -3.4175e-01,  3.4496e-01],
        [-9.7538e-02,  3.2713e-01,  4.1662e-01, -6.2477e-02,  3.9992e-02,
         -5.5977e-01,  2.0580e-02,  4.3077e-01, -1.2978e-01, -2.3266e-02,
         -6.4869e-02,  2.2357e-01,  2.8159e-01, -4.5599e-01, -3.7437e-01,
          1.4339e-01, -4.1953e-01,  4.0766e-01,  1.7903e-01],
        [-3.9161e-01, -3.7330e-01, -7.1030e-01, -1.0550e-01,  3.6135e-01,
         -6.4580e-01, -5.2059e-01,  4.6459e-01, -1.1069e-01,  4.6605e-01,
          1.1983e-01,  9.8748e-02,  1.4392e-01,  3.2739e-01, -5.4159e-01,
         -5.8278e-01, -3.9269e-01, -1.1758e-01,  1.9668e-01],
        [-7.7826e-02,  4.4533e-01, -5.1356e-01,  4.9808e-01,  4.7940e-01,
          3.1495e-01, -8.7443e-02,  1.1709

In [33]:
fc = model.state_dict()['hidden3.weight'].squeeze()

In [34]:
from sklearn.cluster import AgglomerativeClustering

In [35]:
clustering = AgglomerativeClustering(
        linkage='ward',
        n_clusters=2,
        affinity='euclidean',
    ).fit(fc)

In [36]:
clustering.children_

array([[10, 11],
       [ 5,  8],
       [ 2,  6],
       [12, 13],
       [ 1,  3],
       [ 9, 14],
       [ 7, 18],
       [15, 19],
       [16, 20],
       [17, 21],
       [ 0, 22],
       [ 4, 23]])