In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, f1_score, recall_score
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler

In [3]:
filename = 'NF-UNSW-NB15-v3'
df = pd.read_csv(f'../db/{filename}.csv')

df.drop(['IPV4_SRC_ADDR',
         'IPV4_DST_ADDR',
         'L4_SRC_PORT',
         'L4_DST_PORT',
         'L7_PROTO',
         'TCP_FLAGS',
         'CLIENT_TCP_FLAGS',
         'SERVER_TCP_FLAGS',
         'MIN_TTL', 
         'MAX_TTL',
         'SHORTEST_FLOW_PKT',
         'MIN_IP_PKT_LEN', 
         'TCP_WIN_MAX_IN', 
         'TCP_WIN_MAX_OUT', 
         'DNS_QUERY_ID', 
         'DNS_TTL_ANSWER',
         'FTP_COMMAND_RET_CODE',
         'SRC_TO_DST_SECOND_BYTES',
         'DST_TO_SRC_SECOND_BYTES',
         'FLOW_START_MILLISECONDS',
         'FLOW_END_MILLISECONDS',], inplace=True, axis=1)

In [4]:
dictionary_sets_by_attack_type = {}
attack_types = df['Attack'].unique()

In [5]:
for attack_type in attack_types:
    print(f"Processando a categoria: '{attack_type}'")
    df_current_attack = df[df['Attack'] == attack_type]

    df_train_current_attack, df_aux_current_attack = train_test_split(df_current_attack, train_size=0.5, random_state=42)
    df_test_current_attack, df_val_current_attack = train_test_split(df_aux_current_attack, train_size=0.5, random_state=42)

    dictionary_sets_by_attack_type[attack_type] = {
        'treino': df_train_current_attack,
        'teste': df_test_current_attack,
        'validacao': df_val_current_attack
    }
    print(f"  -> Treino: {len(df_train_current_attack)} | Teste: {len(df_test_current_attack)} | Validação: {len(df_val_current_attack)}")


Processando a categoria: 'Benign'
  -> Treino: 1118865 | Teste: 559433 | Validação: 559433
Processando a categoria: 'Fuzzers'
  -> Treino: 16908 | Teste: 8454 | Validação: 8454
Processando a categoria: 'Exploits'
  -> Treino: 21374 | Teste: 10687 | Validação: 10687
Processando a categoria: 'Backdoor'
  -> Treino: 2329 | Teste: 1165 | Validação: 1165
Processando a categoria: 'Reconnaissance'
  -> Treino: 8537 | Teste: 4268 | Validação: 4269
Processando a categoria: 'Generic'
  -> Treino: 9825 | Teste: 4913 | Validação: 4913
Processando a categoria: 'DoS'
  -> Treino: 2990 | Teste: 1495 | Validação: 1495
Processando a categoria: 'Shellcode'
  -> Treino: 1190 | Teste: 595 | Validação: 596
Processando a categoria: 'Analysis'
  -> Treino: 613 | Teste: 306 | Validação: 307
Processando a categoria: 'Worms'
  -> Treino: 79 | Teste: 39 | Validação: 40


In [6]:
list_train = [dictionary_sets_by_attack_type[attack_type]['treino'] for attack_type in attack_types]
df_train = pd.concat(list_train)

list_test = [dictionary_sets_by_attack_type[attack_type]['teste'] for attack_type in attack_types]
df_test = pd.concat(list_test)

list_val = [dictionary_sets_by_attack_type[attack_type]['validacao'] for attack_type in attack_types]
df_val = pd.concat(list_val)

In [7]:
print(f"--- Base de Treino ---")
print(f"Tamanho: {len(df_train)} linhas")
print(f"Categorias presentes: {df_train['Attack'].unique()}")
print(df_train['Attack'].value_counts())
print("-" * 25)

print(f"\n--- Base de Teste ---")
print(f"Tamanho: {len(df_test)} linhas")
print(f"Categorias presentes: {df_test['Attack'].unique()}")
print(df_test['Attack'].value_counts())
print("-" * 25)

print(f"\n--- Base de Validação ---")
print(f"Tamanho: {len(df_val)} linhas")
print(f"Categorias presentes: {df_val['Attack'].unique()}")
print(df_val['Attack'].value_counts())
print("-" * 25)

--- Base de Treino ---
Tamanho: 1182710 linhas
Categorias presentes: ['Benign' 'Fuzzers' 'Exploits' 'Backdoor' 'Reconnaissance' 'Generic' 'DoS'
 'Shellcode' 'Analysis' 'Worms']
Attack
Benign            1118865
Exploits            21374
Fuzzers             16908
Generic              9825
Reconnaissance       8537
DoS                  2990
Backdoor             2329
Shellcode            1190
Analysis              613
Worms                  79
Name: count, dtype: int64
-------------------------

--- Base de Teste ---
Tamanho: 591355 linhas
Categorias presentes: ['Benign' 'Fuzzers' 'Exploits' 'Backdoor' 'Reconnaissance' 'Generic' 'DoS'
 'Shellcode' 'Analysis' 'Worms']
Attack
Benign            559433
Exploits           10687
Fuzzers             8454
Generic             4913
Reconnaissance      4268
DoS                 1495
Backdoor            1165
Shellcode            595
Analysis             306
Worms                 39
Name: count, dtype: int64
-------------------------

--- Base de Valida

In [8]:
scaler = MinMaxScaler()

In [9]:
df_train_benign = df_train[df_train['Attack'] == 'Benign']
df_train_attacks = df_train[df_train['Attack'] != 'Benign']

rus = df_train_attacks['Attack'].value_counts().min()
if rus < 1000:
    rus = 1000

df_train_attacks_balanced = df_train_attacks.groupby('Attack').sample(n=rus, replace=True, random_state=42)

num_attack_classes = len(df_train_attacks['Attack'].unique())
num_benign_samples = num_attack_classes * rus
df_train_benign_sampled = df_train_benign.sample(n=num_benign_samples, random_state=42)

df_train = pd.concat([df_train_attacks_balanced, df_train_benign_sampled])
df_train = shuffle(df_train, random_state=42).reset_index(drop=True)


X_train = df_train.drop(['Label', 'Attack'], axis=1)
y_train = df_train['Label'].to_numpy()

X_train = scaler.fit_transform(X_train)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)

In [20]:
print(df_train['Label'].value_counts())
print()
print(df_train['Attack'].value_counts())
print()
print(X_train.shape)
print()
print(y_train.unique(return_counts=True))
print(X_train.min(), X_train.max(), X_train.mean())

Label
1    9000
0    9000
Name: count, dtype: int64

Attack
Benign            9000
DoS               1000
Shellcode         1000
Generic           1000
Analysis          1000
Reconnaissance    1000
Fuzzers           1000
Worms             1000
Exploits          1000
Backdoor          1000
Name: count, dtype: int64

torch.Size([18000, 32])

(tensor([0, 1]), tensor([9000, 9000]))
tensor(0.) tensor(1.) tensor(0.0519)


In [21]:
df_test_benign = df_test[df_test['Attack'] == 'Benign']
df_test_attacks = df_test[df_test['Attack'] != 'Benign']

rus = df_test_attacks['Attack'].value_counts().min()
if rus < 1000:
    rus = 1000

df_test_attacks_balanced = df_test_attacks.groupby('Attack').sample(n=rus, replace=True, random_state=42)

num_attack_classes = len(df_test_attacks['Attack'].unique())
num_benign_samples = num_attack_classes * rus
df_test_benign_sampled = df_test_benign.sample(n=num_benign_samples, random_state=42)

df_test = pd.concat([df_test_attacks_balanced, df_test_benign_sampled])
df_test = shuffle(df_test, random_state=42).reset_index(drop=True)


X_test = df_test.drop(['Label', 'Attack'], axis=1)
y_test = df_test['Label'].to_numpy()

X_test = scaler.fit_transform(X_test)

X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)

In [22]:
print(df_test['Label'].value_counts())
print()
print(df_test['Attack'].value_counts())
print()
print(X_test.shape)
print()
print(y_test.unique(return_counts=True))
print(X_test.min(), X_test.max(), X_test.mean())

Label
1    9000
0    9000
Name: count, dtype: int64

Attack
Benign            9000
DoS               1000
Shellcode         1000
Generic           1000
Analysis          1000
Reconnaissance    1000
Fuzzers           1000
Worms             1000
Exploits          1000
Backdoor          1000
Name: count, dtype: int64

torch.Size([18000, 32])

(tensor([0, 1]), tensor([9000, 9000]))
tensor(0.) tensor(1.) tensor(0.0487)


In [23]:
df_val_benign = df_val[df_val['Attack'] == 'Benign']
df_val_attacks = df_val[df_val['Attack'] != 'Benign']

rus = df_val_attacks['Attack'].value_counts().min()
if rus < 1000:
    rus = 1000

df_val_attacks_balanced = df_val_attacks.groupby('Attack').sample(n=rus, replace=True, random_state=42)

num_attack_classes = len(df_val_attacks['Attack'].unique())
num_benign_samples = num_attack_classes * rus
df_val_benign_sampled = df_val_benign.sample(n=num_benign_samples, random_state=42)

df_val = pd.concat([df_val_attacks_balanced, df_val_benign_sampled])
df_val = shuffle(df_val, random_state=42).reset_index(drop=True)


X_val = df_val.drop(['Label', 'Attack'], axis=1)
y_val = df_val['Label'].to_numpy()

X_val = scaler.fit_transform(X_val)

X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.long)

In [24]:
print(df_val['Label'].value_counts())
print()
print(df_val['Attack'].value_counts())
print()
print(X_val.shape)
print()
print(y_val.unique(return_counts=True))
print(X_val.min(), X_val.max(), X_val.mean())

Label
1    9000
0    9000
Name: count, dtype: int64

Attack
Benign            9000
DoS               1000
Shellcode         1000
Generic           1000
Analysis          1000
Reconnaissance    1000
Fuzzers           1000
Worms             1000
Exploits          1000
Backdoor          1000
Name: count, dtype: int64

torch.Size([18000, 32])

(tensor([0, 1]), tensor([9000, 9000]))
tensor(0.) tensor(1.) tensor(0.0508)


In [25]:
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
test_dataset = TensorDataset(X_test, y_test)

NEW_BATCH_SIZE = 64 

train_loader = DataLoader(train_dataset, batch_size=NEW_BATCH_SIZE, shuffle=True, num_workers=80)
val_loader = DataLoader(val_dataset, batch_size=NEW_BATCH_SIZE, shuffle=False, num_workers=80)
test_loader = DataLoader(test_dataset, batch_size=NEW_BATCH_SIZE, shuffle=False, num_workers=80)