In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

In [2]:
BATCH_SIZE = 64
SEED = 42
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

Using device: cuda


In [None]:
def df_balancer(df, seed):
    df_benign = df[df['Attack'] == 'Benign']
    df_attacks = df[df['Attack'] != 'Benign']
    
    rus = df_attacks['Attack'].value_counts().min()
    if rus < 1000:
        rus = 1000
    
    df_attacks_balanced = df_attacks.groupby('Attack').sample(n=rus, replace=True, random_state=seed)
    
    num_attack_classes = len(df_attacks['Attack'].unique())
    num_benign_samples = num_attack_classes * rus
    df_benign_sampled = df_benign.sample(n=num_benign_samples, random_state=seed)
    
    df = pd.concat([df_attacks_balanced, df_benign_sampled])
    df = shuffle(df, random_state=seed).reset_index(drop=True)

    return df

In [3]:
def dfs_creator(filename, seed):
    print(f"Processing: {filename}...")
    df = pd.read_csv(f'../db/{filename}.csv') 

    df.drop(['IPV4_SRC_ADDR',
         'IPV4_DST_ADDR',
         'L4_SRC_PORT',
         'L4_DST_PORT',
         'L7_PROTO',
         'TCP_FLAGS',
         'CLIENT_TCP_FLAGS',
         'SERVER_TCP_FLAGS',
         'MIN_TTL', 
         'MAX_TTL',
         'SHORTEST_FLOW_PKT',
         'MIN_IP_PKT_LEN', 
         'TCP_WIN_MAX_IN', 
         'TCP_WIN_MAX_OUT', 
         'DNS_QUERY_ID', 
         'DNS_TTL_ANSWER',
         'FTP_COMMAND_RET_CODE',
         'SRC_TO_DST_SECOND_BYTES',
         'DST_TO_SRC_SECOND_BYTES',
         'FLOW_START_MILLISECONDS',
         'FLOW_END_MILLISECONDS',], inplace=True, axis=1)
    
    dictionary_sets_by_attack_type = {}
    attack_types = df['Attack'].unique()

    for attack_type in attack_types:
        print(f"Processando a categoria: '{attack_type}'")
        df_current_attack = df[df['Attack'] == attack_type]
    
        df_train_current_attack, df_aux_current_attack = train_test_split(df_current_attack, train_size=0.5, random_state=seed)
        df_test_current_attack, df_val_current_attack = train_test_split(df_aux_current_attack, train_size=0.5, random_state=seed)
    
        dictionary_sets_by_attack_type[attack_type] = {
            'treino': df_train_current_attack,
            'teste': df_test_current_attack,
            'validacao': df_val_current_attack
        }
        print(f"  -> Treino: {len(df_train_current_attack)} | Teste: {len(df_test_current_attack)} | Validação: {len(df_val_current_attack)}")


    list_train = [dictionary_sets_by_attack_type[attack_type]['treino'] for attack_type in attack_types]
    df_train = pd.concat(list_train)
    df_train = shuffle(df_train, random_state=seed).reset_index(drop=True)
    df_train = dfs_balancer(df_train, seed)
    
    list_test = [dictionary_sets_by_attack_type[attack_type]['teste'] for attack_type in attack_types]
    df_test = pd.concat(list_test)
    df_test = shuffle(df_test, random_state=seed).reset_index(drop=True)
    df_test = dfs_balancer(df_test, seed)
    
    list_val = [dictionary_sets_by_attack_type[attack_type]['validacao'] for attack_type in attack_types]
    df_val = pd.concat(list_val)
    df_val = shuffle(df_val, random_state=seed).reset_index(drop=True)
    df_val = dfs_balancer(df_val, seed)

    print(f"--- Base de Treino ---")
    print(f"Tamanho: {len(df_train)} linhas")
    print(f"Categorias presentes: {df_train['Attack'].unique()}")
    print(df_train['Attack'].value_counts())
    print("-" * 25)
    
    print(f"\n--- Base de Teste ---")
    print(f"Tamanho: {len(df_test)} linhas")
    print(f"Categorias presentes: {df_test['Attack'].unique()}")
    print(df_test['Attack'].value_counts())
    print("-" * 25)
    
    print(f"\n--- Base de Validação ---")
    print(f"Tamanho: {len(df_val)} linhas")
    print(f"Categorias presentes: {df_val['Attack'].unique()}")
    print(df_val['Attack'].value_counts())
    print("-" * 25)

    return {
        'train': df_train,
        'val': df_val,
        'test': df_test
    }

In [4]:
def Xy_creator(df, seed):    
    X = df.drop(['Label', 'Attack'], axis=1)
    y = df['Label'].to_numpy()
    
    X = scaler.fit_transform(X)
    
    X = torch.tensor(X, dtype=torch.float32)
    y = torch.tensor(y, dtype=torch.long)

    return X, y

In [7]:
def dataloader_creator(X, y, bs, train=False):
dataset = TensorDataset(X, y)

if train == True
    loader = DataLoader(ataset, batch_size=bs, shuffle=True, num_workers=88)
else:
    loader = DataLoader(ataset, batch_size=bs, shuffle=False, num_workers=88)

In [None]:
datasets = {}
Xs = {}
ys = {}
dataloaders = {}
filenames = ['NF-UNSW-NB15-v3', 'NF-BoT-IoT-v3', 'NF-CICIDS2018-v3']

for f in filenames:
    datasets[f] = dfs_creator(f, SEED)

for dataset in datasets:
    auxX = {}
    auxy = {}
    for i in dataset:
        x, y = Xy_creator(i, SEED)
        auxX[i] = x
        auxy[i] = y
    Xs[dataset] = auxX
    ys[dataset] = auxy

for x, y in zip(Xs, ys):
    dataloaders[f] = dataloaders_creator(X[f], y[f], BATCH_SIZE, 