In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

In [2]:
BATCH_SIZE = 64
SEED = 42
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

DROP_COLS = ['IPV4_SRC_ADDR',
             'IPV4_DST_ADDR',
             'L4_SRC_PORT',
             'L4_DST_PORT',
             'L7_PROTO',
             'TCP_FLAGS',
             'CLIENT_TCP_FLAGS',
             'SERVER_TCP_FLAGS',
             'MIN_TTL', 
             'MAX_TTL',
             'SHORTEST_FLOW_PKT',
             'MIN_IP_PKT_LEN', 
             'TCP_WIN_MAX_IN', 
             'TCP_WIN_MAX_OUT', 
             'DNS_QUERY_ID', 
             'DNS_TTL_ANSWER',
             'FTP_COMMAND_RET_CODE',
             'SRC_TO_DST_SECOND_BYTES',
             'DST_TO_SRC_SECOND_BYTES',
             'FLOW_START_MILLISECONDS',
             'FLOW_END_MILLISECONDS']

Using device: cuda


In [None]:
def balance_data(df, seed):
    df_benign = df[df['Attack'] == 'Benign']
    df_attacks = df[df['Attack'] != 'Benign']

    min_samples = df_attacks['Attack'].value_counts().min()
    target_samples = max(1000, min_samples)

    df_attacks_balanced = df_attacks.groupby('Attack').sample(n=target_samples, replace=True, random_state=seed)
    
    total_attacks = len(df_attacks_balanced)
    
    replace_benign = len(df_benign) < total_attacks
    df_benign_sampled = df_benign.sample(n=total_attacks, replace=replace_benign, random_state=seed)
    
    df_final = pd.concat([df_attacks_balanced, df_benign_sampled])
    df_final = shuffle(df_final, random_state=seed).reset_index(drop=True)
    
    return df_final

In [None]:
def create_xy_tensors(X, y):
    tensor_x = torch.tensor(X, dtype=torch.float32)
    tensor_y = torch.tensor(y, dtype=torch.long)
    return tensor_x, tensor_y

In [None]:
def process_dataset(filename, seed, batch_size):
    print(f"\n--- Processando Arquivo: {filename} ---")
    
    path = f'../db/{filename}.csv'
    try:
        df = pd.read_csv(path)
    except FileNotFoundError:
        print(f"Erro: Arquivo {path} n찾o encontrado.")
        return None

    cols_to_drop = [c for c in DROP_COLS if c in df.columns]
    df.drop(columns=cols_to_drop, inplace=True)
    
    X = df.drop(['Label', 'Attack'], axis=1)
    y_stratify = df['Attack'] 
    
    df_train, df_temp = train_test_split(df, test_size=0.5, random_state=seed, stratify=y_stratify)
    

    y_stratify_temp = df_temp['Attack']
    df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=seed, stratify=y_stratify_temp)
    
    print(f"Splits iniciais -> Treino: {len(df_train)}, Val: {len(df_val)}, Teste: {len(df_test)}")

    df_train_balanced = balance_data(df_train, seed)
    print(f"Treino ap처s balanceamento: {len(df_train_balanced)}")

    print(df_train_balanced['Attack'].value_counts())

    df_test_balanced = balance_data(df_test, seed)
    print(f"Treino ap처s balanceamento: {len(df_test_balanced)}")

    print(df_test_balanced['Attack'].value_counts())

    df_val_balanced = balance_data(df_val seed)
    print(f"Treino ap처s balanceamento: {len(df_val_balanced)}")

    print(df_val_balanced['Attack'].value_counts())

    def get_numpy_data(dataframe):
        x_data = dataframe.drop(['Label', 'Attack'], axis=1).to_numpy()
        y_data = dataframe['Label'].to_numpy()
        return x_data, y_data

    X_train, y_train = get_numpy_data(df_train_balanced)
    X_val, y_val = get_numpy_data(df_val_balanced)
    X_test, y_test = get_numpy_data(df_test_balanced)

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)
    
    train_dataset = TensorDataset(*create_xy_tensors(X_train, y_train))
    val_dataset = TensorDataset(*create_xy_tensors(X_val, y_val))
    test_dataset = TensorDataset(*create_xy_tensors(X_test, y_test))
    
    return {
        'train_loader': DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS),
        'val_loader': DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS),
        'test_loader': DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS),
        'num_features': X_train.shape[1],
        'scaler': scaler
    }

In [None]:
datasets = {}
Xs = {}
ys = {}
dataloaders = {}
filenames = ['NF-UNSW-NB15-v3', 'NF-BoT-IoT-v3', 'NF-CICIDS2018-v3']

for f in filenames:
    datasets[f] = dfs_creator(f, SEED)

for dataset in datasets:
    auxX = {}
    auxy = {}
    for i in dataset:
        x, y = Xy_creator(i, SEED)
        auxX[i] = x
        auxy[i] = y
    Xs[dataset] = auxX
    ys[dataset] = auxy

for x, y in zip(Xs, ys):
    dataloaders[f] = dataloaders_creator(X[f], y[f], BATCH_SIZE, 