In [109]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [110]:
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
    'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
    'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
    'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count',
    'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label', 'difficulty'
]

# Functions for preparing data
Functions are used to ensure that the test, train and validation data are applied consistently to both data sets.

In [111]:
def load_and_preprocess_data(file_path, columns):
    data = pd.read_csv(file_path, header=None, names=columns, index_col=False)
    data.drop(columns=['difficulty'], inplace=True, errors='ignore')
    data.loc[data.label != 'normal', 'label'] = 'attack'
    data['label'] = data['label'].map({'normal': 0, 'attack': 1})
    return data

In [112]:
def encode_categorical_columns(train_data, test_data):
    categorical_cols = ['protocol_type', 'service', 'flag']
    for col in categorical_cols:
        encoder = LabelEncoder()
        encoder.fit(pd.concat([train_data[col], test_data[col]]))
        train_data[col] = encoder.transform(train_data[col])
        test_data[col] = encoder.transform(test_data[col])
    return train_data, test_data

In [113]:
def preprocess_features(train_features, test_features=None):
    categorical_columns = train_features.select_dtypes(include=['object']).columns
    numerical_columns = train_features.select_dtypes(exclude=['object']).columns

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_columns),
            ('num', StandardScaler(), numerical_columns)
        ]
    )

    train_processed = preprocessor.fit_transform(train_features)

    if test_features is not None:
        test_processed = preprocessor.transform(test_features)
        return pd.DataFrame(train_processed, columns=preprocessor.get_feature_names_out()), \
               pd.DataFrame(test_processed, columns=preprocessor.get_feature_names_out()), preprocessor

    return pd.DataFrame(train_processed, columns=preprocessor.get_feature_names_out()), preprocessor

# Preparation

In [114]:
# Load datasets
train_data = load_and_preprocess_data('../NSL-KDD/KDDTrain+.txt', columns)
test_data = load_and_preprocess_data('../NSL-KDD/KDDTest+.txt', columns)

In [120]:
train_data.shape, test_data.shape

((125973, 42), (22544, 42))

## Encode categorical columns

In [115]:
train_data, test_data = encode_categorical_columns(train_data, test_data)

## Separate features and labels

In [116]:
X = train_data.drop(['label'], axis=1)
y = train_data['label']

X_test = test_data.drop(['label'], axis=1)
y_test = test_data['label']

## Split training data into training and validation sets

In [117]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## Preprocess train, validation, and test sets

In [118]:
X_train_processed, X_val_processed, preprocessor = preprocess_features(X_train, X_val)
X_test_processed, preprocessor = preprocess_features(X_test)

# Save preprocessed datasets

In [119]:
X_train_processed.to_csv('./DATASET/train_features.csv', index=False)
y_train.to_csv('./DATASET/train_labels.csv', index=False)
X_val_processed.to_csv('./DATASET/val_features.csv', index=False)
y_val.to_csv('./DATASET/val_labels.csv', index=False)
X_test_processed.to_csv('./DATASET/test_features.csv', index=False)
y_test.to_csv('./DATASET/test_labels.csv', index=False)

print("Data preparation completed and datasets saved.")

Data preparation completed and datasets saved.
