# New Class Preprocessing
This notebook will perform preprocessing on all but one class to test the model's ability to identify new classes. The preprocessing steps taken are:

1. Min/max scaling
2. PCA

There are a total of 10 possible classes. Benign traffic will be present in all subsets, and each type of possible attack will be left out once, resulting in 9 datasets.

In [1]:
import pandas as pd
import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

In [6]:
labels = {
    "Benign": 0,
    "ACK_Flooding": 1,
    "ARP_Spoofing": 2,
    "Port_Scanning": 3,
    "Service_Detection": 4,
    "SYN_Flooding": 5,
    "UDP_Flooding": 6,
    "HTTP_Flooding": 7,
    "Telnet-brute_Force": 8,
    "Host_Discovery": 9,
}

# Set below unknown_attacks to be the types of traffic that will be tested as novel
unknown_attacks = ["ACK_Flooding", "ARP_Spoofing", "Port_Scanning", "Service_Detection", "SYN_Flooding", "UDP_Flooding", "HTTP_Flooding", "Telnet-brute_Force", "Host_Discovery"]

In [7]:
# Read raw data
train_set = pd.read_csv("./sampled_datasets/train_set.csv")
val_set = pd.read_csv("./sampled_datasets/val_set.csv")
test_set = pd.read_csv("./sampled_datasets/test_set.csv")

In [9]:
# Scale features
columns_to_normalize = ["length"] + [f"kit_fe_{i}" for i in range(0, 100)]

for attack in unknown_attacks:
    # Filter out the unknown attack label from training set
    new_train = train_set[train_set["label"] != labels[attack]].reset_index(drop=True)

    # Normalize the data
    scaler = MinMaxScaler()
    scaler.fit(new_train[columns_to_normalize])

    # Scale training, validation, and test sets
    train_norm = pd.DataFrame(scaler.transform(new_train[columns_to_normalize]), columns=columns_to_normalize)
    val_norm = pd.DataFrame(scaler.transform(val_set[columns_to_normalize]), columns=columns_to_normalize)
    test_norm = pd.DataFrame(scaler.transform(test_set[columns_to_normalize]), columns=columns_to_normalize)

    # Concatenate normalized columns back to the non-normalized columns for the training data
    train_norm = pd.concat([new_train.drop(columns=columns_to_normalize), train_norm], axis=1)
    val_norm = pd.concat([val_set.drop(columns=columns_to_normalize), val_norm], axis=1)
    test_norm = pd.concat([test_set.drop(columns=columns_to_normalize), test_norm], axis=1)

    # Initialize PCA and fit to the scaled training data
    pca = PCA(n_components=0.95)  # Retain 95% of the variance
    pca.fit(train_norm.drop('label', axis=1))

    # Save PCA-transformed train data
    X_train_pca =  pca.transform(train_norm.drop('label', axis=1))
    train_pca_df = pd.DataFrame(X_train_pca, columns=[f'PC{i+1}' for i in range(X_train_pca.shape[1])])
    train_pca_df['label'] = train_norm['label'].values
    train_pca_df.to_csv(f"./new_class_datasets/pca_train_no_{attack}.csv", index=False)

    # Apply PCA to the test and validation sets
    X_test_pca = pca.transform(test_norm.drop('label', axis=1))
    test_pca_df = pd.DataFrame(X_test_pca, columns=[f'PC{i+1}' for i in range(X_test_pca.shape[1])])
    test_pca_df['label'] = test_norm['label'].values
    test_pca_df.to_csv(f"./new_class_datasets/pca_test_no_{attack}.csv", index=False)

    X_val_pca = pca.transform(val_norm.drop('label', axis=1))
    val_pca_df = pd.DataFrame(X_val_pca, columns=[f'PC{i+1}' for i in range(X_val_pca.shape[1])])
    val_pca_df['label'] = val_norm['label'].values
    val_pca_df.to_csv(f"./new_class_datasets/pca_val_no_{attack}.csv", index=False)

    # Save min-max scaler and PCA model
    joblib.dump(scaler, f'./preprocessing_models/scaler_no_{attack}.joblib')
    joblib.dump(pca, f'./preprocessing_models/pca_no_{attack}.joblib')

  C = X.T @ X


Since we leave one class out to simulate an unknown attack, map the labels so that they are sequential.

In [10]:
# Sequential relabelling
current_labels = list(labels.values())
current_labels.remove(labels[attack])

# Create a mapping from original labels to new labels
label_mapping = {label: i for i, label in enumerate(current_labels)}

# Add label for unknown class
label_mapping[labels[attack]] = -1

# Apply the mapping to dataset
train_pca_df['label'] = train_pca_df['label'].map(label_mapping)
val_pca_df['label'] = val_pca_df['label'].map(label_mapping)
test_pca_df['label'] = test_pca_df['label'].map(label_mapping)

train_pca_df.to_csv(f"./new_class_datasets/pca_train_no_{attack}.csv", index=False)
val_pca_df.to_csv(f"./new_class_datasets/pca_val_no_{attack}.csv", index=False)
test_pca_df.to_csv(f"./new_class_datasets/pca_test_no_{attack}.csv", index=False)