In [729]:
import numpy as np 
import pandas as pd
import copy
import pickle
import joblib

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA
from imblearn.under_sampling import RandomUnderSampler as under_sam

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.metrics import matthews_corrcoef, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier

### ICFS function
Takes a dataframe as parameter and saves to file all the features necessary to describe DoS+Probe and U2R+R2L

In [730]:
def pearson_correlated_features(x, y, threshold):
    y['target'] = y['target'].astype(int)

    for p in x.columns:
        x[p] = x[p].astype(float)

    # Ensure y is a DataFrame for consistency
    if isinstance(y, pd.Series):
        y = pd.DataFrame(y, columns=['target'])

    # Calculate the Pearson's correlation coefficients between features and the target variable(s)
    corr_matrix = x.corrwith(y['target'])

    # Select features with correlations above the threshold
    selected_features = x.columns[corr_matrix.abs() > threshold].tolist()

    return selected_features

In [731]:
def compute_set_difference(df1, df2):
    # Create a new DataFrame containing the set difference of the two DataFrames.
    df_diff = df1[~df1.index.isin(df2.index)]
    # Return the DataFrame.
    return df_diff

In [732]:
def perform_icfs(x_train):
    # now ICFS only on the numerical features
    num_train = copy.deepcopy(x_train)
    del num_train['protocol_type']
    del num_train['service']
    del num_train['flag']

    target = pd.DataFrame()
    target['target'] = np.array([1 if x != 'normal' else 0 for x in num_train['label']])
    num_train = pd.concat([num_train, target], axis=1)

    # These are how attacks are categorized in the trainset
    dos_list = ['back', 'land', 'neptune', 'pod', 'smurf', 'teardrop']
    probe_list = ['ipsweep', 'portsweep', 'satan', 'nmap']
    u2r_list = ['loadmodule', 'perl', 'rootkit', 'buffer_overflow']
    r2l_list = ['ftp_write', 'guess_passwd', 'imap', 'multihop', 'phf', 'spy', 'warezclient', 'warezmaster']
    normal = ['normal']

    # useful sub-sets
    x_normal = num_train[num_train['label'].isin(normal)]
    x_u2r = num_train[num_train['label'].isin(u2r_list)]
    x_r2l = num_train[num_train['label'].isin(r2l_list)]
    x_dos = num_train[num_train['label'].isin(dos_list)]
    x_probe = num_train[num_train['label'].isin(probe_list)]

    # start the ICFS with l1

    # features for dos
    dos = copy.deepcopy(num_train)
    del dos['target']
    y = np.array([1 if x in dos_list else 0 for x in dos['label']])
    y_dos = pd.DataFrame(y, columns=['target'])
    del dos['label']
    dos_all = pearson_correlated_features(dos, y_dos, 0.1)
    print(dos_all)

    # features for probe
    probe = copy.deepcopy(num_train)
    del probe['target']
    y = np.array([1 if x in probe_list else 0 for x in probe['label']])
    y_probe = pd.DataFrame(y, columns=['target'])
    del probe['label']
    probe_all = pearson_correlated_features(probe, y_probe, 0.1)
    print(probe_all)

    # intersect for the optimal features
    set_dos = set(dos_all)
    set_probe = set(probe_all)

    comm_features_l1 = set_probe & set_dos

    print('common features to train l1: ', comm_features_l1)

    # now l2 needs the features to describe the difference between rare attacks and normal traffic

    # features for u2r
    u2r = pd.concat([x_u2r, x_normal], axis=0)
    del u2r['target']
    y = np.array([1 if x in u2r_list else 0 for x in u2r['label']])
    y_u2r = pd.DataFrame(y, columns=['target'])
    del u2r['label']
    u2r_all = pearson_correlated_features(u2r, y_u2r, 0.01)
    print(u2r_all)

    # features for r2l
    r2l = pd.concat([x_r2l, x_normal], axis=0)
    del r2l['target']
    y = np.array([1 if x in r2l_list else 0 for x in r2l['label']])
    y_r2l = pd.DataFrame(y, columns=['target'])
    del r2l['label']
    r2l_all = pearson_correlated_features(r2l, y_r2l, 0.01)
    print(r2l_all)

    # intersect for the optimal features
    set_r2l = set(r2l_all)
    set_u2r = set(u2r_all)

    comm_features_l2 = set_r2l & set_u2r
    # print('Common features to train l2: ', len(common_features_l2), common_features_l2)

    with open('NSL-KDD Files/NSL_features_l1.txt', 'w') as g:
        for a, x in enumerate(comm_features_l1):
            if a < len(comm_features_l1) - 1:
                g.write(x + ',' + '\n')
            else:
                g.write(x)

    # read the common features from file
    with open('NSL-KDD Files/NSL_features_l2.txt', 'w') as g:
        for a, x in enumerate(comm_features_l2):
            if a < len(comm_features_l2) - 1:
                g.write(x + ',' + '\n')
            else:
                g.write(x)

# Main implementation

In [733]:
# loading the train set
df_train = pd.read_csv('NSL-KDD Original Datasets/KDDTrain+.txt', sep=",", header=None)
df_train = df_train[df_train.columns[:-1]]  # tags column
titles = pd.read_csv('NSL-KDD Original Datasets/Field Names.csv', header=None)
label = pd.Series(['label'], index=[41])
titles = pd.concat([titles[0], label])
df_train.columns = titles.to_list()
df_train = df_train.drop(['num_outbound_cmds'],axis=1)
df_train_original = df_train
# df_train_original

In [734]:
# load test set
df_test = pd.read_csv('NSL-KDD Original Datasets/KDDTest+.txt', sep=",", header=None)
df_test_ = df_test.sort_index(axis=1)
df_test = df_test[df_test.columns[:-1]]
df_test.columns = titles.to_list()
df_test = df_test.drop(['num_outbound_cmds'],axis=1)
df_test_original = df_test
# df_test_original

### Execution Parameters

In [735]:
EXPORT_MODELS = 1
EXPORT_DATASETS = 1
EXPORT_PCA = 1
EXPORT_ENCODERS = 1

### Perform ICFS if needed

In [736]:
# It is possible to compute the ICFS again

# perform_icfs(df_train_original)

# DoS + Probe classifier (NBC)

In [737]:
# list of single attacks 
dos_attacks = ['back', 'land', 'neptune', 'pod', 'smurf', 'teardrop', 'worm', 'apache2', 'mailbomb', 'processtable', 'udpstorm']
probe_attacks = ['ipsweep', 'mscan', 'nmap', 'portsweep', 'saint', 'satan']
r2l_attacks = ['guess_passwd', 'ftp_write', 'imap', 'phf', 'multihop', 'warezmaster',
                'snmpguess', 'spy', 'warezclient', 'httptunnel', 'named', 'sendmail', 'snmpgetattack', 'xlock', 'xsnoop']
u2r_attacks = ['buffer_overflow', 'loadmodule', 'perl', 'ps', 'rootkit', 'sqlattack', 'xterm'] 

# list of attack classes split according to detection layer
dos_probe_list = ['back', 'land', 'neptune', 'pod', 'smurf', 'teardrop', 'ipsweep', 'nmap', 'portsweep', 'satan']
dos_probe_test = ['apache2', 'mailbomb', 'processtable', 'udpstorm', 'mscan', 'saint']
u2r_r2l_list = ['guess_passwd', 'ftp_write', 'imap', 'phf', 'multihop', 'warezmaster',
                'snmpguess', 'spy', 'warezclient', 'buffer_overflow', 'loadmodule', 'rootkit', 'perl']
u2r_r2l_test = ['httptunnel', 'named', 'sendmail', 'snmpgetattack', 'xlock', 'xsnoop', 'ps', 'xterm', 'sqlattack']
normal_list = ['normal']
categorical_features = ['protocol_type', 'service', 'flag']

# load the features obtained with ICFS for both layer 1 and layer 2
with open('NSL-KDD Files/NSL_features_l1.txt', 'r') as f:
    common_features_l1 = f.read().split(',')

with open('NSL-KDD Files/NSL_features_l2.txt', 'r') as f:
    common_features_l2 = f.read().split(',')
    
df_train_and_validate = copy.deepcopy(df_train_original)
df_test = copy.deepcopy(df_test_original)

In [738]:
# split in test and validation set for BOTH layers
df_train_original, df_val_original = train_test_split(df_train_and_validate, test_size=0.2, random_state=42)
df_train_original = df_train_original.sort_index(axis=1)
df_val_original = df_val_original.sort_index(axis=1)

# dataframes specifically for layer 1
df_train = copy.deepcopy(df_train_original)
df_val = copy.deepcopy(df_val_original)

# set the target variables accordingly
y_train = np.array([1 if x in (dos_attacks+probe_attacks) else 0 for x in df_train['label']])
y_val = np.array([1 if x in (dos_attacks+probe_attacks) else 0 for x in df_val['label']])

# this dataframe contains the whole train set 
df_train = df_train.drop(['label'],axis=1)
df_train = df_train.reset_index().drop(['index'], axis=1)
df_train

Unnamed: 0,count,diff_srv_rate,dst_bytes,dst_host_count,dst_host_diff_srv_rate,dst_host_rerror_rate,dst_host_same_src_port_rate,dst_host_same_srv_rate,dst_host_serror_rate,dst_host_srv_count,...,serror_rate,service,src_bytes,srv_count,srv_diff_host_rate,srv_rerror_rate,srv_serror_rate,su_attempted,urgent,wrong_fragment
0,16,0.00,14939,52,0.00,0.0,0.02,1.00,0.0,255,...,0.0,http,214,18,0.17,0.0,0.0,0,0,0
1,142,0.06,0,255,0.06,0.0,0.00,0.01,1.0,2,...,1.0,private,0,2,0.00,0.0,1.0,0,0,0
2,273,0.06,0,255,0.06,1.0,0.00,0.03,0.0,8,...,0.0,http,0,8,0.00,1.0,0.0,0,0,0
3,20,0.00,259,255,0.00,0.0,0.00,1.00,0.0,255,...,0.0,http,257,20,0.00,0.0,0.0,0,0,0
4,274,0.00,4,255,0.00,0.0,1.00,1.00,0.0,255,...,0.0,other,516,274,0.00,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100773,258,0.07,0,255,0.09,1.0,0.00,0.02,0.0,4,...,0.0,echo,0,4,0.00,1.0,0.0,0,0,0
100774,24,0.08,0,255,0.07,0.0,0.01,0.02,1.0,4,...,1.0,telnet,0,4,0.00,0.0,1.0,0,0,0
100775,258,0.07,0,255,0.07,1.0,0.00,0.02,0.0,6,...,0.0,http,0,6,0.00,1.0,0.0,0,0,0
100776,5,0.00,4281,21,0.00,0.0,0.05,1.00,0.0,255,...,0.0,http,309,5,0.00,0.0,0.0,0,0,0


In [739]:
# this dataframe contains the whole validation set
df_val = df_val.drop(['label'],axis=1)
df_val = df_val.reset_index().drop(['index'], axis=1)

df_val

Unnamed: 0,count,diff_srv_rate,dst_bytes,dst_host_count,dst_host_diff_srv_rate,dst_host_rerror_rate,dst_host_same_src_port_rate,dst_host_same_srv_rate,dst_host_serror_rate,dst_host_srv_count,...,serror_rate,service,src_bytes,srv_count,srv_diff_host_rate,srv_rerror_rate,srv_serror_rate,su_attempted,urgent,wrong_fragment
0,2,0.00,0,67,0.00,0.00,1.00,1.00,0.00,171,...,0.0,domain_u,36,4,0.75,0.0,0.0,0,0,0
1,42,0.10,0,255,0.05,0.00,0.01,0.17,1.00,44,...,1.0,http,0,11,0.00,0.0,1.0,0,0,0
2,284,0.06,0,255,0.06,0.00,0.00,0.08,1.00,20,...,1.0,pop_3,0,20,0.00,0.0,1.0,0,0,0
3,110,0.06,0,255,0.07,1.00,0.00,0.11,0.00,27,...,0.0,private,0,8,0.00,1.0,0.0,0,0,0
4,1,0.00,0,134,0.64,0.63,0.64,0.01,0.04,1,...,0.0,private,0,1,0.00,1.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25190,11,0.00,944,255,0.00,0.00,0.00,1.00,0.00,255,...,0.0,http,199,13,0.15,0.0,0.0,0,0,0
25191,1,0.00,0,2,0.00,0.00,1.00,1.00,0.00,129,...,0.0,eco_i,8,3,1.00,0.0,0.0,0,0,0
25192,2,0.00,0,140,0.04,0.02,0.06,0.06,0.00,9,...,0.0,ftp_data,12983,2,0.00,0.0,0.0,0,0,0
25193,1,0.00,329,255,0.06,0.00,0.00,0.55,0.02,141,...,0.0,smtp,813,3,1.00,0.0,0.0,0,0,0


In [740]:
# now the real processing for layer 1 starts
X_train = df_train[common_features_l1]
X_train

Unnamed: 0,logged_in,count,serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_srv_rerror_rate,dst_host_rerror_rate
0,1,16,0.0,0.0,0.0,1.00,0.00,0.17,255,1.00,0.00,0.02,0.06,0.0,0.0,0.0,0.0
1,0,142,1.0,0.0,0.0,0.01,0.06,0.00,2,0.01,0.06,0.00,0.00,1.0,1.0,0.0,0.0
2,0,273,0.0,1.0,1.0,0.03,0.06,0.00,8,0.03,0.06,0.00,0.00,0.0,0.0,1.0,1.0
3,1,20,0.0,0.0,0.0,1.00,0.00,0.00,255,1.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0
4,0,274,0.0,0.0,0.0,1.00,0.00,0.00,255,1.00,0.00,1.00,0.00,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100773,0,258,0.0,1.0,1.0,0.02,0.07,0.00,4,0.02,0.09,0.00,0.00,0.0,0.0,1.0,1.0
100774,0,24,1.0,0.0,0.0,0.17,0.08,0.00,4,0.02,0.07,0.01,0.00,1.0,1.0,0.0,0.0
100775,0,258,0.0,1.0,1.0,0.02,0.07,0.00,6,0.02,0.07,0.00,0.00,0.0,0.0,1.0,1.0
100776,1,5,0.0,0.0,0.0,1.00,0.00,0.00,255,1.00,0.00,0.05,0.05,0.0,0.0,0.0,0.0


In [741]:
X_validate = df_val[common_features_l1]
X_validate

Unnamed: 0,logged_in,count,serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_srv_rerror_rate,dst_host_rerror_rate
0,0,2,0.0,0.0,0.0,1.00,0.00,0.75,171,1.00,0.00,1.00,0.01,0.00,0.00,0.0,0.00
1,0,42,1.0,0.0,0.0,0.26,0.10,0.00,44,0.17,0.05,0.01,0.00,1.00,1.00,0.0,0.00
2,0,284,1.0,0.0,0.0,0.07,0.06,0.00,20,0.08,0.06,0.00,0.00,1.00,1.00,0.0,0.00
3,0,110,0.0,1.0,1.0,0.07,0.06,0.00,27,0.11,0.07,0.00,0.00,0.00,0.00,1.0,1.00
4,0,1,0.0,1.0,1.0,1.00,0.00,0.00,1,0.01,0.64,0.64,0.00,0.04,0.00,1.0,0.63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25190,1,11,0.0,0.0,0.0,1.00,0.00,0.15,255,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00
25191,0,1,0.0,0.0,0.0,1.00,0.00,1.00,129,1.00,0.00,1.00,0.50,0.00,0.00,0.0,0.00
25192,0,2,0.0,0.0,0.0,1.00,0.00,0.00,9,0.06,0.04,0.06,0.00,0.00,0.00,0.0,0.02
25193,1,1,0.0,0.0,0.0,1.00,0.00,1.00,141,0.55,0.06,0.00,0.00,0.02,0.02,0.0,0.00


In [742]:
# 2 one-hot encoders, one for the features of layer1 and one for the features of layer2
ohe = OneHotEncoder(handle_unknown='ignore')
ohe2 = OneHotEncoder(handle_unknown='ignore')
scaler1 = MinMaxScaler()
scaler2 = MinMaxScaler()

In [743]:
# scaling the train set for layer1
df_minmax = scaler1.fit_transform(X_train)
X_train = pd.DataFrame(df_minmax, columns=X_train.columns)

X_train

Unnamed: 0,logged_in,count,serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_srv_rerror_rate,dst_host_rerror_rate
0,1.0,0.031311,0.0,0.0,0.0,1.00,0.00,0.17,1.000000,1.00,0.00,0.02,0.06,0.0,0.0,0.0,0.0
1,0.0,0.277886,1.0,0.0,0.0,0.01,0.06,0.00,0.007843,0.01,0.06,0.00,0.00,1.0,1.0,0.0,0.0
2,0.0,0.534247,0.0,1.0,1.0,0.03,0.06,0.00,0.031373,0.03,0.06,0.00,0.00,0.0,0.0,1.0,1.0
3,1.0,0.039139,0.0,0.0,0.0,1.00,0.00,0.00,1.000000,1.00,0.00,0.00,0.00,0.0,0.0,0.0,0.0
4,0.0,0.536204,0.0,0.0,0.0,1.00,0.00,0.00,1.000000,1.00,0.00,1.00,0.00,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100773,0.0,0.504892,0.0,1.0,1.0,0.02,0.07,0.00,0.015686,0.02,0.09,0.00,0.00,0.0,0.0,1.0,1.0
100774,0.0,0.046967,1.0,0.0,0.0,0.17,0.08,0.00,0.015686,0.02,0.07,0.01,0.00,1.0,1.0,0.0,0.0
100775,0.0,0.504892,0.0,1.0,1.0,0.02,0.07,0.00,0.023529,0.02,0.07,0.00,0.00,0.0,0.0,1.0,1.0
100776,1.0,0.009785,0.0,0.0,0.0,1.00,0.00,0.00,1.000000,1.00,0.00,0.05,0.05,0.0,0.0,0.0,0.0


In [744]:
# scaling the validation set for layer1
df_minmax_val = scaler1.transform(X_validate)
X_validate = pd.DataFrame(df_minmax_val, columns=X_validate.columns)

X_validate

Unnamed: 0,logged_in,count,serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_srv_rerror_rate,dst_host_rerror_rate
0,0.0,0.003914,0.0,0.0,0.0,1.00,0.00,0.75,0.670588,1.00,0.00,1.00,0.01,0.00,0.00,0.0,0.00
1,0.0,0.082192,1.0,0.0,0.0,0.26,0.10,0.00,0.172549,0.17,0.05,0.01,0.00,1.00,1.00,0.0,0.00
2,0.0,0.555773,1.0,0.0,0.0,0.07,0.06,0.00,0.078431,0.08,0.06,0.00,0.00,1.00,1.00,0.0,0.00
3,0.0,0.215264,0.0,1.0,1.0,0.07,0.06,0.00,0.105882,0.11,0.07,0.00,0.00,0.00,0.00,1.0,1.00
4,0.0,0.001957,0.0,1.0,1.0,1.00,0.00,0.00,0.003922,0.01,0.64,0.64,0.00,0.04,0.00,1.0,0.63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25190,1.0,0.021526,0.0,0.0,0.0,1.00,0.00,0.15,1.000000,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00
25191,0.0,0.001957,0.0,0.0,0.0,1.00,0.00,1.00,0.505882,1.00,0.00,1.00,0.50,0.00,0.00,0.0,0.00
25192,0.0,0.003914,0.0,0.0,0.0,1.00,0.00,0.00,0.035294,0.06,0.04,0.06,0.00,0.00,0.00,0.0,0.02
25193,1.0,0.001957,0.0,0.0,0.0,1.00,0.00,1.00,0.552941,0.55,0.06,0.00,0.00,0.02,0.02,0.0,0.00


In [745]:
# perform One-hot encoding for the train set
label_enc = ohe.fit_transform(df_train[categorical_features])
label_enc.toarray()
new_labels = ohe.get_feature_names_out(categorical_features)
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_train = pd.concat([X_train, df_enc], axis=1)

df_train.iloc[:,1:4]

Unnamed: 0,diff_srv_rate,dst_bytes,dst_host_count
0,0.00,14939,52
1,0.06,0,255
2,0.06,0,255
3,0.00,259,255
4,0.00,4,255
...,...,...,...
100773,0.07,0,255
100774,0.08,0,255
100775,0.07,0,255
100776,0.00,4281,21


In [746]:
# perform One-hot encoding for the validation set
label_enc = ohe.transform(df_val[categorical_features])
label_enc.toarray()
new_labels = ohe.get_feature_names_out(categorical_features)
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_validate = pd.concat([X_validate, df_enc], axis=1)

X_validate

Unnamed: 0,logged_in,count,serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_srv_count,dst_host_same_srv_rate,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,0.003914,0.0,0.0,0.0,1.00,0.00,0.75,0.670588,1.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.082192,1.0,0.0,0.0,0.26,0.10,0.00,0.172549,0.17,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.555773,1.0,0.0,0.0,0.07,0.06,0.00,0.078431,0.08,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.215264,0.0,1.0,1.0,0.07,0.06,0.00,0.105882,0.11,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.001957,0.0,1.0,1.0,1.00,0.00,0.00,0.003922,0.01,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25190,1.0,0.021526,0.0,0.0,0.0,1.00,0.00,0.15,1.000000,1.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25191,0.0,0.001957,0.0,0.0,0.0,1.00,0.00,1.00,0.505882,1.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25192,0.0,0.003914,0.0,0.0,0.0,1.00,0.00,0.00,0.035294,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25193,1.0,0.001957,0.0,0.0,0.0,1.00,0.00,1.00,0.552941,0.55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [747]:
# do the same for testset
y_test = np.array([1 if x in (dos_attacks+probe_attacks) else 0 for x in df_test['label']])

df_test = df_test.drop(['label'],axis=1)
df_test = df_test.reset_index().drop(['index'], axis=1)
df_test

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,tcp,private,REJ,0,0,0,0,0,0,...,255,10,0.04,0.06,0.00,0.00,0.00,0.0,1.00,1.00
1,0,tcp,private,REJ,0,0,0,0,0,0,...,255,1,0.00,0.06,0.00,0.00,0.00,0.0,1.00,1.00
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,134,86,0.61,0.04,0.61,0.02,0.00,0.0,0.00,0.00
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,3,57,1.00,0.00,1.00,0.28,0.00,0.0,0.00,0.00
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,29,86,0.31,0.17,0.03,0.02,0.00,0.0,0.83,0.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22539,0,tcp,smtp,SF,794,333,0,0,0,0,...,100,141,0.72,0.06,0.01,0.01,0.01,0.0,0.00,0.00
22540,0,tcp,http,SF,317,938,0,0,0,0,...,197,255,1.00,0.00,0.01,0.01,0.01,0.0,0.00,0.00
22541,0,tcp,http,SF,54540,8314,0,0,0,2,...,255,255,1.00,0.00,0.00,0.00,0.00,0.0,0.07,0.07
22542,0,udp,domain_u,SF,42,42,0,0,0,0,...,255,252,0.99,0.01,0.00,0.00,0.00,0.0,0.00,0.00


In [748]:
X_test = df_test[common_features_l1]

X_test

Unnamed: 0,logged_in,count,serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_srv_rerror_rate,dst_host_rerror_rate
0,0,229,0.0,1.0,1.0,0.04,0.06,0.00,10,0.04,0.06,0.00,0.00,0.00,0.0,1.00,1.00
1,0,136,0.0,1.0,1.0,0.01,0.06,0.00,1,0.00,0.06,0.00,0.00,0.00,0.0,1.00,1.00
2,0,1,0.0,0.0,0.0,1.00,0.00,0.00,86,0.61,0.04,0.61,0.02,0.00,0.0,0.00,0.00
3,0,1,0.0,0.0,0.0,1.00,0.00,1.00,57,1.00,0.00,1.00,0.28,0.00,0.0,0.00,0.00
4,0,1,0.0,1.0,0.5,1.00,0.00,0.75,86,0.31,0.17,0.03,0.02,0.00,0.0,0.71,0.83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22539,1,1,0.0,0.0,0.0,1.00,0.00,0.00,141,0.72,0.06,0.01,0.01,0.01,0.0,0.00,0.00
22540,1,2,0.0,0.0,0.0,1.00,0.00,0.18,255,1.00,0.00,0.01,0.01,0.01,0.0,0.00,0.00
22541,1,5,0.0,0.0,0.0,1.00,0.00,0.20,255,1.00,0.00,0.00,0.00,0.00,0.0,0.07,0.07
22542,0,4,0.0,0.0,0.0,1.00,0.00,0.33,252,0.99,0.01,0.00,0.00,0.00,0.0,0.00,0.00


In [749]:
df_minmax = scaler1.transform(X_test)
X_test = pd.DataFrame(df_minmax, columns=X_test.columns)
X_test

Unnamed: 0,logged_in,count,serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_srv_rerror_rate,dst_host_rerror_rate
0,0.0,0.448141,0.0,1.0,1.0,0.04,0.06,0.00,0.039216,0.04,0.06,0.00,0.00,0.00,0.0,1.00,1.00
1,0.0,0.266145,0.0,1.0,1.0,0.01,0.06,0.00,0.003922,0.00,0.06,0.00,0.00,0.00,0.0,1.00,1.00
2,0.0,0.001957,0.0,0.0,0.0,1.00,0.00,0.00,0.337255,0.61,0.04,0.61,0.02,0.00,0.0,0.00,0.00
3,0.0,0.001957,0.0,0.0,0.0,1.00,0.00,1.00,0.223529,1.00,0.00,1.00,0.28,0.00,0.0,0.00,0.00
4,0.0,0.001957,0.0,1.0,0.5,1.00,0.00,0.75,0.337255,0.31,0.17,0.03,0.02,0.00,0.0,0.71,0.83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22539,1.0,0.001957,0.0,0.0,0.0,1.00,0.00,0.00,0.552941,0.72,0.06,0.01,0.01,0.01,0.0,0.00,0.00
22540,1.0,0.003914,0.0,0.0,0.0,1.00,0.00,0.18,1.000000,1.00,0.00,0.01,0.01,0.01,0.0,0.00,0.00
22541,1.0,0.009785,0.0,0.0,0.0,1.00,0.00,0.20,1.000000,1.00,0.00,0.00,0.00,0.00,0.0,0.07,0.07
22542,0.0,0.007828,0.0,0.0,0.0,1.00,0.00,0.33,0.988235,0.99,0.01,0.00,0.00,0.00,0.0,0.00,0.00


In [750]:
label_enc = ohe.transform(df_test.iloc[:,1:4])
label_enc.toarray()
new_labels = ohe.get_feature_names_out(categorical_features)
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_test = pd.concat([X_test, df_enc], axis=1)
X_test

Unnamed: 0,logged_in,count,serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_srv_count,dst_host_same_srv_rate,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,0.448141,0.0,1.0,1.0,0.04,0.06,0.00,0.039216,0.04,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.266145,0.0,1.0,1.0,0.01,0.06,0.00,0.003922,0.00,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.001957,0.0,0.0,0.0,1.00,0.00,0.00,0.337255,0.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.001957,0.0,0.0,0.0,1.00,0.00,1.00,0.223529,1.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.001957,0.0,1.0,0.5,1.00,0.00,0.75,0.337255,0.31,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22539,1.0,0.001957,0.0,0.0,0.0,1.00,0.00,0.00,0.552941,0.72,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22540,1.0,0.003914,0.0,0.0,0.0,1.00,0.00,0.18,1.000000,1.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22541,1.0,0.009785,0.0,0.0,0.0,1.00,0.00,0.20,1.000000,1.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22542,0.0,0.007828,0.0,0.0,0.0,1.00,0.00,0.33,0.988235,0.99,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [751]:
print('Shape of the whole train set: ', X_train.shape)
print('Shape of its targets: ', y_train.shape)
print('Shape of the whole test set: ', X_test.shape)
print('Shape of its targets: ', y_test.shape)

Shape of the whole train set:  (100778, 101)
Shape of its targets:  (100778,)
Shape of the whole test set:  (22544, 101)
Shape of its targets:  (22544,)


In [752]:
# Export the dataset for training layer 1
if EXPORT_DATASETS:
    X_train.to_csv('NSL-KDD Encoded Datasets/before_pca/KDDTrain+_l1.txt', index=False)
    X_validate.to_csv('NSL-KDD Encoded Datasets/before_pca/KDDValidate+_l1.txt', index=False)
    np.save('NSL-KDD Encoded Datasets/before_pca/KDDTrain+_l1_targets', y_train)
    np.save('NSL-KDD Encoded Datasets/before_pca/KDDValidate+_l1_targets', y_val)

### Principal Component Analysis

In [753]:
pca_dos_probe = PCA(n_components=0.95)
X_train_dos_probe = pca_dos_probe.fit_transform(X_train)
X_test_dos_probe = pca_dos_probe.transform(X_test)
X_validate_dos_probe = pca_dos_probe.transform(X_validate)

X_train.columns

Index(['logged_in', 'count', 'serror_rate', 'rerror_rate', 'srv_rerror_rate',
       'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       ...
       'flag_REJ', 'flag_RSTO', 'flag_RSTOS0', 'flag_RSTR', 'flag_S0',
       'flag_S1', 'flag_S2', 'flag_S3', 'flag_SF', 'flag_SH'],
      dtype='object', length=101)

In [754]:
if EXPORT_PCA:
    # save the pca transformed as well as the transformer
    joblib.dump(X_test_dos_probe, 'NSL-KDD Encoded Datasets/pca_transformed/pca_test1.pkl')
    joblib.dump(X_train_dos_probe, 'NSL-KDD Encoded Datasets/pca_transformed/pca_train1.pkl')
    joblib.dump(X_validate_dos_probe, 'NSL-KDD Encoded Datasets/pca_transformed/pca_validate1.pkl')
    joblib.dump(pca_dos_probe, 'NSL-KDD Encoded Datasets/pca_transformed/layer1_transformer.pkl')

### Building the classifier for the layer1

In [755]:
# Using Random Forest Classifier
# dos_probe_classifier = RandomForestClassifier(n_estimators=100, criterion='gini')

# Using the Naive Bayes Classifier
dos_probe_classifier = GaussianNB()
dos_probe_classifier.fit(X_train_dos_probe, y_train)
predicted = dos_probe_classifier.predict(X_test_dos_probe)
# class_probabilities = dos_probe_classifier.predict_proba(X_test_dos_probe)

In [756]:
print('Metrics for layer 1:')
print('Confusion matrix: [TP FN / FP TN]\n', confusion_matrix(y_test,predicted))
print('Accuracy = ', accuracy_score(y_test,predicted))
print('F1 Score = ', f1_score(y_test,predicted))
print('Precision = ', precision_score(y_test,predicted))
print('Recall = ', recall_score(y_test,predicted))
print('Shape of the train set for l1: ', X_train_dos_probe.shape)

Metrics for layer 1:
Confusion matrix: [TP FN / FP TN]
 [[9557 3106]
 [ 903 8978]]
Accuracy =  0.8221699787083038
F1 Score =  0.8174823582972911
Precision =  0.7429659053293611
Recall =  0.9086124886145127
Shape of the train set for l1:  (100778, 28)


# R2L+U2R classifier

In [757]:
df_train = copy.deepcopy(df_train_original)
df_test = copy.deepcopy(df_test_original)
df_val = copy.deepcopy(df_val_original)

# load targeted attacks (Normal + r2l + u2r)
df_train = df_train[df_train['label'].isin(normal_list+u2r_attacks+r2l_attacks)]
df_val = df_val[df_val['label'].isin(normal_list+u2r_attacks+r2l_attacks)]

# set the target variables accordingly
y_train = np.array([1 if x in (u2r_attacks+r2l_attacks) else 0 for x in df_train['label']])
y_val = np.array([1 if x in (u2r_attacks+r2l_attacks) else 0 for x in df_val['label']])

df_train = df_train.drop(['label'],axis=1)
df_train = df_train.reset_index().drop(['index'], axis=1)
df_train

Unnamed: 0,count,diff_srv_rate,dst_bytes,dst_host_count,dst_host_diff_srv_rate,dst_host_rerror_rate,dst_host_same_src_port_rate,dst_host_same_srv_rate,dst_host_serror_rate,dst_host_srv_count,...,serror_rate,service,src_bytes,srv_count,srv_diff_host_rate,srv_rerror_rate,srv_serror_rate,su_attempted,urgent,wrong_fragment
0,16,0.00,14939,52,0.00,0.0,0.02,1.00,0.0,255,...,0.0,http,214,18,0.17,0.0,0.0,0,0,0
1,20,0.00,259,255,0.00,0.0,0.00,1.00,0.0,255,...,0.0,http,257,20,0.00,0.0,0.0,0,0,0
2,274,0.00,4,255,0.00,0.0,1.00,1.00,0.0,255,...,0.0,other,516,274,0.00,0.0,0.0,0,0,0
3,5,0.00,0,79,0.05,0.0,0.70,0.70,0.0,129,...,0.0,ftp_data,7940,5,0.00,0.0,0.0,0,0,0
4,1,0.00,105,255,0.62,0.0,0.90,0.00,0.0,1,...,0.0,other,147,1,0.00,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54728,2,0.00,364,199,0.03,0.0,0.01,0.74,0.0,148,...,0.0,smtp,908,4,0.75,0.0,0.0,0,0,0
54729,1,0.00,1,255,0.02,0.0,0.00,0.00,0.0,1,...,0.0,shell,67,1,0.00,0.0,0.0,0,0,0
54730,4,0.00,2698,218,0.01,0.0,0.00,0.95,0.0,208,...,0.0,http,304,4,0.00,0.0,0.0,0,0,0
54731,5,0.00,4281,21,0.00,0.0,0.05,1.00,0.0,255,...,0.0,http,309,5,0.00,0.0,0.0,0,0,0


In [758]:
df_val = df_val.drop(['label'],axis=1)
df_val = df_val.reset_index().drop(['index'], axis=1)
df_val

Unnamed: 0,count,diff_srv_rate,dst_bytes,dst_host_count,dst_host_diff_srv_rate,dst_host_rerror_rate,dst_host_same_src_port_rate,dst_host_same_srv_rate,dst_host_serror_rate,dst_host_srv_count,...,serror_rate,service,src_bytes,srv_count,srv_diff_host_rate,srv_rerror_rate,srv_serror_rate,su_attempted,urgent,wrong_fragment
0,2,0.0,0,67,0.00,0.00,1.00,1.00,0.00,171,...,0.0,domain_u,36,4,0.75,0.0,0.0,0,0,0
1,20,0.0,4673,255,0.00,0.00,0.00,1.00,0.00,255,...,0.0,http,325,20,0.00,0.0,0.0,0,0,0
2,1,0.0,335,138,0.05,0.00,0.01,0.62,0.00,85,...,0.0,smtp,743,2,1.00,0.0,0.0,0,0,0
3,3,0.0,29055,168,0.00,0.00,0.01,1.00,0.00,255,...,0.0,http,209,3,0.00,0.0,0.0,0,0,0
4,1,0.0,0,113,0.04,0.00,0.35,0.35,0.02,39,...,0.0,ftp_data,567,1,0.00,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13652,274,0.0,70,255,0.01,0.00,0.00,1.00,0.00,254,...,0.0,domain_u,43,274,0.00,0.0,0.0,0,0,0
13653,74,0.0,134,255,0.01,0.00,0.00,1.00,0.00,254,...,0.0,domain_u,45,154,0.01,0.0,0.0,0,0,0
13654,11,0.0,944,255,0.00,0.00,0.00,1.00,0.00,255,...,0.0,http,199,13,0.15,0.0,0.0,0,0,0
13655,2,0.0,0,140,0.04,0.02,0.06,0.06,0.00,9,...,0.0,ftp_data,12983,2,0.00,0.0,0.0,0,0,0


In [759]:
X_train = df_train[common_features_l2]
X_train

Unnamed: 0,srv_count,urgent,root_shell,hot,dst_host_srv_diff_host_rate,dst_host_count,logged_in,dst_host_same_src_port_rate,srv_diff_host_rate,num_shells,dst_host_srv_count
0,18,0,0,0,0.06,52,1,0.02,0.17,0,255
1,20,0,0,0,0.00,255,1,0.00,0.00,0,255
2,274,0,0,0,0.00,255,0,1.00,0.00,0,255
3,5,0,0,0,0.02,79,0,0.70,0.00,0,129
4,1,0,0,0,0.00,255,0,0.90,0.00,0,1
...,...,...,...,...,...,...,...,...,...,...,...
54728,4,0,0,0,0.00,199,1,0.01,0.75,0,148
54729,1,0,0,0,0.00,255,0,0.00,0.00,0,1
54730,4,0,0,0,0.00,218,1,0.00,0.00,0,208
54731,5,0,0,0,0.05,21,1,0.05,0.00,0,255


In [760]:
X_validate = df_val[common_features_l2]
X_validate

Unnamed: 0,srv_count,urgent,root_shell,hot,dst_host_srv_diff_host_rate,dst_host_count,logged_in,dst_host_same_src_port_rate,srv_diff_host_rate,num_shells,dst_host_srv_count
0,4,0,0,0,0.01,67,0,1.00,0.75,0,171
1,20,0,0,0,0.00,255,1,0.00,0.00,0,255
2,2,0,0,0,0.00,138,1,0.01,1.00,0,85
3,3,0,0,0,0.02,168,1,0.01,0.00,0,255
4,1,0,0,0,0.00,113,1,0.35,0.00,0,39
...,...,...,...,...,...,...,...,...,...,...,...
13652,274,0,0,0,0.00,255,0,0.00,0.00,0,254
13653,154,0,0,0,0.00,255,0,0.00,0.01,0,254
13654,13,0,0,0,0.00,255,1,0.00,0.15,0,255
13655,2,0,0,0,0.00,140,0,0.06,0.00,0,9


In [761]:
df_minmax = scaler2.fit_transform(X_train)
X_train = pd.DataFrame(df_minmax, columns=X_train.columns)
X_train

Unnamed: 0,srv_count,urgent,root_shell,hot,dst_host_srv_diff_host_rate,dst_host_count,logged_in,dst_host_same_src_port_rate,srv_diff_host_rate,num_shells,dst_host_srv_count
0,0.035225,0.0,0.0,0.0,0.06,0.203922,1.0,0.02,0.17,0.0,1.000000
1,0.039139,0.0,0.0,0.0,0.00,1.000000,1.0,0.00,0.00,0.0,1.000000
2,0.536204,0.0,0.0,0.0,0.00,1.000000,0.0,1.00,0.00,0.0,1.000000
3,0.009785,0.0,0.0,0.0,0.02,0.309804,0.0,0.70,0.00,0.0,0.505882
4,0.001957,0.0,0.0,0.0,0.00,1.000000,0.0,0.90,0.00,0.0,0.003922
...,...,...,...,...,...,...,...,...,...,...,...
54728,0.007828,0.0,0.0,0.0,0.00,0.780392,1.0,0.01,0.75,0.0,0.580392
54729,0.001957,0.0,0.0,0.0,0.00,1.000000,0.0,0.00,0.00,0.0,0.003922
54730,0.007828,0.0,0.0,0.0,0.00,0.854902,1.0,0.00,0.00,0.0,0.815686
54731,0.009785,0.0,0.0,0.0,0.05,0.082353,1.0,0.05,0.00,0.0,1.000000


In [762]:
df_minmax = scaler2.transform(X_validate)
X_validate = pd.DataFrame(df_minmax, columns=X_validate.columns)
X_validate

Unnamed: 0,srv_count,urgent,root_shell,hot,dst_host_srv_diff_host_rate,dst_host_count,logged_in,dst_host_same_src_port_rate,srv_diff_host_rate,num_shells,dst_host_srv_count
0,0.007828,0.0,0.0,0.0,0.01,0.262745,0.0,1.00,0.75,0.0,0.670588
1,0.039139,0.0,0.0,0.0,0.00,1.000000,1.0,0.00,0.00,0.0,1.000000
2,0.003914,0.0,0.0,0.0,0.00,0.541176,1.0,0.01,1.00,0.0,0.333333
3,0.005871,0.0,0.0,0.0,0.02,0.658824,1.0,0.01,0.00,0.0,1.000000
4,0.001957,0.0,0.0,0.0,0.00,0.443137,1.0,0.35,0.00,0.0,0.152941
...,...,...,...,...,...,...,...,...,...,...,...
13652,0.536204,0.0,0.0,0.0,0.00,1.000000,0.0,0.00,0.00,0.0,0.996078
13653,0.301370,0.0,0.0,0.0,0.00,1.000000,0.0,0.00,0.01,0.0,0.996078
13654,0.025440,0.0,0.0,0.0,0.00,1.000000,1.0,0.00,0.15,0.0,1.000000
13655,0.003914,0.0,0.0,0.0,0.00,0.549020,0.0,0.06,0.00,0.0,0.035294


In [763]:
# perform One-hot encoding for the train set
label_enc = ohe2.fit_transform(df_train[categorical_features])
label_enc.toarray()
new_labels = ohe2.get_feature_names_out(categorical_features)
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_train = pd.concat([X_train, df_enc], axis=1)
X_train

Unnamed: 0,srv_count,urgent,root_shell,hot,dst_host_srv_diff_host_rate,dst_host_count,logged_in,dst_host_same_src_port_rate,srv_diff_host_rate,num_shells,...,flag_OTH,flag_REJ,flag_RSTO,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.035225,0.0,0.0,0.0,0.06,0.203922,1.0,0.02,0.17,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.039139,0.0,0.0,0.0,0.00,1.000000,1.0,0.00,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.536204,0.0,0.0,0.0,0.00,1.000000,0.0,1.00,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.009785,0.0,0.0,0.0,0.02,0.309804,0.0,0.70,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.001957,0.0,0.0,0.0,0.00,1.000000,0.0,0.90,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54728,0.007828,0.0,0.0,0.0,0.00,0.780392,1.0,0.01,0.75,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
54729,0.001957,0.0,0.0,0.0,0.00,1.000000,0.0,0.00,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
54730,0.007828,0.0,0.0,0.0,0.00,0.854902,1.0,0.00,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
54731,0.009785,0.0,0.0,0.0,0.05,0.082353,1.0,0.05,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [764]:
# perform One-hot encoding for the validation set
label_enc = ohe2.transform(df_val[categorical_features])
label_enc.toarray()
new_labels = ohe2.get_feature_names_out(categorical_features)
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_validate = pd.concat([X_validate, df_enc], axis=1)
X_validate

Unnamed: 0,srv_count,urgent,root_shell,hot,dst_host_srv_diff_host_rate,dst_host_count,logged_in,dst_host_same_src_port_rate,srv_diff_host_rate,num_shells,...,flag_OTH,flag_REJ,flag_RSTO,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.007828,0.0,0.0,0.0,0.01,0.262745,0.0,1.00,0.75,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.039139,0.0,0.0,0.0,0.00,1.000000,1.0,0.00,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.003914,0.0,0.0,0.0,0.00,0.541176,1.0,0.01,1.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.005871,0.0,0.0,0.0,0.02,0.658824,1.0,0.01,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.001957,0.0,0.0,0.0,0.00,0.443137,1.0,0.35,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13652,0.536204,0.0,0.0,0.0,0.00,1.000000,0.0,0.00,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13653,0.301370,0.0,0.0,0.0,0.00,1.000000,0.0,0.00,0.01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13654,0.025440,0.0,0.0,0.0,0.00,1.000000,1.0,0.00,0.15,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
13655,0.003914,0.0,0.0,0.0,0.00,0.549020,0.0,0.06,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [765]:
# do the same for test set
df_test = df_test[df_test['label'].isin(normal_list+u2r_attacks+r2l_attacks)]

y_test = np.array([0 if x=='normal' else 1 for x in df_test['label']])
df_test = df_test.drop(['label'],axis=1)
df_test = df_test.reset_index().drop(['index'], axis=1)
df_test

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,134,86,0.61,0.04,0.61,0.02,0.00,0.00,0.00,0.00
1,0,tcp,http,SF,267,14515,0,0,0,0,...,155,255,1.00,0.00,0.01,0.03,0.01,0.00,0.00,0.00
2,0,tcp,smtp,SF,1022,387,0,0,0,0,...,255,28,0.11,0.72,0.00,0.00,0.00,0.00,0.72,0.04
3,0,tcp,telnet,SF,129,174,0,0,0,0,...,255,255,1.00,0.00,0.00,0.00,0.01,0.01,0.02,0.02
4,0,tcp,http,SF,327,467,0,0,0,0,...,151,255,1.00,0.00,0.01,0.03,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12658,0,tcp,http,SF,274,1623,0,0,0,0,...,92,255,1.00,0.00,0.01,0.04,0.00,0.00,0.00,0.00
12659,0,tcp,http,SF,280,6087,0,0,0,0,...,5,255,1.00,0.00,0.20,0.04,0.00,0.00,0.00,0.00
12660,0,tcp,smtp,SF,794,333,0,0,0,0,...,100,141,0.72,0.06,0.01,0.01,0.01,0.00,0.00,0.00
12661,0,tcp,http,SF,317,938,0,0,0,0,...,197,255,1.00,0.00,0.01,0.01,0.01,0.00,0.00,0.00


In [766]:
X_test = df_test[common_features_l2] 
X_test

Unnamed: 0,srv_count,urgent,root_shell,hot,dst_host_srv_diff_host_rate,dst_host_count,logged_in,dst_host_same_src_port_rate,srv_diff_host_rate,num_shells,dst_host_srv_count
0,1,0,0,0,0.02,134,0,0.61,0.00,0,86
1,4,0,0,0,0.03,155,1,0.01,0.00,0,255
2,3,0,0,0,0.00,255,1,0.00,1.00,0,28
3,1,0,0,0,0.00,255,0,0.00,0.00,0,255
4,47,0,0,0,0.03,151,1,0.01,0.04,0,255
...,...,...,...,...,...,...,...,...,...,...,...
12658,1,0,0,0,0.04,92,1,0.01,0.00,0,255
12659,3,0,0,0,0.04,5,1,0.20,0.00,0,255
12660,1,0,0,0,0.01,100,1,0.01,0.00,0,141
12661,11,0,0,0,0.01,197,1,0.01,0.18,0,255


In [767]:
df_minmax = scaler2.transform(X_test)
X_test = pd.DataFrame(df_minmax, columns=X_test.columns)
X_test

Unnamed: 0,srv_count,urgent,root_shell,hot,dst_host_srv_diff_host_rate,dst_host_count,logged_in,dst_host_same_src_port_rate,srv_diff_host_rate,num_shells,dst_host_srv_count
0,0.001957,0.0,0.0,0.0,0.02,0.525490,0.0,0.61,0.00,0.0,0.337255
1,0.007828,0.0,0.0,0.0,0.03,0.607843,1.0,0.01,0.00,0.0,1.000000
2,0.005871,0.0,0.0,0.0,0.00,1.000000,1.0,0.00,1.00,0.0,0.109804
3,0.001957,0.0,0.0,0.0,0.00,1.000000,0.0,0.00,0.00,0.0,1.000000
4,0.091977,0.0,0.0,0.0,0.03,0.592157,1.0,0.01,0.04,0.0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...
12658,0.001957,0.0,0.0,0.0,0.04,0.360784,1.0,0.01,0.00,0.0,1.000000
12659,0.005871,0.0,0.0,0.0,0.04,0.019608,1.0,0.20,0.00,0.0,1.000000
12660,0.001957,0.0,0.0,0.0,0.01,0.392157,1.0,0.01,0.00,0.0,0.552941
12661,0.021526,0.0,0.0,0.0,0.01,0.772549,1.0,0.01,0.18,0.0,1.000000


In [768]:
label_enc = ohe2.transform(df_test.iloc[:,1:4])
label_enc.toarray()
new_labels = ohe2.get_feature_names_out(categorical_features)
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_test = pd.concat([X_test, df_enc], axis=1)
X_test

Unnamed: 0,srv_count,urgent,root_shell,hot,dst_host_srv_diff_host_rate,dst_host_count,logged_in,dst_host_same_src_port_rate,srv_diff_host_rate,num_shells,...,flag_OTH,flag_REJ,flag_RSTO,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.001957,0.0,0.0,0.0,0.02,0.525490,0.0,0.61,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.007828,0.0,0.0,0.0,0.03,0.607843,1.0,0.01,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.005871,0.0,0.0,0.0,0.00,1.000000,1.0,0.00,1.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.001957,0.0,0.0,0.0,0.00,1.000000,0.0,0.00,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.091977,0.0,0.0,0.0,0.03,0.592157,1.0,0.01,0.04,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12658,0.001957,0.0,0.0,0.0,0.04,0.360784,1.0,0.01,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12659,0.005871,0.0,0.0,0.0,0.04,0.019608,1.0,0.20,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12660,0.001957,0.0,0.0,0.0,0.01,0.392157,1.0,0.01,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12661,0.021526,0.0,0.0,0.0,0.01,0.772549,1.0,0.01,0.18,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [769]:
print('Shape of the train set: ', X_train.shape)
print('Shape of its target: ', y_train.shape)
print('Shape of the test set: ', X_test.shape)
print('Shape of its target: ', y_test.shape)

Shape of the train set:  (54733, 51)
Shape of its target:  (54733,)
Shape of the test set:  (12663, 51)
Shape of its target:  (12663,)


In [770]:
# Under sampling the train set for l2
sm = under_sam(sampling_strategy=1)
X_train, y_train = sm.fit_resample(X_train,y_train)

## Export the datasets
Train set has been scaled, one hot encoded, undersampled
Test set has been scaled and one hot encoded

In [771]:
# Export the dataset for training layer 2
if EXPORT_DATASETS:
    X_train.to_csv('NSL-KDD Encoded Datasets/before_pca/KDDTrain+_l2.txt', index=False)
    np.save('NSL-KDD Encoded Datasets/before_pca/KDDTrain+_l2_targets', y_train)
    X_validate.to_csv('NSL-KDD Encoded Datasets/before_pca/KDDValidate+_l2.txt', index=False)
    np.save('NSL-KDD Encoded Datasets/before_pca/KDDValidate+_l2_targets', y_train)

In [772]:
from sklearn import tree

# Principal Component Analysis
pca_r2l_u2r = PCA(n_components=0.95)
X_train_r2l_u2r = pca_r2l_u2r.fit_transform(X_train)
X_test_r2l_u2r = pca_r2l_u2r.transform(X_test)
X_validate_r2l_u2r = pca_r2l_u2r.transform(X_validate)

# Try also Decision Trees
# r2l_u2r_classifier = tree.DecisionTreeClassifier()

# Support Vector Machine for layer l2
r2l_u2r_classifier = SVC(C=0.1, gamma=0.01, kernel='rbf')
r2l_u2r_classifier.fit(X_train_r2l_u2r, y_train)
predicted = r2l_u2r_classifier.predict(X_test_r2l_u2r)

In [773]:
if EXPORT_PCA:
    # save the pca transformed as well as the transformer
    joblib.dump(X_test_r2l_u2r, 'NSL-KDD Encoded Datasets/pca_transformed/pca_test2.pkl')
    joblib.dump(X_train_r2l_u2r, 'NSL-KDD Encoded Datasets/pca_transformed/pca_train2.pkl')
    joblib.dump(X_train_r2l_u2r, 'NSL-KDD Encoded Datasets/pca_transformed/pca_validate2.pkl')
    joblib.dump(pca_r2l_u2r, 'NSL-KDD Encoded Datasets/pca_transformed/layer2_transformer.pkl')

In [774]:
print('Metrics for layer 2:')
print('Confusion matrix: [TP FN / FP TN]\n', confusion_matrix(y_test,predicted))
print('Accuracy = ', accuracy_score(y_test,predicted))
print('F1 Score = ', f1_score(y_test,predicted))
print('Precision = ', precision_score(y_test,predicted))
print('Recall = ', recall_score(y_test,predicted))
print('Matthew corr = ', matthews_corrcoef(y_test,predicted))
print('Shape of the training set: ', X_train_r2l_u2r.shape)

Metrics for layer 2:
Confusion matrix: [TP FN / FP TN]
 [[9079  632]
 [1530 1422]]
Accuracy =  0.8292663665797995
F1 Score =  0.5681182580902916
Precision =  0.6923076923076923
Recall =  0.4817073170731707
Matthew corr =  0.4778587906703902
Shape of the training set:  (1624, 13)


### Export the classifiers

In [775]:
if EXPORT_MODELS:
    with open('Models/NSL_l1_classifier.pkl', "wb") as f:
        pickle.dump(dos_probe_classifier, f)
    with open('Models/NSL_l2_classifier.pkl', "wb") as f:
        pickle.dump(r2l_u2r_classifier, f)

# Testing

In [776]:
df_test1 = copy.deepcopy(df_test_original)
df_test2 = copy.deepcopy(df_test_original)
y_test_real = np.array([0 if x=='normal' else 1 for x in df_test1['label']])
df_test_original

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,private,REJ,0,0,0,0,0,0,...,10,0.04,0.06,0.00,0.00,0.00,0.0,1.00,1.00,neptune
1,0,tcp,private,REJ,0,0,0,0,0,0,...,1,0.00,0.06,0.00,0.00,0.00,0.0,1.00,1.00,neptune
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,86,0.61,0.04,0.61,0.02,0.00,0.0,0.00,0.00,normal
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,57,1.00,0.00,1.00,0.28,0.00,0.0,0.00,0.00,saint
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,86,0.31,0.17,0.03,0.02,0.00,0.0,0.83,0.71,mscan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22539,0,tcp,smtp,SF,794,333,0,0,0,0,...,141,0.72,0.06,0.01,0.01,0.01,0.0,0.00,0.00,normal
22540,0,tcp,http,SF,317,938,0,0,0,0,...,255,1.00,0.00,0.01,0.01,0.01,0.0,0.00,0.00,normal
22541,0,tcp,http,SF,54540,8314,0,0,0,2,...,255,1.00,0.00,0.00,0.00,0.00,0.0,0.07,0.07,back
22542,0,udp,domain_u,SF,42,42,0,0,0,0,...,252,0.99,0.01,0.00,0.00,0.00,0.0,0.00,0.00,normal


In [777]:
X_test = df_test1[common_features_l1]
X_test

Unnamed: 0,logged_in,count,serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_srv_rerror_rate,dst_host_rerror_rate
0,0,229,0.0,1.0,1.0,0.04,0.06,0.00,10,0.04,0.06,0.00,0.00,0.00,0.0,1.00,1.00
1,0,136,0.0,1.0,1.0,0.01,0.06,0.00,1,0.00,0.06,0.00,0.00,0.00,0.0,1.00,1.00
2,0,1,0.0,0.0,0.0,1.00,0.00,0.00,86,0.61,0.04,0.61,0.02,0.00,0.0,0.00,0.00
3,0,1,0.0,0.0,0.0,1.00,0.00,1.00,57,1.00,0.00,1.00,0.28,0.00,0.0,0.00,0.00
4,0,1,0.0,1.0,0.5,1.00,0.00,0.75,86,0.31,0.17,0.03,0.02,0.00,0.0,0.71,0.83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22539,1,1,0.0,0.0,0.0,1.00,0.00,0.00,141,0.72,0.06,0.01,0.01,0.01,0.0,0.00,0.00
22540,1,2,0.0,0.0,0.0,1.00,0.00,0.18,255,1.00,0.00,0.01,0.01,0.01,0.0,0.00,0.00
22541,1,5,0.0,0.0,0.0,1.00,0.00,0.20,255,1.00,0.00,0.00,0.00,0.00,0.0,0.07,0.07
22542,0,4,0.0,0.0,0.0,1.00,0.00,0.33,252,0.99,0.01,0.00,0.00,0.00,0.0,0.00,0.00


In [778]:
df_minmax = scaler1.transform(X_test)
X_test = pd.DataFrame(df_minmax, columns=X_test.columns)
label_enc = ohe.transform(df_test1.iloc[:,1:4])
label_enc.toarray()
new_labels = ohe.get_feature_names_out(categorical_features)
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_test = pd.concat([X_test, df_enc], axis=1)

X_test_layer1 = pca_dos_probe.transform(X_test)
print('Test set shape for layer 1: ', X_test_layer1.shape)

Test set shape for layer 1:  (22544, 28)


In [779]:
X_test = df_test2[common_features_l2] 
X_test

Unnamed: 0,srv_count,urgent,root_shell,hot,dst_host_srv_diff_host_rate,dst_host_count,logged_in,dst_host_same_src_port_rate,srv_diff_host_rate,num_shells,dst_host_srv_count
0,10,0,0,0,0.00,255,0,0.00,0.00,0,10
1,1,0,0,0,0.00,255,0,0.00,0.00,0,1
2,1,0,0,0,0.02,134,0,0.61,0.00,0,86
3,65,0,0,0,0.28,3,0,1.00,1.00,0,57
4,8,0,0,0,0.02,29,0,0.03,0.75,0,86
...,...,...,...,...,...,...,...,...,...,...,...
22539,1,0,0,0,0.01,100,1,0.01,0.00,0,141
22540,11,0,0,0,0.01,197,1,0.01,0.18,0,255
22541,10,0,0,2,0.00,255,1,0.00,0.20,0,255
22542,6,0,0,0,0.00,255,0,0.00,0.33,0,252


In [780]:
df_minmax = scaler2.transform(X_test)
X_test = pd.DataFrame(df_minmax, columns=X_test.columns)
label_enc = ohe2.transform(df_test2.iloc[:,1:4])
label_enc.toarray()
new_labels = ohe2.get_feature_names_out(categorical_features)
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_test = pd.concat([X_test, df_enc], axis=1)

X_test_layer2 = pca_r2l_u2r.transform(X_test)
print('Test set shape for layer 2: ', X_test_layer2.shape)
print('Type of X_test_layer1: ', type(X_test_layer1))
print('Type of X_test_layer1: ', type(X_test_layer2))

Test set shape for layer 2:  (22544, 13)
Type of X_test_layer1:  <class 'numpy.ndarray'>
Type of X_test_layer1:  <class 'numpy.ndarray'>


In [781]:
# same classifiers obtained above
classifier1 = dos_probe_classifier
classifier2 = r2l_u2r_classifier

In [782]:
result = []
for i in range(X_test_layer2.shape[0]):
    layer1 = classifier1.predict(X_test_layer1[i].reshape(1, -1))[0]
    if layer1 == 1:
        result.append(layer1)
    else:
        layer2 = classifier2.predict(X_test_layer2[i].reshape(1, -1))[0]
        if layer2 == 1:
            result.append(layer2)
        else:
            result.append(0)
            
result = np.array(result)

In [783]:
# the results may vary
# C=0.1, gamma=0.01
print('Results for the layer 2 (SVM):')
print(confusion_matrix(y_test_real,result))
print('Accuracy = ', accuracy_score(y_test_real,result))
print('F1 Score = ', f1_score(y_test_real,result))
print('Precision = ', precision_score(y_test_real,result))
print('Recall = ', recall_score(y_test_real,result))
print('Matthew corr = ', matthews_corrcoef(y_test_real,result))

Results for the layer 2 (SVM):
[[ 8109  1602]
 [  906 11927]]
Accuracy =  0.88875088715401
F1 Score =  0.904863060465822
Precision =  0.8815877004952325
Recall =  0.9294007636561988
Matthew corr =  0.7727159938785936


### Export the test sets

In [784]:
if EXPORT_DATASETS:
    column_names = [f'PC{i}' for i in range(1, X_test_layer1.shape[1] + 1)]
    X1_test = pd.DataFrame(data=X_test_layer1, columns=column_names)
    X1_test.to_csv('NSL-KDD Encoded Datasets/before_pca/X_test_l1.txt', index=False)
    
    column_names = [f'PC{i}' for i in range(1, X_test_layer2.shape[1] + 1)]
    X2_test = pd.DataFrame(data=X_test_layer2, columns=column_names)
    X2_test.to_csv('NSL-KDD Encoded Datasets/before_pca/X_test_l2.txt', index=False)
    
    np.save('NSL-KDD Encoded Datasets/before_pca/y_test', y_test_real)

### evaluate seen and unseen attack categories

In [785]:
# load testset
df_test = pd.read_csv('NSL-KDD Original Datasets\KDDTest+.txt', sep=",", header=None)
df_test = df_test[df_test.columns[:-1]]
df_test.columns = titles.to_list()
y_test = df_test['label']
df_test = df_test.drop(['num_outbound_cmds'],axis=1)
df_test_original = df_test

In [786]:
if EXPORT_DATASETS:
    df_test_original.to_csv('NSL-KDD Encoded Datasets/before_pca/KDDTest+', index=False)
    np.save('NSL-KDD Encoded Datasets/before_pca/KDDTest+_targets', y_test)
    
df_test_original

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,private,REJ,0,0,0,0,0,0,...,10,0.04,0.06,0.00,0.00,0.00,0.0,1.00,1.00,neptune
1,0,tcp,private,REJ,0,0,0,0,0,0,...,1,0.00,0.06,0.00,0.00,0.00,0.0,1.00,1.00,neptune
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,86,0.61,0.04,0.61,0.02,0.00,0.0,0.00,0.00,normal
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,57,1.00,0.00,1.00,0.28,0.00,0.0,0.00,0.00,saint
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,86,0.31,0.17,0.03,0.02,0.00,0.0,0.83,0.71,mscan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22539,0,tcp,smtp,SF,794,333,0,0,0,0,...,141,0.72,0.06,0.01,0.01,0.01,0.0,0.00,0.00,normal
22540,0,tcp,http,SF,317,938,0,0,0,0,...,255,1.00,0.00,0.01,0.01,0.01,0.0,0.00,0.00,normal
22541,0,tcp,http,SF,54540,8314,0,0,0,2,...,255,1.00,0.00,0.00,0.00,0.00,0.0,0.07,0.07,back
22542,0,udp,domain_u,SF,42,42,0,0,0,0,...,252,0.99,0.01,0.00,0.00,0.00,0.0,0.00,0.00,normal


In [787]:
new_attack = []
for i in df_test_original['label'].value_counts().index.tolist()[1:]:
    if i not in df_train_original['label'].value_counts().index.tolist()[1:]:
        new_attack.append(i)
        
new_attack.sort()
new_attack

['apache2',
 'httptunnel',
 'mailbomb',
 'mscan',
 'named',
 'processtable',
 'ps',
 'saint',
 'sendmail',
 'snmpgetattack',
 'snmpguess',
 'sqlattack',
 'udpstorm',
 'worm',
 'xlock',
 'xsnoop',
 'xterm']

In [788]:
index_of_new_attacks = []

for i in range(len(df_test_original)):
    if df_test_original['label'][i] in new_attack:
        index_of_new_attacks.append(df_test_original.index[i])

In [789]:
len(index_of_new_attacks)

3750

In [790]:
new_attack.append('normal')
new_attack

['apache2',
 'httptunnel',
 'mailbomb',
 'mscan',
 'named',
 'processtable',
 'ps',
 'saint',
 'sendmail',
 'snmpgetattack',
 'snmpguess',
 'sqlattack',
 'udpstorm',
 'worm',
 'xlock',
 'xsnoop',
 'xterm',
 'normal']

In [791]:
index_of_old_attacks = []

for i in range(len(df_test_original)):
    if df_test_original['label'][i] not in new_attack:
        index_of_old_attacks.append(df_test_original.index[i])

In [792]:
len(index_of_old_attacks)

9083

In [793]:
print('Number of new attacks in the test set: ', result[index_of_new_attacks].shape[0])
print('Number of new attacks detected by the classifiers: ', result[index_of_new_attacks].sum())
print('Proportion of new attacks detected: ', result[index_of_new_attacks].sum()/result[index_of_new_attacks].shape[0])

Number of new attacks in the test set:  3750
Number of new attacks detected by the classifiers:  3409
Proportion of new attacks detected:  0.9090666666666667


In [794]:
print('Number of old attacks in the test set: ', result[index_of_old_attacks].shape[0])
print('Number of old attacks detected by the classifiers: ', result[index_of_old_attacks].sum())
print('Proportion of old attacks detected: ', result[index_of_old_attacks].sum()/result[index_of_old_attacks].shape[0])

Number of old attacks in the test set:  9083
Number of old attacks detected by the classifiers:  8518
Proportion of old attacks detected:  0.9377958824177034


### Evaluate single attack types

In [795]:
# load test set
df_test = pd.read_csv('NSL-KDD Original Datasets/KDDTest+.txt', sep=",", header=None)
df_test = df_test[df_test.columns[:-1]]
df_test.columns = titles.to_list()
y_test = df_test['label']
df_test = df_test.drop(['num_outbound_cmds'],axis=1)
df_test_original = df_test
df = df_test_original

dos_index = df.index[(df['label'].isin(dos_attacks))].tolist()
probe_index = df.index[(df['label'].isin(probe_attacks))].tolist()
r2l_index = df.index[(df['label'].isin(r2l_attacks))].tolist()
u2r_index = df.index[(df['label'].isin(u2r_attacks))].tolist()

print('Evaluation split into single attack type:')
print("Number of dos attacks: ", result[dos_index].shape[0])
print("Number of detected attacks: ", result[dos_index].sum())
print("Ratio of detection: ", result[dos_index].sum()/result[dos_index].shape[0])

print("Number of probe attacks: ", result[probe_index].shape[0])
print("Number of detected attacks: ", result[probe_index].sum())
print("Ratio of detection: ", result[probe_index].sum()/result[probe_index].shape[0])

print("Number of r2l attacks: ", result[r2l_index].shape[0])
print("Number of detected attacks: ", result[r2l_index].sum())
print("Ratio of detection: ", result[r2l_index].sum()/result[r2l_index].shape[0])

print("Number of u2r attacks: ", result[u2r_index].shape[0])
print("Number of detected attacks: ", result[u2r_index].sum())
print("Ratio of detection: ", result[u2r_index].sum()/result[u2r_index].shape[0])

Evaluation split into single attack type:
Number of dos attacks:  7460
Number of detected attacks:  6880
Ratio of detection:  0.9222520107238605
Number of probe attacks:  2421
Number of detected attacks:  2212
Ratio of detection:  0.9136720363486163
Number of r2l attacks:  2885
Number of detected attacks:  2768
Ratio of detection:  0.9594454072790295
Number of u2r attacks:  67
Number of detected attacks:  67
Ratio of detection:  1.0


In [796]:
# Export one hot encoders
if EXPORT_ENCODERS:
    joblib.dump(ohe, 'NSL-KDD Files/one_hot_encoders/ohe1.pkl')
    joblib.dump(ohe2, 'NSL-KDD Files/one_hot_encoders/ohe2.pkl')