In [476]:
import numpy as np 
import pandas as pd
import copy

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import PCA
from imblearn.under_sampling import RandomUnderSampler as under_sam

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.metrics import matthews_corrcoef, precision_score, recall_score

# ICFS function
Takes a dataframe as parameter and saves to file all the features necessary to describe DoS+Probe and U2R+R2L

In [477]:
def get_most_correlated_features(x, y, threshold):
    y['target'] = y['target'].astype(int)

    for p in x.columns:
        x[p] = x[p].astype(float)

    # Ensure y is a DataFrame for consistency
    if isinstance(y, pd.Series):
        y = pd.DataFrame(y, columns=['target'])

    # Calculate the Pearson's correlation coefficients between features and the target variable(s)
    corr_matrix = x.corrwith(y['target'])

    # Select features with correlations above the threshold
    selected_features = x.columns[corr_matrix.abs() > threshold].tolist()

    return selected_features

In [478]:
def compute_set_difference(df1, df2):
    # Create a new DataFrame containing the set difference of the two DataFrames.
    df_diff = df1[~df1.index.isin(df2.index)]
    # Return the DataFrame.
    return df_diff

In [479]:
def perform_icfs(x_train):
    # now ICFS only on the numerical features
    num_train = copy.deepcopy(x_train)
    del num_train['protocol_type']
    del num_train['service']
    del num_train['flag']

    target = pd.DataFrame()
    target['target'] = np.array([1 if x != 'normal' else 0 for x in num_train['label']])
    num_train = pd.concat([num_train, target], axis=1)

    # These are how attacks are categorized in the trainset
    dos_list = ['back', 'land', 'neptune', 'pod', 'smurf', 'teardrop']
    probe_list = ['ipsweep', 'portsweep', 'satan', 'nmap']
    u2r_list = ['loadmodule', 'perl', 'rootkit', 'buffer_overflow']
    r2l_list = ['ftp_write', 'guess_passwd', 'imap', 'multihop', 'phf', 'spy', 'warezclient', 'warezmaster']
    normal = ['normal']

    # useful sub-sets
    x_normal = num_train[num_train['label'].isin(normal)]
    x_u2r = num_train[num_train['label'].isin(u2r_list)]
    x_r2l = num_train[num_train['label'].isin(r2l_list)]
    x_dos = num_train[num_train['label'].isin(dos_list)]
    x_probe = num_train[num_train['label'].isin(probe_list)]

    # start the ICFS with l1

    # features for dos
    dos = copy.deepcopy(num_train)
    del dos['target']
    y = np.array([1 if x in dos_list else 0 for x in dos['label']])
    y_dos = pd.DataFrame(y, columns=['target'])
    del dos['label']
    dos_all = get_most_correlated_features(dos, y_dos, 0.1)
    print(dos_all)

    # features for probe
    probe = copy.deepcopy(num_train)
    del probe['target']
    y = np.array([1 if x in probe_list else 0 for x in probe['label']])
    y_probe = pd.DataFrame(y, columns=['target'])
    del probe['label']
    probe_all = get_most_correlated_features(probe, y_probe, 0.1)
    print(probe_all)

    # intersect for the optimal features
    set_dos = set(dos_all)
    set_probe = set(probe_all)

    comm_features_l1 = set_probe & set_dos

    print('common features to train l1: ', comm_features_l1)

    # now l2 needs the features to describe the difference between rare attacks and normal traffic

    # features for u2r
    u2r = pd.concat([x_u2r, x_normal], axis=0)
    del u2r['target']
    y = np.array([1 if x in u2r_list else 0 for x in u2r['label']])
    y_u2r = pd.DataFrame(y, columns=['target'])
    del u2r['label']
    u2r_all = get_most_correlated_features(u2r, y_u2r, 0.01)
    print(u2r_all)

    # features for r2l
    r2l = pd.concat([x_r2l, x_normal], axis=0)
    del r2l['target']
    y = np.array([1 if x in r2l_list else 0 for x in r2l['label']])
    y_r2l = pd.DataFrame(y, columns=['target'])
    del r2l['label']
    r2l_all = get_most_correlated_features(r2l, y_r2l, 0.01)
    print(r2l_all)

    # intersect for the optimal features
    set_r2l = set(r2l_all)
    set_u2r = set(u2r_all)

    comm_features_l2 = set_r2l & set_u2r
    # print('Common features to train l2: ', len(common_features_l2), common_features_l2)

    with open('NSL-KDD Outputs/NSL_features_l1.txt', 'w') as g:
        for a, x in enumerate(comm_features_l1):
            if a < len(comm_features_l1) - 1:
                g.write(x + ',' + '\n')
            else:
                g.write(x)

    # read the common features from file
    with open('NSL-KDD Outputs/NSL_features_l2.txt', 'w') as g:
        for a, x in enumerate(comm_features_l2):
            if a < len(comm_features_l2) - 1:
                g.write(x + ',' + '\n')
            else:
                g.write(x)

# 'main function'

In [480]:
# loading the train set
df_train = pd.read_csv('NSL-KDD_datasets/KDDTrain+.txt', sep=",", header=None)
df_train = df_train[df_train.columns[:-1]]  # tags column
titles = pd.read_csv('NSL-KDD_datasets/Field Names.csv', header=None)
label = pd.Series(['label'], index=[41])
titles = pd.concat([titles[0], label])
df_train.columns = titles.to_list()
df_train = df_train.drop(['num_outbound_cmds'],axis=1)
df_train_original = df_train
df_train_original

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0,tcp,private,S0,0,0,0,0,0,0,...,25,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,neptune
125969,8,udp,private,SF,105,145,0,0,0,0,...,244,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,normal
125970,0,tcp,smtp,SF,2231,384,0,0,0,0,...,30,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,normal
125971,0,tcp,klogin,S0,0,0,0,0,0,0,...,8,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune


In [481]:
# load test set
df_test = pd.read_csv('NSL-KDD_datasets/KDDTest+.csv', sep=",", header=None)
df_test = df_test[df_test.columns[:-1]]
df_test.columns = titles.to_list()
df_test = df_test.drop(['num_outbound_cmds'],axis=1)
df_test_original = df_test
df_test_original

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,private,REJ,0,0,0,0,0,0,...,10,0.04,0.06,0.00,0.00,0.00,0.0,1.00,1.00,neptune
1,0,tcp,private,REJ,0,0,0,0,0,0,...,1,0.00,0.06,0.00,0.00,0.00,0.0,1.00,1.00,neptune
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,86,0.61,0.04,0.61,0.02,0.00,0.0,0.00,0.00,normal
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,57,1.00,0.00,1.00,0.28,0.00,0.0,0.00,0.00,saint
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,86,0.31,0.17,0.03,0.02,0.00,0.0,0.83,0.71,mscan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22538,0,tcp,smtp,SF,794,333,0,0,0,0,...,141,0.72,0.06,0.01,0.01,0.01,0.0,0.00,0.00,normal
22539,0,tcp,http,SF,317,938,0,0,0,0,...,255,1.00,0.00,0.01,0.01,0.01,0.0,0.00,0.00,normal
22540,0,tcp,http,SF,54540,8314,0,0,0,2,...,255,1.00,0.00,0.00,0.00,0.00,0.0,0.07,0.07,back
22541,0,udp,domain_u,SF,42,42,0,0,0,0,...,252,0.99,0.01,0.00,0.00,0.00,0.0,0.00,0.00,normal


## finished loading dataset

In [482]:
# It is possible to compute the ICFS again

# perform_icfs(df_test_original)

## dos + probe classifier

In [483]:
# list of attacks in train and only in test
dos_probe_list = ['back', 'land', 'neptune', 'pod', 'smurf', 'teardrop', 'worm', 'ipsweep', 'nmap', 'portsweep', 'satan']
dos_probe_test = ['apache2', 'mailbomb', 'processtable', 'udpstorm', 'mscan', 'saint']
u2r_r2l_list = ['guess_passwd', 'ftp_write', 'imap', 'phf', 'multihop', 'warezmaster',
                'snmpguess', 'spy', 'warezclient', 'buffer_overflow', 'loadmodule', 'rootkit', 'perl']
u2r_r2l_test = ['httptunnel', 'named', 'sendmail', 'snmpgetattack', 'xlock', 'xsnoop', 'ps', 'xterm', 'sqlattack']
normal_list = ['normal']
categorical_features = ['protocol_type', 'service', 'flag']

# load the features obtained with ICFS
with open('NSL-KDD Outputs/NSL_features_l1.txt', 'r') as f:
    common_features_l1 = f.read().split(',')

with open('NSL-KDD Outputs/NSL_features_l2.txt', 'r') as f:
    common_features_l2 = f.read().split(',')

In [484]:
df_train = copy.deepcopy(df_train_original)
df_test = copy.deepcopy(df_test_original)

In [485]:
y_train = np.array([1 if x in dos_probe_list else 0 for x in df_train['label']])

df_train = df_train.drop(['label'],axis=1)
df_train = df_train.reset_index().drop(['index'], axis=1)
df_train

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,150,25,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00
1,0,udp,other,SF,146,0,0,0,0,0,...,255,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00
2,0,tcp,private,S0,0,0,0,0,0,0,...,255,26,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00
3,0,tcp,http,SF,232,8153,0,0,0,0,...,30,255,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0,tcp,private,S0,0,0,0,0,0,0,...,255,25,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00
125969,8,udp,private,SF,105,145,0,0,0,0,...,255,244,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00
125970,0,tcp,smtp,SF,2231,384,0,0,0,0,...,255,30,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00
125971,0,tcp,klogin,S0,0,0,0,0,0,0,...,255,8,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00


In [486]:
X_train = df_train[common_features_l1] 

X_train

Unnamed: 0,logged_in,count,serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_srv_rerror_rate,dst_host_rerror_rate
0,0,2,0.0,0.0,0.0,1.00,0.00,0.00,25,0.17,0.03,0.17,0.00,0.00,0.00,0.00,0.05
1,0,13,0.0,0.0,0.0,0.08,0.15,0.00,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00
2,0,123,1.0,0.0,0.0,0.05,0.07,0.00,26,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00
3,1,5,0.2,0.0,0.0,1.00,0.00,0.00,255,1.00,0.00,0.03,0.04,0.03,0.01,0.01,0.00
4,1,30,0.0,0.0,0.0,1.00,0.00,0.09,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0,184,1.0,0.0,0.0,0.14,0.06,0.00,25,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00
125969,0,2,0.0,0.0,0.0,1.00,0.00,0.00,244,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00
125970,1,1,0.0,0.0,0.0,1.00,0.00,0.00,30,0.12,0.06,0.00,0.00,0.72,0.00,0.00,0.01
125971,0,144,1.0,0.0,0.0,0.06,0.05,0.00,8,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00


Define tools for scaling and encoding

In [487]:
# 2 one hot encoder, one for the features of layer1 and one for the features of layer2
ohe = OneHotEncoder(handle_unknown='ignore')
ohe2 = OneHotEncoder(handle_unknown='ignore')
scaler1 = MinMaxScaler()
scaler2 = MinMaxScaler()

In [488]:
df_minmax = scaler1.fit_transform(X_train)
X_train = pd.DataFrame(df_minmax, columns=X_train.columns)
X_train

Unnamed: 0,logged_in,count,serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_srv_rerror_rate,dst_host_rerror_rate
0,0.0,0.003914,0.0,0.0,0.0,1.00,0.00,0.00,0.098039,0.17,0.03,0.17,0.00,0.00,0.00,0.00,0.05
1,0.0,0.025440,0.0,0.0,0.0,0.08,0.15,0.00,0.003922,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00
2,0.0,0.240705,1.0,0.0,0.0,0.05,0.07,0.00,0.101961,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00
3,1.0,0.009785,0.2,0.0,0.0,1.00,0.00,0.00,1.000000,1.00,0.00,0.03,0.04,0.03,0.01,0.01,0.00
4,1.0,0.058708,0.0,0.0,0.0,1.00,0.00,0.09,1.000000,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0.0,0.360078,1.0,0.0,0.0,0.14,0.06,0.00,0.098039,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00
125969,0.0,0.003914,0.0,0.0,0.0,1.00,0.00,0.00,0.956863,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00
125970,1.0,0.001957,0.0,0.0,0.0,1.00,0.00,0.00,0.117647,0.12,0.06,0.00,0.00,0.72,0.00,0.00,0.01
125971,0.0,0.281800,1.0,0.0,0.0,0.06,0.05,0.00,0.031373,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00


In [489]:
# perform One-hot encoding
label_enc = ohe.fit_transform(df_train.iloc[:,1:4])
label_enc.toarray()
new_labels = ohe.get_feature_names_out(['protocol_type', 'service', 'flag'])
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_train = pd.concat([X_train, df_enc], axis=1)
X_train

Unnamed: 0,logged_in,count,serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_srv_count,dst_host_same_srv_rate,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,0.003914,0.0,0.0,0.0,1.00,0.00,0.00,0.098039,0.17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.025440,0.0,0.0,0.0,0.08,0.15,0.00,0.003922,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.240705,1.0,0.0,0.0,0.05,0.07,0.00,0.101961,0.10,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.009785,0.2,0.0,0.0,1.00,0.00,0.00,1.000000,1.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.058708,0.0,0.0,0.0,1.00,0.00,0.09,1.000000,1.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0.0,0.360078,1.0,0.0,0.0,0.14,0.06,0.00,0.098039,0.10,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
125969,0.0,0.003914,0.0,0.0,0.0,1.00,0.00,0.00,0.956863,0.96,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
125970,1.0,0.001957,0.0,0.0,0.0,1.00,0.00,0.00,0.117647,0.12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
125971,0.0,0.281800,1.0,0.0,0.0,0.06,0.05,0.00,0.031373,0.03,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [490]:
# do the same for testset
y_test = np.array([1 if x in dos_probe_list else 0 for x in df_test['label']])

df_test = df_test.drop(['label'],axis=1)
df_test = df_test.reset_index().drop(['index'], axis=1)
df_test

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,tcp,private,REJ,0,0,0,0,0,0,...,255,10,0.04,0.06,0.00,0.00,0.00,0.0,1.00,1.00
1,0,tcp,private,REJ,0,0,0,0,0,0,...,255,1,0.00,0.06,0.00,0.00,0.00,0.0,1.00,1.00
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,134,86,0.61,0.04,0.61,0.02,0.00,0.0,0.00,0.00
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,3,57,1.00,0.00,1.00,0.28,0.00,0.0,0.00,0.00
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,29,86,0.31,0.17,0.03,0.02,0.00,0.0,0.83,0.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22538,0,tcp,smtp,SF,794,333,0,0,0,0,...,100,141,0.72,0.06,0.01,0.01,0.01,0.0,0.00,0.00
22539,0,tcp,http,SF,317,938,0,0,0,0,...,197,255,1.00,0.00,0.01,0.01,0.01,0.0,0.00,0.00
22540,0,tcp,http,SF,54540,8314,0,0,0,2,...,255,255,1.00,0.00,0.00,0.00,0.00,0.0,0.07,0.07
22541,0,udp,domain_u,SF,42,42,0,0,0,0,...,255,252,0.99,0.01,0.00,0.00,0.00,0.0,0.00,0.00


In [491]:
X_test = df_test[common_features_l1]

X_test

Unnamed: 0,logged_in,count,serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_srv_rerror_rate,dst_host_rerror_rate
0,0,229,0.0,1.0,1.0,0.04,0.06,0.00,10,0.04,0.06,0.00,0.00,0.00,0.0,1.00,1.00
1,0,136,0.0,1.0,1.0,0.01,0.06,0.00,1,0.00,0.06,0.00,0.00,0.00,0.0,1.00,1.00
2,0,1,0.0,0.0,0.0,1.00,0.00,0.00,86,0.61,0.04,0.61,0.02,0.00,0.0,0.00,0.00
3,0,1,0.0,0.0,0.0,1.00,0.00,1.00,57,1.00,0.00,1.00,0.28,0.00,0.0,0.00,0.00
4,0,1,0.0,1.0,0.5,1.00,0.00,0.75,86,0.31,0.17,0.03,0.02,0.00,0.0,0.71,0.83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22538,1,1,0.0,0.0,0.0,1.00,0.00,0.00,141,0.72,0.06,0.01,0.01,0.01,0.0,0.00,0.00
22539,1,2,0.0,0.0,0.0,1.00,0.00,0.18,255,1.00,0.00,0.01,0.01,0.01,0.0,0.00,0.00
22540,1,5,0.0,0.0,0.0,1.00,0.00,0.20,255,1.00,0.00,0.00,0.00,0.00,0.0,0.07,0.07
22541,0,4,0.0,0.0,0.0,1.00,0.00,0.33,252,0.99,0.01,0.00,0.00,0.00,0.0,0.00,0.00


In [492]:
df_minmax = scaler1.transform(X_test)
X_test = pd.DataFrame(df_minmax, columns=X_test.columns)
X_test

Unnamed: 0,logged_in,count,serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_srv_rerror_rate,dst_host_rerror_rate
0,0.0,0.448141,0.0,1.0,1.0,0.04,0.06,0.00,0.039216,0.04,0.06,0.00,0.00,0.00,0.0,1.00,1.00
1,0.0,0.266145,0.0,1.0,1.0,0.01,0.06,0.00,0.003922,0.00,0.06,0.00,0.00,0.00,0.0,1.00,1.00
2,0.0,0.001957,0.0,0.0,0.0,1.00,0.00,0.00,0.337255,0.61,0.04,0.61,0.02,0.00,0.0,0.00,0.00
3,0.0,0.001957,0.0,0.0,0.0,1.00,0.00,1.00,0.223529,1.00,0.00,1.00,0.28,0.00,0.0,0.00,0.00
4,0.0,0.001957,0.0,1.0,0.5,1.00,0.00,0.75,0.337255,0.31,0.17,0.03,0.02,0.00,0.0,0.71,0.83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22538,1.0,0.001957,0.0,0.0,0.0,1.00,0.00,0.00,0.552941,0.72,0.06,0.01,0.01,0.01,0.0,0.00,0.00
22539,1.0,0.003914,0.0,0.0,0.0,1.00,0.00,0.18,1.000000,1.00,0.00,0.01,0.01,0.01,0.0,0.00,0.00
22540,1.0,0.009785,0.0,0.0,0.0,1.00,0.00,0.20,1.000000,1.00,0.00,0.00,0.00,0.00,0.0,0.07,0.07
22541,0.0,0.007828,0.0,0.0,0.0,1.00,0.00,0.33,0.988235,0.99,0.01,0.00,0.00,0.00,0.0,0.00,0.00


In [493]:
label_enc = ohe.transform(df_test.iloc[:,1:4])
label_enc.toarray()
new_labels = ohe.get_feature_names_out(['protocol_type', 'service', 'flag'])
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_test = pd.concat([X_test, df_enc], axis=1)
X_test

Unnamed: 0,logged_in,count,serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_srv_count,dst_host_same_srv_rate,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,0.448141,0.0,1.0,1.0,0.04,0.06,0.00,0.039216,0.04,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.266145,0.0,1.0,1.0,0.01,0.06,0.00,0.003922,0.00,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.001957,0.0,0.0,0.0,1.00,0.00,0.00,0.337255,0.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.001957,0.0,0.0,0.0,1.00,0.00,1.00,0.223529,1.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.001957,0.0,1.0,0.5,1.00,0.00,0.75,0.337255,0.31,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22538,1.0,0.001957,0.0,0.0,0.0,1.00,0.00,0.00,0.552941,0.72,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22539,1.0,0.003914,0.0,0.0,0.0,1.00,0.00,0.18,1.000000,1.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22540,1.0,0.009785,0.0,0.0,0.0,1.00,0.00,0.20,1.000000,1.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22541,0.0,0.007828,0.0,0.0,0.0,1.00,0.00,0.33,0.988235,0.99,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [494]:
print('Shape of the whole train set: ', X_train.shape)
print('Shape of its targets: ', y_train.shape)
print('Shape of the whole test set: ', X_test.shape)
print('Shape of its targets: ', y_test.shape)

Shape of the whole train set:  (125973, 101)
Shape of its targets:  (125973,)
Shape of the whole test set:  (22543, 101)
Shape of its targets:  (22543,)


Principal Component Analysis

In [495]:
pca_dos_probe = PCA(n_components=0.95)
X_train_dos_probe = pca_dos_probe.fit_transform(X_train)
X_test_dos_probe = pca_dos_probe.transform(X_test)

X_test_dos_probe

array([[ 0.83410819,  2.2059958 , -0.55079972, ..., -0.05333572,
         0.04164095, -0.01025754],
       [ 0.84699705,  2.21066552, -0.55583237, ..., -0.03596376,
         0.0526911 , -0.018039  ],
       [-0.48456228,  0.10601136,  0.42527954, ...,  0.06688046,
         0.13865028, -0.09777938],
       ...,
       [-1.27085174, -0.37990559, -0.58694472, ...,  0.00409918,
        -0.00656146,  0.00504324],
       [-0.9296366 ,  0.02170432,  1.29802046, ...,  0.03326369,
         0.02234817, -0.01488583],
       [ 0.51291238,  1.95147839, -0.46075733, ...,  0.11666366,
        -0.14854388,  0.03021052]])

Building the classifier for the layer1

In [496]:
dos_probe_classifier = GaussianNB()
dos_probe_classifier.fit(X_train_dos_probe, y_train)
predicted = dos_probe_classifier.predict(X_test_dos_probe)

In [497]:
print('Metrics for layer 1:')
print('Confusion matrix: [TP FP / FN TN]\n', confusion_matrix(y_test,predicted))
print('Accuracy = ', accuracy_score(y_test,predicted))
print('F1 Score = ', f1_score(y_test,predicted))
print('Precision = ', precision_score(y_test,predicted))
print('Recall = ', recall_score(y_test,predicted))
print('Shape of the train set for l1: ', X_train_dos_probe.shape)

Metrics for layer 1:
Confusion matrix: [TP FP / FN TN]
 [[9914 5780]
 [ 558 6291]]
Accuracy =  0.7188484230137958
F1 Score =  0.6650105708245244
Precision =  0.5211664319443294
Recall =  0.9185282522996058
Shape of the train set for l1:  (125973, 28)


## r2l + u2r classifier

In [498]:
df_train = copy.deepcopy(df_train_original)
df_test = copy.deepcopy(df_test_original)

# load targeted attacks (Normal + r2l + u2r)
df_train = df_train[df_train['label'].isin(normal_list + u2r_r2l_list + u2r_r2l_test)]

y_train = np.array([0 if x=='normal' else 1 for x in df_train['label']])
df_train = df_train.drop(['label'],axis=1)
df_train = df_train.reset_index().drop(['index'], axis=1)
df_train

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,150,25,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00
1,0,udp,other,SF,146,0,0,0,0,0,...,255,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00
2,0,tcp,http,SF,232,8153,0,0,0,0,...,30,255,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01
3,0,tcp,http,SF,199,420,0,0,0,0,...,255,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,0,tcp,http,SF,287,2251,0,0,0,0,...,8,219,1.00,0.00,0.12,0.03,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68385,0,tcp,smtp,SF,2233,365,0,0,0,0,...,1,2,1.00,0.00,1.00,1.00,0.00,0.00,0.00,0.00
68386,0,tcp,http,SF,359,375,0,0,0,0,...,3,255,1.00,0.00,0.33,0.04,0.33,0.00,0.00,0.00
68387,8,udp,private,SF,105,145,0,0,0,0,...,255,244,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00
68388,0,tcp,smtp,SF,2231,384,0,0,0,0,...,255,30,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00


In [499]:
X_train = df_train[common_features_l2] 

X_train

Unnamed: 0,srv_count,urgent,root_shell,hot,dst_host_srv_diff_host_rate,dst_host_count,logged_in,dst_host_same_src_port_rate,srv_diff_host_rate,num_shells,dst_host_srv_count
0,2,0,0,0,0.00,150,0,0.17,0.00,0,25
1,1,0,0,0,0.00,255,0,0.88,0.00,0,1
2,5,0,0,0,0.04,30,1,0.03,0.00,0,255
3,32,0,0,0,0.00,255,1,0.00,0.09,0,255
4,7,0,0,0,0.03,8,1,0.12,0.43,0,219
...,...,...,...,...,...,...,...,...,...,...,...
68385,1,0,0,0,1.00,1,1,1.00,0.00,0,2
68386,11,0,0,0,0.04,3,1,0.33,0.18,0,255
68387,2,0,0,0,0.00,255,0,0.01,0.00,0,244
68388,1,0,0,0,0.00,255,1,0.00,0.00,0,30


In [500]:
df_minmax = scaler2.fit_transform(X_train)
X_train = pd.DataFrame(df_minmax, columns=X_train.columns)
X_train

Unnamed: 0,srv_count,urgent,root_shell,hot,dst_host_srv_diff_host_rate,dst_host_count,logged_in,dst_host_same_src_port_rate,srv_diff_host_rate,num_shells,dst_host_srv_count
0,0.003914,0.0,0.0,0.0,0.00,0.588235,0.0,0.17,0.00,0.0,0.098039
1,0.001957,0.0,0.0,0.0,0.00,1.000000,0.0,0.88,0.00,0.0,0.003922
2,0.009785,0.0,0.0,0.0,0.04,0.117647,1.0,0.03,0.00,0.0,1.000000
3,0.062622,0.0,0.0,0.0,0.00,1.000000,1.0,0.00,0.09,0.0,1.000000
4,0.013699,0.0,0.0,0.0,0.03,0.031373,1.0,0.12,0.43,0.0,0.858824
...,...,...,...,...,...,...,...,...,...,...,...
68385,0.001957,0.0,0.0,0.0,1.00,0.003922,1.0,1.00,0.00,0.0,0.007843
68386,0.021526,0.0,0.0,0.0,0.04,0.011765,1.0,0.33,0.18,0.0,1.000000
68387,0.003914,0.0,0.0,0.0,0.00,1.000000,0.0,0.01,0.00,0.0,0.956863
68388,0.001957,0.0,0.0,0.0,0.00,1.000000,1.0,0.00,0.00,0.0,0.117647


In [501]:
# perform One-hot encoding
label_enc = ohe2.fit_transform(df_train.iloc[:,1:4])
label_enc.toarray()
new_labels = ohe2.get_feature_names_out(categorical_features)
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_train = pd.concat([X_train, df_enc], axis=1)
X_train

Unnamed: 0,srv_count,urgent,root_shell,hot,dst_host_srv_diff_host_rate,dst_host_count,logged_in,dst_host_same_src_port_rate,srv_diff_host_rate,num_shells,...,flag_OTH,flag_REJ,flag_RSTO,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.003914,0.0,0.0,0.0,0.00,0.588235,0.0,0.17,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.001957,0.0,0.0,0.0,0.00,1.000000,0.0,0.88,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.009785,0.0,0.0,0.0,0.04,0.117647,1.0,0.03,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.062622,0.0,0.0,0.0,0.00,1.000000,1.0,0.00,0.09,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.013699,0.0,0.0,0.0,0.03,0.031373,1.0,0.12,0.43,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68385,0.001957,0.0,0.0,0.0,1.00,0.003922,1.0,1.00,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
68386,0.021526,0.0,0.0,0.0,0.04,0.011765,1.0,0.33,0.18,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
68387,0.003914,0.0,0.0,0.0,0.00,1.000000,0.0,0.01,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
68388,0.001957,0.0,0.0,0.0,0.00,1.000000,1.0,0.00,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [502]:
# do the same for test set
df_test = df_test[df_test['label'].isin(normal_list + u2r_r2l_list + u2r_r2l_test)]

y_test = np.array([0 if x=='normal' else 1 for x in df_test['label']])
df_test = df_test.drop(['label'],axis=1)
df_test = df_test.reset_index().drop(['index'], axis=1)
df_test

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,134,86,0.61,0.04,0.61,0.02,0.00,0.00,0.00,0.00
1,0,tcp,http,SF,267,14515,0,0,0,0,...,155,255,1.00,0.00,0.01,0.03,0.01,0.00,0.00,0.00
2,0,tcp,smtp,SF,1022,387,0,0,0,0,...,255,28,0.11,0.72,0.00,0.00,0.00,0.00,0.72,0.04
3,0,tcp,telnet,SF,129,174,0,0,0,0,...,255,255,1.00,0.00,0.00,0.00,0.01,0.01,0.02,0.02
4,0,tcp,http,SF,327,467,0,0,0,0,...,151,255,1.00,0.00,0.01,0.03,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12657,0,tcp,http,SF,274,1623,0,0,0,0,...,92,255,1.00,0.00,0.01,0.04,0.00,0.00,0.00,0.00
12658,0,tcp,http,SF,280,6087,0,0,0,0,...,5,255,1.00,0.00,0.20,0.04,0.00,0.00,0.00,0.00
12659,0,tcp,smtp,SF,794,333,0,0,0,0,...,100,141,0.72,0.06,0.01,0.01,0.01,0.00,0.00,0.00
12660,0,tcp,http,SF,317,938,0,0,0,0,...,197,255,1.00,0.00,0.01,0.01,0.01,0.00,0.00,0.00


In [503]:
X_test = df_test[common_features_l2] 

X_test

Unnamed: 0,srv_count,urgent,root_shell,hot,dst_host_srv_diff_host_rate,dst_host_count,logged_in,dst_host_same_src_port_rate,srv_diff_host_rate,num_shells,dst_host_srv_count
0,1,0,0,0,0.02,134,0,0.61,0.00,0,86
1,4,0,0,0,0.03,155,1,0.01,0.00,0,255
2,3,0,0,0,0.00,255,1,0.00,1.00,0,28
3,1,0,0,0,0.00,255,0,0.00,0.00,0,255
4,47,0,0,0,0.03,151,1,0.01,0.04,0,255
...,...,...,...,...,...,...,...,...,...,...,...
12657,1,0,0,0,0.04,92,1,0.01,0.00,0,255
12658,3,0,0,0,0.04,5,1,0.20,0.00,0,255
12659,1,0,0,0,0.01,100,1,0.01,0.00,0,141
12660,11,0,0,0,0.01,197,1,0.01,0.18,0,255


In [504]:
df_minmax = scaler2.transform(X_test)
X_test = pd.DataFrame(df_minmax, columns=X_test.columns)
X_test

Unnamed: 0,srv_count,urgent,root_shell,hot,dst_host_srv_diff_host_rate,dst_host_count,logged_in,dst_host_same_src_port_rate,srv_diff_host_rate,num_shells,dst_host_srv_count
0,0.001957,0.0,0.0,0.0,0.02,0.525490,0.0,0.61,0.00,0.0,0.337255
1,0.007828,0.0,0.0,0.0,0.03,0.607843,1.0,0.01,0.00,0.0,1.000000
2,0.005871,0.0,0.0,0.0,0.00,1.000000,1.0,0.00,1.00,0.0,0.109804
3,0.001957,0.0,0.0,0.0,0.00,1.000000,0.0,0.00,0.00,0.0,1.000000
4,0.091977,0.0,0.0,0.0,0.03,0.592157,1.0,0.01,0.04,0.0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...
12657,0.001957,0.0,0.0,0.0,0.04,0.360784,1.0,0.01,0.00,0.0,1.000000
12658,0.005871,0.0,0.0,0.0,0.04,0.019608,1.0,0.20,0.00,0.0,1.000000
12659,0.001957,0.0,0.0,0.0,0.01,0.392157,1.0,0.01,0.00,0.0,0.552941
12660,0.021526,0.0,0.0,0.0,0.01,0.772549,1.0,0.01,0.18,0.0,1.000000


In [505]:
label_enc = ohe2.transform(df_test.iloc[:,1:4])
label_enc.toarray()
new_labels = ohe2.get_feature_names_out(categorical_features)
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_test = pd.concat([X_test, df_enc], axis=1)
X_test

Unnamed: 0,srv_count,urgent,root_shell,hot,dst_host_srv_diff_host_rate,dst_host_count,logged_in,dst_host_same_src_port_rate,srv_diff_host_rate,num_shells,...,flag_OTH,flag_REJ,flag_RSTO,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.001957,0.0,0.0,0.0,0.02,0.525490,0.0,0.61,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.007828,0.0,0.0,0.0,0.03,0.607843,1.0,0.01,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.005871,0.0,0.0,0.0,0.00,1.000000,1.0,0.00,1.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.001957,0.0,0.0,0.0,0.00,1.000000,0.0,0.00,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.091977,0.0,0.0,0.0,0.03,0.592157,1.0,0.01,0.04,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12657,0.001957,0.0,0.0,0.0,0.04,0.360784,1.0,0.01,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12658,0.005871,0.0,0.0,0.0,0.04,0.019608,1.0,0.20,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12659,0.001957,0.0,0.0,0.0,0.01,0.392157,1.0,0.01,0.00,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12660,0.021526,0.0,0.0,0.0,0.01,0.772549,1.0,0.01,0.18,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [506]:
print('Shape of the train set: ', X_train.shape)
print('Shape of its target: ', y_train.shape)
print('Shape of the test set: ', X_test.shape)
print('Shape of its target: ', y_test.shape)

Shape of the train set:  (68390, 51)
Shape of its target:  (68390,)
Shape of the test set:  (12662, 51)
Shape of its target:  (12662,)


In [507]:
# Under sampling
sm = under_sam(sampling_strategy=1)
X_train, y_train = sm.fit_resample(X_train,y_train)

# Principal Component Analysis
pca_r2l_u2r = PCA(n_components=0.95)
X_train_r2l_u2r = pca_r2l_u2r.fit_transform(X_train)
X_test_r2l_u2r = pca_r2l_u2r.transform(X_test)

# Support Vector Machine for layer l2
r2l_u2r_classifier = SVC(C=0.1, gamma=0.01, kernel='rbf')
r2l_u2r_classifier.fit(X_train_r2l_u2r, y_train)
predicted = r2l_u2r_classifier.predict(X_test_r2l_u2r)

X_test_r2l_u2r

array([[ 0.55577522,  0.47412283,  0.23420784, ..., -0.04849503,
        -0.45364394,  0.03274658],
       [-0.77800619, -0.65535163,  0.28122548, ..., -0.03093364,
        -0.01077156, -0.01079978],
       [-0.34280687,  0.05276766, -0.49241585, ...,  0.08883103,
         0.15092085, -0.15896617],
       ...,
       [-0.2342936 , -0.11056261, -0.19259369, ..., -0.00324799,
        -0.07764877, -0.00196972],
       [-0.84970721, -0.63065645,  0.23408752, ..., -0.02856895,
        -0.00448939, -0.02101307],
       [-1.0554249 ,  1.49542775,  0.61299925, ...,  0.03134584,
         0.02014733, -0.00195427]])

In [508]:
print('Metrics for layer 2:')
print('Confusion matrix: [TP FP / FN TN]\n', confusion_matrix(y_test,predicted))
print('Accuracy = ', accuracy_score(y_test,predicted))
print('F1 Score = ', f1_score(y_test,predicted))
print('Precision = ', precision_score(y_test,predicted))
print('Recall = ', recall_score(y_test,predicted))
print('Matthew corr = ', matthews_corrcoef(y_test,predicted))
print('Shape of the training set: ', X_train_r2l_u2r.shape)

Metrics for layer 2:
Confusion matrix: [TP FP / FN TN]
 [[9056  654]
 [1455 1497]]
Accuracy =  0.8334386352866846
F1 Score =  0.586713697824809
Precision =  0.6959553695955369
Recall =  0.5071138211382114
Matthew corr =  0.4951557904452991
Shape of the training set:  (2094, 13)


## Testing

In [509]:
df_test1 = copy.deepcopy(df_test_original)
df_test2 = copy.deepcopy(df_test_original)
y_test_real = np.array([0 if x=='normal' else 1 for x in df_test1['label']])

In [510]:
X_test = df_test1[common_features_l1]

df_minmax = scaler1.transform(X_test)
X_test = pd.DataFrame(df_minmax, columns=X_test.columns)
label_enc = ohe.transform(df_test1.iloc[:,1:4])
label_enc.toarray()
new_labels = ohe.get_feature_names_out(['protocol_type', 'service', 'flag'])
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_test = pd.concat([X_test, df_enc], axis=1)

X_test_layer1 = pca_dos_probe.transform(X_test)
print('Test set shape for layer 1: ', X_test_layer1.shape)

Test set shape for layer 1:  (22543, 28)


In [511]:
X_test = df_test2[common_features_l2] 

df_minmax = scaler2.transform(X_test)
X_test = pd.DataFrame(df_minmax, columns=X_test.columns)
label_enc = ohe2.transform(df_test2.iloc[:,1:4])
label_enc.toarray()
new_labels = ohe2.get_feature_names_out(['protocol_type', 'service', 'flag'])
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_test = pd.concat([X_test, df_enc], axis=1)

X_test_layer2 = pca_r2l_u2r.transform(X_test)
print('Test set shape for layer 2: ', X_test_layer2.shape)

Test set shape for layer 2:  (22543, 13)


In [512]:
# same classifiers obtained above
classifier1 = dos_probe_classifier
classifier2 = r2l_u2r_classifier

In [513]:
result = []
# 22
for i in range(22543):
    layer1 = classifier1.predict(X_test_layer1[i].reshape(1, -1))[0]
    if layer1 == 1:
        result.append(layer1)
    else:
        layer2 = classifier2.predict(X_test_layer2[i].reshape(1, -1))[0]
        if layer2 == 1:
            result.append(layer2)
        else:
            result.append(0)
            
result = np.array(result)

In [514]:
# the results may vary
# C=0.1, gamma=0.01
print(confusion_matrix(y_test_real,result))
print('Accuracy = ', accuracy_score(y_test_real,result))
print('F1 Score = ', f1_score(y_test_real,result))
print('Precision = ', precision_score(y_test_real,result))
print('Recall = ', recall_score(y_test_real,result))
print('Matthew corr = ', matthews_corrcoef(y_test_real,result))

[[ 8138  1572]
 [  901 11932]]
Accuracy =  0.8902985405669166
F1 Score =  0.9061016820442722
Precision =  0.8835900473933649
Recall =  0.9297903841658225
Matthew corr =  0.7758618318844038


### evaluate seen and unseen attack categories

In [515]:
# load testset
df_test = pd.read_csv('NSL-KDD_datasets\KDDTest+.txt', sep=",", header=None)
df_test = df_test[df_test.columns[:-1]]
df_test.columns = titles.to_list()
y_test = df_test['label']
df_test = df_test.drop(['num_outbound_cmds'],axis=1)
df_test_original = df_test

In [516]:
new_attack = []
for i in df_test_original['label'].value_counts().index.tolist()[1:]:
    if i not in df_train_original['label'].value_counts().index.tolist()[1:]:
        new_attack.append(i)
        
new_attack.sort()
new_attack

['apache2',
 'httptunnel',
 'mailbomb',
 'mscan',
 'named',
 'processtable',
 'ps',
 'saint',
 'sendmail',
 'snmpgetattack',
 'snmpguess',
 'sqlattack',
 'udpstorm',
 'worm',
 'xlock',
 'xsnoop',
 'xterm']

In [517]:
index_of_new_attacks = []

for i in range(len(df_test_original)):
    if df_test_original['label'][i] in new_attack:
        index_of_new_attacks.append(df_test_original.index[i])

In [518]:
len(index_of_new_attacks)

3750

In [519]:
new_attack.append('normal')
new_attack

['apache2',
 'httptunnel',
 'mailbomb',
 'mscan',
 'named',
 'processtable',
 'ps',
 'saint',
 'sendmail',
 'snmpgetattack',
 'snmpguess',
 'sqlattack',
 'udpstorm',
 'worm',
 'xlock',
 'xsnoop',
 'xterm',
 'normal']

In [520]:
index_of_old_attacks = []

for i in range(len(df_test_original)):
    if df_test_original['label'][i] not in new_attack:
        index_of_old_attacks.append(df_test_original.index[i])

In [521]:
len(index_of_old_attacks)

9083

In [522]:
print(result[index_of_new_attacks].shape[0])
print(result[index_of_new_attacks].sum())
print(result[index_of_new_attacks].sum()/result[index_of_new_attacks].shape[0])

IndexError: index 22543 is out of bounds for axis 0 with size 22543

In [None]:
print(result[index_of_old_attacks].shape[0])
print(result[index_of_old_attacks].sum())
print(result[index_of_old_attacks].sum()/result[index_of_old_attacks].shape[0])

### evaluate each attack type

In [None]:
# load testset
df_test = pd.read_csv('../data/KDDTest+.txt', sep=",", header=None)
df_test = df_test[df_test.columns[:-1]]
df_test.columns = titles.to_list()
y_test = df_test['label']
df_test = df_test.drop(['num_outbound_cmds'],axis=1)
df_test_original = df_test
df = df_test_original

dos_index = df.index[(df['label'] == 'apache2') | (df['label']=='back') 
                     | (df['label'] == 'land') | (df['label'] == 'mailbomb') 
                     | (df['label'] == 'neptune') | (df['label'] == 'pod') 
                     | (df['label'] == 'processtable') | (df['label'] == 'smurf') 
                     | (df['label'] == 'teardrop') | (df['label'] == 'udpstorm') 
                     | (df['label'] == 'worm')].tolist()

probe_index = df.index[(df['label'] == 'ipsweep') | (df['label']=='mscan') 
                       | (df['label']=='nmap') | (df['label'] == 'portsweep') 
                       | (df['label'] == 'saint') | (df['label'] == 'satan')].tolist()

r2l_index = df.index[(df['label'] == 'ftp_write') | (df['label']=='guess_passwd') 
                     | (df['label'] == 'httptunnel') | (df['label'] == 'imap') 
                     | (df['label'] == 'multihop') | (df['label'] == 'named') 
                     | (df['label'] == 'phf') | (df['label'] == 'sendmail') 
                     | (df['label'] == 'snmpgetattack') | (df['label'] == 'snmpguess') 
                     | (df['label'] == 'warezmaster') | (df['label'] == 'xlock') 
                     | (df['label'] == 'xsnoop')].tolist()

u2r_index = df.index[(df['label'] == 'buffer_overflow') | (df['label']=='loadmodule') 
                       | (df['label']=='perl') | (df['label'] == 'ps') 
                       | (df['label'] == 'rootkit') | (df['label'] == 'sqlattack')
                       | (df['label'] == 'xterm')].tolist()

print("full sample:", result[dos_index].shape[0])
print("detected sample:", result[dos_index].sum())
print("percent", result[dos_index].sum()/result[dos_index].shape[0])

print(result[probe_index].shape[0])
print(result[probe_index].sum())
print(result[probe_index].sum()/result[probe_index].shape[0])

print(result[r2l_index].shape[0])
print(result[r2l_index].sum())
print(result[r2l_index].sum()/result[r2l_index].shape[0])

print(result[u2r_index].shape[0])
print(result[u2r_index].sum())
print(result[u2r_index].sum()/result[u2r_index].shape[0])