In [1]:
import numpy as np 
import pandas as pd
import copy
import pickle
import joblib

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA
from imblearn.under_sampling import RandomUnderSampler as under_sam

from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.metrics import matthews_corrcoef, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier

### ICFS function
Takes a dataframe as parameter and saves to file all the features necessary to describe DoS+Probe and U2R+R2L

In [2]:
def pearson_correlated_features(x, y, threshold):
    y['target'] = y['target'].astype(int)

    for p in x.columns:
        x[p] = x[p].astype(float)

    # Ensure y is a DataFrame for consistency
    if isinstance(y, pd.Series):
        y = pd.DataFrame(y, columns=['target'])

    # Calculate the Pearson's correlation coefficients between features and the target variable(s)
    corr_matrix = x.corrwith(y['target'])

    # Select features with correlations above the threshold
    selected_features = x.columns[corr_matrix.abs() > threshold].tolist()

    return selected_features

In [3]:
def compute_set_difference(df1, df2):
    # Create a new DataFrame containing the set difference of the two DataFrames.
    df_diff = df1[~df1.index.isin(df2.index)]
    # Return the DataFrame.
    return df_diff

In [4]:
def perform_icfs(x_train):
    # now ICFS only on the numerical features
    num_train = copy.deepcopy(x_train)
    del num_train['protocol_type']
    del num_train['service']
    del num_train['flag']

    target = pd.DataFrame()
    target['target'] = np.array([1 if x != 'normal' else 0 for x in num_train['label']])
    num_train = pd.concat([num_train, target], axis=1)

    # These are how attacks are categorized in the trainset
    dos_list = ['back', 'land', 'neptune', 'pod', 'smurf', 'teardrop']
    probe_list = ['ipsweep', 'portsweep', 'satan', 'nmap']
    u2r_list = ['loadmodule', 'perl', 'rootkit', 'buffer_overflow']
    r2l_list = ['ftp_write', 'guess_passwd', 'imap', 'multihop', 'phf', 'spy', 'warezclient', 'warezmaster']
    normal = ['normal']

    # useful sub-sets
    x_normal = num_train[num_train['label'].isin(normal)]
    x_u2r = num_train[num_train['label'].isin(u2r_list)]
    x_r2l = num_train[num_train['label'].isin(r2l_list)]
    x_dos = num_train[num_train['label'].isin(dos_list)]
    x_probe = num_train[num_train['label'].isin(probe_list)]

    # start the ICFS with l1

    # features for dos
    dos = copy.deepcopy(num_train)
    del dos['target']
    y = np.array([1 if x in dos_list else 0 for x in dos['label']])
    y_dos = pd.DataFrame(y, columns=['target'])
    del dos['label']
    dos_all = pearson_correlated_features(dos, y_dos, 0.1)
    print(dos_all)

    # features for probe
    probe = copy.deepcopy(num_train)
    del probe['target']
    y = np.array([1 if x in probe_list else 0 for x in probe['label']])
    y_probe = pd.DataFrame(y, columns=['target'])
    del probe['label']
    probe_all = pearson_correlated_features(probe, y_probe, 0.1)
    print(probe_all)

    # intersect for the optimal features
    set_dos = set(dos_all)
    set_probe = set(probe_all)

    comm_features_l1 = set_probe & set_dos

    print('common features to train l1: ', comm_features_l1)

    # now l2 needs the features to describe the difference between rare attacks and normal traffic

    # features for u2r
    u2r = pd.concat([x_u2r, x_normal], axis=0)
    del u2r['target']
    y = np.array([1 if x in u2r_list else 0 for x in u2r['label']])
    y_u2r = pd.DataFrame(y, columns=['target'])
    del u2r['label']
    u2r_all = pearson_correlated_features(u2r, y_u2r, 0.01)
    print(u2r_all)

    # features for r2l
    r2l = pd.concat([x_r2l, x_normal], axis=0)
    del r2l['target']
    y = np.array([1 if x in r2l_list else 0 for x in r2l['label']])
    y_r2l = pd.DataFrame(y, columns=['target'])
    del r2l['label']
    r2l_all = pearson_correlated_features(r2l, y_r2l, 0.01)
    print(r2l_all)

    # intersect for the optimal features
    set_r2l = set(r2l_all)
    set_u2r = set(u2r_all)

    comm_features_l2 = set_r2l & set_u2r
    # print('Common features to train l2: ', len(common_features_l2), common_features_l2)

    with open('NSL-KDD Files/NSL_features_l1.txt', 'w') as g:
        for a, x in enumerate(comm_features_l1):
            if a < len(comm_features_l1) - 1:
                g.write(x + ',' + '\n')
            else:
                g.write(x)

    # read the common features from file
    with open('NSL-KDD Files/NSL_features_l2.txt', 'w') as g:
        for a, x in enumerate(comm_features_l2):
            if a < len(comm_features_l2) - 1:
                g.write(x + ',' + '\n')
            else:
                g.write(x)

# Main implementation

In [5]:
# loading the train 20% set
df_train20 = pd.read_csv('KB Process/NSL-KDD Original Datasets/KDDTrain+_20Percent.txt', sep=",", header=None)
df_train20 = df_train20[df_train20.columns[:-1]]  # tags column
titles = pd.read_csv('KB Process/NSL-KDD Original Datasets/Field Names.csv', header=None)
label = pd.Series(['label'], index=[41])
titles = pd.concat([titles[0], label])
df_train20.columns = titles.to_list()
df_train20 = df_train20.drop(['num_outbound_cmds'],axis=1)
df_train_original = df_train20

# df_train_original.to_csv('KB Process/NSL-KDD Original Datasets/KDDTrain+20_percent_with_labels.txt', index=False)

df_train_original

FileNotFoundError: [Errno 2] No such file or directory: 'KB Process/NSL-KDD Original Datasets/KDDTrain+_20Percent.txt'

In [None]:
# loading the train set
df_train = pd.read_csv('KB Process/NSL-KDD Original Datasets/KDDTrain+.txt', sep=",", header=None)
df_train = df_train[df_train.columns[:-1]]  # tags column
titles = pd.read_csv('KB Process/NSL-KDD Original Datasets/Field Names.csv', header=None)
label = pd.Series(['label'], index=[41])
titles = pd.concat([titles[0], label])
df_train.columns = titles.to_list()
df_train = df_train.drop(['num_outbound_cmds'],axis=1)
df_train_original = df_train

# df_train_original.to_csv('KB Process/NSL-KDD Original Datasets/KDDTrain+_with_labels.txt', index=False)

df_train_original

In [None]:
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler

# EDA Plots

data = copy.deepcopy(df_train_original)

# Separate features and labels
features = data.drop('label', axis=1)
labels = data['label']

# Build the major category attacks array
attacks = []
for item in labels:
    if item in ["back", "land", "neptune", "pod", "smurf", "teardrop"]:
        attacks.append("dos")
    elif item in ["ipsweep", "nmap", "portsweep", "satan"]:
        attacks.append("probe")
    elif item in ["ftp_write", "guess_passwd", "imap", "multihop", "phf", "spy", "warezmaster", "warezclient"]:
        attacks.append("r2l")
    elif item in ["buffer_overflow", "loadmodule", "perl", "rootkit"]:
        attacks.append("u2r")
    else: 
        attacks.append("normal")
        
# Plot histograms
df_hist = copy.deepcopy(features)
df_hist['attack_cat'] = attacks
df_hist['labels'] = labels

plt.figure(figsize=(12, 6))
plt.hist(df_hist['attack_cat'], bins=10, color='skyblue', align='mid')
plt.xlabel('')
plt.xticks(rotation=45)
plt.tight_layout()
plt.ylabel('')
plt.title('')
plt.savefig('Attacks histogram')
plt.show()

In [None]:
# Plot histograms
df_hist = copy.deepcopy(features)
df_hist['attack_cat'] = attacks
df_hist['labels'] = labels

plt.figure(figsize=(12, 6))
plt.hist(df_hist['labels'], bins=60, color='red', align='mid')
plt.xlabel('')
plt.xticks(rotation=45)
plt.tight_layout()
plt.ylabel('')
plt.title('')
plt.savefig('Sub-attacks histogram')
plt.show()

In [None]:
# Drop the categorical features
del features['flag']
del features['protocol_type']
del features['service']

# Standardize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)



# Apply PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_features)

# Create a DataFrame for visualization
pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])
pca_df['label'] = labels
pca_df['attacks'] = attacks

# Plot the 2D PCA visualization
plt.figure(figsize=(12, 8))
colors = {'normal': 'blue', 'dos': 'orange', 'probe': 'green', 'u2r': 'red', 'r2l': 'purple'}

for attack_class, color in colors.items():
    subset = pca_df[pca_df['attacks'] == attack_class]
    plt.scatter(subset['PC1'], subset['PC2'], label=attack_class, color=color, alpha=0.7)

plt.title('2D PCA Visualization of NSL-KDD Attack Classes + Normal')
plt.xlabel('Principal Component 1 (PC1)')
plt.ylabel('Principal Component 2 (PC2)')
plt.tight_layout()
plt.legend()
plt.savefig('PCA Visualization')
plt.show()

In [None]:
# Same plot without normal instances
plt.figure(figsize=(12, 8))
colors = {'dos': 'orange', 'probe': 'green', 'u2r': 'red', 'r2l': 'purple'}

for attack_class, color in colors.items():
    subset = pca_df[pca_df['attacks'] == attack_class]
    plt.scatter(subset['PC1'], subset['PC2'], label=attack_class, color=color, alpha=0.7)

plt.title('2D PCA Visualization of NSL-KDD Attack Classes')
plt.xlabel('Principal Component 1 (PC1)')
plt.ylabel('Principal Component 2 (PC2)')
plt.tight_layout()
plt.legend()
plt.savefig('PCA Visualization no normal')
plt.show()

In [None]:
# load test set
df_test = pd.read_csv('KB Process/NSL-KDD Original Datasets/KDDTest+.txt', sep=",", header=None)
df_test_ = df_test.sort_index(axis=1)
df_test = df_test[df_test.columns[:-1]]
df_test.columns = titles.to_list()
df_test = df_test.drop(['num_outbound_cmds'],axis=1)
df_test_original = df_test

# df_test_original.to_csv('KB Process/NSL-KDD Original Datasets/KDDTest+.txt', index=False)

df_test_original

### Execution Parameters

In [None]:
EXPORT_MODELS = 0
EXPORT_DATASETS = 0
EXPORT_PCA = 0
EXPORT_ENCODERS = 0

pd.options.display.max_columns = None

### Perform ICFS if needed

In [None]:
# It is possible to compute the ICFS again

# perform_icfs(df_train_original)

# DoS + Probe classifier (NBC)

In [None]:
# list of single attacks 
dos_attacks = ['back', 'land', 'neptune', 'pod', 'smurf', 'teardrop', 'worm', 'apache2', 'mailbomb', 'processtable', 'udpstorm']
probe_attacks = ['ipsweep', 'mscan', 'nmap', 'portsweep', 'saint', 'satan']
r2l_attacks = ['guess_passwd', 'ftp_write', 'imap', 'phf', 'multihop', 'warezmaster',
                'snmpguess', 'spy', 'warezclient', 'httptunnel', 'named', 'sendmail', 'snmpgetattack', 'xlock', 'xsnoop']
u2r_attacks = ['buffer_overflow', 'loadmodule', 'perl', 'ps', 'rootkit', 'sqlattack', 'xterm'] 

# list of attack classes split according to detection layer
dos_probe_list = ['back', 'land', 'neptune', 'pod', 'smurf', 'teardrop', 'ipsweep', 'nmap', 'portsweep', 'satan']
dos_probe_test = ['apache2', 'mailbomb', 'processtable', 'udpstorm', 'mscan', 'saint']
u2r_r2l_list = ['guess_passwd', 'ftp_write', 'imap', 'phf', 'multihop', 'warezmaster',
                'snmpguess', 'spy', 'warezclient', 'buffer_overflow', 'loadmodule', 'rootkit', 'perl']
u2r_r2l_test = ['httptunnel', 'named', 'sendmail', 'snmpgetattack', 'xlock', 'xsnoop', 'ps', 'xterm', 'sqlattack']
normal_list = ['normal']
categorical_features = ['protocol_type', 'service', 'flag']

# load the features obtained with ICFS for both layer 1 and layer 2
with open('KB Process/Required Files/NSL_features_l1.txt', 'r') as f:
    common_features_l1 = f.read().split(',')

with open('KB Process/Required Files/NSL_features_l2.txt', 'r') as f:
    common_features_l2 = f.read().split(',')
    
df_train_and_validate = copy.deepcopy(df_train_original)
df_test = copy.deepcopy(df_test_original)

In [None]:
# split in test and validation set for BOTH layers
df_train_original, df_val_original = train_test_split(df_train_and_validate, test_size=0.2, random_state=42)

# dataframes specifically for layer 1
df_train = copy.deepcopy(df_train_original)
df_val = copy.deepcopy(df_val_original)

# set the target variables accordingly
y_train = np.array([1 if x in (dos_attacks+probe_attacks) else 0 for x in df_train['label']])
y_val = np.array([1 if x in (dos_attacks+probe_attacks) else 0 for x in df_val['label']])

# this dataframe contains the whole train set 
df_train = df_train.drop(['label'],axis=1)
df_train = df_train.reset_index().drop(['index'], axis=1)
df_train

In [None]:
# this dataframe contains the whole validation set
df_val = df_val.drop(['label'],axis=1)
df_val = df_val.reset_index().drop(['index'], axis=1)

df_val

In [None]:
# now the real processing for layer 1 starts
X_train = df_train[common_features_l1]
X_train

In [None]:
X_validate = df_val[common_features_l1]
X_validate

In [None]:
# 2 one-hot encoders, one for the features of layer1 and one for the features of layer2
ohe = OneHotEncoder(handle_unknown='ignore')
ohe2 = OneHotEncoder(handle_unknown='ignore')
scaler1 = MinMaxScaler()
scaler2 = MinMaxScaler()

In [None]:
# scaling the train set for layer1
df_minmax = scaler1.fit_transform(X_train)
X_train = pd.DataFrame(df_minmax, columns=X_train.columns)

X_train

In [None]:
# scaling the validation set for layer1
df_minmax_val = scaler1.transform(X_validate)
X_validate = pd.DataFrame(df_minmax_val, columns=X_validate.columns)

X_validate

In [None]:
# perform One-hot encoding for the train set
label_enc = ohe.fit_transform(df_train[categorical_features])
label_enc.toarray()
new_labels = ohe.get_feature_names_out(categorical_features)
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_train = pd.concat([X_train, df_enc], axis=1)

df_train.iloc[:,1:4]

In [None]:
# perform One-hot encoding for the validation set
label_enc = ohe.transform(df_val[categorical_features])
label_enc.toarray()
new_labels = ohe.get_feature_names_out(categorical_features)
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_validate = pd.concat([X_validate, df_enc], axis=1)

X_validate

In [None]:
# do the same for testset
y_test = np.array([1 if x in (dos_attacks+probe_attacks) else 0 for x in df_test['label']])

df_test = df_test.drop(['label'],axis=1)
df_test = df_test.reset_index().drop(['index'], axis=1)
df_test

In [None]:
X_test = df_test[common_features_l1]

X_test

In [None]:
df_minmax = scaler1.transform(X_test)
X_test = pd.DataFrame(df_minmax, columns=X_test.columns)
X_test

In [None]:
label_enc = ohe.transform(df_test.iloc[:,1:4])
label_enc.toarray()
new_labels = ohe.get_feature_names_out(categorical_features)
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_test = pd.concat([X_test, df_enc], axis=1)
X_test

In [None]:
print('Shape of the whole train set: ', X_train.shape)
print('Shape of its targets: ', y_train.shape)
print('Shape of the whole test set: ', X_test.shape)
print('Shape of its targets: ', y_test.shape)

In [None]:
# Export the dataset for training layer 1
if EXPORT_DATASETS:
    X_train.to_csv('KB Process/NSL-KDD Encoded Datasets/before_pca/KDDTrain+_l1.txt', index=False)
    X_validate.to_csv('KB Process/NSL-KDD Encoded Datasets/before_pca/KDDValidate+_l1.txt', index=False)
    np.save('KB Process/NSL-KDD Encoded Datasets/before_pca/KDDTrain+_l1_targets', y_train)
    np.save('KB Process/NSL-KDD Encoded Datasets/before_pca/KDDValidate+_l1_targets', y_val)

### Principal Component Analysis

In [None]:
pca_dos_probe = PCA(n_components=0.95)
X_train_dos_probe = pca_dos_probe.fit_transform(X_train)
X_test_dos_probe = pca_dos_probe.transform(X_test)
X_validate_dos_probe = pca_dos_probe.transform(X_validate)

# X_train = X_train.sort_index(axis=1)
X_train_dos_probe.shape

In [None]:
if EXPORT_PCA:
    # save the pca transformed as well as the transformer
    joblib.dump(X_test_dos_probe, 'KB Process/NSL-KDD Encoded Datasets/pca_transformed/KDDTest+_l1_pca.pkl')
    joblib.dump(X_train_dos_probe, 'KB Process/NSL-KDD Encoded Datasets/pca_transformed/KDDTrain+_l1_pca.pkl')
    joblib.dump(X_validate_dos_probe, 'KB Process/NSL-KDD Encoded Datasets/pca_transformed/KDDValidate+_l1_pca.pkl')
    joblib.dump(pca_dos_probe, 'KB Process/NSL-KDD Encoded Datasets/pca_transformed/layer1_pca_transformer.pkl')

### Building the classifier for the layer1

In [None]:
# Using Random Forest Classifier
# dos_probe_classifier = RandomForestClassifier(n_estimators=100, criterion='gini')

# Using the Naive Bayes Classifier
dos_probe_classifier = GaussianNB()
dos_probe_classifier.fit(X_train_dos_probe, y_train)
predicted = dos_probe_classifier.predict(X_test_dos_probe)
# class_probabilities = dos_probe_classifier.predict_proba(X_test_dos_probe)

In [None]:
print('Metrics for layer 1:')
print('Confusion matrix: [TP FN / FP TN]\n', confusion_matrix(y_test,predicted))
print('Accuracy = ', accuracy_score(y_test,predicted))
print('F1 Score = ', f1_score(y_test,predicted))
print('Precision = ', precision_score(y_test,predicted))
print('Recall = ', recall_score(y_test,predicted))
print('Shape of the train set for l1: ', X_train_dos_probe.shape)

# R2L+U2R classifier

In [None]:
df_train = copy.deepcopy(df_train_original)
df_test = copy.deepcopy(df_test_original)
df_val = copy.deepcopy(df_val_original)

# load targeted attacks (Normal + r2l + u2r)
df_train = df_train[df_train['label'].isin(normal_list+u2r_attacks+r2l_attacks)]
df_val = df_val[df_val['label'].isin(normal_list+u2r_attacks+r2l_attacks)]

# set the target variables accordingly
y_train = np.array([1 if x in (u2r_attacks+r2l_attacks) else 0 for x in df_train['label']])
y_val = np.array([1 if x in (u2r_attacks+r2l_attacks) else 0 for x in df_val['label']])

df_train = df_train.drop(['label'],axis=1)
df_train = df_train.reset_index().drop(['index'], axis=1)
df_train

In [None]:
df_val = df_val.drop(['label'],axis=1)
df_val = df_val.reset_index().drop(['index'], axis=1)
df_val

In [None]:
X_train = df_train[common_features_l2]
X_train

In [None]:
X_validate = df_val[common_features_l2]
X_validate

In [None]:
df_minmax = scaler2.fit_transform(X_train)
X_train = pd.DataFrame(df_minmax, columns=X_train.columns)
X_train

In [None]:
df_minmax = scaler2.transform(X_validate)
X_validate = pd.DataFrame(df_minmax, columns=X_validate.columns)
X_validate

In [None]:
# perform One-hot encoding for the train set
label_enc = ohe2.fit_transform(df_train[categorical_features])
label_enc.toarray()
new_labels = ohe2.get_feature_names_out(categorical_features)
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_train = pd.concat([X_train, df_enc], axis=1)
X_train

In [None]:
# perform One-hot encoding for the validation set
label_enc = ohe2.transform(df_val[categorical_features])
label_enc.toarray()
new_labels = ohe2.get_feature_names_out(categorical_features)
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_validate = pd.concat([X_validate, df_enc], axis=1)
X_validate

In [None]:
# do the same for test set
df_test = df_test[df_test['label'].isin(normal_list+u2r_attacks+r2l_attacks)]

y_test = np.array([0 if x=='normal' else 1 for x in df_test['label']])
df_test = df_test.drop(['label'],axis=1)
df_test = df_test.reset_index().drop(['index'], axis=1)
df_test

In [None]:
X_test = df_test[common_features_l2] 
X_test

In [None]:
df_minmax = scaler2.transform(X_test)
X_test = pd.DataFrame(df_minmax, columns=X_test.columns)
X_test

In [None]:
label_enc = ohe2.transform(df_test.iloc[:,1:4])
label_enc.toarray()
new_labels = ohe2.get_feature_names_out(categorical_features)
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_test = pd.concat([X_test, df_enc], axis=1)
X_test

In [None]:
print('Shape of the train set: ', X_train.shape)
print('Shape of its target: ', y_train.shape)
print('Shape of the test set: ', X_test.shape)
print('Shape of its target: ', y_test.shape)

In [None]:
# Under sampling the train set for l2
sm = under_sam(sampling_strategy=1)
X_train, y_train = sm.fit_resample(X_train,y_train)

## Export the datasets
Train set has been scaled, one hot encoded, undersampled
Test set has been scaled and one hot encoded

In [None]:
# Export the dataset for training layer 2
if EXPORT_DATASETS:
    X_train.to_csv('KB Process/NSL-KDD Encoded Datasets/before_pca/KDDTrain+_l2.txt', index=False)
    np.save('KB Process/NSL-KDD Encoded Datasets/before_pca/KDDTrain+_l2_targets', y_train)
    X_validate.to_csv('KB Process/NSL-KDD Encoded Datasets/before_pca/KDDValidate+_l2.txt', index=False)
    np.save('KB Process/NSL-KDD Encoded Datasets/before_pca/KDDValidate+_l2_targets', y_train)

In [None]:
from sklearn import tree

# Principal Component Analysis
pca_r2l_u2r = PCA(n_components=0.95)
X_train_r2l_u2r = pca_r2l_u2r.fit_transform(X_train)
X_test_r2l_u2r = pca_r2l_u2r.transform(X_test)
X_validate_r2l_u2r = pca_r2l_u2r.transform(X_validate)

# Try also Decision Trees
# r2l_u2r_classifier = tree.DecisionTreeClassifier()

# Support Vector Machine for layer l2
r2l_u2r_classifier = SVC(C=0.1, gamma=0.01, kernel='rbf', probability=True)
r2l_u2r_classifier.fit(X_train_r2l_u2r, y_train)
predicted = r2l_u2r_classifier.predict(X_test_r2l_u2r)

In [None]:
if EXPORT_PCA:
    # save the pca transformed as well as the transformer
    joblib.dump(X_test_r2l_u2r, 'KB Process/NSL-KDD Encoded Datasets/pca_transformed/KDDTest+_l2_pca.pkl')
    joblib.dump(X_train_r2l_u2r, 'KB Process/NSL-KDD Encoded Datasets/pca_transformed/KDDTrain+_l2_pca.pkl')
    joblib.dump(X_train_r2l_u2r, 'KB Process/NSL-KDD Encoded Datasets/pca_transformed/KDDValidate+_l2_pca.pkl')
    joblib.dump(pca_r2l_u2r, 'KB Process/NSL-KDD Encoded Datasets/pca_transformed/layer2_pca_transformer.pkl')

In [None]:
print('Metrics for layer 2:')
print('Confusion matrix: [TP FN / FP TN]\n', confusion_matrix(y_test,predicted))
print('Accuracy = ', accuracy_score(y_test,predicted))
print('F1 Score = ', f1_score(y_test,predicted))
print('Precision = ', precision_score(y_test,predicted))
print('Recall = ', recall_score(y_test,predicted))
print('Matthew corr = ', matthews_corrcoef(y_test,predicted))
print('Shape of the training set: ', X_train_r2l_u2r.shape)

### Export the classifiers

In [None]:
if EXPORT_MODELS:
    with open('KB Process/Models/From notebook/NSL_l1_classifier.pkl', "wb") as f:
        pickle.dump(dos_probe_classifier, f)
    with open('KB Process/Models/From notebook/NSL_l2_classifier.pkl', "wb") as f:
        pickle.dump(r2l_u2r_classifier, f)

# Testing

In [None]:
df_test1 = copy.deepcopy(df_test_original)
df_test2 = copy.deepcopy(df_test_original)
y_test_real = np.array([0 if x=='normal' else 1 for x in df_test1['label']])
df_test_original

In [None]:
X_test = df_test1[common_features_l1]
X_test

In [None]:
df_minmax = scaler1.transform(X_test)
X_test = pd.DataFrame(df_minmax, columns=X_test.columns)
label_enc = ohe.transform(df_test1.iloc[:,1:4])
label_enc.toarray()
new_labels = ohe.get_feature_names_out(categorical_features)
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_test = pd.concat([X_test, df_enc], axis=1)

X_test_layer1 = pca_dos_probe.transform(X_test)
print('Test set shape for layer 1: ', X_test_layer1.shape)

In [None]:
X_test = df_test2[common_features_l2] 
X_test

In [None]:
df_minmax = scaler2.transform(X_test)
X_test = pd.DataFrame(df_minmax, columns=X_test.columns)
label_enc = ohe2.transform(df_test2.iloc[:,1:4])
label_enc.toarray()
new_labels = ohe2.get_feature_names_out(categorical_features)
df_enc = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
X_test = pd.concat([X_test, df_enc], axis=1)

X_test_layer2 = pca_r2l_u2r.transform(X_test)
print('Test set shape for layer 2: ', X_test_layer2.shape)
print('Type of X_test_layer1: ', type(X_test_layer1))
print('Type of X_test_layer1: ', type(X_test_layer2))

In [None]:
# same classifiers obtained above
classifier1 = dos_probe_classifier
classifier2 = r2l_u2r_classifier

In [None]:
result = []
for i in range(X_test_layer2.shape[0]):
    layer1 = classifier1.predict(X_test_layer1[i].reshape(1, -1))[0]
    if layer1 == 1:
        result.append(layer1)
    else:
        layer2 = classifier2.predict(X_test_layer2[i].reshape(1, -1))[0]
        if layer2 == 1:
            result.append(layer2)
        else:
            result.append(0)
            
result = np.array(result)

In [None]:
# the results may vary
# C=0.1, gamma=0.01
print('Results for the layer 2 (SVM):')
print(confusion_matrix(y_test_real,result))
print('Accuracy = ', accuracy_score(y_test_real,result))
print('F1 Score = ', f1_score(y_test_real,result))
print('Precision = ', precision_score(y_test_real,result))
print('Recall = ', recall_score(y_test_real,result))
print('Matthew corr = ', matthews_corrcoef(y_test_real,result))

### Export the test sets

In [None]:
if EXPORT_DATASETS:
    column_names = [f'PC{i}' for i in range(1, X_test_layer1.shape[1] + 1)]
    X1_test = pd.DataFrame(data=X_test_layer1, columns=column_names)
    X1_test.to_csv('KB Process/NSL-KDD Encoded Datasets/before_pca/X_test_l1.txt', index=False)
    
    column_names = [f'PC{i}' for i in range(1, X_test_layer2.shape[1] + 1)]
    X2_test = pd.DataFrame(data=X_test_layer2, columns=column_names)
    X2_test.to_csv('KB Process/NSL-KDD Encoded Datasets/before_pca/X_test_l2.txt', index=False)
    
    np.save('KB Process/NSL-KDD Encoded Datasets/before_pca/y_test', y_test_real)

### evaluate seen and unseen attack categories

In [None]:
# load testset
df_test = pd.read_csv('KB Process/NSL-KDD Original Datasets\KDDTest+.txt', sep=",", header=None)
df_test = df_test[df_test.columns[:-1]]
df_test.columns = titles.to_list()
y_test = df_test['label']
df_test = df_test.drop(['num_outbound_cmds'],axis=1)

df_test_original = df_test

In [None]:
# Generate and plot a synthetic imbalanced classification dataset
from collections import Counter
from matplotlib import pyplot
from numpy import where

X, y = copy.deepcopy(df_test), copy.deepcopy(y_test)

counter = Counter(y)

# scatter plot of examples by class label
for label, _ in counter.items():
    row_ix = where(y == label)[0]
    pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label))
    
pyplot.legend()
pyplot.show()

In [None]:
if EXPORT_DATASETS:
    df_test_original.to_csv('KB Process/NSL-KDD Encoded Datasets/before_pca/KDDTest+', index=False)
    np.save('KB Process/NSL-KDD Encoded Datasets/before_pca/KDDTest+_targets', y_test)
    
df_test_original

In [None]:
new_attack = []
for i in df_test_original['label'].value_counts().index.tolist()[1:]:
    if i not in df_train_original['label'].value_counts().index.tolist()[1:]:
        new_attack.append(i)
        
new_attack.sort()
new_attack

In [None]:
index_of_new_attacks = []

for i in range(len(df_test_original)):
    if df_test_original['label'][i] in new_attack:
        index_of_new_attacks.append(df_test_original.index[i])

In [None]:
len(index_of_new_attacks)

In [None]:
new_attack.append('normal')
new_attack

In [None]:
index_of_old_attacks = []

for i in range(len(df_test_original)):
    if df_test_original['label'][i] not in new_attack:
        index_of_old_attacks.append(df_test_original.index[i])

In [None]:
len(index_of_old_attacks)

In [None]:
print('Number of new attacks in the test set: ', result[index_of_new_attacks].shape[0])
print('Number of new attacks detected by the classifiers: ', result[index_of_new_attacks].sum())
print('Proportion of new attacks detected: ', result[index_of_new_attacks].sum()/result[index_of_new_attacks].shape[0])

In [None]:
print('Number of old attacks in the test set: ', result[index_of_old_attacks].shape[0])
print('Number of old attacks detected by the classifiers: ', result[index_of_old_attacks].sum())
print('Proportion of old attacks detected: ', result[index_of_old_attacks].sum()/result[index_of_old_attacks].shape[0])

### Evaluate single attack types

In [None]:
# load test set
df_test = pd.read_csv('KB Process/NSL-KDD Original Datasets/KDDTest+.txt', sep=",", header=None)
df_test = df_test[df_test.columns[:-1]]
df_test.columns = titles.to_list()
y_test = df_test['label']
df_test = df_test.drop(['num_outbound_cmds'],axis=1)
df_test_original = df_test
df = df_test_original

dos_index = df.index[(df['label'].isin(dos_attacks))].tolist()
probe_index = df.index[(df['label'].isin(probe_attacks))].tolist()
r2l_index = df.index[(df['label'].isin(r2l_attacks))].tolist()
u2r_index = df.index[(df['label'].isin(u2r_attacks))].tolist()

print('Evaluation split into single attack type:')
print("Number of dos attacks: ", result[dos_index].shape[0])
print("Number of detected attacks: ", result[dos_index].sum())
print("Ratio of detection: ", result[dos_index].sum()/result[dos_index].shape[0])

print("Number of probe attacks: ", result[probe_index].shape[0])
print("Number of detected attacks: ", result[probe_index].sum())
print("Ratio of detection: ", result[probe_index].sum()/result[probe_index].shape[0])

print("Number of r2l attacks: ", result[r2l_index].shape[0])
print("Number of detected attacks: ", result[r2l_index].sum())
print("Ratio of detection: ", result[r2l_index].sum()/result[r2l_index].shape[0])

print("Number of u2r attacks: ", result[u2r_index].shape[0])
print("Number of detected attacks: ", result[u2r_index].sum())
print("Ratio of detection: ", result[u2r_index].sum()/result[u2r_index].shape[0])

In [None]:
# Export one hot encoders
if EXPORT_ENCODERS:
    joblib.dump(ohe, 'KB Process/Required Files/one_hot_encoders/OneHotEncoder_l1.pkl')
    joblib.dump(ohe2, 'KB Process/Required Files/one_hot_encoders/OneHotEncoder_l2.pkl')
    joblib.dump(scaler1, 'KB Process/Required Files/scalers/Scaler_l1.pkl')
    joblib.dump(scaler2, 'KB Process/Required Files/scalers/Scaler_l2.pkl')