In [78]:
import pandas as pd
import numpy as np
import copy
from sklearn.model_selection import train_test_split

In [79]:
# loading the train set
df_train = pd.read_csv('EvalResources/KDDTrain+.txt', sep=",", header=None, skipinitialspace = True)
df_train = df_train[df_train.columns[:-1]]  # tags column
titles = pd.read_csv('EvalResources/Field Names.csv', header=None, skipinitialspace = True)
label = pd.Series(['label'], index=[41])
titles = pd.concat([titles[0], label])
df_train.columns = titles.to_list()
df_train = df_train.drop(['num_outbound_cmds'],axis=1)
df_train_original = df_train

# load test set
df_test = pd.read_csv('EvalResources/KDDTest+.txt', sep=",", header=None, skipinitialspace = True)
df_test_ = df_test.sort_index(axis=1)
df_test = df_test[df_test.columns[:-1]]
df_test.columns = titles.to_list()
df_test = df_test.drop(['num_outbound_cmds'],axis=1)
df_test_original = df_test

In [80]:
# list of single attacks 
dos_attacks = ['back', 'land', 'neptune', 'pod', 'smurf', 'teardrop', 'worm', 'apache2', 'mailbomb', 'processtable', 'udpstorm']
probe_attacks = ['ipsweep', 'mscan', 'nmap', 'portsweep', 'saint', 'satan']
r2l_attacks = ['guess_passwd', 'ftp_write', 'imap', 'phf', 'multihop', 'warezmaster',
                'snmpguess', 'spy', 'warezclient', 'httptunnel', 'named', 'sendmail', 'snmpgetattack', 'xlock', 'xsnoop']
u2r_attacks = ['buffer_overflow', 'loadmodule', 'perl', 'ps', 'rootkit', 'sqlattack', 'xterm'] 

# list of attack classes split according to detection layer
dos_probe_list = ['back', 'land', 'neptune', 'pod', 'smurf', 'teardrop', 'ipsweep', 'nmap', 'portsweep', 'satan']
dos_probe_test = ['apache2', 'mailbomb', 'processtable', 'udpstorm', 'mscan', 'saint']
u2r_r2l_list = ['guess_passwd', 'ftp_write', 'imap', 'phf', 'multihop', 'warezmaster',
                'snmpguess', 'spy', 'warezclient', 'buffer_overflow', 'loadmodule', 'rootkit', 'perl']
u2r_r2l_test = ['httptunnel', 'named', 'sendmail', 'snmpgetattack', 'xlock', 'xsnoop', 'ps', 'xterm', 'sqlattack']
normal_list = ['normal']
categorical_features = ['protocol_type', 'service', 'flag']

In [81]:
df_train_and_validate = copy.deepcopy(df_train_original)
df_test = copy.deepcopy(df_test_original)

In [82]:
# Save all the targets for the dataset
y_test_l1 = [1 if x in (dos_attacks+probe_attacks) else 0 for x in df_test['label']]
y_test_l2 = [1 if x in (u2r_attacks+r2l_attacks) else 0 for x in df_test['label']]

In [83]:
# split in test and validation set for BOTH layers
df_train_original, df_val_original = train_test_split(df_train_and_validate, test_size=0.2, random_state=42)

In [84]:
# LAYER 1 ONLY!

# dataframes specifically for layer 1
df_train = copy.deepcopy(df_train_original)
df_val = copy.deepcopy(df_val_original)
df_test = copy.deepcopy(df_test_original)

# set the target variables accordingly
y_train = np.array([1 if x in (dos_attacks+probe_attacks) else 0 for x in df_train['label']])
y_val = np.array([1 if x in (dos_attacks+probe_attacks) else 0 for x in df_val['label']])
y_test = np.array([1 if x in (dos_attacks+probe_attacks) else 0 for x in df_test['label']])

In [85]:
# this dataframe contains the whole train set 
df_train = df_train.drop(['label'],axis=1)
df_train = df_train.reset_index().drop(['index'], axis=1)

In [86]:
# this dataframe contains the whole validation set
df_val = df_val.drop(['label'],axis=1)
df_val = df_val.reset_index().drop(['index'], axis=1)

In [87]:
# this dataframe contains the whole validation set
df_test = df_test.drop(['label'],axis=1)
df_test = df_test.reset_index().drop(['index'], axis=1)

In [88]:
from sklearn.preprocessing import OneHotEncoder

# one-hot encoder for the features of layer1
ohe = OneHotEncoder(handle_unknown='ignore')

In [89]:
# perform One-hot encoding for the train set
label_enc = ohe.fit_transform(df_train[categorical_features])
label_enc.toarray()
new_labels = ohe.get_feature_names_out(categorical_features)
df_enc_train = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)

In [90]:
# perform One-hot encoding for the validation set
label_enc = ohe.transform(df_val[categorical_features])
label_enc.toarray()
new_labels = ohe.get_feature_names_out(categorical_features)
df_enc_val = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)

In [91]:
# perform One-hot encoding for the test set
label_enc = ohe.transform(df_test.iloc[:,1:4])
label_enc.toarray()
new_labels = ohe.get_feature_names_out(categorical_features)
df_enc_test = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)

In [92]:
# remove categorical features from the datasets
df_train.drop(columns=categorical_features, inplace=True)
df_val.drop(columns=categorical_features, inplace=True)
df_test.drop(columns=categorical_features, inplace=True)

In [93]:
from sklearn.preprocessing import MinMaxScaler

scaler1 = MinMaxScaler()

In [94]:
# scaling the train set for layer1
df_minmax_train = scaler1.fit_transform(df_train)
x_train = pd.DataFrame(df_minmax_train, columns=df_train.columns)

In [95]:
# scaling the validation set for layer1
df_minmax_val = scaler1.transform(df_val)
x_val = pd.DataFrame(df_minmax_val, columns=df_val.columns)

In [96]:
# scaling the test set for layer1
df_minmax_test = scaler1.transform(df_test)
x_test = pd.DataFrame(df_minmax_test, columns=df_test.columns)

In [97]:
# Check if there are any NaN values in the entire DataFrame
if x_test.isna().any().any():
    print("DataFrame has NaN values")
else:
    print("DataFrame does not have NaN values")

DataFrame does not have NaN values


x_train = pd.concat([x_train, df_enc_train], axis=1)
x_val = pd.concat([x_val, df_enc_val], axis=1)
x_test = pd.concat([x_test, df_enc_test], axis=1)

### Now we can perform features selection

In [98]:
import matplotlib.pyplot as plt


## Feature importances with Random Forest

import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

forest = RandomForestClassifier()
forest.fit(x_train, y_train)

start_time = time.time()
result = permutation_importance(
    forest, x_test, y_test, n_repeats=100, random_state=42, n_jobs=-1
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")

forest_importances = pd.Series(result.importances_mean, index=x_train.columns)

# Sort the feature importances
forest_importances_sorted = forest_importances.sort_values(ascending=False)

# Select the top n important features
n_top_features = 15
top_features = forest_importances_sorted.head(n_top_features)

# Print the top features
print("Top", n_top_features, "features: ")
print(top_features)

print(top_features.index.to_list)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

## Using Recursive Feature Elimination method

In [99]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

forest = RandomForestClassifier()
forest.fit(x_train, y_train)

# Perform Recursive Feature Elimination (RFE)
n_selected_features = 12
selector_rfe = RFE(estimator=forest, n_features_to_select=n_selected_features, step=1)
selector_rfe.fit(x_train, y_train)

# Get selected features
selected_features_rfe = x_train.columns[selector_rfe.support_]

# Print selected features
print("Selected features using RFE:")
print(selected_features_rfe)

Selected features using RFE:
Index(['logged_in', 'count', 'same_srv_rate', 'diff_srv_rate',
       'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_srv_serror_rate',
       'dst_host_rerror_rate'],
      dtype='object')


In [100]:
# LAYER 2 ONLY!

# dataframes specifically for layer 2
df_train = copy.deepcopy(df_train_original)
df_val = copy.deepcopy(df_val_original)
df_test = copy.deepcopy(df_test_original)

In [101]:
# set the target variables accordingly
y_train = np.array([1 if x in (u2r_attacks+r2l_attacks) else 0 for x in df_train['label']])
y_val = np.array([1 if x in (u2r_attacks+r2l_attacks) else 0 for x in df_val['label']])
y_test = np.array([1 if x in (u2r_attacks+r2l_attacks) else 0 for x in df_test['label']])

In [102]:
# this dataframe contains the whole train set 
df_train = df_train.drop(['label'], axis=1)
df_train = df_train.reset_index().drop(['index'], axis=1)

In [103]:
# this dataframe contains the whole validation set
df_val = df_val.drop(['label'], axis=1)
df_val = df_val.reset_index().drop(['index'], axis=1)

In [104]:
# this dataframe contains the whole validation set
df_test = df_test.drop(['label'], axis=1)
df_test = df_test.reset_index().drop(['index'], axis=1)

In [105]:
# one-hot encoder for the features of layer2
ohe = OneHotEncoder(handle_unknown='ignore')

In [106]:
# perform One-hot encoding for the train set
label_enc = ohe.fit_transform(df_train[categorical_features])
label_enc.toarray()
new_labels = ohe.get_feature_names_out(categorical_features)
df_enc_train = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)

In [107]:
# perform One-hot encoding for the validation set
label_enc = ohe.transform(df_val[categorical_features])
label_enc.toarray()
new_labels = ohe.get_feature_names_out(categorical_features)
df_enc_val = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
# perform One-hot encoding for the test set

In [108]:
label_enc = ohe.transform(df_test.iloc[:, 1:4])
label_enc.toarray()
new_labels = ohe.get_feature_names_out(categorical_features)
df_enc_test = pd.DataFrame(data=label_enc.toarray(), columns=new_labels)
# remove categorical features from the datasets

In [109]:
df_train.drop(columns=categorical_features, inplace=True)
df_val.drop(columns=categorical_features, inplace=True)
df_test.drop(columns=categorical_features, inplace=True)

In [110]:
scaler1 = MinMaxScaler()

In [111]:
# scaling the train set for layer1
df_minmax_train = scaler1.fit_transform(df_train)
x_train = pd.DataFrame(df_minmax_train, columns=df_train.columns)

In [112]:
# scaling the validation set for layer1
df_minmax_val = scaler1.transform(df_val)
x_val = pd.DataFrame(df_minmax_val, columns=df_val.columns)

In [113]:
# scaling the test set for layer1
df_minmax_test = scaler1.transform(df_test)
x_test = pd.DataFrame(df_minmax_test, columns=df_test.columns)
# Check if there are any NaN values in the entire DataFrame

In [114]:
if x_test.isna().any().any():
    print("DataFrame has NaN values")
else:
    print("DataFrame does not have NaN values")

DataFrame does not have NaN values


x_train = pd.concat([x_train, df_enc_train], axis=1)
x_val = pd.concat([x_val, df_enc_val], axis=1)
x_test = pd.concat([x_test, df_enc_test], axis=1)

In [115]:
# Feature importances with Random Forest

forest = RandomForestClassifier()
forest.fit(x_train, y_train)

start_time = time.time()
result = permutation_importance(
    forest, x_test, y_test, n_repeats=100, random_state=42, n_jobs=-1
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")

forest_importances = pd.Series(result.importances_mean, index=x_train.columns)

# Sort the feature importances
forest_importances_sorted = forest_importances.sort_values(ascending=False)

# Select the top n important features
n_top_features = 11
top_features = forest_importances_sorted.head(n_top_features)

# Print the top features
print("Top", n_top_features, "features: ")
print(top_features)

print(top_features.index.to_list)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

NameError: name 'time' is not defined