# Deep Learning

We follow the previous steps of preprocessing. This time, we use the following:

#### Preprocessing
- Data augmentation
  - Synthetic Minority Oversampling and Edited Nearest Neighbours (SMOTEENN)
  - Oversample using Adaptive Synthetic (ADASYN)

#### Algorithms
- CNN + RNN

In [2]:
### Suppress warnings in tensorflow output
import warnings
from tensorflow import get_logger
warnings.filterwarnings('ignore')
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
get_logger().setLevel('ERROR')
###
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from collections import Counter
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import ADASYN
from sklearn import svm
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, Dropout, Conv1D, MaxPooling1D, Flatten
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


In [None]:
df = pd.read_csv('../../../../Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv')

In [None]:
# Inspect all labels in the dataset
counts = df[' Label'].value_counts()
print(counts)

In [None]:
# Preprocessing: Remove all rows with NaN or inf values

# Count rows with at least one NaN value
nan_rows_indices = df.index[df.isna().any(axis=1)].tolist()
nan_rows_count = len(nan_rows_indices)
print("Number of rows with at least one NaN value:", nan_rows_count)

# Exclude last column because it has strings and the check for inf numbers throws an error
X = df.iloc[:, 0:-1]

# Count rows with at least one infinite value
inf_rows_indices = X.index[np.isinf(X).any(axis=1)].tolist()
inf_rows_count = len(inf_rows_indices)
print("Number of rows with at least one inf value:", inf_rows_count)

# Combine indices of rows with NaN or inf values
rows_to_drop = set(nan_rows_indices + inf_rows_indices)

# Drop rows with at least one NaN or inf value
df2 = df.drop(index=rows_to_drop)

In [None]:
# Give all samples with a non BENIGN label the label MALICIOUS
df2[' Label'] = df2[' Label'].where(df2[' Label'] == 'BENIGN', 'MALICIOUS')

label_counts = df2[' Label'].value_counts()
print(label_counts)

In [None]:
# Make the labels numerical
df2[' Label'] = df2[' Label'].replace({'BENIGN': 0, 'MALICIOUS': 1})

In [None]:
# Split dataset into features and labels
X_original = df2.iloc[:,0:-1]
y_original = df2.iloc[:, -1]

In [None]:
# Transform the features to only have a range [0, 1]. Note that the scaled data are now a numpy array, not a df
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_original)

# Print the range of values in the scaled dataframe
# print(X_scaled.describe().loc[['min', 'max']])
print(X_scaled.max(axis=None))
print(X_scaled.min(axis=None))

In [None]:
# Print the count of 0s and 1s. It should match the numbers for 'MALICIOUS' and 'BENIGN'
unique_values, counts = np.unique(y_original, return_counts=True)

for value, count in zip(unique_values, counts):
    print(f"Value: {value}, Count: {count}")

We can see the dataset is imbalanced. Hence, we use the data augmentation techniques

In [None]:
benign_indices = y_original.index[y_original == 1].tolist()
print(len(y_original[benign_indices]))

In [None]:
len(y_original)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_original, random_state=42, test_size=0.2, shuffle=False)
print(len(X_scaled))
print(len(y_original))

print(len(X_train))
print(len(y_train))

print(len(X_test))
print(len(y_test))

In [None]:
# # Data augmentation technique one: SMOTEENN
# print('Total data before augmentation %s' % Counter(y_original))
# sme = SMOTEENN()
# X_augmented_smoteenn, y_augmented_smoteenn = sme.fit_resample(X_train, y_train)
# print('Total Data after SMOTEENN augmentation %s' % Counter(y_augmented_smoteenn))

# # Data augmentation technique one: ADASYN
# adasyn = ADASYN()
# X_augmented_ada, y_augmented_ada = adasyn.fit_resample(X_train, y_train)
# print('Total Data after ADASYN augmentation %s' % Counter(y_augmented_smoteenn))


In [None]:
# Build the RNN model
# base_model = Sequential(
#     [
#         Conv1D(32, kernel_size=5, activation='relu', input_shape=(X_train.shape[1], 1)),
#         MaxPooling1D(pool_size=4),
#         Conv1D(16, kernel_size=3, activation='relu'),
#         MaxPooling1D(pool_size=2),
#         SimpleRNN(100, activation='relu'),
#         Dropout(0.5),
#         Dense(1, activation='sigmoid')
#     ]
# )



# base_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# # Train the model
# base_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

# # Evaluate the model
# base_y_pred_prob = base_model.predict(X_test)
# base_y_pred = (base_y_pred_prob > 0.5).astype(int)

# accuracy = accuracy_score(y_test, base_y_pred)
# precision = precision_score(y_test, base_y_pred)
# recall = recall_score(y_test, base_y_pred)
# f1 = f1_score(y_test, base_y_pred)

# print("Accuracy:", accuracy)
# print("Precision:", precision)
# print("Recall:", recall)
# print("F1 Score:", f1)

# print(classification_report(y_test, base_y_pred, target_names=["Begign","Malicious"]))
# disp = ConfusionMatrixDisplay.from_predictions(y_test, base_y_pred)
# plt.show()


In [None]:
print(f'Total data before SMOTEENN augmentation {Counter(y_original)}')
sme = SMOTEENN()
X_augmented_smoteenn, y_augmented_smoteenn = sme.fit_resample(X_train, y_train)
print(f'Total data after SMOTEENN augmentation {Counter(y_augmented_smoteenn)}')

# Data augmentation technique one: ADASYN
adasyn = ADASYN()
X_augmented_ada, y_augmented_ada = adasyn.fit_resample(X_train, y_train)
print(f'Total data after ADASYN augmentation {Counter(y_augmented_ada)}')

In [None]:
# Build the RNN model
sme_model = Sequential(
    [
        Conv1D(32, kernel_size=5, activation='relu', input_shape=(X_train.shape[1], 1)),
        # MaxPooling1D(pool_size=4),
        # Conv1D(16, kernel_size=3, activation='relu'),
        # MaxPooling1D(pool_size=2),
        SimpleRNN(100, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ]
)

ada_model = Sequential(
    [
        Conv1D(32, kernel_size=5, activation='relu', input_shape=(X_train.shape[1], 1)),
        # MaxPooling1D(pool_size=4),
        # Conv1D(16, kernel_size=3, activation='relu'),
        # MaxPooling1D(pool_size=2),
        SimpleRNN(100, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ]
)
# Compile the models
sme_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
ada_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# # Evaluate the models
# sme_y_pred_prob = sme_model.predict(X_test)
# sme_y_pred = (sme_y_pred_prob > 0.5).astype(int)

# accuracy = accuracy_score(y_test, sme_y_pred)
# precision = precision_score(y_test, sme_y_pred)
# recall = recall_score(y_test, sme_y_pred)
# f1 = f1_score(y_test, sme_y_pred)

# print("Accuracy:", accuracy)
# print("Precision:", precision)
# print("Recall:", recall)
# print("F1 Score:", f1)

# print(classification_report(y_test, base_y_pred, target_names=["Begign","Malicious"]))
# disp = ConfusionMatrixDisplay.from_predictions(y_test, base_y_pred)
# plt.show()


In [None]:
# Train the model
sme_model.fit(X_train, y_train, epochs=3, batch_size=32, validation_split=0.2)
sme_y_pred_prob = sme_model.predict(X_test)
sme_y_pred = (sme_y_pred_prob > 0.5).astype(int)

accuracy = accuracy_score(y_test, sme_y_pred)
precision = precision_score(y_test, sme_y_pred)
recall = recall_score(y_test, sme_y_pred)
f1 = f1_score(y_test, sme_y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print(classification_report(y_test, sme_y_pred, target_names=["Begign","Malicious"]))
disp = ConfusionMatrixDisplay.from_predictions(y_test, sme_y_pred)
plt.show()

In [None]:
# Train the model
ada_model.fit(X_train, y_train, epochs=3, batch_size=32, validation_split=0.2)
ada_y_pred_prob = sme_model.predict(X_test)
ada_y_pred = (sme_y_pred_prob > 0.5).astype(int)

accuracy = accuracy_score(y_test, ada_y_pred)
precision = precision_score(y_test, ada_y_pred)
recall = recall_score(y_test, ada_y_pred)
f1 = f1_score(y_test, ada_y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print(classification_report(y_test, ada_y_pred, target_names=["Begign","Malicious"]))
disp = ConfusionMatrixDisplay.from_predictions(y_test, ada_y_pred)
plt.show()

In [None]:
# Select best model and perform feature selection on it to check if it improves performance

In [None]:
# Hyperparameter tuning?