In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.io import arff

In [None]:
largeDataSet = './data/raw/KDDTrain.arff'
smallDataSet = './data/raw/KDDTrain_20Percent.arff'

largeTestData = './data/raw/KDDTest.arff'
smallTestData = './data/raw/KDDTest-21.arff'

# Loading the .arff data format into a pandas dataframe
data, meta = arff.loadarff(smallDataSet)
test_data, test_meta = arff.loadarff(smallTestData)

df = pd.DataFrame(data)
df_test = pd.DataFrame(test_data)

# Review of the dataset
df.head(3)

### Dataset Infomaion
- Dataset: NSL-KDD
- Source: https://www.kaggle.com/datasets/hassan06/nslkdd

- Description: The dataset is a modified version of the NSL-KDD dataset, which is a subset of the original KDD'99 dataset. The number of records in the NSL-KDD train and test sets are 125,973 and 22,544 respectively. These were created by applying the following two steps to the original dataset:
    - Duplicate free: The duplicate records were removed from the original KDD'99 dataset.
    - Binary: The multiclass attacks in the dataset were converted to binary by combining all the attack types into a single attack type, and the normal records remained unchanged.

In [None]:
# Calculating the number of features and samples and values types in the dataset
features = df.shape[1]
samples = df.shape[0]
value_types = ', '.join(df.dtypes.unique().astype(str))

# Print the information
print('Dataset Information: ')
print('--------------------')
print(f"Dataset has {features} features and {samples} samples.")
print(f"Dataset Value types: {value_types}")
print(f"Dataset has {df.isnull().sum().sum()} missing values.")
print("===========================================")

# Dataset summary
df.info()

### Data Cleaning : Handling & Imputing Missing Values

In this stage we're ensuring there's no missing / null values in the datset.

In [None]:
# Manually Checking if data include missing values
missingValues = df.isnull().sum()
print(f"Missing Values in every Column: \n{missingValues}")

In [None]:
# Checking Anomolies in the dataset
print(f"Descriptive Statistics: \n{df.describe(include='all')}")


Since there's no missing values in the dataset, we don't need to handle/ impute missing values.
Also, since every feature is complete with its values, we don't need to drop any feature.

## Outliers and Removing the Outliers
We'll find the outliers in the dataset and remove them if necessary.

In [None]:
# Create a pair plot with 'hue' parameter for coloring based on the 'class' column
# sns.pairplot(df[["protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "class"]], hue='class', diag_kind='kde')
# plt.tight_layout()
# plt.show()

Now, as there is outliers in our dataset, we will remove them. We will use the Z-score method to detect and remove the outliers. The Z-score is the signed number of standard deviations by which the value of an observation or data point is above the mean value of what is being observed or measured.

In [None]:
from scipy.stats import zscore

# A threshold value beyod which a data point is considered as an outlier
zscore_threshold = 3

# Calculate Z-scores for numeric columns (excluding categorical)
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
z_scores = np.abs(zscore(df[numeric_columns]))

# Create a outlier mask indicating whether each row is an outlier or not
outlier_mask = np.any(z_scores > zscore_threshold, axis=1)

# Remove outliers from the dataset
df_original = df.copy()
df = df_original[~outlier_mask]

# Display the shape before and after removing outliers
print("Dataset Shape before removing outliers:", df_original.shape)
print("Dataset Shape after removing outliers:", df.shape)


In [None]:
# Calculate Z-scores for numeric columns in the test data (excluding categorical)
numeric_columns_test = df_test.select_dtypes(include=['int64', 'float64']).columns.tolist()
z_scores_test = np.abs(zscore(df_test[numeric_columns_test]))

# Create an outlier mask for the test data indicating whether each row is an outlier or not
outlier_mask_test = np.any(z_scores_test > zscore_threshold, axis=1)

# Remove outliers from the test dataset
df_test_original = df_test.copy()
df_test = df_test_original[~outlier_mask_test]

# Display the shape before and after removing outliers from the test dataset
print("Test Dataset Shape before removing outliers:", df_test_original.shape)
print("Test Dataset Shape after removing outliers:", df_test.shape)


### Feature Scaling

After removing the outliers, We'll now scale our datasets. We'll consider using both MinmaxScaler and StandardScaler to see which one performs better.

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Separate categorical and numeric columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()[:-1]  # Exclude the target
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# One-hot encode categorical columns
encoded_columns = pd.get_dummies(df[categorical_columns], drop_first=True)

# Scale numeric columns using MinMaxScaler
scaler = MinMaxScaler()
scaled_columns = scaler.fit_transform(df[numeric_columns])

# Scale numeric columns using StandardScaler
# scaler = StandardScaler()
# scaled_columns = scaler.fit_transform(df[numeric_columns])

# Convert the scaled numeric columns back to a DataFrame with appropriate column names
# Combine scaled numeric columns with the one-hot encoded categorical columns
# Note: Here, we're using the original index from the DataFrame 'df' to ensure alignment
scaled_df = pd.DataFrame(scaled_columns, columns=numeric_columns, index=df.index)
df_scaled = pd.concat([scaled_df, encoded_columns], axis=1)

df_not_scaled = df.copy()
df = df_scaled
df.head(3)

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Separate categorical and numeric columns
categorical_columns = df_test.select_dtypes(include=['object']).columns.tolist()[:-1]  # Exclude the target
numeric_columns = df_test.select_dtypes(include=['int64', 'float64']).columns.tolist()

# One-hot encode categorical columns
encoded_columns = pd.get_dummies(df_test[categorical_columns], drop_first=True)

# Scale numeric columns using MinMaxScaler
scaler = MinMaxScaler()
scaled_columns = scaler.fit_transform(df_test[numeric_columns])

# Scale numeric columns using StandardScaler
# scaler = StandardScaler()
# scaled_columns = scaler.fit_transform(df_test[numeric_columns])

# Convert the scaled numeric columns back to a DataFrame with appropriate column names
# Combine scaled numeric columns with the one-hot encoded categorical columns
# Note: Here, we're using the original index from the DataFrame 'df' to ensure alignment
scaled_df_test = pd.DataFrame(scaled_columns, columns=numeric_columns, index=df_test.index)
df_test_scaled = pd.concat([scaled_df_test, encoded_columns], axis=1)

df_test_not_scaled = df_test.copy()
df_test = df_test_scaled
df_test.head(3)

Correletion analysis -
feature seletion - 
hadling class imbalance -
dimentionality reduction


### Correletion Analysis
We'll use the Pearson Correlation method to find the correlation between the features. We'll remove the features which are highly correlated with each other. We'll try to keep the features which are highly correlated with the target variable.

In [None]:
# Calculate Pearson correlation coefficients
correlation_matrix = df.corr(method='pearson')

# Set up the heatmap
plt.figure(figsize=(12, 10))
sns.set(style="white")
cmap = sns.diverging_palette(240, 10, as_cmap=True)

# Create the heatmap without annotations
ax = sns.heatmap(correlation_matrix, cmap=cmap, annot=False, fmt=".2f", square=True, center=0, linewidths=0.5)

# Set x-axis and y-axis labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, verticalalignment='center')

# Show colorbar with correlation scale
cbar = ax.collections[0].colorbar
cbar.set_label('Correlation Strength')

plt.title("Pearson Correlation Heatmap")
plt.show()

In [None]:
# Calculate Pearson correlation coefficients
correlation_matrix = df_test.corr(method='pearson')

# Set up the heatmap
plt.figure(figsize=(12, 10))
sns.set(style="white")
cmap = sns.diverging_palette(240, 10, as_cmap=True)

# Create the heatmap without annotations
ax = sns.heatmap(correlation_matrix, cmap=cmap, annot=False, fmt=".2f", square=True, center=0, linewidths=0.5)

# Set x-axis and y-axis labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, verticalalignment='center')

# Show colorbar with correlation scale
cbar = ax.collections[0].colorbar
cbar.set_label('Correlation Strength')

plt.title("Pearson Correlation Heatmap")
plt.show()

Now, We'll remove some features that are not useful for our model. We'll remove the following features:

In [None]:
# Remove

In [None]:
# Remove From test

### Feature Reduction
We'll use PCA to reduce the dimensionality of our dataset. We'll use the elbow method to determine the number of components to use.

In [None]:
from sklearn.decomposition import PCA

# Since our current dataframe doesn't include our target class we don't need to drop it
# X = df.drop("target_column_name", axis=1)
unprocessed = df

# Initializing PCA with the number features to keep
feature_to_keep = 75
pca = PCA(n_components=feature_to_keep)

# Fitting PCA on data
preprocessed = pca.fit_transform(unprocessed)

print(f'Original shape: {unprocessed.shape}\nafter PCA: {preprocessed.shape}')

In [None]:

# Since our current dataframe doesn't include our target class we don't need to drop it
# X = df.drop("target_column_name", axis=1)
unprocessed_test = df_test

# Initializing PCA with the number features to keep
feature_to_keep_test = 75
pca_test = PCA(n_components=feature_to_keep)

# Fitting PCA on data
preprocessed_test = pca_test.fit_transform(unprocessed_test)

print(f'Original Test shape: {unprocessed_test.shape}\nafter PCA: {preprocessed_test.shape}')

### Data Splitting
Now, we will split the data into training and testing sets. We will use 80% of the data for training and 20% for testing.

In [None]:
from sklearn.model_selection import train_test_split


# Splitting the data into training and testing sets
target = df_not_scaled['class']
X_train, X_test, y_train, y_test = train_test_split(preprocessed, target, test_size=0.2, random_state=42)

# Print the shapes of the split datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)



## Training the Model

### Naive Bayes Classifier
We'll start by training a Naive Bayes Classifier.

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


# Binary Encoding the class variable for Naive-Bayes
# then splitting the data into trainning and testing sets
df_not_scaled['class'] = df_not_scaled['class'].apply(lambda x: 0 if x == b'normal' else 1)

target = df_not_scaled['class']
X_train, X_test, y_train, y_test = train_test_split(preprocessed, target, test_size=0.2, random_state=42)


# Initialize the Naive Bayes model and trainning the model
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(X_train, y_train)

# Testing the model and calculating the accuracy of the model
# and calculating the confusing matrix
y_pred = naive_bayes_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Calculate precision, recall, and F1-score
report = classification_report(y_test, y_pred, target_names=["normal", "attack"])

print("================= Naive-Bayes Classifier=================\n")

print(f"Accuracy of Naive Bayes Classifier: {accuracy*100:.2f}%\n")
print(f"Confusion Matrix:")
print(conf_matrix)
print()
print("Classification Report:")
print(report)


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Binary Encoding the class variable for Naive-Bayes
df_test_not_scaled['class'] = df_test_not_scaled['class'].apply(lambda x: 0 if x == b'normal' else 1)

# Splitting the data into training and testing sets for testing
target_test = df_test_not_scaled['class']
X_test_final = preprocessed_test
y_test_final = target_test

# Initialize the Naive Bayes model and train it on the training data
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(X_train, y_train)

# Testing the model on the preprocessed test data and calculating the accuracy
y_pred_test = naive_bayes_model.predict(X_test_final)
accuracy_test = accuracy_score(y_test_final, y_pred_test)

# Calculating the confusion matrix, precision, recall, and F1-score
conf_matrix_test = confusion_matrix(y_test_final, y_pred_test)
report_test = classification_report(y_test_final, y_pred_test, target_names=["normal", "attack"])

print("================= Naive-Bayes Classifier - Test Data =================\n")
print(f"Accuracy of Naive Bayes Classifier on Test Data: {accuracy_test*100:.2f}%\n")
print(f"Confusion Matrix on Test Data:")
print(conf_matrix_test)
print()
print("Classification Report on Test Data:")
print(report_test)


In [None]:

from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN model and train the model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Testing the model and calculating the accuracy of the model
# and calculating the confusion matrix
y_pred = knn_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Calculate precision, recall, and F1-score
report = classification_report(y_test, y_pred, target_names=["normal", "attack"])

print("================= K-Nearest Neighbors (KNN) Classifier =================\n")

print(f"Accuracy of KNN Classifier: {accuracy*100:.2f}%\n")
print(f"Confusion Matrix:")
print(conf_matrix)
print()
print("Classification Report:")
print(report)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Initialize the KNN model and train the model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Testing the KNN model on the preprocessed test data and calculating the accuracy
y_pred_test = knn_model.predict(X_test_final)
accuracy_test = accuracy_score(y_test_final, y_pred_test)

# Calculating the confusion matrix, precision, recall, and F1-score
conf_matrix_test = confusion_matrix(y_test_final, y_pred_test)
report_test = classification_report(y_test_final, y_pred_test, target_names=["normal", "attack"])

print("================= K-Nearest Neighbors (KNN) Classifier - Test Data =================\n")
print(f"Accuracy of KNN Classifier on Test Data: {accuracy_test*100:.2f}%\n")
print(f"Confusion Matrix on Test Data:")
print(conf_matrix_test)
print()
print("Classification Report on Test Data:")
print(report_test)
