<a href="https://colab.research.google.com/github/HananAlfares/Enhanced-Intrusion-Detection-for-IoT/blob/main/Enhanced_Intrusion_Detection_for_IoT_Neural_Network_based_Imbalanced_Data_Handling_and_Feature_Reduction_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
from google.colab import drive
drive.mount("/content/drive/")
# Set working directory (consider running this again after mounting)
cwd = "/content/drive/My Drive/Intrusion analysis and incident management/Bot_IoT"
os.chdir(cwd)

Mounted at /content/drive/


In [None]:
import pandas as pd
import numpy as np

In [None]:
data=pd.read_csv("Bot_IoT_preprocessed.csv")

In [None]:
#multi-label classification
#each sample can belong to multiple classes simultaneously

In [None]:
#A multi-output neural network is a type of neural network that can produce multiple outputs for a single input.
#In this specific case, the neural network has multiple output nodes, each corresponding to a different label or category.

In [None]:
data.head()

Unnamed: 0,pkSeqID,stime,flgs,proto,saddr,sport,daddr,dport,pkts,bytes,...,srate,drate,attack,combined_DoS_HTTP,combined_DoS_TCP,combined_Normal_Normal,combined_Reconnaissance_OS_Fingerprint,combined_Reconnaissance_Service_Scan,combined_Theft_Data_Exfiltration,combined_Theft_Keylogging
0,1,1526344000.0,0,0,-5129129635686255861,,-8013749477516327964,,4,240,...,0.000836,0.000836,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2,1526344000.0,0,5,-6185723984693521753,139.0,-2415109182496690282,36390.0,10,680,...,0.002751,0.002751,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,3,1526344000.0,0,6,-1869885841190239628,51838.0,7736881533495019271,123.0,2,180,...,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,4,1526344000.0,0,0,-2415109182496690282,,-6185723984693521753,,10,510,...,0.002751,0.002751,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,5,1526344000.0,0,6,-5248272586555793995,58999.0,-5129129635686255861,53.0,4,630,...,0.001755,0.001755,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
print(data.columns)

Index(['pkSeqID', 'stime', 'flgs', 'proto', 'saddr', 'sport', 'daddr', 'dport',
       'pkts', 'bytes', 'state', 'ltime', 'seq', 'dur', 'mean', 'stddev',
       'smac', 'dmac', 'sum', 'min', 'max', 'soui', 'doui', 'sco', 'dco',
       'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'srate', 'drate',
       'attack', 'combined_DoS_HTTP', 'combined_DoS_TCP',
       'combined_Normal_Normal', 'combined_Reconnaissance_OS_Fingerprint',
       'combined_Reconnaissance_Service_Scan',
       'combined_Theft_Data_Exfiltration', 'combined_Theft_Keylogging'],
      dtype='object')


In [None]:
# Split data into X (features) and y (target)
X = data.loc[:, :'attack']  # Select all columns up to 'attack' column (inclusive)
y = data.loc[:, 'attack':]   # Select all columns from 'attack' column onwards   # Both category and subcategory as target

In [None]:
# Compute class weights based on inverse class frequency
#We compute class weights based on the inverse class frequency of the 'category' column.
from sklearn.utils.class_weight import compute_class_weight

# Assuming 'y' is a DataFrame containing the multiple target columns
# Concatenate all target columns
y_concatenated = y['attack']  # Assuming 'attack' is one of the target columns
for col in ['combined_DoS_HTTP', 'combined_DoS_TCP', 'combined_Normal_Normal',
            'combined_Reconnaissance_OS_Fingerprint', 'combined_Reconnaissance_Service_Scan',
            'combined_Theft_Data_Exfiltration', 'combined_Theft_Keylogging']:
  y_concatenated = pd.concat([y_concatenated, y[col]], axis=1)

In [None]:
# Compute class weights for each individual class
class_weights = compute_class_weight('balanced', classes=np.unique(y_concatenated), y=np.ravel(y_concatenated))


In [None]:
print(class_weights)


[0.66650637 2.00144404]


In [None]:
#Class 0 has a weight of approximately 0.67, indicating that it is underrepresented compared to class 1.
#Class 1 has a weight of approximately 2.00, indicating that it is overrepresented compared to class 0.

In [None]:
#One-hot encoding converts categorical variables into binary vectors,
# where each class is represented by a binary indicator column.
# If the original categorical variable had hierarchical or nested classes,
# one-hot encoding could lead to fewer classes after the transformation.

In [None]:
# Impute NaN values with the mean
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

In [None]:
# Normalize features
from sklearn.preprocessing import MinMaxScaler

# Scale the data using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_imputed)

In [None]:
# Split data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
#We build a multi-output neural network model using Keras Sequential API,
#where the output layer has sigmoid activation for multi-label classification.

In [None]:
# Build multi-output neural network model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense( activation='relu', input_shape=(X_train.shape[1],)),
    Dense(128, activation='relu'),  # Additional hidden layer
    Dense(64, activation='relu'),   # Additional hidden layer
    Dense(32, activation='relu'),   # Additional hidden layer
    Dense(y_train.shape[1], activation='sigmoid')  # Output layer with sigmoid activation for multi-label classification
])

In [None]:
#Output Layer: The output layer consists of multiple nodes, each representing a different label or category.
#In this case, there are two output nodes corresponding to 'category' and 'subcategory'.

In [None]:
#The activation functions used in the hidden layers (typically ReLU) introduce non-linearity to the model,
#enabling it to learn complex patterns in the data. The output layer uses the sigmoid activation function,
#which is suitable for multi-label classification tasks as it produces probabilities for each label independentl

In [None]:
# Compile the model with class weights
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'], loss_weights=class_weights)

In [None]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x79360e2bfee0>

In [None]:
y_pred = model.predict(X_test)




In [None]:
threshold = 0.5
y_pred_binary = (y_pred > threshold).astype(int)


In [None]:
#evaluate the metrics using the weighted versions of the metrics.
# Weighted metrics take into account the class weights when computing the overall metric,
#which is especially important when dealing with class imbalance.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming y_true and y_pred are your true and predicted labels respectively
# Compute metrics for each label separately
accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary, average='weighted')
recall = recall_score(y_test, y_pred_binary, average='weighted')
f1 = f1_score(y_test, y_pred_binary, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.99997
Precision: 0.9999907026299295
Recall: 0.9999854893274003
F1 Score: 0.9999874245059415


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_binary))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    998529
           1       1.00      1.00      1.00      5828
           2       1.00      1.00      1.00    628024
           3       1.00      1.00      1.00      1471
           4       1.00      1.00      1.00     71660
           5       1.00      1.00      1.00    292734
           6       0.34      0.71      0.46        17
           7       0.98      0.91      0.94       266

   micro avg       1.00      1.00      1.00   1998529
   macro avg       0.91      0.95      0.93   1998529
weighted avg       1.00      1.00      1.00   1998529
 samples avg       1.00      1.00      1.00   1998529



In [None]:
import matplotlib.pyplot as plt

# Assuming class_weights is a list or array containing the class weights

# Plot class weights
plt.bar(range(len(class_weights)), class_weights)
plt.xlabel('Class')
plt.ylabel('Weight')
plt.title('Class Weights')
plt.xticks(range(len(class_weights)))  # Set x-axis ticks to class indices
plt.show()


In [None]:
#try to compute classification report
from sklearn.metrics import precision_score, classification_report
# Calculate classification report
report = classification_report(y_test, y_pred_binary)
print("Classification Report:")
print(report)


In [None]:
 #Since ROC curve and ROC AUC are typically used for binary classification, you'll need to compute them for each class
 #separately in a one-vs-rest manner.

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# Assuming y_true and y_score are your true labels and predicted probabilities respectively
# Assuming y_true and y_score are numpy arrays

# Compute predicted probabilities on the test set
#y_score = model.predict(X_test)

# Compute ROC curve and ROC AUC score for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(y_test.shape[1]):  # Iterate over each class
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred[:, i])
    roc_auc[i] = roc_auc_score(y_test[:, i], y_pred[:, i])

# Plot ROC curve for each class
plt.figure(figsize=(8, 6))
for i in range(y_test.shape[1]):
    plt.plot(fpr[i], tpr[i], label=f'Class {i} (AUC = {roc_auc[i]:.2f})')

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


InvalidIndexError: (slice(None, None, None), 0)

In [None]:
#hypermeter tunning:
# increase the number of hidden layers

In [None]:
#However, if you're interested in understanding the impact of class weights on model performance,
#you can compare the ROC curve with and without class weights to observe any differences.
# This comparison can provide insights into how class weights affect the model's ability to
# discriminate between positive and negative classes.





