In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Loading the dataset:-
gnidd = pd.read_csv('/content/sample_data/5gnidd.csv')
print(gnidd.head())
print()
unique_columns_before = gnidd.columns.tolist()
num_columns_before = len(unique_columns_before)
print(f"Number of columns before dropping: {num_columns_before}")
print("Unique columns before dropping:")
print(unique_columns_before)


# Describing dataset-
gnidd.describe()
print()


# Dropping unnecessary columns:-
unnecessary_columns = ['Unnamed: 0', 'RunTime', 'Min', 'Max', 'sTos', 'dTos',
                       'sDSb', 'dDSb', 'sHops', 'dHops', 'SrcWin','DstWin',
                       'sVid', 'dVid', 'SrcTCPBase', 'DstTCPBase', 'TcpRtt',
                       'SynAck', 'AckDat'
                       ]

gnidd = gnidd.drop(unnecessary_columns, axis=1)
unique_columns_after = gnidd.columns.tolist()
num_columns_after = len(unique_columns_after)
print(f"Number of columns after dropping: {num_columns_after}")
print("Unique columns after dropping:")
print(unique_columns_after)


# Dropping any missing rows:-
gnidd = gnidd.dropna()
print(gnidd)
print()


# Checking for duplicate rows:-
print(gnidd.duplicated().sum())
print()


# droping duplicate rows:
gnidd = gnidd.drop_duplicates()
print(gnidd.shape)
print(gnidd.head())
print()


# Separating feature columns (X) and label columns (Y):-
X = gnidd.drop(['Label', 'Attack Type', 'Attack Tool'], axis=1)
Y = gnidd['Attack Type']


# Converting categorical columns to numerical using one-hot encoding:-
categorical_cols = ['Proto', 'Cause', 'State']
X = pd.get_dummies(X, columns=categorical_cols)


# Performing Feature Selection: ANOVA Method
from sklearn.feature_selection import SelectKBest, f_classif
k_best = 10
anova_selector = SelectKBest(f_classif, k=k_best)
X_selected = anova_selector.fit_transform(X, Y)

# Print the columns that are observed
observed_columns = X.columns.tolist()
print("Observed columns:")
print(observed_columns)


# Print the selected features
selected_feature_indices = anova_selector.get_support(indices=True)
selected_feature_names = X.columns[selected_feature_indices]
print("Selected Features:")
print(selected_feature_names)


# Converting selected features to a DataFrame
selected_feature_indices = anova_selector.get_support(indices=True)
selected_feature_names = X.columns[selected_feature_indices]
X_selected_df = pd.DataFrame(X_selected, columns=selected_feature_names)

from sklearn.preprocessing import MinMaxScaler, StandardScaler
col_to_scale = ['Seq','Dur','Mean','Sum','TotPkts','SrcPkts','DstPkts','TotBytes',
                'SrcBytes','DstBytes','Offset','sMeanPktSz','dMeanPktSz','Load','SrcLoad',
                'DstLoad','Loss','SrcLoss','DstLoss','pLoss','Rate','SrcRate','DstRate',
                'SrcGap','DstGap','sTtl','dTtl'
                ]

numeric_data = gnidd[col_to_scale]
# Min-Max Scaling:-
min_max_sclaing = MinMaxScaler()

# Applying Z-Score Normalization:-
standard_scaling = StandardScaler()
standard_scaled_data = standard_scaling.fit_transform(numeric_data)
standard_scaled_data = pd.DataFrame(standard_scaled_data, columns = col_to_scale)

# Converting dataframe to a numpy array:-
X = X.to_numpy()

# Converting labels to numerical values:-
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(Y)

# Converting target labels to one-hot encoding:-
numerical_classes = len(label_encoder.classes_)
Y = tf.keras.utils.to_categorical(Y, numerical_classes)
print('Encoded Labels: ', Y)
print()

# Splitting data into training and testing sets:-
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=42)

# Reshaping input data for our RNN (assuming that the dataset has 1D sequences):-
input_shape = (X_train.shape[1], 1)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Building our RNN model:-
model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(128, input_shape=input_shape),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(numerical_classes, activation='softmax')
])

# Model Compilation:-
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Training our Model:-
epochs = 500
batch_size = 128
history = model.fit(X_train,
                    Y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=0.3)

# Model Evaluation:-
loss, accuracy = model.evaluate(X_test, Y_test, batch_size=batch_size)
print(f'Test loss is: {loss:.4f}, Test accuracy is: {accuracy:.4f}')

# Making predictions:-
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(Y_test, axis=1)

# Converting predicted labels back to original ones:-
y_pred_labels = label_encoder.inverse_transform(y_pred_classes)
y_test_labels = label_encoder.inverse_transform(y_test_classes)

# Calculating accuracy:-
accuracy = accuracy_score(y_test_labels, y_pred_labels)
print(f'Accuracy is: {accuracy:.4f}')

# Generating classification report:-
classification_rep = classification_report(y_test_labels, y_pred_labels)
print('Classification Report is:')
print(classification_rep)

# Generating Confusion Matrix:-
confusion_mat = confusion_matrix(y_test_labels, y_pred_labels)
print('Confusion Matrix is:')
print(confusion_mat)

# Printing predected val:-
print(y_pred_labels)
print(y_test_labels)

from sklearn.metrics import precision_score, recall_score, f1_score
import time

# ... (Your previous code)
# Model Evaluation:-
start_time = time.time()
loss, accuracy = model.evaluate(X_test, Y_test, batch_size=batch_size)
end_time = time.time()
training_time = end_time - start_time

print(f'Test loss is: {loss:.4f}, Test accuracy is: {accuracy:.4f}')
print(f'Training time: {training_time:.2f} seconds')

# Making predictions:-
start_time = time.time()
y_pred = model.predict(X_test)
end_time = time.time()
prediction_time = end_time - start_time

y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(Y_test, axis=1)

# Calculate precision, recall, F1-score for each class
class_names = label_encoder.classes_
for i in range(len(class_names)):
    class_label = class_names[i]
    class_indices = np.where(y_test_classes == i)[0]

    precision = precision_score(y_test_classes[class_indices], y_pred_classes[class_indices], average='weighted')
    recall = recall_score(y_test_classes[class_indices], y_pred_classes[class_indices], average='weighted')
    f1 = f1_score(y_test_classes[class_indices], y_pred_classes[class_indices], average='weighted')

    print(f"Class: {class_label}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print("-" * 40)

print(f'Prediction time: {prediction_time:.2f} seconds')

   Unnamed: 0  Seq       Dur   RunTime      Mean       Sum       Min  \
0           0    1  0.000000  0.000000  0.000000  0.000000  0.000000   
1           1    2  0.000000  0.000000  0.000000  0.000000  0.000000   
2           2    3  4.998020  4.998020  4.998020  4.998020  4.998020   
3           3    4  4.998037  4.998037  4.998037  4.998037  4.998037   
4           4    5  4.999453  4.999453  4.999453  4.999453  4.999453   

        Max Proto  sTos  ...  sVid dVid SrcTCPBase  DstTCPBase  TcpRtt  \
0  0.000000  icmp   0.0  ...   NaN  NaN        NaN         NaN     0.0   
1  0.000000  icmp   0.0  ...   NaN  NaN        NaN         NaN     0.0   
2  4.998020   udp   0.0  ...   NaN  NaN        NaN         NaN     0.0   
3  4.998037   udp   0.0  ...   NaN  NaN        NaN         NaN     0.0   
4  4.999453   udp   0.0  ...   NaN  NaN        NaN         NaN     0.0   

   SynAck  AckDat   Label  Attack Type  Attack Tool  
0     0.0     0.0  Benign       Benign       Benign  
1     0.0     

  f = msb / msw


Observed columns:
['Seq', 'Dur', 'Mean', 'Sum', 'sTtl', 'dTtl', 'TotPkts', 'SrcPkts', 'DstPkts', 'TotBytes', 'SrcBytes', 'DstBytes', 'Offset', 'sMeanPktSz', 'dMeanPktSz', 'Load', 'SrcLoad', 'DstLoad', 'Loss', 'SrcLoss', 'DstLoss', 'pLoss', 'SrcGap', 'DstGap', 'Rate', 'SrcRate', 'DstRate', 'Proto_tcp', 'Cause_Start', 'Cause_Status', 'State_CON', 'State_FIN', 'State_RST']
Selected Features:
Index(['sTtl', 'TotPkts', 'SrcPkts', 'DstPkts', 'SrcBytes', 'sMeanPktSz',
       'Cause_Start', 'Cause_Status', 'State_CON', 'State_RST'],
      dtype='object')
Encoded Labels:  [[1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.

At Batch size 50(less) and 100 epochs(more) with 80:20 split and val split 0.2 accuracy = 100%