In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import  train_test_split
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

# Dataset Loading

In [24]:
df = pd.read_csv("reduced_dataset.csv")
df.head()

Unnamed: 0,Dst Port,Flow Duration,Tot Fwd Pkts,TotLen Fwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Std,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,...,Fwd IAT Min,Bwd IAT Min,Fwd Header Len,Fwd Pkts/s,ACK Flag Cnt,Pkt Size Avg,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Label
0,445,21035444,1,0.0,0.0,0.0,0.237694,5258861.0,5130278.201409,12015355.0,...,0.0,3004536.0,32,0.047539,0,0.0,8192,0,0,Benign
1,53,414,1,46.0,46.0,0.0,4830.917874,414.0,0.0,414.0,...,0.0,0.0,8,2415.458937,0,77.0,-1,-1,0,Benign
2,51832,57,2,0.0,0.0,0.0,35087.719298,57.0,0.0,57.0,...,57.0,0.0,40,35087.719298,1,0.0,260,-1,0,Benign
3,53,2749,1,49.0,49.0,0.0,727.537286,2749.0,0.0,2749.0,...,0.0,0.0,8,363.768643,0,131.0,-1,-1,0,Benign
4,63974,1308,5,935.0,935.0,418.144712,5351.681957,218.0,283.818252,742.0,...,3.0,742.0,124,3822.629969,0,179.714286,65535,32768,1,Benign


In [31]:
labels = list(df["Label"].unique())
labels

['Benign',
 'Label',
 'Infilteration',
 'DDOS attack-LOIC-UDP',
 'DDOS attack-HOIC',
 'DoS attacks-SlowHTTPTest',
 'DoS attacks-Hulk',
 'Bot',
 'FTP-BruteForce',
 'SSH-Bruteforce',
 'DoS attacks-GoldenEye',
 'DoS attacks-Slowloris',
 'Brute Force -Web',
 'Brute Force -XSS',
 'SQL Injection',
 'DDoS attacks-LOIC-HTTP']

In [32]:
df.isnull().sum()

Dst Port             0
Flow Duration        0
Tot Fwd Pkts         0
TotLen Fwd Pkts      0
Fwd Pkt Len Max      0
Fwd Pkt Len Std      0
Flow Pkts/s          0
Flow IAT Mean        0
Flow IAT Std         0
Flow IAT Max         0
Flow IAT Min         0
Fwd IAT Tot          0
Fwd IAT Mean         0
Fwd IAT Max          0
Fwd IAT Min          0
Bwd IAT Min          0
Fwd Header Len       0
Fwd Pkts/s           0
ACK Flag Cnt         0
Pkt Size Avg         0
Init Fwd Win Byts    0
Init Bwd Win Byts    0
Fwd Act Data Pkts    0
Label                0
dtype: int64

In [33]:
df['Label'].value_counts().sum()

3931539

# Data PreProcessing

In [34]:
df_cleaned = df[df['Label'] != 'Label']

In [35]:
unique_labels = df_cleaned['Label'].unique()

label_mapping = {label: idx for idx, label in enumerate(unique_labels)}

df_cleaned['Label'] = df_cleaned['Label'].map(label_mapping)

print("Label Mapping:", label_mapping)

Label Mapping: {'Benign': 0, 'Infilteration': 1, 'DDOS attack-LOIC-UDP': 2, 'DDOS attack-HOIC': 3, 'DoS attacks-SlowHTTPTest': 4, 'DoS attacks-Hulk': 5, 'Bot': 6, 'FTP-BruteForce': 7, 'SSH-Bruteforce': 8, 'DoS attacks-GoldenEye': 9, 'DoS attacks-Slowloris': 10, 'Brute Force -Web': 11, 'Brute Force -XSS': 12, 'SQL Injection': 13, 'DDoS attacks-LOIC-HTTP': 14}


In [36]:
X = df_cleaned.drop(columns = ['Label'])
y = df_cleaned["Label"]

In [37]:
X.shape, y.shape

((3931480, 23), (3931480,))

In [38]:
y.value_counts()

Label
0     1183245
3      686012
14     576191
5      461912
6      286191
7      193360
8      187589
1      161934
4      139890
9       41508
10      10990
2        1730
11        611
12        230
13         87
Name: count, dtype: int64

In [39]:
X = pd.DataFrame(X).apply(pd.to_numeric, errors='coerce').values

print("Infinities in X:", np.isinf(X).sum())

threshold = 1e+100
print("Large values in X:", (np.abs(X) > threshold).sum())

Infinities in X: 9659
Large values in X: 9659


In [40]:
X[np.isinf(X)] = np.nan

print("Infinities in X_train:", np.isinf(X).sum())

print("Large values in X_train:", (np.abs(X) > threshold).sum())

Infinities in X_train: 0
Large values in X_train: 0


In [41]:
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Data Splitting & Scaling

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [43]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3145184, 23), (786296, 23), (3145184,), (786296,))

In [44]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [45]:
Xi_train = X_train_scaled.reshape(len(X_train_scaled), X_train_scaled.shape[1], 1)
Xi_test = X_test_scaled.reshape(len(X_test_scaled), X_test_scaled.shape[1], 1)

In [46]:
Xi_train.shape, Xi_test.shape, y_train.shape, y_test.shape

((3145184, 23, 1), (786296, 23, 1), (3145184,), (786296,))

# Model Building

In [47]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Conv1D,MaxPooling1D,Flatten,BatchNormalization

In [48]:
model = Sequential()
model.add(Conv1D(filters=64,kernel_size=(3),activation="relu",padding="same",input_shape=(22, 1)))
model.add(BatchNormalization())
model.add(Conv1D(filters=64,kernel_size=(3),activation="relu",padding="same"))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=3,strides=2,padding="same"))
model.add(Conv1D(filters=64,kernel_size=(3),activation="relu",padding="same"))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=3,strides=2,padding="same"))
model.add(Flatten())
model.add(Dense(64,activation="relu"))
model.add(Dense(64,activation="relu"))
model.add(Dense(22,activation="softmax"))

In [49]:
model.summary()

# Model Training, Testing & Evaluation

In [50]:
def train_and_eval(X_train_data, y_train_data, X_test_data, y_test_data, scaled_data):
    model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
    with tf.device('/GPU:0'):
        history = model.fit(X_train_data, y_train_data, epochs = 10, batch_size = 512, validation_split = 0.2)
        print('\n\n')
        scores = model.evaluate(X_test_data, y_test_data)
        print('\n\n')
        y_res_pred = model.predict(scaled_data)
    predicted_labels = [tf.argmax(pred).numpy() for pred in y_res_pred]
    print('\n\n')
    print(classification_report(y_test, predicted_labels))

In [52]:
train_and_eval(Xi_train, y_train, Xi_test, y_test, X_test_scaled)

Epoch 1/10
[1m4915/4915[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 27ms/step - accuracy: 0.9324 - loss: 0.1762 - val_accuracy: 0.8319 - val_loss: 0.6046
Epoch 2/10
[1m4915/4915[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 26ms/step - accuracy: 0.9358 - loss: 0.1675 - val_accuracy: 0.9357 - val_loss: 0.1724
Epoch 3/10
[1m4915/4915[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 26ms/step - accuracy: 0.9376 - loss: 0.1618 - val_accuracy: 0.9372 - val_loss: 0.1641
Epoch 4/10
[1m4915/4915[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 26ms/step - accuracy: 0.9377 - loss: 0.1604 - val_accuracy: 0.9373 - val_loss: 0.1632
Epoch 5/10
[1m4915/4915[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 26ms/step - accuracy: 0.9383 - loss: 0.1595 - val_accuracy: 0.9343 - val_loss: 0.1674
Epoch 6/10
[1m4915/4915[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 26ms/step - accuracy: 0.9386 - loss: 0.1575 - val_accuracy: 0.9374 - val_loss: 0.162

# Model Training, Testing & Evaluation (Random forest)

In [53]:
from sklearn.ensemble import RandomForestClassifier

In [54]:
rf = RandomForestClassifier()
rf.fit(X_train_scaled, y_train)

In [55]:
y_pred = rf.predict(X_test_scaled)

In [56]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.34      1.00      0.51    236563
           1       0.75      0.01      0.02     32229
           2       0.91      0.62      0.74       353
           3       1.00      0.24      0.39    137297
           4       0.00      0.00      0.00     28087
           5       0.88      0.05      0.09     92139
           6       1.00      0.42      0.59     57199
           7       0.00      0.00      0.00     38905
           8       1.00      0.50      0.67     37209
           9       1.00      0.46      0.63      8194
          10       1.00      0.66      0.80      2189
          11       1.00      0.25      0.40       128
          12       1.00      0.47      0.64        58
          13       0.00      0.00      0.00        21
          14       0.92      0.01      0.03    115725

    accuracy                           0.41    786296
   macro avg       0.72      0.31      0.37    786296
weighted avg       0.68   

In [57]:
rf.fit(X_train, y_train)

In [58]:
y_pred_02 = rf.predict(X_test)

In [59]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.34      1.00      0.51    236563
           1       0.75      0.01      0.02     32229
           2       0.91      0.62      0.74       353
           3       1.00      0.24      0.39    137297
           4       0.00      0.00      0.00     28087
           5       0.88      0.05      0.09     92139
           6       1.00      0.42      0.59     57199
           7       0.00      0.00      0.00     38905
           8       1.00      0.50      0.67     37209
           9       1.00      0.46      0.63      8194
          10       1.00      0.66      0.80      2189
          11       1.00      0.25      0.40       128
          12       1.00      0.47      0.64        58
          13       0.00      0.00      0.00        21
          14       0.92      0.01      0.03    115725

    accuracy                           0.41    786296
   macro avg       0.72      0.31      0.37    786296
weighted avg       0.68   