In [2]:
 #%tensorflow_version 2.x
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Input, Dense, Dropout, Activation, Lambda, Flatten
from keras.layers import Conv1D, Dense, Dropout, MaxPooling1D, LSTM
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from keras import callbacks
from keras.callbacks import CSVLogger
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger
import numpy as np
import csv
import pandas as pd
import os
import glob
from sklearn.utils import resample
np.random.seed(0)

In [3]:

dataset_path = "/kaggle/input/be-ma-dataset"

# Get a list of all CSV files in the directory
csv_files = glob.glob(os.path.join(dataset_path, "*.csv"))

df_list = []

# Load each file and append it to the list
for file in csv_files:
    df = pd.read_csv(file)
    df_list.append(df)

# Merge all datasets into one DataFrame
full_dataset = pd.concat(df_list, ignore_index=True)

# Display label distribution before balancing
print("Original Label Distribution:\n", full_dataset["Label"].value_counts())

# Separate features (X) and labels (y)
y = full_dataset['Label']  # Extract the Label column as target
X = full_dataset.drop(['Label'], axis=1)  # Drop the Label column from features

# Remove non-numeric columns
non_numeric_columns = X.select_dtypes(include=['object']).columns.tolist()
X = X.drop(columns=non_numeric_columns)

# Combine X and y again for balancing
full_dataset_numeric = X.copy()
full_dataset_numeric['Label'] = y

# Separate benign and malicious samples
benign = full_dataset_numeric[full_dataset_numeric['Label'] == 0]
malicious = full_dataset_numeric[full_dataset_numeric['Label'] == 1]

# Downsample malicious to match benign count
malicious_downsampled = resample(
    malicious, 
    replace=False,  # No replacement to avoid duplicates
    n_samples=len(benign),  # Match benign count
    random_state=42  # For reproducibility
)

# Concatenate the balanced dataset
balanced_dataset = pd.concat([benign, malicious_downsampled])

# Shuffle the dataset
balanced_dataset = balanced_dataset.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate features and labels again
y_balanced = balanced_dataset['Label']
X_balanced = balanced_dataset.drop(['Label'], axis=1)

# Normalize the data
#X_balanced_normalized = (X_balanced - X_balanced.min()) / (X_balanced.max() - X_balanced.min())

print("Balanced Dataset Shape:", balanced_dataset.shape)
print("Balanced Label Distribution:\n", y_balanced.value_counts())

# Show the first few rows of normalized data
#print(X_balanced_normalized.head())


Original Label Distribution:
 Label
1    6889325
0      50674
Name: count, dtype: int64
Balanced Dataset Shape: (101348, 74)
Balanced Label Distribution:
 Label
1    50674
0    50674
Name: count, dtype: int64


In [4]:
balanced_df=X_balanced.drop(['Unnamed: 0', ' Source IP', ' Destination IP'], axis=1)
print(balanced_df.columns.tolist())


[' Source Port', ' Destination Port', ' Protocol', ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', 'Total Length of Fwd Packets', ' Total Length of Bwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', 'Bwd Packet Length Max', ' Bwd Packet Length Min', ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', ' SYN Flag Count', ' RST Flag Count', ' ACK Flag Count', ' URG Flag Count', ' CWE Flag Count', ' Down/Up Ratio', ' Average Packet 

In [5]:
balanced_df.head()

Unnamed: 0,Source Port,Destination Port,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,...,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Inbound,Timestamp
0,37014,63652,17,107520,4,0,1438.0,0.0,389.0,330.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1541239000.0
1,55026,443,6,117947,4,1,24.0,0.0,6.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1541235000.0
2,61203,80,6,55048,1,1,6.0,6.0,6.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1541246000.0
3,45920,44189,17,1,2,0,802.0,0.0,401.0,401.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1543665000.0
4,62247,443,6,1,2,0,37.0,0.0,31.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1541259000.0


In [6]:
print(len(balanced_df.columns.tolist()))


70


In [7]:
y.nunique()


2

In [8]:
# Ensure y is taken from the balanced dataset
y = pd.get_dummies(y_balanced)  # Convert categorical labels to one-hot encoding
y.head()
print(balanced_df.shape, y.shape)

(101348, 70) (101348, 2)


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(balanced_df, y, test_size=0.25, random_state=0)
y_train = np.array(y_train)
X_train = (X_train - X_train.min()) / (X_train.max() - X_train.min())
X_test = (X_test - X_test.min()) / (X_test.max() - X_test.min())
print(X_train.shape, y_train.shape)

(76011, 70) (76011, 2)


In [10]:
y_train = np.array(y_train)
y_test = np.array(y_test)
X_train = np.array(X_train)
X_test = np.array(X_test)
X_train = np.reshape(X_train, (X_train.shape[0],X_train.shape[1],1))
X_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[1],1))

In [11]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)


(76011, 70, 1) (76011, 2)
(25337, 70, 1) (25337, 2)


In [12]:
print(np.isnan(X_train).sum(), np.isinf(X_train).sum()) 
print(np.isnan(X_test).sum(), np.isinf(X_test).sum())# Check for NaNs or infs
print(np.isnan(y_train).sum(), np.isinf(y_train).sum())


2148 0
664 0
0 0


In [13]:
X_train = np.nan_to_num(X_train)
X_test = np.nan_to_num(X_test)
y_train = np.nan_to_num(y_train)


In [14]:
print(X_train.min(), X_train.max())
X_train = np.clip(X_train, 1e-8, 1.0)  # Avoid exact zeros
X_test = np.clip(X_test, 1e-8, 1.0)

0.0 1.0


In [15]:
y_train = y_train.astype(int)  # Convert boolean to integer (0 and 1)
y_test = y_test.astype(int)
print("Unique labels in y_train:", np.unique(y_train))


Unique labels in y_train: [0 1]


In [16]:
print('xtrain={}, ytrain={}, xtest={}, ytest={}'.format(X_train.shape,y_train.shape,X_test.shape,y_test.shape))

xtrain=(76011, 70, 1), ytrain=(76011, 2), xtest=(25337, 70, 1), ytest=(25337, 2)


In [18]:
print(y_train.shape)
print(y_test.shape)


(76011, 2)
(25337, 2)


In [22]:
model = Sequential()
model.add(Conv1D(64, 3, activation="relu", input_shape=(70,1)))
model.add(Conv1D(32, 3, activation="relu"))
model.add(Dropout(rate=0.3))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation="relu"))  # Add a fully connected layer
model.add(Dropout(0.5))
model.add(Dense(2, activation="softmax"))  # Output layer with 2 units (for classification)

In [23]:
from keras.optimizers import SGD

# Define the optimizer
sgd = SGD(learning_rate=0.009, momentum=0.9, nesterov=True)

model.compile(loss="categorical_crossentropy", optimizer=sgd, metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, batch_size=10, validation_data=(X_test, y_test))

Epoch 1/5
[1m7602/7602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 5ms/step - accuracy: 0.9872 - loss: 0.0354 - val_accuracy: 0.9992 - val_loss: 0.0042
Epoch 2/5
[1m7602/7602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 5ms/step - accuracy: 0.9993 - loss: 0.0045 - val_accuracy: 0.9997 - val_loss: 0.0024
Epoch 3/5
[1m7602/7602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 5ms/step - accuracy: 0.9994 - loss: 0.0026 - val_accuracy: 0.9997 - val_loss: 0.0020
Epoch 4/5
[1m7602/7602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 5ms/step - accuracy: 0.9997 - loss: 0.0024 - val_accuracy: 0.9996 - val_loss: 0.0026
Epoch 5/5
[1m7602/7602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 5ms/step - accuracy: 0.9997 - loss: 0.0024 - val_accuracy: 0.9997 - val_loss: 0.0018


<keras.src.callbacks.history.History at 0x7ed3fe29b0d0>

In [30]:
from tensorflow.keras.callbacks import EarlyStopping
m = Sequential()
m.add(Conv1D(128, 3, activation="relu", input_shape=(70,1)))
m.add(Conv1D(64, 3, activation="relu"))
m.add(MaxPooling1D(pool_size=2))
model.add(Dropout(rate=0.3))
#m.add(LSTM(500))
m.add(Flatten())
m.add(Dense(2))


m.compile(loss="mean_absolute_error", optimizer="adam", metrics=['accuracy'])

early_stopping = EarlyStopping(
    monitor='val_loss',  # Watch validation loss
    patience=5,  # Stop if val_loss doesn’t improve for 5 epochs
    restore_best_weights=True  # Roll back to best model
)

m.fit(X_train, y_train, epochs=20, batch_size=10, validation_data=(X_test, y_test), callbacks=[early_stopping])

Epoch 1/20
[1m7602/7602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 5ms/step - accuracy: 0.9875 - loss: 0.0363 - val_accuracy: 0.9996 - val_loss: 0.0267
Epoch 2/20
[1m7602/7602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 5ms/step - accuracy: 0.9993 - loss: 0.0080 - val_accuracy: 0.9996 - val_loss: 0.0112
Epoch 3/20
[1m7602/7602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 5ms/step - accuracy: 0.9997 - loss: 0.0061 - val_accuracy: 0.9996 - val_loss: 0.0113
Epoch 4/20
[1m7602/7602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 5ms/step - accuracy: 0.9996 - loss: 0.0052 - val_accuracy: 0.9996 - val_loss: 0.0099
Epoch 5/20
[1m7602/7602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 5ms/step - accuracy: 0.9997 - loss: 0.0046 - val_accuracy: 0.9996 - val_loss: 0.0064
Epoch 6/20
[1m7602/7602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 5ms/step - accuracy: 0.9997 - loss: 0.0042 - val_accuracy: 0.9996 - val_loss: 0.0097
Epoch 7/20

<keras.src.callbacks.history.History at 0x7ed3fc951a50>

In [34]:
m2 = Sequential()
m2.add(Conv1D(64, 3, activation="relu", input_shape=(70,1)))
m2.add(Dropout(0.3))
m2.add(Conv1D(32, 3, activation="relu"))
m2.add(MaxPooling1D(pool_size=2))
m2.add(Flatten())
m2.add(Dense(2))


m2.compile(loss="mean_absolute_error", optimizer="adam", metrics=['accuracy'])
early_stopping = EarlyStopping(
    monitor='val_loss',  # Watch validation loss
    patience=5,  # Stop if val_loss doesn’t improve for 5 epochs
    restore_best_weights=True  # Roll back to best model
)

m2.fit(X_train, y_train, epochs=10, batch_size=10, validation_data=(X_test, y_test), callbacks=[early_stopping])

Epoch 1/10
[1m7602/7602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 4ms/step - accuracy: 0.9829 - loss: 0.0543 - val_accuracy: 0.9985 - val_loss: 0.0132
Epoch 2/10
[1m7602/7602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 5ms/step - accuracy: 0.9980 - loss: 0.0089 - val_accuracy: 0.9996 - val_loss: 0.0157
Epoch 3/10
[1m7602/7602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 4ms/step - accuracy: 0.9996 - loss: 0.0064 - val_accuracy: 0.9996 - val_loss: 0.0084
Epoch 4/10
[1m7602/7602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 4ms/step - accuracy: 0.9995 - loss: 0.0054 - val_accuracy: 0.9996 - val_loss: 0.0220
Epoch 5/10
[1m7602/7602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 5ms/step - accuracy: 0.9996 - loss: 0.0047 - val_accuracy: 0.9996 - val_loss: 0.0206
Epoch 6/10
[1m7602/7602[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 5ms/step - accuracy: 0.9996 - loss: 0.0044 - val_accuracy: 0.9996 - val_loss: 0.0204
Epoch 7/10

<keras.src.callbacks.history.History at 0x7ed3fce7b580>

In [35]:
y_pred = m2.predict(X_test)

[1m792/792[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step


In [36]:
a = (y_pred > 0.5)
b = (y_test > 0.5) 

In [37]:
b=np.argmax(y_test, axis=1)

In [38]:
a=np.argmax(y_pred, axis=1)
a[2]

1

In [39]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

print('Convolution Neural Network')

print('Accuracy: %f' % (accuracy_score(a, b)*100))
print("Confusion Matrix =\n", metrics.confusion_matrix(b, a, labels=None, 
                                              sample_weight=None))
print("Recall =", metrics.recall_score(b, a, labels=None, 
                                             pos_label=1, average='weighted', 
                                             sample_weight=None))
print("Classification Report =\n", metrics.classification_report(b, a, 
                                                                 labels=None, 
                                                                 target_names=None, 
                                                                 sample_weight=None, 
                                                                 digits=2, 
                                                                 output_dict=False))

print("F1 Score = ",f1_score(a, b, average='macro'))

Convolution Neural Network
Accuracy: 99.960532
Confusion Matrix =
 [[12546     3]
 [    7 12781]]
Recall = 0.9996053202825906
Classification Report =
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     12549
           1       1.00      1.00      1.00     12788

    accuracy                           1.00     25337
   macro avg       1.00      1.00      1.00     25337
weighted avg       1.00      1.00      1.00     25337

F1 Score =  0.9996052863272954
