In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline


In [2]:

# Load the dataset
cleaned_data = pd.read_csv('cleaned_dataset_full.csv')
print(cleaned_data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61238 entries, 0 to 61237
Data columns (total 84 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0    Source IP                    61238 non-null  int64  
 1    Source Port                  61238 non-null  int64  
 2    Destination IP               61238 non-null  int64  
 3    Destination Port             61238 non-null  int64  
 4    Protocol                     61238 non-null  int64  
 5    Flow Duration                61238 non-null  int64  
 6    Total Fwd Packets            61238 non-null  int64  
 7    Total Backward Packets       61238 non-null  int64  
 8   Total Length of Fwd Packets   61238 non-null  float64
 9    Total Length of Bwd Packets  61238 non-null  float64
 10   Fwd Packet Length Max        61238 non-null  float64
 11   Fwd Packet Length Min        61238 non-null  float64
 12   Fwd Packet Length Mean       61238 non-null  float64
 13   

In [3]:

# Get the label counts
label_counts = cleaned_data[' Label'].value_counts()
print("Label counts:")
print(label_counts)


Label counts:
 Label
BENIGN    51240
Syn        9998
Name: count, dtype: int64


In [4]:

# Separate features and labels
X = cleaned_data.drop(' Label', axis=1)
y = cleaned_data[' Label']


In [5]:

# Replace infinite values with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)


In [6]:

# Split the dataset into numeric and categorical columns
numeric_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Create transformers for numeric and categorical columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [7]:

# Preprocess the data
X_preprocessed = preprocessor.fit_transform(X)


In [8]:

# Normalize the preprocessed features
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X_preprocessed)


In [9]:

# Reshape the data for CNN
feature_count = X_normalized.shape[1]
side_length = int(np.ceil(np.sqrt(feature_count)))
X_reshaped = np.zeros((X_normalized.shape[0], side_length, side_length, 1))

for i in range(X_normalized.shape[0]):
    X_reshaped[i, :, :, 0] = np.pad(X_normalized[i], (0, side_length**2 - feature_count)).reshape(side_length, side_length)


In [10]:

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_categorical, test_size=0.2, random_state=42)


In [11]:

# Define the CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(side_length, side_length, 1)),
    MaxPooling2D(2, 2),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(2, activation='softmax')  # 2 classes: normal and attack
])


  super().__init__(


In [12]:

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [13]:

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9680 - loss: 0.0842 - val_accuracy: 0.9995 - val_loss: 0.0044
Epoch 2/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9990 - loss: 0.0046 - val_accuracy: 0.9998 - val_loss: 0.0020
Epoch 3/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9991 - loss: 0.0026 - val_accuracy: 0.9998 - val_loss: 0.0018
Epoch 4/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9995 - loss: 0.0020 - val_accuracy: 0.9998 - val_loss: 0.0015
Epoch 5/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9998 - loss: 0.0013 - val_accuracy: 0.9998 - val_loss: 0.0016
Epoch 6/10
[1m1531/1531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9996 - loss: 0.0026 - val_accuracy: 0.9998 - val_loss: 0.0013
Epoch 7/10
[1m1

In [14]:

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_accuracy}")


[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9999 - loss: 0.0014
Test accuracy: 0.9998366832733154


In [15]:

# Make predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)


[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [16]:

# Print classification report
print("Classification Report:")
print(classification_report(y_true, y_pred_classes, target_names=label_encoder.classes_))


Classification Report:
              precision    recall  f1-score   support

      BENIGN       1.00      1.00      1.00     10198
         Syn       1.00      1.00      1.00      2050

    accuracy                           1.00     12248
   macro avg       1.00      1.00      1.00     12248
weighted avg       1.00      1.00      1.00     12248



In [17]:

# Plot confusion matrix
cm = confusion_matrix(y_true, y_pred_classes)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()


NameError: name 'sns' is not defined

<Figure size 1000x800 with 0 Axes>

In [None]:

# Plot training history
plt.figure(figsize=(10, 5))


In [None]:

# Plot training & validation accuracy values
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')


In [None]:

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')

plt.tight_layout()
plt.show()