In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
import joblib

# =============================
# 🔹 Load Dataset
# =============================
file_path = 'dataset_sdn.csv'
df = pd.read_csv(file_path, encoding='latin1')

# Identify numeric and categorical features
numerical_features = [feature for feature in df.columns if df[feature].dtypes != 'O' and feature != "label"]
print("The number of numerical features is", len(numerical_features), "and they are : \n", numerical_features)
categorical_features = [feature for feature in df.columns if df[feature].dtypes == 'O']
print("The number of categorical features is", len(categorical_features), "and they are : \n", categorical_features)
target = 'label'

# Separate features and target variable
X = df[numerical_features + categorical_features]
y = df[target]

# Split into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# =============================
# 🔹 Handle Missing Values
# =============================
# Impute missing values in numeric features
num_imputer = SimpleImputer(strategy='mean')
X_train_numeric = num_imputer.fit_transform(X_train[numerical_features])
X_test_numeric = num_imputer.transform(X_test[numerical_features])

# Impute missing values in categorical features
cat_imputer = SimpleImputer(strategy='most_frequent')
X_train_categorical = cat_imputer.fit_transform(X_train[categorical_features])
X_test_categorical = cat_imputer.transform(X_test[categorical_features])

# =============================
# 🔹 Standardize Numeric Features
# =============================
scaler = StandardScaler()
X_train_numeric_scaled = scaler.fit_transform(X_train_numeric)
X_test_numeric_scaled = scaler.transform(X_test_numeric)

# =============================
# 🔹 Encode Categorical Features
# =============================
vectorizer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat_encoded = vectorizer.fit_transform(X_train_categorical)
X_test_cat_encoded = vectorizer.transform(X_test_categorical)

# =============================
# 🔹 Combine Processed Features
# =============================
X_train_final = np.hstack((X_train_numeric_scaled, X_train_cat_encoded))
X_test_final = np.hstack((X_test_numeric_scaled, X_test_cat_encoded))

# Reshape data for CNN input (2D format)
X_train_final = X_train_final.reshape((X_train_final.shape[0], X_train_final.shape[1], 1))
X_test_final = X_test_final.reshape((X_test_final.shape[0], X_test_final.shape[1], 1))

# Convert target to NumPy arrays
y_train = y_train.values
y_test = y_test.values

# =============================
# 🔹 Build CNN Model
# =============================
model = Sequential()

# Add 1D Convolutional Layer
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train_final.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))

# Add Flatten Layer
model.add(Flatten())

# Add Fully Connected Layers
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))  # Dropout for regularization
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))  # Dropout for regularization

# Add Output Layer
model.add(Dense(1, activation='sigmoid'))  # Binary classification (DDoS or not)

# Compile the Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print Model Summary
model.summary()

# =============================
# 🔹 Train CNN Model
# =============================
history = model.fit(
    X_train_final, y_train,
    epochs=10,  # Adjust the number of epochs
    batch_size=32,  # Adjust the batch size
    validation_data=(X_test_final, y_test),
    verbose=1
)

# =============================
# 🔹 Evaluate CNN Model
# =============================
# Make Predictions
y_pred_prob = model.predict(X_test_final)
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to binary predictions

# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of CNN Model: {accuracy:.4f}")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = conf_matrix.ravel()
print(f"TP: {TP}, TN: {TN}, FP: {FP}, FN: {FN}")

# =============================
# 🔹 Save Model, Scaler & Vectorizer
# =============================
model.save('Modells/CNN_ddos_model.h5')  # Save the CNN model
joblib.dump(scaler, 'Modells/CNN_ddos_scaler.pkl')  # Save the scaler
joblib.dump(vectorizer, 'Modells/CNN_ddos_vectorizer.pkl')  # Save the vectorizer

print("CNN model, scaler, and vectorizer saved.")

The number of numerical features is 19 and they are : 
 ['dt', 'switch', 'pktcount', 'bytecount', 'dur', 'dur_nsec', 'tot_dur', 'flows', 'packetins', 'pktperflow', 'byteperflow', 'pktrate', 'Pairflow', 'port_no', 'tx_bytes', 'rx_bytes', 'tx_kbps', 'rx_kbps', 'tot_kbps']
The number of categorical features is 3 and they are : 
 ['src', 'dst', 'Protocol']
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_8 (Conv1D)           (None, 57, 64)            256       
                                                                 
 max_pooling1d_8 (MaxPooling  (None, 28, 64)           0         
 1D)                                                             
                                                                 
 flatten_4 (Flatten)         (None, 1792)              0         
                                                                 
 dense_8 (Dense)             