In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Cropping1D, Dropout, Conv1D, MaxPool1D, UpSampling1D, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.optimizers import RMSprop, Adam

# Load and preprocess data
def load_data(filename):
    df = pd.read_csv(filename)
    label_replacements = {
        'DDoS-ICMP_Flood': 'DDoS', 'DDoS-UDP_Flood': 'DDoS', 'DDoS-TCP_Flood': 'DDoS', 'DDoS-PSHACK_Flood': 'DDoS',
        'DDoS-SYN_Flood': 'DDoS', 'DDoS-RSTFINFlood': 'DDoS', 'DDoS-SynonymousIP_Flood': 'DDoS', 'DDoS-ICMP_Fragmentation': 'DDoS',
        'DDoS-UDP_Fragmentation': 'DDoS', 'DDoS-ACK_Fragmentation': 'DDoS', 'DDoS-HTTP_Flood': 'DDoS', 'DDoS-SlowLoris': 'DDoS',
        'DoS-UDP_Flood': 'DoS', 'DoS-TCP_Flood': 'DoS', 'DoS-SYN_Flood': 'DoS', 'DoS-HTTP_Flood': 'DoS',
        'Recon-HostDiscovery': 'Recon', 'Recon-OSScan': 'Recon', 'Recon-PortScan': 'Recon', 'Recon-PingSweep': 'Recon', 'VulnerabilityScan': 'Recon',
        'Mirai-greeth_flood': 'Mirai', 'Mirai-udpplain': 'Mirai', 'Mirai-greip_flood': 'Mirai',
        'MITM-ArpSpoofing': 'Spoofing', 'DNS_Spoofing': 'Spoofing',
        'DictionaryBruteForce': 'BruteForce',
        'BrowserHijacking': 'Web-based', 'XSS': 'Web-based', 'Uploading_Attack': 'Web-based', 'SqlInjection': 'Web-based', 'CommandInjection': 'Web-based', 'Backdoor_Malware': 'Web-based',
        'BenignTraffic': 'BENIGN'
    }
    df['label'] = df['label'].replace(label_replacements)
    return df
dataset = []
for i in range(0, 5):
    df = load_data(f"/kaggle/input/cic-iot-2023/part-0000{i}-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv")
    dataset.append(df)
dataset = pd.concat(dataset)

In [None]:
dataset.info()

In [None]:
dataset.value_counts('label')

In [None]:
label_encoder = LabelEncoder()
dataset['label'] = label_encoder.fit_transform(dataset['label'])

dataset.value_counts('label')

In [None]:
# Compute correlation matrix
corr_matrix = dataset.corr()

In [None]:
# Visualize correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=False, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
import seaborn as sns
plt.subplots(figsize=(10,15))
heat = sns.heatmap(corr_matrix.iloc[:46,46:])

In [None]:
X = dataset.drop(columns='label')

In [None]:
X.describe

In [None]:
y = dataset['label']

In [None]:
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights_dict = dict(zip(np.unique(y), class_weights))

# Print class weights to understand the distribution
print("Class Weights:", class_weights_dict)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.metrics import CategoricalAccuracy

# Normalize the feature data
scaler = StandardScaler()
#scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(X)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(normalized_features, y, test_size=0.2, random_state=42)

In [None]:
test=[]
test = load_data("/kaggle/input/cic-iot-2023/part-00016-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv")

In [None]:
X_test = scaler.fit_transform(test.drop(columns='label'))

In [None]:
test['label'] = label_encoder.fit_transform(test['label'])
y_test = test['label']
test.value_counts('label')

In [None]:
X_test_expanded = X_test.reshape(-1, X_test.shape[1], 1)

In [None]:
from tensorflow.keras.utils import to_categorical
# Convert labels to categorical
y_train_cat = to_categorical(y_train)
y_val_cat = to_categorical(y_val)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
# Define the learning rate schedule function
def lr_schedule(epoch, lr):
    if epoch % 2 == 0 and epoch != 0:
        return lr * 0.9  # Reduce learning rate by 10%
    return lr

# Create callbacks
lr_scheduler = LearningRateScheduler(lr_schedule)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [None]:
import numpy as np
from keras_tuner import HyperModel
from kerastuner.tuners import RandomSearch
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv1D, BatchNormalization, MaxPool1D, Dropout, Flatten, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1
from tensorflow.keras.metrics import CategoricalAccuracy
from sklearn.metrics import accuracy_score

class EncodedCNNHyperModel(HyperModel):
    def build(self, hp):
        model = Sequential()
        model.add(Input(shape=(X_train.shape[1], 1)))
        for i in range(hp.Int('num_layers', 1, 3)):
            model.add(Conv1D(filters=hp.Int('filters_' + str(i), 32, 128, step=32),
                             kernel_size=3, activation='relu'))
            model.add(BatchNormalization())
            model.add(MaxPool1D(pool_size=2))
            model.add(Dropout(hp.Float('dropout_' + str(i), 0.2, 0.5, step=0.1)))
        model.add(Flatten())
        model.add(Dense(units=hp.Int('units', 64, 128, step=32), activation='relu', kernel_regularizer=l1(0.01)))
        model.add(Dense(len(np.unique(y_train)), activation='softmax'))
        model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=[CategoricalAccuracy()])
        return model
# Setup Keras Tuner
tuner = RandomSearch(
    EncodedCNNHyperModel(),
    objective='val_categorical_accuracy',
    max_trials=10,
    executions_per_trial=2,
    directory='model_tuning',
    project_name='NetworkTrafficClassification'
)
# Specify a batch size
batch_size = 128

# Search for the best model hyperparameters using the encoded features
tuner.search(x=np.expand_dims(X_train, axis=-1), y=y_train_cat,
             validation_data=(np.expand_dims(X_val, axis=-1), y_val_cat),
             batch_size=batch_size,
            epochs=10,
            callbacks=[early_stopping, lr_scheduler])  # Explicitly set batch size 

best_model = tuner.get_best_models(num_models=1)[0]

In [None]:
history = best_model.fit(
    np.expand_dims(X_train, axis=-1), y_train_cat,
    epochs=20,
    batch_size=batch_size,
    validation_data=(np.expand_dims(X_val, axis=-1), y_val_cat),
    callbacks=[early_stopping, learning_rate_scheduler],
    class_weight=class_weights_dict
)

In [None]:
# Plot the training history
def plot_training_history(history):
    acc = history.history['val_categorical_accuracy']
    val_acc = history.history['val_categorical_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)

    plt.figure(figsize=(14, 5))
    plt.subplot(1, 2, 1)
    plt.plot(epochs, acc, 'bo', label='Training accuracy')
    plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.show()

plot_training_history(history)

In [None]:
# Predict and evaluate on validation set
y_val_pred = best_model.predict(np.expand_dims(X_val, axis=-1))
y_val_pred_classes = np.argmax(y_val_pred, axis=1)
y_val_true_classes = np.argmax(y_val_cat, axis=1)

In [None]:
classification_accuracy = accuracy_score(y_val_true_classes, y_val_pred_classes)
print(f"Classification accuracy on validation set: {classification_accuracy}")

In [None]:
# Evaluate the model on the test dataset
test_loss, test_accuracy = best_model.evaluate(X_test_expanded, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

In [None]:
# Predict on the test dataset
y_test_pred = best_model.predict(X_test_expanded)
y_test_pred_classes = np.argmax(y_test_pred, axis=1)
y_test_true_classes = np.argmax(y_test, axis=1)

# Calculate classification accuracy
classification_accuracy = accuracy_score(y_test_true_classes, y_test_pred_classes)
print(f"Classification accuracy on test set: {classification_accuracy}")

In [None]:
#best_model.save('CNN_Model_MinMaxScaler.keras')