**IMPORTS**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import tensorflow as tf
from tensorflow.keras.utils import plot_model
from sklearn.metrics import RocCurveDisplay
from tensorflow.keras.layers import (
    Input,
    Embedding,
    Attention,
    LayerNormalization,
    Dense,
)
from sklearn import tree
from tensorflow import keras
from tensorflow.keras import models, layers
import warnings

from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    f1_score,
    precision_score,
    recall_score,
    accuracy_score,
)
from sklearn.metrics import precision_recall_curve, precision_recall_fscore_support
import seaborn as sns

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score, auc

warnings.filterwarnings("ignore")

**LOADING AND PREPROCESSING DATASET**

In [None]:
path = './clean_data.csv'
df = pd.read_csv(path, encoding='utf-8')
print("Data Shape:", df.shape)

In [None]:
X = df['Sentence']
y = df['Label']
print(X.shape, y.shape)

In [None]:
import nltk
nltk.download('stopwords')
vectorizer = CountVectorizer(min_df = 2, max_df = 0.8, stop_words = stopwords.words('english'))
X = vectorizer.fit_transform(X.values.astype('U')).toarray()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) #Train 80 Test 20

In [None]:
f1_dict = {}
precision_dict = {}
recall_dict = {}
accuracy_dict = {}
train_accuracy = {}
validation_accuracy = {}
test_accuracy = {}

## Support function

In [None]:
# Function to plot the history graphs of the training and validation curves during training
def plot_history(history):
    history_dict = history.history
    train_loss = history_dict['loss']    # Training loss over epochs
    val_loss = history_dict['val_loss']    # Validation loss over epochs
    epochs = range(1, len(history_dict['loss'])+1)
    plt.plot(epochs, train_loss,'b', label='Training error')
    plt.plot(epochs, val_loss,'b', color="orange", label='Validation error')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

def plot_historyAcc(history):
    history_dict = history.history
    train_acc = history_dict['accuracy']    # Training loss over epochs
    val_acc = history_dict['val_accuracy']    # Validation loss over epochs
    epochs = range(1, len(history_dict['accuracy'])+1)
    plt.plot(epochs, train_acc,'b', label='Training accuracy')
    plt.plot(epochs, val_acc,'b', color="orange", label='Validation accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

# Function to plot the confusion matrix
def plot_confusion_matrix(conf_matrix):    
    fig, ax = plt.subplots(figsize=(7.5, 7.5))
    ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            ax.text(x=j, y=i, s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
    
    plt.xlabel('Predicted Value', fontsize=18)
    plt.ylabel('Actual Value', fontsize=18)
    plt.title('Confusion Matrix', fontsize=18)
    plt.show()

def plot_roc_auc(model, X_test, y_test):
    # Predict probabilities for the positive class
    y_pred_proba = model.predict(X_test)
    
    # Extract probabilities for the positive class (assuming binary classification)
    if y_pred_proba.shape[1] > 1:
        y_pred_proba = y_pred_proba[:, 1]
    
    # Compute ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)

    # Plot ROC curve
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{model.__class__.__name__} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Classifier (AUC = 0.5)')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for {model.name}')
    plt.legend(loc='lower right')
    plt.show()

def plot_report(y_test, y_pred):
    conf_matrix_model = confusion_matrix(y_test, y_pred)
    plot_confusion_matrix(conf_matrix_model)
    print(classification_report(y_test, y_pred, target_names=["Non-Intrusion", "Intrusion"]))

## Deep Learning

In [None]:
def train_dl_model(
    model_name,
    X_train,
    y_train,
    X_test,
    y_test,
    af="sigmoid",
    epochs=10,
    dense=64,
    learning_rate=0.01,
):
    X_train_dl = X_train.reshape(-1, 1, 6509)
    X_test_dl = X_test.reshape(-1, 1, 6509)
    train_shape = X_train_dl.shape[1:]
    opt = keras.optimizers.Adam(learning_rate=learning_rate)
    if model_name == "CNN":
        model = models.Sequential(name="CNN")
        model.add(layers.Conv1D(32, 1, activation="relu", input_shape=train_shape))
        model.add(layers.Conv1D(32, 1, activation="relu"))
        model.add(layers.Flatten())
        model.add(layers.Dense(1, activation=af))
        model.compile(
            optimizer=opt,
            loss=tf.keras.losses.BinaryCrossentropy(),
            metrics=["accuracy"],
        )
    elif model_name == "RNN":
        model = models.Sequential(name="RNN")
        model.add(
            layers.SimpleRNN(units=dense, activation="relu", input_shape=train_shape)
        )
        model.add(layers.Dense(units=1, activation=af))
        model.compile(optimizer=opt, loss="binary_crossentropy", metrics=["accuracy"])

    history = model.fit(
        X_train_dl,
        y_train,
        batch_size=32,
        epochs=epochs,
        validation_data=(X_test_dl, y_test),
    )
    y_pred = model.predict(X_test_dl).flatten()
    y_pred = np.round(y_pred)
    print(f"Accuracy of {model_name} on test set : {accuracy_score(y_pred, y_test)}")
    print(f"F1 Score of {model_name} on test set : {f1_score(y_pred, y_test)}")

    # Updates model score to f1_dict
    f1_dict[f"{model_name}"] = f1_score(y_pred, y_test)
    precision_dict[f"{model_name}"] = precision_score(y_pred, y_test)
    recall_dict[f"{model_name}"] = recall_score(y_pred, y_test)
    accuracy_dict[f"{model_name}"] = accuracy_score(y_pred, y_test)

    #! Plotting
    plot_history(history)
    plot_historyAcc(history)
    plot_roc_auc(model, X_test_dl, y_test)
    plot_report(y_test, y_pred)

## Machine learning

In [None]:
def train_ml_model(model_name, X_train, y_train, X_test, y_test):
    if model_name == "LogisticRegression":
        model = LogisticRegression()
    elif model_name == "RandomForest":
        model = RandomForestClassifier()
    elif model_name == "SVM":
        model = SVC(gamma="auto")
    elif model_name == "NaiveBayes":
        model = GaussianNB()
    elif model_name == "DecisionTree":
        model = tree.DecisionTreeClassifier()
    else:
        print("Invalid model name")
        return None
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Accuracy of {model_name} on test set : {accuracy_score(y_pred, y_test)}")
    print(f"F1 Score of {model_name} on test set : {f1_score(y_pred, y_test)}")
    f1_dict[f"{model_name}"] = f1_score(y_pred, y_test)
    precision_dict[f"{model_name}"] = precision_score(y_pred, y_test)
    recall_dict[f"{model_name}"] = recall_score(y_pred, y_test)
    accuracy_dict[f"{model_name}"] = accuracy_score(y_pred, y_test)
    #!Plotting
    # ax = plt.gca()
    model_disp = RocCurveDisplay.from_estimator(model, X_test, y_test)
    plt.show()

# Main

In [None]:
list_model_ml = [
    "LogisticRegression",
    "RandomForest",
    "SVM",
    "NaiveBayes",
    "DecisionTree",
]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
for model in list_model_ml:
    train_ml_model(model, X_train, y_train, X_test, y_test)

In [None]:
list_model_dl = [
    "CNN",
    "RNN",
]
#! Chỗ này để chỉnh kịch bản 1->5
scenarios = 1
for model in list_model_dl:
    if scenarios == 4 or scenarios == 5:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
        if scenarios == 5:
            train_dl_model(
                model, X_train, y_train, X_test, y_test, epochs=20
            )  # Chỉnh lại epochs
        else:
            train_dl_model(model, X_train, y_train, X_test, y_test)  # mặc định
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        if scenarios == 1:
            train_dl_model(
                model, X_train, y_train, X_test, y_test, dense=256
            )  # Chỉnh lại dense
        elif scenarios == 2:
            train_dl_model(
                model, X_train, y_train, X_test, y_test, af="softmax", epochs=20
            )
        else:
            train_dl_model(model, X_train, y_train, X_test, y_test)  # mặc định

# FINAL PLOT FOR MODELS PERFORMANCE 

In [None]:
keys2 = f1_dict, precision_dict, recall_dict, accuracy_dict
metrics = ['F1_Score', 'Precision', 'Recall', 'Accuracy']
data = pd.DataFrame(keys2)
data.index = metrics
data

In [None]:
result = data.plot(kind='bar', rot=0, figsize=(15, 7));
result.legend(bbox_to_anchor=(1, 1.02), loc='upper left');