# Environment & Libraries Setup

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from mlxtend.classifier import StackingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.naive_bayes import GaussianNB
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.utils import to_categorical
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, average_precision_score
from google.colab import drive
from imblearn.pipeline import Pipeline
from collections import Counter
import joblib
import gradio as gr

## Mounting Google Drive


In [None]:
drive.mount('/content/drive', force_remount=True)

# Data Loading and Preprocesing

In [None]:
# Load datasets from specified CSV files
df1 = pd.read_csv('/content/drive/MyDrive/Magisterka/Dataset/l1-nondoh.csv')
df2 = pd.read_csv('/content/drive/MyDrive/Magisterka/Dataset/l2-benign.csv')
df3 = pd.read_csv('/content/drive/MyDrive/Magisterka/Dataset/l2-malicious.csv')

# Align columns before concatenation
common_cols = list(set(df1.columns) & set(df2.columns) & set(df3.columns))
df1 = df1[common_cols]
df2 = df2[common_cols]
df3 = df3[common_cols]

# Concatenate the dataframes into a single dataframe
df = pd.concat([df1, df2, df3], ignore_index=True)

print(df.info)

# Remove duplicate rows and reset index
df = df.drop_duplicates().reset_index(drop=True)

# Shuffle the combined dataframe randomly
shuffled_df = df.sample(frac=1, random_state=42)
df = shuffled_df.reset_index(drop=True)

# Drop high-cardinality or identifier columns that are not useful for modeling
drop_cols = ['SourceIP', 'DestinationIP', 'SourcePort', 'DestinationPort', 'TimeStamp']
df = df.drop(columns=[col for col in drop_cols if col in df.columns])

# Handle missing values by dropping rows with any missing values
df = df.dropna(axis=0)

# Separate features (X) and the target label (y)
X = df.drop('Label', axis=1)
y = df['Label']

In [None]:
# Create a copy of the dataframe to include the 'Label' column
df_02 = df.copy()

# Initialize LabelEncoder
le = LabelEncoder()

# Encode the 'Label' column into numerical representations
df_02['Label'] = le.fit_transform(df_02['Label'])

# Display the dataframe with the encoded labels
print(df_02.head()) # Using head() to avoid printing the entire large dataframe

In [None]:
# Assign the 'Label' column of the encoded dataframe to the target variable y
y = df_02['Label']

# Display the target variable
y

In [None]:
# Print the mapping of original labels to their encoded numerical representations
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label mapping:", label_mapping)

# Data Analysis

## Dataset Overview

In [None]:
# General info about the dataframe (data types, non-null values, memory usage)
print(df.info())

# Descriptive statistics for numerical columns
print(df.describe())

# Visualize the distribution of the target classes to check for imbalance
sns.countplot(data=df, x='Label')
plt.title('Distribution of Target Classes')
plt.show()

## Feature Analysis & Correlation



In [None]:
# Plot histograms for all features
features = [col for col in df.columns if col != 'Label']

df[features].hist(bins=30, figsize=(18, 18), color='#4287f5', grid=False)
plt.tight_layout()
plt.suptitle('Feature Histograms', fontsize=20, y=1.02)
plt.show()

## Correlation Matrix and Heatmap

In [None]:
# Calculate the correlation matrix
corr = df_02.corr()

# Plot the heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(corr, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Feature Correlation Heatmap')
plt.show()

# Feature Engineering

### Raw Data Split

In [None]:
# Features separation (X) and target (y) after initial preprocessing
X = df_02.drop('Label', axis=1)
y = df_02['Label']

In [None]:
# Data split into training and testing (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Oversampling (SMOTE)

In [None]:
# Apply SMOTE oversampling
oversampler = SMOTE(random_state=42)
X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train, y_train)

# Class distribution before and after oversampling
print("Original training set shape:", Counter(y_train))
print("Resampled training set shape:", Counter(y_train_oversampled))

## Undersampligng

In [None]:
# Apply undersampling
undersampler = RandomUnderSampler(random_state=42)
X_train_downsampled, y_train_downsampled = undersampler.fit_resample(X_train, y_train)
print("Original training set shape:", Counter(y_train))
print("Resampled training set shape:", Counter(y_train_downsampled))

## Hybrid SMOTE + undersampler

In [None]:
# Define undersampler
undersampler = RandomUnderSampler(sampling_strategy={2: 250000}, random_state=42)

# Define oversampler
oversampler = SMOTE(sampling_strategy={0: 250000, 1: 250000}, random_state=42)

# Combine into pipeline
pipeline = Pipeline(steps=[('undersample', undersampler),
                           ('oversample', oversampler)])

# Apply to training data
X_train_balanced, y_train_balanced = pipeline.fit_resample(X_train, y_train)

print("Original training set shape:", Counter(y_train))
print("Resampled training set shape:", Counter(y_train_balanced))


## Standard Scaler

### Standard Scaler Split

In [None]:
df_scaled = X.copy()

In [None]:
# Split with reproducibility
X_train_std, X_test_std, y_train_std, y_test_std = train_test_split(df_scaled[features], y, test_size=0.2, random_state=42)

In [None]:
# Apply Standard Scaler
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std  = scaler.transform(X_test)

# Machine Learning - classification models training

## Models: Random Forest, Naive Bayes, Logistic Regression, XGBoost

In [None]:
def mapping_confusion_matrix(y_test, y_pred,model,label):

    # Label mapping
    label_mapping = {0: 'Benign', 1: 'Malicious', 2: 'NonDoH'}

    # Compute confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Create label list in the correct order
    labels = [label_mapping[i] for i in sorted(label_mapping)]

    # Plot confusion matrix
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title(f"Confusion Matrix - {model.__class__.__name__} - {label}")
    plt.tight_layout()
    plt.show()

In [None]:
def train_and_evaluate(X_train, y_train, X_test, y_test, model, label):
    # Train model
    model.fit(X_train, y_train)

    # Predict on test set
    y_pred = model.predict(X_test)

    # Print evaluation results
    print(f"Label: {label}")
    print(f"Model: {model.__class__.__name__}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    mapping_confusion_matrix(y_test, y_pred, model, label)


In [None]:
# Model list
models = [
    RandomForestClassifier(random_state=42),
    GaussianNB(),
    LogisticRegression(max_iter=1000, random_state=42),
    XGBClassifier()
]

In [None]:
# Train and evaluate all models
for model in models:
    train_and_evaluate(X_train, y_train, X_test, y_test, model, "Raw Data")
    train_and_evaluate(X_train_std, y_train_std,  X_test_std, y_test_std, model, "Standard Scaler")
    train_and_evaluate(X_train_oversampled, y_train_oversampled, X_test, y_test, model, "SMOTE Oversampling")
    train_and_evaluate(X_train_downsampled, y_train_downsampled, X_test, y_test, model, "Downsampling")
    train_and_evaluate(X_train_balanced, y_train_balanced, X_test, y_test, model, "Hybrid SMOTE+downsampling")


# Machine learning - Neural Networks

## Feedforward Neural Network (MLP)


### Raw data

In [None]:
y_onehot = to_categorical(y_train, num_classes=3)

In [None]:
model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_onehot, epochs=70, batch_size=1024, validation_split=0.1)

In [None]:
y_test_onehot = to_categorical(y_test, num_classes=3)

In [None]:

loss, acc = model.evaluate(X_test, y_test_onehot)
print("Test accuracy:", acc)

In [None]:
# Get model predictions
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test_onehot, axis=1)

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Classification report (optional but useful)
print(classification_report(y_true, y_pred))

# Plot confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=[f"Class {i}" for i in range(3)],
            yticklabels=[f"Class {i}" for i in range(3)])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

In [None]:
# Predict class indices from the model
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

y_true = y_test.ravel() if y_test.ndim == 2 else y_test

# Define class names based on your label encoding
class_names = ['Benign', 'Malicious', 'NonDoH']

# Generate confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Print classification report
print(classification_report(y_true, y_pred, target_names=class_names))

# Plot confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names,
            yticklabels=class_names)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

## Standard Scaler

In [None]:
y_onehot = to_categorical(y_train_std, num_classes=3)

In [None]:
model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_std.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(3, activation='softmax')  # 3 classes → softmax output
])

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(X_train_std, y_onehot, epochs=70, batch_size=1024, validation_split=0.1) #we increased from 20 to 70

In [None]:
y_test_onehot = to_categorical(y_test_std, num_classes=3)

In [None]:
# Evaluate the model
loss, acc = model.evaluate(X_test_std, y_test_onehot)
print("Test accuracy:", acc)

In [None]:
# Get model predictions
y_pred_probs = model.predict(X_test_std)
y_pred = np.argmax(y_pred_probs, axis=1)         # Convert softmax probs to class indices
y_true = np.argmax(y_test_onehot, axis=1)        # Convert one-hot encoded labels to class indices

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Classification report (optional but useful)
print(classification_report(y_true, y_pred))

# Plot confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=[f"Class {i}" for i in range(3)],
            yticklabels=[f"Class {i}" for i in range(3)])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

In [None]:
# Predict class indices from the model
y_pred_probs = model.predict(X_test_std)
y_pred = np.argmax(y_pred_probs, axis=1)

y_true = y_test.ravel() if y_test.ndim == 2 else y_test

class_names = ['Benign', 'Malicious', 'NonDoH']

# Generate confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Print classification report
print(classification_report(y_true, y_pred, target_names=class_names))

# Plot confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names,
            yticklabels=class_names)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

## Downsampling

In [None]:
y_onehot = to_categorical(y_train_downsampled, num_classes=3)

In [None]:
model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_downsampled.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(3, activation='softmax')  # 3 classes → softmax output
])

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(X_train_downsampled, y_onehot, epochs=70, batch_size=1024, validation_split=0.1) #we increased from 20 to 70

In [None]:
y_test_onehot = to_categorical(y_test, num_classes=3)

In [None]:
# Evaluate the model
loss, acc = model.evaluate(X_test, y_test_onehot)
print("Test accuracy:", acc)

In [None]:
# Get model predictions
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test_onehot, axis=1)

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Classification report (optional but useful)
print(classification_report(y_true, y_pred))

# Plot confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=[f"Class {i}" for i in range(3)],
            yticklabels=[f"Class {i}" for i in range(3)])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

In [None]:
# Predict class indices from the model
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)


y_true = y_test.ravel() if y_test.ndim == 2 else y_test

# Define class names based on your label encoding
class_names = ['Benign', 'Malicious', 'NonDoH']

# Generate confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Print classification report
print(classification_report(y_true, y_pred, target_names=class_names))

# Plot confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names,
            yticklabels=class_names)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

## Oversampling

In [None]:
y_onehot = to_categorical(y_train_oversampled, num_classes=3)

In [None]:
model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_oversampled.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(3, activation='softmax')  # 3 classes → softmax output
])

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(X_train_oversampled, y_onehot, epochs=70, batch_size=1024, validation_split=0.1) #we increased from 20 to 70

In [None]:
y_test_onehot = to_categorical(y_test, num_classes=3)

In [None]:
# Evaluate the model
loss, acc = model.evaluate(X_test, y_test_onehot)
print("Test accuracy:", acc)

In [None]:
# Get model predictions
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test_onehot, axis=1)

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Classification report (optional but useful)
print(classification_report(y_true, y_pred))

# Plot confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=[f"Class {i}" for i in range(3)],
            yticklabels=[f"Class {i}" for i in range(3)])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

In [None]:
# Predict class indices from the model
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

y_true = y_test.ravel() if y_test.ndim == 2 else y_test


class_names = ['Benign', 'Malicious', 'NonDoH']

# Generate confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Print classification report
print(classification_report(y_true, y_pred, target_names=class_names))

# Plot confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names,
            yticklabels=class_names)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

# RNN

## Raw Data

In [None]:
# Shape: (samples, timesteps=1, features)
X_train_rnn = np.asarray(X_train, dtype=np.float32).reshape(X_train.shape[0], 1, X_train.shape[1])
X_test_rnn  = np.asarray(X_test,  dtype=np.float32).reshape(X_test.shape[0],  1, X_test.shape[1])

# Labels as integers
y_train_int = np.asarray(y_train, dtype=np.int32)
y_test_int  = np.asarray(y_test,  dtype=np.int32)

# RNN model
model = models.Sequential([
    layers.SimpleRNN(
        128,
        activation='tanh',
        input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2])
    ),
    layers.Dense(64, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer=Adam(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(
    X_train_rnn, y_train_int,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_rnn, y_test_int),
    verbose=1
)

# Evaluate
loss, acc = model.evaluate(X_test_rnn, y_test_int, verbose=0)
print(f"Test loss: {loss:.4f}")
print(f"Test accuracy: {acc:.4f}")

# Predict and report
y_pred_probs = model.predict(X_test_rnn, verbose=0)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = y_test_int

print("\nClassification report:\n")
print(classification_report(y_true, y_pred, digits=4))

# Print confusion matrix
cm = confusion_matrix(y_true, y_pred)
class_names = [f"Class {i}" for i in range(3)]

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Confusion Matrix (SimpleRNN)")
plt.tight_layout()
plt.show()



## Standar Scaler

In [None]:
# Shape: (samples, timesteps=1, features)
X_train_rnn = np.asarray(X_train_std, dtype=np.float32).reshape(X_train_std.shape[0], 1, X_train.shape[1])
X_test_rnn  = np.asarray(X_test_std,  dtype=np.float32).reshape(X_test_std.shape[0],  1, X_test.shape[1])

# Labels as integers
y_train_int = np.asarray(y_train, dtype=np.int32)
y_test_int  = np.asarray(y_test,  dtype=np.int32)

# RNN model
model = models.Sequential([
    layers.SimpleRNN(
        128,
        activation='tanh',
        input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2])
    ),
    layers.Dense(64, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer=Adam(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(
    X_train_rnn, y_train_int,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_rnn, y_test_int),
    verbose=1
)

# Evaluate
loss, acc = model.evaluate(X_test_rnn, y_test_int, verbose=0)
print(f"Test loss: {loss:.4f}")
print(f"Test accuracy: {acc:.4f}")

# Predict & report
y_pred_probs = model.predict(X_test_rnn, verbose=0)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = y_test_int

print("\nClassification report:\n")
print(classification_report(y_true, y_pred, digits=4))

# Print confusion matrix
cm = confusion_matrix(y_true, y_pred)
class_names = [f"Class {i}" for i in range(3)]  # customize if you have real names

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Confusion Matrix (SimpleRNN)")
plt.tight_layout()
plt.show()


## SMOTE

In [None]:
# Shape: (samples, timesteps=1, features)
X_train_rnn = np.asarray(X_train_oversampled, dtype=np.float32).reshape(X_train_oversampled.shape[0], 1, X_train.shape[1])
X_test_rnn  = np.asarray(X_test,  dtype=np.float32).reshape(X_test.shape[0],  1, X_test.shape[1])

# Labels as integers
y_train_int = np.asarray(y_train_oversampled, dtype=np.int32)
y_test_int  = np.asarray(y_test,  dtype=np.int32)

# RNN model
model = models.Sequential([
    layers.SimpleRNN(
        128,
        activation='tanh',
        input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2])
    ),
    layers.Dense(64, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer=Adam(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(
    X_train_rnn, y_train_int,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_rnn, y_test_int),
    verbose=1
)

# Evaluate
loss, acc = model.evaluate(X_test_rnn, y_test_int, verbose=0)
print(f"Test loss: {loss:.4f}")
print(f"Test accuracy: {acc:.4f}")

# Predict & report
y_pred_probs = model.predict(X_test_rnn, verbose=0)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = y_test_int

print("\nClassification report:\n")
print(classification_report(y_true, y_pred, digits=4))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
class_names = [f"Class {i}" for i in range(3)]  # customize if you have real names

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Confusion Matrix (SimpleRNN)")
plt.tight_layout()
plt.show()

## Downsampling

In [None]:
# Shape: (samples, timesteps=1, features)
X_train_rnn = np.asarray(X_train_downsampled, dtype=np.float32).reshape(X_train_downsampled.shape[0], 1, X_train.shape[1])
X_test_rnn  = np.asarray(X_test,  dtype=np.float32).reshape(X_test.shape[0],  1, X_test.shape[1])

# Labels as integers
y_train_int = np.asarray(y_train_downsampled, dtype=np.int32)
y_test_int  = np.asarray(y_test,  dtype=np.int32)

# RNN model
model = models.Sequential([
    layers.SimpleRNN(
        128,
        activation='tanh',
        input_shape=(X_train_rnn.shape[1], X_train_rnn.shape[2])
    ),
    layers.Dense(64, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer=Adam(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(
    X_train_rnn, y_train_int,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_rnn, y_test_int),
    verbose=1
)

# Evaluate
loss, acc = model.evaluate(X_test_rnn, y_test_int, verbose=0)
print(f"Test loss: {loss:.4f}")
print(f"Test accuracy: {acc:.4f}")

# Predict & Report
y_pred_probs = model.predict(X_test_rnn, verbose=0)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = y_test_int

print("\nClassification report:\n")
print(classification_report(y_true, y_pred, digits=4))

# Print confusion matrix
cm = confusion_matrix(y_true, y_pred)
class_names = [f"Class {i}" for i in range(3)]
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Confusion Matrix (SimpleRNN)")
plt.tight_layout()
plt.show()

# LSTM

## Raw Data

In [None]:
X_train_lstm = np.asarray(X_train, dtype=np.float32).reshape(X_train.shape[0], 1, X_train.shape[1])
X_test_lstm  = np.asarray(X_test,  dtype=np.float32).reshape(X_test.shape[0],  1, X_test.shape[1])


y_train_int = np.asarray(y_train, dtype=np.int32)
y_test_int  = np.asarray(y_test,  dtype=np.int32)

model = models.Sequential([
    layers.LSTM(128, activation='tanh', input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])),
    layers.Dense(64, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer=Adam(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(
    X_train_lstm, y_train_int,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_lstm, y_test_int),
    verbose=1
)


loss, acc = model.evaluate(X_test_lstm, y_test_int, verbose=0)
print(f"Test loss: {loss:.4f}")
print(f"Test accuracy: {acc:.4f}")

y_pred_probs = model.predict(X_test_lstm, verbose=0)
y_pred = np.argmax(y_pred_probs, axis=1)

y_true = y_test_int

cm = confusion_matrix(y_true, y_pred)
print(classification_report(y_true, y_pred, digits=4))

class_names = [f"Class {i}" for i in range(3)]

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()


## Standard Scaler

In [None]:
X_train_lstm = np.asarray(X_train_std, dtype=np.float32).reshape(X_train_std.shape[0], 1, X_train_std.shape[1])
X_test_lstm  = np.asarray(X_test_std,  dtype=np.float32).reshape(X_test_std.shape[0],  1, X_test_std.shape[1])


y_train_int = np.asarray(y_train, dtype=np.int32)
y_test_int  = np.asarray(y_test,  dtype=np.int32)

model = models.Sequential([
    layers.LSTM(128, activation='tanh', input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])),
    layers.Dense(64, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer=Adam(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(
    X_train_lstm, y_train_int,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_lstm, y_test_int),
    verbose=1
)


In [None]:
loss, acc = model.evaluate(X_test_lstm, y_test_int, verbose=0)
print(f"Test loss: {loss:.4f}")
print(f"Test accuracy: {acc:.4f}")

y_pred_probs = model.predict(X_test_lstm, verbose=0)
y_pred = np.argmax(y_pred_probs, axis=1)

y_true = y_test_int

cm = confusion_matrix(y_true, y_pred)
print(classification_report(y_true, y_pred, digits=4))

class_names = [f"Class {i}" for i in range(3)]

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()


## Downsampling

In [None]:
X_train_lstm = np.asarray(X_train_downsampled, dtype=np.float32).reshape(X_train_downsampled.shape[0], 1, X_train_downsampled.shape[1])
X_test_lstm  = np.asarray(X_test,  dtype=np.float32).reshape(X_test.shape[0],  1, X_test.shape[1])


y_train_int = np.asarray(y_train_downsampled, dtype=np.int32)
y_test_int  = np.asarray(y_test,  dtype=np.int32)

model = models.Sequential([
    layers.LSTM(128, activation='tanh', input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])),
    layers.Dense(64, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer=Adam(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(
    X_train_lstm, y_train_int,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_lstm, y_test_int),
    verbose=1
)


loss, acc = model.evaluate(X_test_lstm, y_test_int, verbose=0)
print(f"Test loss: {loss:.4f}")
print(f"Test accuracy: {acc:.4f}")

y_pred_probs = model.predict(X_test_lstm, verbose=0)
y_pred = np.argmax(y_pred_probs, axis=1)

y_true = y_test_int

cm = confusion_matrix(y_true, y_pred)
print(classification_report(y_true, y_pred, digits=4))

class_names = [f"Class {i}" for i in range(3)]


plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()


## Oversampling

In [None]:
X_train_lstm = np.asarray(X_train_oversampled, dtype=np.float32).reshape(X_train_oversampled.shape[0], 1, X_train_oversampled.shape[1])
X_test_lstm  = np.asarray(X_test,  dtype=np.float32).reshape(X_test.shape[0],  1, X_test.shape[1])

y_train_int = np.asarray(y_train_oversampled, dtype=np.int32)
y_test_int  = np.asarray(y_test,  dtype=np.int32)

model = models.Sequential([
    layers.LSTM(128, activation='tanh', input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])),
    layers.Dense(64, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer=Adam(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(
    X_train_lstm, y_train_int,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_lstm, y_test_int),
    verbose=1
)

loss, acc = model.evaluate(X_test_lstm, y_test_int, verbose=0)
print(f"Test loss: {loss:.4f}")
print(f"Test accuracy: {acc:.4f}")

y_pred_probs = model.predict(X_test_lstm, verbose=0)
y_pred = np.argmax(y_pred_probs, axis=1)

y_true = y_test_int

cm = confusion_matrix(y_true, y_pred)
print(classification_report(y_true, y_pred, digits=4))

class_names = [f"Class {i}" for i in range(3)]

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

In [None]:
X_train_lstm = np.asarray(X_train_oversampled, dtype=np.float32).reshape(X_train_oversampled.shape[0], 1, X_train_oversampled.shape[1])
X_test_lstm  = np.asarray(X_test,  dtype=np.float32).reshape(X_test.shape[0],  1, X_test.shape[1])


y_train_int = np.asarray(y_train_oversampled, dtype=np.int32)
y_test_int  = np.asarray(y_test,  dtype=np.int32)

model = models.Sequential([
    layers.LSTM(128, activation='tanh', input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])),
    layers.Dense(64, activation='relu'),
    layers.Dense(3, activation='softmax')
])

model.compile(optimizer=Adam(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(
    X_train_lstm, y_train_int,
    epochs=20,
    batch_size=32,
    validation_data=(X_test_lstm, y_test_int),
    verbose=1
)

loss, acc = model.evaluate(X_test_lstm, y_test_int, verbose=0)
print(f"Test loss: {loss:.4f}")
print(f"Test accuracy: {acc:.4f}")

y_pred_probs = model.predict(X_test_lstm, verbose=0)
y_pred = np.argmax(y_pred_probs, axis=1)

y_true = y_test_int

cm = confusion_matrix(y_true, y_pred)
print(classification_report(y_true, y_pred, digits=4))

class_names = [f"Class {i}" for i in range(3)]

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()


# Anomaly Detection - hybrid approach + GUI

In [None]:
# Isolation Forest on training data
iso = IsolationForest(contamination=0.05, random_state=42)
iso.fit(X_train)

X_train_aug = X_train.copy()
X_test_aug  = X_test.copy()

X_train_aug['iso_score'] = iso.decision_function(X_train_aug)
X_test_aug['iso_score'] = iso.decision_function(X_test_aug)

# Save list of feature columns
feature_list = list(X_train_aug.columns)
joblib.dump(feature_list, "/content/drive/MyDrive/Dataset/Model/features.pkl")

# Random Forest classifier
clf = RandomForestClassifier(class_weight='balanced', random_state=42)
clf.fit(X_train_aug, y_train)

y_pred = clf.predict(X_test_aug)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


In [None]:
# Get model predictions
y_pred = clf.predict(X_test_aug)
y_true = y_test

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Classification report
print(classification_report(y_true, y_pred))

# Plot confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=[f"Class {i}" for i in range(len(set(y_true)))],
            yticklabels=[f"Class {i}" for i in range(len(set(y_true)))])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - Random Forest with Isolation Forest feature")
plt.tight_layout()
plt.show()

In [None]:
joblib.dump(iso, "/content/drive/MyDrive/Dataset/Model/isolation_forest.pkl")
joblib.dump(clf, "/content/drive/MyDrive/Dataset/Model/rf_classifier.pkl")

In [None]:
# Load trained models
iso = joblib.load("/content/drive/MyDrive/Dataset/Model/isolation_forest.pkl")
clf = joblib.load("/content/drive/MyDrive/Dataset/Model/rf_classifier.pkl")

# Define labels mapping
label_map = {0: "Benign", 1: "Malicious", 2: "NonDoH"}

train_features = ['PacketLengthSkewFromMode', 'FlowSentRate', 'PacketTimeVariance',
       'ResponseTimeTimeStandardDeviation', 'FlowBytesReceived',
       'ResponseTimeTimeCoefficientofVariation', 'PacketLengthMode',
       'PacketTimeMedian', 'PacketTimeMode', 'PacketLengthVariance',
       'ResponseTimeTimeMedian', 'PacketLengthCoefficientofVariation',
       'PacketTimeMean', 'PacketTimeSkewFromMedian',
       'PacketTimeCoefficientofVariation', 'ResponseTimeTimeSkewFromMedian',
       'ResponseTimeTimeSkewFromMode', 'PacketLengthMedian',
       'PacketTimeSkewFromMode', 'PacketLengthMean', 'ResponseTimeTimeMean',
       'FlowBytesSent', 'Duration', 'PacketLengthSkewFromMedian',
       'PacketTimeStandardDeviation', 'ResponseTimeTimeVariance',
       'ResponseTimeTimeMode', 'FlowReceivedRate',
       'PacketLengthStandardDeviation']

# Function to handle uploaded CSV
def predict_csv(file_path):
    # Read CSV
    X_new = pd.read_csv(file_path)
    X_new = X_new[train_features]
    # Add iso_score from Isolation Forest
    X_new["iso_score"] = iso.decision_function(X_new)

    # Predict
    preds = clf.predict(X_new)
    preds = [label_map[p] for p in preds]

    # Return as dataframe for nice output
    result = pd.DataFrame({"Prediction": preds})
    print(X_new)
    print(result)
    return result

# Gradio UI
iface = gr.Interface(
    fn=predict_csv,
    inputs=gr.File(type="filepath", file_types=[".csv"]),
    outputs=gr.Dataframe(),
    title="Hybrid Anomaly Detection + Classification",
    description="Upload X_test.csv to get predictions (NonDoH, Benign, Malicious)."
)

iface.launch(share=True)