In [2]:
import numpy as np

In [None]:
import sys
import os

# Add the correct path to the SafeML Implementation_in_Python folder
sys.path.insert(0, r'C:\Users\G800613RTS\Desktop\Anomaly\SafeML\Implementation_in_Python')

# Alternative relative path (if running from the Anomaly folder)
# sys.path.insert(0, './SafeML/Implementation_in_Python')

# Verify the path exists
safeml_path = r'C:\Users\G800613RTS\Desktop\Anomaly\SafeML\Implementation_in_Python'
if os.path.exists(safeml_path):
    print(f"✅ SafeML path found: {safeml_path}")
else:
    print(f"❌ SafeML path NOT found: {safeml_path}")
    print("Current working directory:", os.getcwd())
    print("Available directories:", os.listdir('.'))

# Importing local modules (statistical distance measures)
from CVM_Distance import CVM_Dist as Cramer_Von_Mises_Dist
from Anderson_Darling_Distance import Anderson_Darling_Dist
from Kolmogorov_Smirnov_Distance import Kolmogorov_Smirnov_Dist
from KuiperDistance import Kuiper_Dist
from WassersteinDistance import Wasserstein_Dist
from DTS_Distance import DTS_Dist # Combo of Anderson_Darling and CVM distance.

print("✅ All SafeML modules imported successfully!")

✅ SafeML path found: C:\Users\G800613RTS\Desktop\Det_Ano_IM\SafeML\Implementation_in_Python
✅ All SafeML modules imported successfully!


In [4]:
import tensorflow as tf

In [5]:
%matplotlib inline
import os, sys # For accessing Python Modules in the System Path (for accessing the Statistical Measures modules)
# See: https://stackoverflow.com/a/39311677
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

import pandas as pd # For DataFrames, Series, and reading csv data in.
import seaborn as sns # Graphing, built ontop of MatPlot for ease-of-use and nicer diagrams.
import matplotlib.pyplot as plt # MatPlotLib for graphing data visually. Seaborn more likely to be used.
import numpy as np # For manipulating arrays and changing data into correct formats for certain libraries
import sklearn # For Machine Learning algorithms

from sklearn.decomposition import PCA # For PCA dimensionality reduction technique
from sklearn.preprocessing import StandardScaler # For scaling to unit scale, before PCA application
from sklearn.preprocessing import LabelBinarizer # For converting categorical data into numeric, for modeling stage
from sklearn.model_selection import StratifiedKFold # For optimal train_test splitting, for model input data
from sklearn.model_selection import train_test_split # For basic dataset splitting
from sklearn.neighbors import KNeighborsClassifier # K-Nearest Neighbors ML classifier (default n. of neighbors = 5)

from sklearn.metrics import accuracy_score # For getting the accuracy of a model's predictions
from sklearn.metrics import classification_report # Various metrics for model performance
from sklearn.neural_network import MLPClassifier # For Neural Network classifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

# TensorFlow/Keras imports for DNN model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler

In [6]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    # Fix: Use axis parameter instead of positional argument
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    return df[indices_to_keep]

In [7]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix"):
    """
    Custom confusion matrix plotting function using matplotlib and seaborn
    """
    # Create confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Create the plot
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Normal (0)', 'Anomaly (1)'], 
                yticklabels=['Normal (0)', 'Anomaly (1)'],
                cbar_kws={'label': 'Count'})
    
    plt.title(title, fontsize=14, fontweight='bold')
    plt.xlabel('Predicted Label', fontsize=12)
    plt.ylabel('True Label', fontsize=12)
    plt.tight_layout()
    plt.show()
    
    # Print confusion matrix statistics
    tn, fp, fn, tp = cm.ravel()
    total = cm.sum()
    accuracy = (tp + tn) / total
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    print(f"📊 Confusion Matrix Statistics:")
    print(f"   True Negatives (TN):  {tn:4d}")
    print(f"   False Positives (FP): {fp:4d}")
    print(f"   False Negatives (FN): {fn:4d}")
    print(f"   True Positives (TP):  {tp:4d}")
    print(f"   Accuracy:    {accuracy:.4f}")
    print(f"   Precision:   {precision:.4f}")
    print(f"   Recall:      {recall:.4f}")
    print(f"   Specificity: {specificity:.4f}")
    print("-" * 40)
    
    return cm

In [8]:
def train_and_predict_DNN_model(X_train, X_test, y_train, y_test):
    """
    Train a Deep Neural Network using TensorFlow/Keras with early stopping.
    
    Parameters:
    -----------
    X_train, X_test : pandas.DataFrame or numpy.array
        Training and test feature data
    y_train, y_test : pandas.Series or numpy.array
        Training and test labels
    
    Returns:
    --------
    tuple : (predictions, accuracy, model, scaler)
        Model predictions, accuracy score, trained model, and fitted scaler
    """
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense
    from tensorflow.keras.metrics import AUC
    from tensorflow.keras.callbacks import EarlyStopping
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import accuracy_score
    import numpy as np
    
    # Scale the data for neural network
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
   
    
    # Build the DNN model
    dnn_model = Sequential([
        Dense(128, input_dim=X_train_scaled.shape[1], activation='relu'),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    
    # Compile the model
    dnn_model.compile(
        optimizer='adam', 
        loss='binary_crossentropy', 
        metrics=['accuracy', AUC()]
    )
    
    # Add Early Stopping callback
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        verbose=0,  # Reduced verbosity for cleaner output
        min_delta=0.001
    )
    
    # Train the model with scaled data and early stopping
    history = dnn_model.fit(
        X_train_scaled, y_train,
        epochs=50,
        batch_size=32,
        validation_split=0.2,
        callbacks=[early_stopping],
        verbose=0  # Reduced verbosity for cleaner output
    )
    print(f"\n✅ Training completed! Stopped at epoch {len(history.history['loss'])}")
    
    # Make predictions
    pred_y_prob = dnn_model.predict(X_test_scaled, verbose=0)
    pred_y = (pred_y_prob > 0.5).astype(int).flatten()
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, pred_y)
    
    # Return predictions, accuracy, trained model, and fitted scaler
    return pred_y, accuracy, dnn_model, scaler

In [9]:
def get_X_train_and_test_data_for_given_label(labels, label_index, pred_y, X_train, X_test, y_train, y_test):
    """
    Get training and test data for a specific label/class.
    Works with numpy arrays from train_test_split.
    
    Parameters:
    -----------
    labels : array
        Unique labels in the dataset
    label_index : int
        Index of the label to filter for
    pred_y : array
        Predicted labels
    X_train, X_test : numpy.ndarray
        Training and test feature data (numpy arrays)
    y_train, y_test : numpy.ndarray
        Training and test labels (numpy arrays)
    
    Returns:
    --------
    tuple : (X_train_for_label, X_test_for_label) as pandas DataFrames
    """
    # Since X_train, X_test, y_train are numpy arrays, use boolean indexing
    train_mask = y_train == labels[label_index]
    test_mask = pred_y == labels[label_index]
    
    # Filter the data using boolean masks
    X_train_loc_for_label = X_train[train_mask]
    X_test_loc_for_label = X_test[test_mask]
    
    # Convert to pandas DataFrames with the correct feature names
    # These are the 10 selected features from your df_new
    feature_columns = ['dst_port','flow_duration', 'fwd_pkt_len_max', 'fwd_pkt_len_mean',
       'pkt_len_mean', 'pkt_len_std', 'fwd_iat_tot', 'syn_flag_cnt',
       'pkt_size_avg', 'fwd_seg_size_avg']
    
    X_train_loc_for_label = pd.DataFrame(X_train_loc_for_label, columns=feature_columns)
    X_test_loc_for_label = pd.DataFrame(X_test_loc_for_label, columns=feature_columns)
    
    return X_train_loc_for_label, X_test_loc_for_label

In [10]:
def get_statistical_dist_measures_for_class_result(accuracy, X_train_L, X_test_L):
    """
    Calculate statistical distance measures for a specific class result including accuracy.
    Used during training phase to build the statistical distance database.
    
    Parameters:
    -----------
    accuracy : float
        Model accuracy for this training run
    X_train_L : pandas.DataFrame
        Training data for a specific class/label
    X_test_L : pandas.DataFrame  
        Test data for the same class/label
    
    Returns:
    --------
    dict : Dictionary containing accuracy and statistical distance measures
    """
    # Check if either dataset is empty
    if len(X_train_L) == 0 or len(X_test_L) == 0:
        print(f"Warning: Empty dataset detected. Train size: {len(X_train_L)}, Test size: {len(X_test_L)}")
        return {'Accuracy': accuracy,
                'Anderson_Darling_dist': np.nan,
                'CVM_dist': np.nan,
                'DTS_dist': np.nan,
                'Kolmogorov_Smirnov_dist': np.nan,
                'Kuiper_dist': np.nan,
                'Wasserstein_dist': np.nan}
    
    num_of_features = len(X_train_L.columns)
    
    # Instantiate empty arrays with large enough size, to hold statistical distance data
    CVM_distances = np.zeros(num_of_features)
    Anderson_Darling_distances = np.zeros(num_of_features)
    Kolmogorov_Smirnov_distances = np.zeros(num_of_features)
    Kuiper_distances = np.zeros(num_of_features)
    Wasserstein_distances = np.zeros(num_of_features)
    DTS_distances = np.zeros(num_of_features)

    for i in range(0, num_of_features):
        # iloc[:, i] allows selection of the ith feature in the Pandas dataframe
        # Calling the methods from the imported Python modules (see import section at top of notebook)
        CVM_distances[i] = Cramer_Von_Mises_Dist(X_train_L.iloc[:, i], X_test_L.iloc[:, i])
        Anderson_Darling_distances[i] = Anderson_Darling_Dist(X_train_L.iloc[:, i], X_test_L.iloc[:, i])
        Kolmogorov_Smirnov_distances[i] = Kolmogorov_Smirnov_Dist(X_train_L.iloc[:, i], X_test_L.iloc[:, i])
        Kuiper_distances[i] = Kuiper_Dist(X_train_L.iloc[:, i], X_test_L.iloc[:, i])
        Wasserstein_distances[i] = Wasserstein_Dist(X_train_L.iloc[:, i], X_test_L.iloc[:, i])
        DTS_distances[i] = DTS_Dist(X_train_L.iloc[:, i], X_test_L.iloc[:, i])
        
    # Computing mean/ average, to get ECDF distance of full dataset. Float64 to keep accuracy high.
    # See: https://numpy.org/doc/stable/reference/generated/numpy.mean.html
    CVM_distance = np.mean(CVM_distances, dtype=np.float64)
    Anderson_Darling_distance = np.mean(Anderson_Darling_distances, dtype=np.float64)
    Kolmogorov_Smirnov_distance = np.mean(Kolmogorov_Smirnov_distances, dtype=np.float64)
    Kuiper_distance = np.mean(Kuiper_distances, dtype=np.float64)
    Wasserstein_distance = np.mean(Wasserstein_distances, dtype=np.float64)
    DTS_distance = np.mean(DTS_distances, dtype=np.float64)
    
    # Returning dictionary, for efficient and fast DataFrame creation. Returns mean for each distance.
    # See https://stackoverflow.com/a/17496530. Fast way to 'append' to dataframe for results table.
    # PRESERVE THE ORDERING
    return {'Accuracy': accuracy,
            'Anderson_Darling_dist': Anderson_Darling_distance,
            'CVM_dist': CVM_distance,
            'DTS_dist':DTS_distance,
            'Kolmogorov_Smirnov_dist':Kolmogorov_Smirnov_distance,
            'Kuiper_dist': Kuiper_distance,
            'Wasserstein_dist': Wasserstein_distance}

Reading Dataset

In [None]:
import pandas as pd
import os

# Define the folder path where DOS files are located
dos_folder_path = r'C:\Users\G800613RTS\Desktop\Anomaly\ddos'

# List of DOS files
dos_files = ['dos.csv', 'dos1.csv', 'dos2.csv', 'dos3.csv', 'dos4.csv', 
             'dos5.csv', 'dos6.csv', 'dos7.csv', 'dos8.csv']

# Specific destination ports to filter for
target_ports = [8008, 8009, 8443, 9000, 443]

# Verify the folder exists
if not os.path.exists(dos_folder_path):
    print(f"❌ DOS folder not found: {dos_folder_path}")
    print("Please verify the folder path exists")
else:
    print(f"✅ DOS folder found: {dos_folder_path}")
    print(f"Files in folder: {os.listdir(dos_folder_path)}")

# Read all files and filter for label=1 AND specific destination ports
filtered_data = []

for file in dos_files:
    try:
        # Construct full file path
        file_path = os.path.join(dos_folder_path, file)
        
        # Check if file exists
        if not os.path.exists(file_path):
            print(f"❌ File not found: {file_path}")
            continue
        
        # Read the CSV file
        df = pd.read_csv(file_path)
        print(f"📁 Reading {file}: {len(df)} total rows")
        
        # Check if required columns exist
        if 'Label' not in df.columns:
            print(f"⚠️ No 'Label' column found in {file}")
            continue
        if 'dst_port' not in df.columns:
            print(f"⚠️ No 'dst_port' column found in {file}")
            continue
        
        # Filter rows where label equals 1 AND dst_port is in target_ports
        filtered_df = df[(df['Label'] == 1) & (df['dst_port'].isin(target_ports))]
        
        # Add to our list if we have data
        if len(filtered_df) > 0:
            filtered_data.append(filtered_df)
            print(f"✅ Loaded {file}: {len(filtered_df)} rows with label=1 and target ports")
        else:
            print(f"⚠️ No data found in {file} with label=1 and target ports")
        
    except FileNotFoundError:
        print(f"❌ File {file} not found at path: {os.path.join(dos_folder_path, file)}")
    except Exception as e:
        print(f"❌ Error reading {file}: {e}")

# Combine all filtered data into one DataFrame
if filtered_data:
    combined_df = pd.concat(filtered_data, ignore_index=True)
    print(f"\n🎯 Total rows with label=1 and target ports: {len(combined_df)}")
    
    # Analyze destination ports BEFORE sampling
    print("\n" + "="*50)
    print("ORIGINAL DESTINATION PORT ANALYSIS")
    print("="*50)
    
    dst_port_counts = combined_df['dst_port'].value_counts()
    print("Original distribution:")
    print(dst_port_counts)
    
    # === APPLY CUSTOM SAMPLING BY PORT ===
    print("\n" + "="*50)
    print("APPLYING CUSTOM SAMPLING")
    print("="*50)
    
    sampled_data = []
    
    for port in combined_df['dst_port'].unique():
        port_data = combined_df[combined_df['dst_port'] == port]
        original_count = len(port_data)
        
        if port == 9000:
            # Take 30% of port 9000 data
            sample_fraction = 0.3
            sampled_port_data = port_data.sample(frac=sample_fraction, random_state=42)
            print(f"Port {port}: {original_count} → {len(sampled_port_data)} rows (30%)")
            
        elif port == 8443:
            # Take 30% of port 8443 data
            sample_fraction = 0.3
            sampled_port_data = port_data.sample(frac=sample_fraction, random_state=42)
            print(f"Port {port}: {original_count} → {len(sampled_port_data)} rows (30%)")
            
        elif port == 8008:
            # Take 30% of port 8008 data
            sample_fraction = 0.3
            sampled_port_data = port_data.sample(frac=sample_fraction, random_state=42)
            print(f"Port {port}: {original_count} → {len(sampled_port_data)} rows (30%)")
            
        else:
            # Keep all data for other ports (8009, 443)
            sampled_port_data = port_data
            print(f"Port {port}: {original_count} → {len(sampled_port_data)} rows (100%)")
        
        sampled_data.append(sampled_port_data)
    
    # Combine sampled data
    combined_df = pd.concat(sampled_data, ignore_index=True)
    
    # Analyze destination ports AFTER sampling
    print("\n" + "="*50)
    print("FINAL DESTINATION PORT ANALYSIS")
    print("="*50)
    
    # Get unique destination ports
    unique_dst_ports = combined_df['dst_port'].unique()
    print(f"Number of unique destination ports: {len(unique_dst_ports)}")
    
    # Show all unique destination ports (sorted)
    print(f"\nUnique destination ports found:")
    print(sorted(unique_dst_ports))
    
    # Show distribution of target ports after sampling
    print(f"\nFinal port distribution:")
    dst_port_counts_final = combined_df['dst_port'].value_counts()
    print(dst_port_counts_final)
    
    # Show port statistics
    print(f"\nDestination port statistics:")
    print(f"Min port: {combined_df['dst_port'].min()}")
    print(f"Max port: {combined_df['dst_port'].max()}")
    print(f"Mean port: {combined_df['dst_port'].mean():.2f}")
    
    # Show percentage distribution of target ports
    print(f"\nFinal ports percentage distribution:")
    dst_port_percentages = combined_df['dst_port'].value_counts(normalize=True) * 100
    for port, percentage in dst_port_percentages.items():
        print(f"Port {port}: {percentage:.2f}%")
    
    # Summary of changes
    print(f"\n" + "="*50)
    print("SAMPLING SUMMARY")
    print("="*50)
    print(f"Total rows after sampling: {len(combined_df)}")
    
    # Show which target ports were found vs not found
    found_ports = set(unique_dst_ports)
    missing_ports = set(target_ports) - found_ports
    
    if missing_ports:
        print(f"Target ports NOT found in data: {sorted(missing_ports)}")
    else:
        print(f"All target ports found in data!")
        
else:
    print("❌ No data found with the specified criteria")
    combined_df = pd.DataFrame()

# Store the filtered and sampled data for further use
df_data = combined_df

print(f"\n📊 Final dataset shape: {df_data.shape}")
if len(df_data) > 0:
    print(f"📊 Dataset columns: {list(df_data.columns)}")
    print(f"📊 Label distribution: {df_data['Label'].value_counts().to_dict()}")

✅ DOS folder found: C:\Users\G800613RTS\Desktop\Det_Ano_IM\ddos
Files in folder: ['dos.csv', 'dos1.csv', 'dos2.csv', 'dos3.csv', 'dos4.csv', 'dos5.csv', 'dos6.csv', 'dos7.csv', 'dos8.csv']
📁 Reading dos.csv: 15662 total rows
✅ Loaded dos.csv: 15502 rows with label=1 and target ports
📁 Reading dos1.csv: 15736 total rows
✅ Loaded dos1.csv: 15520 rows with label=1 and target ports
📁 Reading dos2.csv: 77 total rows
✅ Loaded dos2.csv: 4 rows with label=1 and target ports
📁 Reading dos3.csv: 70 total rows
✅ Loaded dos3.csv: 4 rows with label=1 and target ports
📁 Reading dos4.csv: 37 total rows
✅ Loaded dos4.csv: 1 rows with label=1 and target ports
📁 Reading dos5.csv: 590 total rows
✅ Loaded dos5.csv: 501 rows with label=1 and target ports
📁 Reading dos6.csv: 2413 total rows
✅ Loaded dos6.csv: 2309 rows with label=1 and target ports
📁 Reading dos7.csv: 947 total rows
✅ Loaded dos7.csv: 893 rows with label=1 and target ports
📁 Reading dos8.csv: 2011 total rows
✅ Loaded dos8.csv: 1947 rows wit

In [None]:
import pandas as pd
import os

# Define the folder path where session/output files are located
session_folder_path = r'C:\Users\G800613RTS\Desktop\Anomaly\normal'

# List of all session/output CSV files
session_files = ['output.csv', 'output1.csv', 'session3.csv', 'session5.csv', 'session7.csv', 
                 'session8.csv', 'session9.csv', 'session10.csv', 'session12.csv', 'session13.csv', 'session15.csv']

# Verify the folder exists
if not os.path.exists(session_folder_path):
    print(f"❌ Session folder not found: {session_folder_path}")
    print("Please verify the folder path exists")
else:
    print(f"✅ Session folder found: {session_folder_path}")
    print(f"Files in folder: {os.listdir(session_folder_path)}")

dfs_session = []

# Load all session files into a list
for file_name in session_files:
    try:
        # Construct full file path
        file_path = os.path.join(session_folder_path, file_name)
        
        # Check if file exists
        if not os.path.exists(file_path):
            print(f"❌ File not found: {file_path}")
            continue
        
        # Read the CSV file
        df = pd.read_csv(file_path)
        dfs_session.append(df)
        print(f"✅ Successfully loaded {file_name} with {len(df)} rows and {len(df.columns)} columns")
        
    except FileNotFoundError:
        print(f"❌ File {file_name} not found at path: {os.path.join(session_folder_path, file_name)}")
    except Exception as e:
        print(f"❌ Error loading {file_name}: {e}")

# Check if any files were loaded successfully
if dfs_session:
    # Concatenate all session DataFrames into a single DataFrame
    df_session_data = pd.concat(dfs_session, axis=0, ignore_index=True)
    
    print("\n" + "="*70)
    print("SESSION/OUTPUT FILES CONCATENATION RESULTS:")
    print("="*70)
    print(f"✅ Total files processed: {len(dfs_session)}")
    print(f"📊 Concatenated dataset shape: {df_session_data.shape}")
    print(f"📈 Total rows: {len(df_session_data):,}")
    print(f"📋 Total columns: {len(df_session_data.columns)}")
    
    # Show which files were successfully loaded
    print(f"\n📁 Successfully loaded files:")
    for i, file_name in enumerate([f for f in session_files if any(f in str(df) for df in dfs_session)]):
        print(f"   {i+1}. {file_name}")
    
    # Show basic dataset info
    if len(df_session_data) > 0:
        print(f"\n📊 Dataset Overview:")
        print(f"   Column names: {list(df_session_data.columns)}")
        
        # Check for Label column and show distribution if it exists
        if 'Label' in df_session_data.columns:
            label_counts = df_session_data['Label'].value_counts()
            print(f"   Label distribution:")
            for label, count in label_counts.items():
                print(f"     Label {label}: {count:,} rows ({count/len(df_session_data)*100:.1f}%)")
        else:
            print("   No 'Label' column found in the data")
            
else:
    print("❌ No session files were loaded successfully")
    df_session_data = pd.DataFrame()

print("\n" + "="*70)

✅ Session folder found: C:\Users\G800613RTS\Desktop\Det_Ano_IM\normal
Files in folder: ['output.csv', 'output1.csv', 'session10.csv', 'session12.csv', 'session13.csv', 'session15.csv', 'session3.csv', 'session5.csv', 'session7.csv', 'session8.csv', 'session9.csv']
✅ Successfully loaded output.csv with 9605 rows and 83 columns
✅ Successfully loaded output1.csv with 4611 rows and 83 columns
✅ Successfully loaded session3.csv with 4611 rows and 83 columns
✅ Successfully loaded session5.csv with 1066 rows and 83 columns
✅ Successfully loaded session7.csv with 2686 rows and 83 columns
✅ Successfully loaded session8.csv with 3754 rows and 83 columns
✅ Successfully loaded session9.csv with 3808 rows and 83 columns
✅ Successfully loaded session10.csv with 1892 rows and 83 columns
✅ Successfully loaded session12.csv with 3150 rows and 83 columns
✅ Successfully loaded session13.csv with 4690 rows and 83 columns
✅ Successfully loaded session15.csv with 282 rows and 83 columns

SESSION/OUTPUT FILE

In [19]:
df_combined = pd.concat([df_data, df_session_data], axis=0, ignore_index=True)

In [20]:
# Concatenate both datasets
df_combined = pd.concat([df_data, df_session_data], axis=0, ignore_index=True)

print("="*70)
print("COMBINED DATASET RESULTS:")
print("="*70)

print(f"df_data (DOS files) shape: {df_data.shape}")
print(f"df_session_data (Session files) shape: {df_session_data.shape}")
print(f"Combined dataset shape: {df_combined.shape}")

print(f"\nTotal rows in combined dataset: {len(df_combined)}")
print(f"Total columns in combined dataset: {len(df_combined.columns)}")

print("\n" + "="*50 + "\n")

# Show label distribution
print("LABEL DISTRIBUTION:")
label_counts = df_combined['Label'].value_counts().sort_index()
print(label_counts)

print("\nLABEL PERCENTAGE DISTRIBUTION:")
label_percentages = df_combined['Label'].value_counts(normalize=True).sort_index() * 100
print(f"Label 0 (Normal/Benign): {label_percentages[0]:.2f}%")
print(f"Label 1 (Anomaly/Attack): {label_percentages[1]:.2f}%")

print("\n" + "="*50 + "\n")

print("DETAILED BREAKDOWN:")
print(f"Rows with Label 0 (Normal): {label_counts[0]:,}")
print(f"Rows with Label 1 (Anomaly): {label_counts[1]:,}")
print(f"Total rows: {label_counts.sum():,}")

print("\n" + "="*50 + "\n")

print("DATASET SOURCES:")
print(f"From DOS files (mostly anomalies): {len(df_data):,} rows")
print(f"From Session files (normal traffic): {len(df_session_data):,} rows")
print(f"Combined total: {len(df_combined):,} rows")

COMBINED DATASET RESULTS:
df_data (DOS files) shape: (11678, 83)
df_session_data (Session files) shape: (40155, 83)
Combined dataset shape: (51833, 83)

Total rows in combined dataset: 51833
Total columns in combined dataset: 83


LABEL DISTRIBUTION:
Label
0    40155
1    11678
Name: count, dtype: int64

LABEL PERCENTAGE DISTRIBUTION:
Label 0 (Normal/Benign): 77.47%
Label 1 (Anomaly/Attack): 22.53%


DETAILED BREAKDOWN:
Rows with Label 0 (Normal): 40,155
Rows with Label 1 (Anomaly): 11,678
Total rows: 51,833


DATASET SOURCES:
From DOS files (mostly anomalies): 11,678 rows
From Session files (normal traffic): 40,155 rows
Combined total: 51,833 rows


In [21]:
df= df_combined

In [22]:
df_combined.shape

(51833, 83)

In [17]:
df_combined=df_combined.drop(columns=['src_ip', 'dst_ip', 'timestamp'])

In [23]:
df_combined.shape

(51833, 83)

In [24]:
df= df.drop(columns=['src_ip', 'dst_ip', 'timestamp'])

In [26]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['Label']= encoder.fit_transform(df['Label'])

In [27]:
df['Label'].value_counts().sum

<bound method Series.sum of Label
0    40155
1    11678
Name: count, dtype: int64>

In [28]:
# Check for NaNs
nan_mask = df.isna()
print("NaNs in DataFrame:\n", df[nan_mask].sum())

# Check for infinities
inf_mask = df.isin([np.inf, -np.inf])
print("Infs in DataFrame:\n", df[inf_mask].sum())

NaNs in DataFrame:
 src_port            0.0
dst_port            0.0
protocol            0.0
flow_duration       0.0
flow_byts_s         0.0
                   ... 
subflow_fwd_pkts    0.0
subflow_bwd_pkts    0.0
subflow_fwd_byts    0.0
subflow_bwd_byts    0.0
Label               0.0
Length: 80, dtype: float64
Infs in DataFrame:
 src_port            0.0
dst_port            0.0
protocol            0.0
flow_duration       0.0
flow_byts_s         0.0
                   ... 
subflow_fwd_pkts    0.0
subflow_bwd_pkts    0.0
subflow_fwd_byts    0.0
subflow_bwd_byts    0.0
Label               0.0
Length: 80, dtype: float64


In [29]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace infinities with NaN
df.fillna(0, inplace=True)  # Replace NaNs with 0

In [30]:
df.isnull().sum()

src_port            0
dst_port            0
protocol            0
flow_duration       0
flow_byts_s         0
                   ..
subflow_fwd_pkts    0
subflow_bwd_pkts    0
subflow_fwd_byts    0
subflow_bwd_byts    0
Label               0
Length: 80, dtype: int64

In [31]:
df=df.astype(int)
df

Unnamed: 0,src_port,dst_port,protocol,flow_duration,flow_byts_s,flow_pkts_s,fwd_pkts_s,bwd_pkts_s,tot_fwd_pkts,tot_bwd_pkts,...,fwd_blk_rate_avg,bwd_blk_rate_avg,fwd_seg_size_avg,bwd_seg_size_avg,cwr_flag_count,subflow_fwd_pkts,subflow_bwd_pkts,subflow_fwd_byts,subflow_bwd_byts,Label
0,59274,8009,6,0,218055,574,332,241,11,8,...,0,0,249,557,0,11,8,2748,4462,1
1,59286,8009,6,0,225340,593,343,250,11,8,...,0,0,249,557,0,11,8,2748,4462,1
2,63489,8009,6,0,249818,617,326,290,9,8,...,0,0,278,546,0,9,8,2508,4370,1
3,63491,8009,6,0,370123,914,484,430,9,8,...,0,0,278,546,0,9,8,2508,4370,1
4,59300,8009,6,0,218277,696,403,293,11,8,...,0,0,135,557,0,11,8,1495,4462,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51828,50557,20002,17,0,0,0,0,0,2,0,...,0,0,415,0,0,2,0,830,0,0
51829,443,46848,17,6,563,4,2,1,17,11,...,0,0,140,104,0,17,11,2393,1151,0
51830,53139,443,17,0,16088,64,32,32,4,4,...,0,0,393,105,0,4,4,1575,423,0
51831,58071,443,17,4,158621,192,70,122,336,585,...,120045,3028042,86,1247,0,336,585,29165,729942,0


In [32]:
X = df.drop('Label',axis=1)
y = df['Label']

In [33]:
X, y

(       src_port  dst_port  protocol  flow_duration  flow_byts_s  flow_pkts_s  \
 0         59274      8009         6              0       218055          574   
 1         59286      8009         6              0       225340          593   
 2         63489      8009         6              0       249818          617   
 3         63491      8009         6              0       370123          914   
 4         59300      8009         6              0       218277          696   
 ...         ...       ...       ...            ...          ...          ...   
 51828     50557     20002        17              0            0            0   
 51829       443     46848        17              6          563            4   
 51830     53139       443        17              0        16088           64   
 51831     58071       443        17              4       158621          192   
 51832       443     50738         6              0            0            0   
 
        fwd_pkts_s  bwd_pk

In [34]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [35]:
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
# Impute missing values (replace NaNs with the mean)
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Determine the number of columns (features) in your DataFrame
num_columns = df.shape[1]

# Set an appropriate value for k (less than or equal to the number of columns)
k = min(10, num_columns)  # Adjust this as needed

# Initialize SelectKBest with the scoring function
k_best = SelectKBest(score_func=f_classif, k=k)

# Fit and transform the imputed data to select the top 10 features
X_new = k_best.fit_transform(X_imputed, y)

In [36]:
# Get the boolean mask of selected features
selected_features_mask = k_best.get_support()
selected_features_mask 

array([ True, False,  True, False, False, False, False, False, False,
       False, False, False,  True, False,  True, False, False, False,
       False, False, False, False,  True,  True, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False])

In [37]:
elected_feature_names = X.columns[selected_features_mask]

In [38]:
elected_feature_names

Index(['src_port', 'protocol', 'fwd_pkt_len_max', 'fwd_pkt_len_mean',
       'pkt_len_mean', 'pkt_len_std', 'fwd_iat_tot', 'syn_flag_cnt',
       'pkt_size_avg', 'fwd_seg_size_avg'],
      dtype='object')

In [39]:
new_columns=['dst_port','flow_duration', 'fwd_pkt_len_max', 'fwd_pkt_len_mean',
       'pkt_len_mean', 'pkt_len_std', 'fwd_iat_tot', 'syn_flag_cnt',
       'pkt_size_avg', 'fwd_seg_size_avg']

In [40]:
df_new=X[new_columns]
df_new

Unnamed: 0,dst_port,flow_duration,fwd_pkt_len_max,fwd_pkt_len_mean,pkt_len_mean,pkt_len_std,fwd_iat_tot,syn_flag_cnt,pkt_size_avg,fwd_seg_size_avg
0,8009,0,1842,249,379,577,0,3,379,249
1,8009,0,1842,249,379,577,0,3,379,249
2,8009,0,1798,278,404,595,0,3,404,278
3,8009,0,1798,278,404,595,0,3,404,278
4,8009,0,589,135,313,468,0,3,313,135
...,...,...,...,...,...,...,...,...,...,...
51828,20002,0,415,415,415,0,0,0,415,415
51829,46848,6,278,140,126,66,6,0,126,140
51830,443,0,706,393,249,266,0,0,249,393
51831,443,4,191,86,824,587,4,0,824,86


In [41]:
df_new['label']=df['Label']
df_new['label']

0        1
1        1
2        1
3        1
4        1
        ..
51828    0
51829    0
51830    0
51831    0
51832    0
Name: label, Length: 51833, dtype: int32

In [42]:
X1 = df_new[new_columns].values  # Get the 10 selected features
y1 = df_new['label'].values 

In [43]:
X1, y1

(array([[ 8009,     0,  1842, ...,     3,   379,   249],
        [ 8009,     0,  1842, ...,     3,   379,   249],
        [ 8009,     0,  1798, ...,     3,   404,   278],
        ...,
        [  443,     0,   706, ...,     0,   249,   393],
        [  443,     4,   191, ...,     0,   824,    86],
        [50738,     0,    72, ...,     0,    72,    72]]),
 array([1, 1, 1, ..., 0, 0, 0]))

In [44]:
X1.shape

(51833, 10)

Modelling Stage

SafeML

Running the Statistical distance measure algorithms

In [45]:
# Firstly, creating the final 2D-array (Pandas Dataframe) which will be used to store the Results
# PRESERVE THE ORDERING
results_column_names = ['Accuracy', 'Anderson_Darling_dist', 'CVM_dist',
                                     'DTS_dist', 'Kolmogorov_Smirnov_dist','Kuiper_dist', 'Wasserstein_dist']
# Creating the empty Dataframe for Results
df_results = pd.DataFrame(columns = results_column_names)
# Can copy this dataframe for future results tables e.g. for each class/ label
df_results

Unnamed: 0,Accuracy,Anderson_Darling_dist,CVM_dist,DTS_dist,Kolmogorov_Smirnov_dist,Kuiper_dist,Wasserstein_dist


Code for each permutation

In [46]:
from tqdm import tqdm
import pickle
from sklearn.model_selection import train_test_split

# Set up variables
labels = df['Label'].unique()
number_of_classes = len(labels)
list_of_lists_results = [[] for _ in range(number_of_classes)]

print("🚀 Starting single DNN training...")

# 1. Split data (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X1, y1, test_size=0.33, random_state=42, stratify=y
)

# 2. Train DNN model and get predictions
pred_y, accuracy, trained_model, scaler = train_and_predict_DNN_model(X_train, X_test, y_train, y_test)
print(f"✅ Model trained. Accuracy: {accuracy:.4f}")

🚀 Starting single DNN training...

✅ Training completed! Stopped at epoch 13
✅ Model trained. Accuracy: 0.9980


In [47]:
# Count predictions for each class
import numpy as np
pred_counts = np.bincount(pred_y)
total_predictions = len(pred_y)

print(f"Total test samples: {total_predictions}")
print(f"Predictions breakdown:")
for class_idx, count in enumerate(pred_counts):
    percentage = (count / total_predictions) * 100
    class_name = "Normal" if class_idx == 0 else "Anomaly"
    print(f"   Class {class_idx} ({class_name}): {count:,} samples ({percentage:.1f}%)")


Total test samples: 17105
Predictions breakdown:
   Class 0 (Normal): 13,273 samples (77.6%)
   Class 1 (Anomaly): 3,832 samples (22.4%)


In [48]:
# FIXED: Sort labels to ensure [0, 1] order
labels = sorted(df['Label'].unique())  # This ensures [0, 1] instead of [1, 0]
number_of_classes = len(labels)
list_of_lists_results = [[] for _ in range(number_of_classes)]

In [49]:

desired_samples = {0: 13273, 1: 3832}  
# 3. Loop over each label/class for ECDF statistical distance measures
for current_label in range(number_of_classes):
    X_train_loc_for_label, X_test_loc_for_label = get_X_train_and_test_data_for_given_label(
        labels, current_label, pred_y, X_train, X_test, y_train, y_test
    )
    print(f"   Training data shape for Class {current_label}: {X_train_loc_for_label.shape}")
    print(f"   Test data shape for Class {current_label}: {X_test_loc_for_label.shape}")
    print(f"   Training samples: {len(X_train_loc_for_label)} rows")
    print(f"   Test samples: {len(X_test_loc_for_label)} rows")
    print(f"   Features: {X_train_loc_for_label.shape[1] if len(X_train_loc_for_label) > 0 else 'N/A'}")

    # --- FORCE TEST DATA TO DESIRED SIZE ---
    n_desired = desired_samples.get(current_label, len(X_test_loc_for_label))
    if len(X_test_loc_for_label) > n_desired:
        X_test_loc_for_label = X_test_loc_for_label.sample(n=n_desired, random_state=42)
    elif len(X_test_loc_for_label) < n_desired:
        print(f"⚠️ Not enough samples for Class {current_label}: requested {n_desired}, available {len(X_test_loc_for_label)}. Oversampling with replacement.")
        X_test_loc_for_label = X_test_loc_for_label.sample(n=n_desired, replace=True, random_state=42)

    # Show the number of samples after forcing the size
    print(f"👉 After sampling: Class {current_label} test set has {len(X_test_loc_for_label)} samples (target: {n_desired})")

    dict_result_row = get_statistical_dist_measures_for_class_result(
        accuracy, X_train_loc_for_label, X_test_loc_for_label
    )
    list_of_lists_results[current_label].append(dict_result_row)

# 4. Save the trained model and scaler
trained_model.save("best_dnn_model_final.h5")
with open("best_scaler_final.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("\n" + "="*60)
print("🏆 MODEL SAVED!")
print("="*60)
print(f"✅ Model saved as: best_dnn_model_final.h5")
print(f"✅ Scaler saved as: best_scaler_final.pkl")
print(f"📊 Accuracy achieved: {accuracy:.4f} ({accuracy*100:.2f}%)")
print("="*60)

   Training data shape for Class 0: (26904, 10)
   Test data shape for Class 0: (13273, 10)
   Training samples: 26904 rows
   Test samples: 13273 rows
   Features: 10
👉 After sampling: Class 0 test set has 13273 samples (target: 13273)
   Training data shape for Class 1: (7824, 10)
   Test data shape for Class 1: (3832, 10)
   Training samples: 7824 rows
   Test samples: 3832 rows
   Features: 10
👉 After sampling: Class 1 test set has 3832 samples (target: 3832)

🏆 MODEL SAVED!
✅ Model saved as: best_dnn_model_final.h5
✅ Scaler saved as: best_scaler_final.pkl
📊 Accuracy achieved: 0.9980 (99.80%)


In [50]:
# Access specific dataframe by index e.g. class 1 dataframe->index 1 mapping
result_dataframes = []

for dict_result_list in list_of_lists_results:
    result_dataframes.append(pd.DataFrame(dict_result_list, columns = results_column_names))

In [51]:
# Print first dataframe result table, for class 0
result_dataframes[0].head()

Unnamed: 0,Accuracy,Anderson_Darling_dist,CVM_dist,DTS_dist,Kolmogorov_Smirnov_dist,Kuiper_dist,Wasserstein_dist
0,0.998012,0.025108,1.569996,0.284515,0.007785,0.010759,10.844416


In [52]:
result_dataframes[0].to_excel("Class0.xlsx")

In [53]:
# Print second dataframe result table, for class 1
result_dataframes[1].head(8)

Unnamed: 0,Accuracy,Anderson_Darling_dist,CVM_dist,DTS_dist,Kolmogorov_Smirnov_dist,Kuiper_dist,Wasserstein_dist
0,0.998012,0.006444,0.119378,0.294817,0.011688,0.011707,3.220949


In [54]:
result_dataframes[1].to_excel("Class1.xlsx")

Plotting the SafeML Results

In [55]:
def predict_realtime_with_pretrained_model(
    data_path="real.csv",
    model_path="best_dnn_model_final.h5",
    scaler_path="best_scaler_final.pkl"
):
    """
    Predicts using a pretrained DNN model and scaler on real-time data.

    Args:
        data_path (str): Path to the real-time CSV data.
        model_path (str): Path to the pretrained Keras model (.h5).
        scaler_path (str): Path to the pretrained scaler (.pkl).

    Returns:
        dict: Prediction results and statistics.
    """
    import pandas as pd
    import numpy as np
    import tensorflow as tf
    import pickle
    import os
    from sklearn.preprocessing import LabelEncoder

    print(f"📥 Loading real-time data from {data_path} ...")
    if not os.path.exists(data_path):
        print(f"❌ Data file not found: {data_path}")
        return None

    df_realtime = pd.read_csv(data_path)
    print(f"✅ Loaded {len(df_realtime)} samples from {data_path}")
    print(f"   Original shape: {df_realtime.shape}")

    # Save true labels if available for comparison
    y_true = None
    if ' Label' in df_realtime.columns:
        encoder = LabelEncoder()
        y_true = encoder.fit_transform(df_realtime[' Label'])
        df_realtime = df_realtime.drop(' Label', axis=1)
        print("   True labels saved for comparison")

    # Step 2: Select ONLY the new_columns (same 10 features as training)
    print("\n🎯 Step 2: Selecting the same 10 features as training...")
    new_columns = ['dst_port','flow_duration', 'fwd_pkt_len_max', 'fwd_pkt_len_mean',
       'pkt_len_mean', 'pkt_len_std', 'fwd_iat_tot', 'syn_flag_cnt',
       'pkt_size_avg', 'fwd_seg_size_avg']
    missing_cols = [col for col in new_columns if col not in df_realtime.columns]
    if missing_cols:
        print(f"❌ Missing required columns: {missing_cols}")
        return None

    df_realtime_selected = df_realtime[new_columns].copy()
    print(f"✅ Selected same 10 features: {df_realtime_selected.shape}")

    # Step 3: Apply same preprocessing as training
    print("\n🔧 Step 3: Applying same preprocessing as training...")
    print("   Handling NaNs and infinities...")
    df_realtime_selected.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_realtime_selected.fillna(0, inplace=True)
    print("   Converting to integer type...")
    df_realtime_selected = df_realtime_selected.astype(int)
    print(f"✅ Preprocessed data shape: {df_realtime_selected.shape}")

    # Step 4: Load pretrained scaler and apply scaling
    print("\n🤖 Step 4: Loading pretrained scaler and applying scaling...")
    if not os.path.exists(scaler_path):
        print(f"❌ Scaler file not found: {scaler_path}")
        return None
    with open(scaler_path, "rb") as f:
        scaler_loaded = pickle.load(f)
    X_realtime_scaled = scaler_loaded.transform(df_realtime_selected)
    print(f"✅ Applied saved scaler to real-time data")
    print(f"   Scaled data shape: {X_realtime_scaled.shape}")

    # Step 5: Load pretrained model and make predictions
    print("\n🔮 Step 5: Loading pretrained model and making predictions...")
    if not os.path.exists(model_path):
        print(f"❌ Model file not found: {model_path}")
        return None
    model = tf.keras.models.load_model(model_path)
    print(f"✅ Loaded trained model from {model_path}")
    print(f"   Model input shape: {model.input_shape}")
    print(f"   Real-time data shape: {X_realtime_scaled.shape}")

    pred_prob = model.predict(X_realtime_scaled, verbose=0)
    pred_labels = (pred_prob > 0.5).astype(int).flatten()

    normal_count = np.sum(pred_labels == 0)
    anomaly_count = np.sum(pred_labels == 1)
    print(f"✅ Predictions completed:")
    print(f"   Normal traffic (Class 0): {normal_count} samples ({normal_count/len(pred_labels)*100:.1f}%)")
    print(f"   Anomaly traffic (Class 1): {anomaly_count} samples ({anomaly_count/len(pred_labels)*100:.1f}%)")

    results = {
        "total_samples": len(df_realtime_selected),
        "normal_count": normal_count,
        "anomaly_count": anomaly_count,
        "normal_percentage": normal_count / len(pred_labels) * 100,
        "anomaly_percentage": anomaly_count / len(pred_labels) * 100,
        "predictions": pred_labels,
        "prediction_probabilities": pred_prob.flatten(),
        "y_true": y_true
    }
    return results

#Example usage:
results = predict_realtime_with_pretrained_model(
    data_path="real.csv",
    model_path="best_dnn_model_final.h5",
    scaler_path="best_scaler_final.pkl"
)

📥 Loading real-time data from real.csv ...
✅ Loaded 25916 samples from real.csv
   Original shape: (25916, 83)

🎯 Step 2: Selecting the same 10 features as training...
✅ Selected same 10 features: (25916, 10)

🔧 Step 3: Applying same preprocessing as training...
   Handling NaNs and infinities...
   Converting to integer type...
✅ Preprocessed data shape: (25916, 10)

🤖 Step 4: Loading pretrained scaler and applying scaling...
✅ Applied saved scaler to real-time data
   Scaled data shape: (25916, 10)

🔮 Step 5: Loading pretrained model and making predictions...
✅ Loaded trained model from best_dnn_model_final.h5
   Model input shape: (None, 10)
   Real-time data shape: (25916, 10)
✅ Predictions completed:
   Normal traffic (Class 0): 20089 samples (77.5%)
   Anomaly traffic (Class 1): 5827 samples (22.5%)


In [56]:
import pandas as pd
import numpy as np

# 1. Run your prediction function and get results
results = predict_realtime_with_pretrained_model(
    data_path="real.csv",
    model_path="best_dnn_model_final.h5",
    scaler_path="best_scaler_final.pkl"
)

# 2. Prepare the DataFrame with the same 10 features as used in prediction
df_realtime = pd.read_csv("real.csv")
new_columns = ['dst_port','flow_duration', 'fwd_pkt_len_max', 'fwd_pkt_len_mean',
       'pkt_len_mean', 'pkt_len_std', 'fwd_iat_tot', 'syn_flag_cnt',
       'pkt_size_avg', 'fwd_seg_size_avg']
df_realtime_selected = df_realtime[new_columns].copy()
df_realtime_selected.replace([np.inf, -np.inf], np.nan, inplace=True)
df_realtime_selected.fillna(0, inplace=True)
df_realtime_selected = df_realtime_selected.astype(int)

# 3. Get predicted labels
pred_labels = results['predictions']

# 4. Select indices for each class
idx_class0 = np.where(pred_labels == 0)[0]
idx_class1 = np.where(pred_labels == 1)[0]

# 5. Sample the required number of samples for each class
desired_samples = {0: 13273, 1: 3832} 
np.random.seed(42)
if len(idx_class0) >= desired_samples[0]:
    idx_class0_sampled = np.random.choice(idx_class0, desired_samples[0], replace=False)
else:
    idx_class0_sampled = np.random.choice(idx_class0, desired_samples[0], replace=True)
if len(idx_class1) >= desired_samples[1]:
    idx_class1_sampled = np.random.choice(idx_class1, desired_samples[1], replace=False)
else:
    idx_class1_sampled = np.random.choice(idx_class1, desired_samples[1], replace=True)

# 6. Concatenate the sampled indices
idx_final = np.concatenate([idx_class0_sampled, idx_class1_sampled])
X_test_for_metrics = df_realtime_selected.iloc[idx_final].reset_index(drop=True)

# 7. For the training data, use your existing X_train (should be the same 10 features, already preprocessed)
#    If X_train is not a DataFrame, convert it:
if not isinstance(X_train, pd.DataFrame):
    X_train_df = pd.DataFrame(X_train, columns=new_columns)
else:
    X_train_df = X_train

# 8. Calculate metrics for each class using your function
metrics_results = []
for label, n_samples in desired_samples.items():
    # Training data for this class
    X_train_L = X_train_df[y_train == label]
    # Test data for this class
    X_test_L = X_test_for_metrics.iloc[
        np.where(pred_labels[idx_final] == label)[0]
    ]
    # Print number of samples in train and test for this class
    print(f"Class {label}: Train samples = {len(X_train_L)}, Test samples = {len(X_test_L)}")
    # Calculate metrics (set accuracy=1)
    metrics = get_statistical_dist_measures_for_class_result(
        accuracy=1.0, X_train_L=X_train_L, X_test_L=X_test_L
    )
    print(f"Class {label} metrics:", metrics)
    metrics_results.append(metrics)

📥 Loading real-time data from real.csv ...
✅ Loaded 25916 samples from real.csv
   Original shape: (25916, 83)

🎯 Step 2: Selecting the same 10 features as training...
✅ Selected same 10 features: (25916, 10)

🔧 Step 3: Applying same preprocessing as training...
   Handling NaNs and infinities...
   Converting to integer type...
✅ Preprocessed data shape: (25916, 10)

🤖 Step 4: Loading pretrained scaler and applying scaling...
✅ Applied saved scaler to real-time data
   Scaled data shape: (25916, 10)

🔮 Step 5: Loading pretrained model and making predictions...
✅ Loaded trained model from best_dnn_model_final.h5
   Model input shape: (None, 10)
   Real-time data shape: (25916, 10)
✅ Predictions completed:
   Normal traffic (Class 0): 20089 samples (77.5%)
   Anomaly traffic (Class 1): 5827 samples (22.5%)
Class 0: Train samples = 26904, Test samples = 13273
Class 0 metrics: {'Accuracy': 1.0, 'Anderson_Darling_dist': 0.012788796898042557, 'CVM_dist': 0.6782712861767471, 'DTS_dist': 0.22

In [57]:
# Convert metrics_results to the required dictionaries for each class
realtime_distances_class0 = {
    'Anderson_Darling_dist': metrics_results[0]['Anderson_Darling_dist'],
    'CVM_dist': metrics_results[0]['CVM_dist'],
    'DTS_dist': metrics_results[0]['DTS_dist'],
    'Kolmogorov_Smirnov_dist': metrics_results[0]['Kolmogorov_Smirnov_dist'],
    'Kuiper_dist': metrics_results[0]['Kuiper_dist'],
    'Wasserstein_dist': metrics_results[0]['Wasserstein_dist']
}
realtime_distances_class1 = {
    'Anderson_Darling_dist': metrics_results[1]['Anderson_Darling_dist'],
    'CVM_dist': metrics_results[1]['CVM_dist'],
    'DTS_dist': metrics_results[1]['DTS_dist'],
    'Kolmogorov_Smirnov_dist': metrics_results[1]['Kolmogorov_Smirnov_dist'],
    'Kuiper_dist': metrics_results[1]['Kuiper_dist'],
    'Wasserstein_dist': metrics_results[1]['Wasserstein_dist']
}

In [58]:
import pandas as pd

# Load training results
df_train_class0 = pd.read_excel("Class0.xlsx")
df_train_class1 = pd.read_excel("Class1.xlsx")

# Get all training distances (as arrays)
train0 = df_train_class0.iloc[0]  # If only one row, or use .mean() for average
train1 = df_train_class1.iloc[0]

# Get real-time results (from your previous metrics_results)
realtime0 = metrics_results[0]
realtime1 = metrics_results[1]

distance_names = [
    'Anderson_Darling_dist', 'CVM_dist', 'DTS_dist',
    'Kolmogorov_Smirnov_dist', 'Kuiper_dist', 'Wasserstein_dist'
]

print("\nClass 0:")
for d in distance_names:
    train_val = train0[d]
    real_val = realtime0[d]
    ratio = real_val / train_val if train_val != 0 else float('inf')
    print(f"{d}: training = {train_val:.6f} | realtime = {real_val:.6f} | ratio = {ratio:.2f}")

print("\nClass 1:")
for d in distance_names:
    train_val = train1[d]
    real_val = realtime1[d]
    ratio = real_val / train_val if train_val != 0 else float('inf')
    print(f"{d}: training = {train_val:.6f} | realtime = {real_val:.6f} | ratio = {ratio:.2f}")


Class 0:
Anderson_Darling_dist: training = 0.025108 | realtime = 0.012789 | ratio = 0.51
CVM_dist: training = 1.569996 | realtime = 0.678271 | ratio = 0.43
DTS_dist: training = 0.284515 | realtime = 0.225784 | ratio = 0.79
Kolmogorov_Smirnov_dist: training = 0.007785 | realtime = 0.003745 | ratio = 0.48
Kuiper_dist: training = 0.010759 | realtime = 0.005282 | ratio = 0.49
Wasserstein_dist: training = 10.844416 | realtime = 10.033164 | ratio = 0.93

Class 1:
Anderson_Darling_dist: training = 0.006444 | realtime = 0.005085 | ratio = 0.79
CVM_dist: training = 0.119378 | realtime = 0.077364 | ratio = 0.65
DTS_dist: training = 0.294817 | realtime = 0.276289 | ratio = 0.94
Kolmogorov_Smirnov_dist: training = 0.011688 | realtime = 0.006717 | ratio = 0.57
Kuiper_dist: training = 0.011707 | realtime = 0.007309 | ratio = 0.62
Wasserstein_dist: training = 3.220949 | realtime = 2.419690 | ratio = 0.75


In [59]:
def estimate_accuracy_from_distance_ratios(training_data_path_class0, training_data_path_class1, 
                                         realtime_distances_class0, realtime_distances_class1):
    """
    Estimate accuracy based on distance ratios between training and real-time data.
    
    Parameters:
    -----------
    training_data_path_class0 : str
        Path to Class0.xlsx file
    training_data_path_class1 : str  
        Path to Class1.xlsx file
    realtime_distances_class0 : dict
        Real-time distances for class 0
    realtime_distances_class1 : dict
        Real-time distances for class 1
    
    Returns:
    --------
    tuple : (estimated_accuracy, detailed_analysis)
    """
    try:
        print("📊 Loading training data and calculating accuracy estimation...")
        
        # Load training data
        df_class0 = pd.read_excel(training_data_path_class0)
        df_class1 = pd.read_excel(training_data_path_class1)
        
        # Get best accuracy from each class (assuming first row or max accuracy)
        if 'Accuracy' in df_class0.columns:
            best_accuracy_class0 = df_class0['Accuracy'].max()
            best_idx_class0 = df_class0['Accuracy'].idxmax()
        else:
            best_accuracy_class0 = df_class0.iloc[0, 0]  # Assume first column is accuracy
            best_idx_class0 = 0
            
        if 'Accuracy' in df_class1.columns:
            best_accuracy_class1 = df_class1['Accuracy'].max()
            best_idx_class1 = df_class1['Accuracy'].idxmax()
        else:
            best_accuracy_class1 = df_class1.iloc[0, 0]  # Assume first column is accuracy
            best_idx_class1 = 0
        
        # Get best training distances
        train_distances_class0 = df_class0.iloc[best_idx_class0]
        train_distances_class1 = df_class1.iloc[best_idx_class1]
        
        print(f"Best Class 0 accuracy: {best_accuracy_class0:.4f} ({best_accuracy_class0*100:.2f}%)")
        print(f"Best Class 1 accuracy: {best_accuracy_class1:.4f} ({best_accuracy_class1*100:.2f}%)")
        
        # Distance measures
        distance_measures = [
            'Anderson_Darling_dist', 'CVM_dist', 'DTS_dist',
            'Kolmogorov_Smirnov_dist', 'Kuiper_dist', 'Wasserstein_dist'
        ]
        
        # Calculate ratios and similarities for each class
        class0_analysis = analyze_class_distances(
            train_distances_class0, realtime_distances_class0, 
            distance_measures, "Class 0", best_accuracy_class0
        )
        
        class1_analysis = analyze_class_distances(
            train_distances_class1, realtime_distances_class1, 
            distance_measures, "Class 1", best_accuracy_class1
        )
        
        # Calculate overall similarity score
        overall_similarity = calculate_weighted_similarity(class0_analysis, class1_analysis)
        
        # Estimate final accuracy
        overall_best_accuracy = max(best_accuracy_class0, best_accuracy_class1)
        estimated_accuracy = estimate_final_accuracy(overall_similarity, overall_best_accuracy)
        
        # Prepare detailed results
        detailed_analysis = {
            'class0_analysis': class0_analysis,
            'class1_analysis': class1_analysis,
            'overall_similarity': overall_similarity,
            'best_accuracy_class0': best_accuracy_class0,
            'best_accuracy_class1': best_accuracy_class1,
            'overall_best_accuracy': overall_best_accuracy,
            'estimated_accuracy': estimated_accuracy
        }
        
        print(f"\n🎯 Final Results:")
        print(f"Overall similarity score: {overall_similarity:.2f}%")
        print(f"Estimated accuracy: {estimated_accuracy:.4f} ({estimated_accuracy*100:.2f}%)")
        
        return estimated_accuracy, detailed_analysis
        
    except Exception as e:
        print(f"❌ Error in accuracy estimation: {e}")
        import traceback
        traceback.print_exc()
        return 0.5, {}

def analyze_class_distances(train_distances, realtime_distances, distance_measures, class_name, class_accuracy):
    """
    Analyze distances for a specific class and calculate similarity.
    """
    print(f"\n📏 {class_name} Distance Analysis:")
    print(f"{'Measure':<25} {'Training':<12} {'Real-time':<12} {'Ratio':<8} {'Similarity %':<12}")
    print("-" * 80)
    
    similarities = []
    valid_measures = 0
    
    for measure in distance_measures:
        try:
            if measure in train_distances.index and measure in realtime_distances:
                train_val = train_distances[measure]
                real_val = realtime_distances[measure]
                
                if pd.isna(train_val) or train_val == 0:
                    continue
                
                ratio = real_val / train_val
                
                # Calculate similarity based on ratio
                # Since ratio < 1 means real-time distances are smaller (better similarity)
                if ratio <= 1.0:
                    similarity = 100 * (2 - ratio)  # Higher similarity for lower ratios
                    if similarity > 100:
                        similarity = 100
                else:
                    similarity = 100 / ratio  # Lower similarity for higher ratios
                
                similarities.append(similarity)
                valid_measures += 1
                
                print(f"{measure:<25} {train_val:<12.6f} {real_val:<12.6f} {ratio:<8.2f} {similarity:<12.2f}")
                
        except Exception as e:
            print(f"Error processing {measure}: {e}")
            continue
    
    avg_similarity = np.mean(similarities) if similarities else 0
    print(f"\nAverage similarity for {class_name}: {avg_similarity:.2f}%")
    
    return {
        'similarities': similarities,
        'average_similarity': avg_similarity,
        'valid_measures': valid_measures,
        'class_accuracy': class_accuracy
    }

def calculate_weighted_similarity(class0_analysis, class1_analysis):
    """
    Calculate weighted overall similarity based on both classes.
    """
    total_weighted_similarity = 0
    total_weight = 0
    
    # Weight by number of valid measures and class accuracy
    for analysis in [class0_analysis, class1_analysis]:
        if analysis['valid_measures'] > 0:
            weight = analysis['valid_measures'] * analysis['class_accuracy']
            total_weighted_similarity += analysis['average_similarity'] * weight
            total_weight += weight
    
    if total_weight > 0:
        overall_similarity = total_weighted_similarity / total_weight
    else:
        overall_similarity = 50.0  # Default if no valid measures
    
    return overall_similarity

def estimate_final_accuracy(similarity_score, best_training_accuracy):
    """
    Estimate final accuracy based on similarity score.
    """
    # Define confidence levels and retention factors
    if similarity_score >= 95.0:
        retention_factor = 0.98
        confidence = "EXCELLENT"
    elif similarity_score >= 90.0:
        retention_factor = 0.95
        confidence = "VERY_HIGH"
    elif similarity_score >= 85.0:
        retention_factor = 0.90
        confidence = "HIGH"
    elif similarity_score >= 80.0:
        retention_factor = 0.85
        confidence = "GOOD"
    elif similarity_score >= 75.0:
        retention_factor = 0.80
        confidence = "MODERATE"
    elif similarity_score >= 70.0:
        retention_factor = 0.75
        confidence = "FAIR"
    else:
        retention_factor = 0.65
        confidence = "LOW"
    
    estimated_accuracy = best_training_accuracy * retention_factor
    
    print(f"\nConfidence level: {confidence}")
    print(f"Retention factor: {retention_factor:.2f}")
    
    return estimated_accuracy

In [60]:
estimated_accuracy, analysis = estimate_accuracy_from_distance_ratios(
        training_data_path_class0="Class0.xlsx",
        training_data_path_class1="Class1.xlsx",
        realtime_distances_class0=realtime_distances_class0,
        realtime_distances_class1=realtime_distances_class1
    )

📊 Loading training data and calculating accuracy estimation...
Best Class 0 accuracy: 0.9980 (99.80%)
Best Class 1 accuracy: 0.9980 (99.80%)

📏 Class 0 Distance Analysis:
Measure                   Training     Real-time    Ratio    Similarity %
--------------------------------------------------------------------------------
Anderson_Darling_dist     0.025108     0.012789     0.51     100.00      
CVM_dist                  1.569996     0.678271     0.43     100.00      
DTS_dist                  0.284515     0.225784     0.79     100.00      
Kolmogorov_Smirnov_dist   0.007785     0.003745     0.48     100.00      
Kuiper_dist               0.010759     0.005282     0.49     100.00      
Wasserstein_dist          10.844416    10.033164    0.93     100.00      

Average similarity for Class 0: 100.00%

📏 Class 1 Distance Analysis:
Measure                   Training     Real-time    Ratio    Similarity %
--------------------------------------------------------------------------------
Ande

In [61]:
def safeml_confidence_assessment_both_classes(best_training_accuracy, estimated_accuracy, distance_analysis):
    """
    Assess confidence using SafeML methodology based on BOTH Class 0 and Class 1 statistical distances.
    Uses comprehensive analysis of all distance measures for decision making.
    
    Parameters:
    -----------
    best_training_accuracy : float
        Best accuracy achieved during training
    estimated_accuracy : float
        Estimated accuracy based on both classes' distances
    distance_analysis : dict
        Detailed analysis of distance differences for both classes
    
    Returns:
    --------
    tuple : (decision, confidence_level, accuracy_difference)
    """
    # Calculate accuracy difference
    accuracy_difference = abs(best_training_accuracy - estimated_accuracy)
    overall_similarity = distance_analysis.get('overall_similarity', 0.0)
    overall_diff_percent = 100 - overall_similarity
    
    print(f"\n" + "="*80)
    print(f"SafeML Confidence Assessment (Both Classes):")
    print(f"   Best training accuracy: {best_training_accuracy:.4f} ({best_training_accuracy*100:.2f}%)")
    print(f"   Estimated accuracy: {estimated_accuracy:.4f} ({estimated_accuracy*100:.2f}%)")
    print(f"   Accuracy difference: {accuracy_difference:.4f} ({accuracy_difference*100:.2f}%)")
    print(f"   Overall similarity score: {overall_similarity:.2f}%")
    print(f"   Overall distance difference: {overall_diff_percent:.2f}%")
    
    # Multi-criteria decision making
    decision_factors = []
    
    # Factor 1: Overall similarity threshold
    if overall_similarity >= 90.0:
        similarity_decision = "AUTONOMOUS"
        similarity_confidence = "HIGH"
        decision_factors.append(("High Similarity", "AUTONOMOUS", "HIGH"))
    elif overall_similarity >= 75.0:
        similarity_decision = "AUTONOMOUS"
        similarity_confidence = "MODERATE"
        decision_factors.append(("Moderate Similarity", "AUTONOMOUS", "MODERATE"))
    else:
        similarity_decision = "HUMAN_INTERVENTION"
        similarity_confidence = "LOW"
        decision_factors.append(("Low Similarity", "HUMAN_INTERVENTION", "LOW"))
    
    # Factor 2: Accuracy drop threshold
    accuracy_drop_percent = (accuracy_difference / best_training_accuracy) * 100
    if accuracy_drop_percent <= 5.0:
        accuracy_decision = "AUTONOMOUS"
        accuracy_confidence = "HIGH"
        decision_factors.append(("Small Accuracy Drop", "AUTONOMOUS", "HIGH"))
    elif accuracy_drop_percent <= 15.0:
        accuracy_decision = "AUTONOMOUS"
        accuracy_confidence = "MODERATE"
        decision_factors.append(("Moderate Accuracy Drop", "AUTONOMOUS", "MODERATE"))
    else:
        accuracy_decision = "HUMAN_INTERVENTION"
        accuracy_confidence = "LOW"
        decision_factors.append(("Large Accuracy Drop", "HUMAN_INTERVENTION", "LOW"))
    
    # Factor 3: Class-specific analysis
    class0_analysis = distance_analysis.get('class0_analysis', {})
    class1_analysis = distance_analysis.get('class1_analysis', {})
    
    # Analyze Class 0
    if class0_analysis and class0_analysis.get('valid_measures', 0) > 0:
        avg_similarity_class0 = class0_analysis['average_similarity']
        if avg_similarity_class0 >= 90.0:
            class0_decision = "AUTONOMOUS"
            class0_conf = "HIGH"
        elif avg_similarity_class0 >= 75.0:
            class0_decision = "AUTONOMOUS"
            class0_conf = "MODERATE"
        else:
            class0_decision = "HUMAN_INTERVENTION"
            class0_conf = "LOW"
        decision_factors.append(("Class 0 Analysis", class0_decision, class0_conf))
    
    # Analyze Class 1
    if class1_analysis and class1_analysis.get('valid_measures', 0) > 0:
        avg_similarity_class1 = class1_analysis['average_similarity']
        if avg_similarity_class1 >= 90.0:
            class1_decision = "AUTONOMOUS"
            class1_conf = "HIGH"
        elif avg_similarity_class1 >= 75.0:
            class1_decision = "AUTONOMOUS"
            class1_conf = "MODERATE"
        else:
            class1_decision = "HUMAN_INTERVENTION"
            class1_conf = "LOW"
        decision_factors.append(("Class 1 Analysis", class1_decision, class1_conf))
    
    print(f"\nDecision Factors Analysis:")
    for factor_name, factor_decision, factor_confidence in decision_factors:
        print(f"   {factor_name:<25}: {factor_decision:<20} ({factor_confidence} confidence)")
    
    # Final decision based on majority vote and severity
    autonomous_votes = sum(1 for _, decision, _ in decision_factors if decision == "AUTONOMOUS")
    intervention_votes = sum(1 for _, decision, _ in decision_factors if decision == "HUMAN_INTERVENTION")
    
    # High confidence votes have more weight
    high_conf_autonomous = sum(1 for _, decision, conf in decision_factors 
                              if decision == "AUTONOMOUS" and conf == "HIGH")
    high_conf_intervention = sum(1 for _, decision, conf in decision_factors 
                                if decision == "HUMAN_INTERVENTION" and conf == "HIGH")
    
    # Conservative approach: if any high-confidence intervention vote, lean towards intervention
    if high_conf_intervention > 0:
        final_decision = "HUMAN_INTERVENTION"
        final_confidence = "LOW"
        primary_reason = "High-confidence factors indicate potential issues"
    elif high_conf_autonomous >= 2:
        final_decision = "AUTONOMOUS"
        final_confidence = "HIGH"
        primary_reason = "Multiple high-confidence factors support autonomous operation"
    elif autonomous_votes > intervention_votes:
        final_decision = "AUTONOMOUS"
        final_confidence = "MODERATE"
        primary_reason = "Majority of factors support autonomous operation"
    else:
        final_decision = "HUMAN_INTERVENTION"
        final_confidence = "LOW"
        primary_reason = "Majority of factors indicate potential issues"
    
    # Determine actions and messages
    if final_decision == "AUTONOMOUS":
        if final_confidence == "HIGH":
            message = "All indicators support autonomous operation - System highly reliable"
            action = "Continue automatic anomaly detection with normal monitoring"
        else:
            message = "System can operate autonomously with increased monitoring"
            action = "Continue automatic detection with enhanced logging and periodic checks"
    else:
        if final_confidence == "LOW":
            message = "Multiple indicators suggest reliability issues - Human intervention required"
            action = "Alert security team for immediate manual review and validation"
        else:
            message = "Some reliability concerns detected - Recommend human oversight"
            action = "Continue with manual validation of anomaly detections"
    
    print(f"\n🎯 Final SafeML Decision:")
    print(f"   Decision: {final_decision}")
    print(f"   Confidence Level: {final_confidence}")
    print(f"   Primary Reason: {primary_reason}")
    print(f"   Message: {message}")
    print(f"   Recommended Action: {action}")
    
    # Additional metrics for transparency
    print(f"\n📊 Decision Metrics:")
    print(f"   Autonomous votes: {autonomous_votes}/{len(decision_factors)}")
    print(f"   Intervention votes: {intervention_votes}/{len(decision_factors)}")
    print(f"   High-confidence autonomous: {high_conf_autonomous}")
    print(f"   High-confidence intervention: {high_conf_intervention}")
    print(f"   Accuracy drop: {accuracy_drop_percent:.1f}%")
    print(f"   Overall similarity: {overall_similarity:.1f}%")
    
    print(f"\n📋 Decision Basis: Comprehensive analysis of both Class 0 and Class 1 statistical distances")
    print("="*80)
    
    return final_decision, final_confidence, accuracy_difference


In [62]:
best_training_accuracy =analysis['overall_best_accuracy']
    
    # Step 3: Make SafeML decision
decision, confidence_level, accuracy_difference = safeml_confidence_assessment_both_classes(
        best_training_accuracy=best_training_accuracy,
        estimated_accuracy=estimated_accuracy,
        distance_analysis=analysis
    )


SafeML Confidence Assessment (Both Classes):
   Best training accuracy: 0.9980 (99.80%)
   Estimated accuracy: 0.9781 (97.81%)
   Accuracy difference: 0.0200 (2.00%)
   Overall similarity score: 100.00%
   Overall distance difference: -0.00%

Decision Factors Analysis:
   High Similarity          : AUTONOMOUS           (HIGH confidence)
   Small Accuracy Drop      : AUTONOMOUS           (HIGH confidence)
   Class 0 Analysis         : AUTONOMOUS           (HIGH confidence)
   Class 1 Analysis         : AUTONOMOUS           (HIGH confidence)

🎯 Final SafeML Decision:
   Decision: AUTONOMOUS
   Confidence Level: HIGH
   Primary Reason: Multiple high-confidence factors support autonomous operation
   Message: All indicators support autonomous operation - System highly reliable
   Recommended Action: Continue automatic anomaly detection with normal monitoring

📊 Decision Metrics:
   Autonomous votes: 4/4
   Intervention votes: 0/4
   High-confidence autonomous: 4
   High-confidence interve