In [1]:
import pandas as pd
import numpy as np
import random
from sklearn import metrics
from sklearn.metrics import (roc_auc_score, precision_score, average_precision_score, recall_score,f1_score, accuracy_score,mean_squared_error,mean_absolute_error, roc_curve, auc, classification_report,auc,confusion_matrix,matthews_corrcoef)
from sklearn import logger
from sklearn.datasets import make_blobs,make_multilabel_classification
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KernelCenterer,LabelEncoder, MinMaxScaler, Normalizer, QuantileTransformer, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, StandardScaler
from sklearn.manifold import TSNE
import time
from sklearn.metrics import confusion_matrix,classification_report
import scipy as sp
from scipy.linalg import svd,null_space
import os    
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.cluster import KMeans,AgglomerativeClustering,SpectralClustering
from sklearn.mixture import GaussianMixture
from scipy.sparse import csr_matrix as sp
import math
from scipy.sparse.linalg import svds
from scipy.spatial.distance import cdist
from mpl_toolkits.mplot3d import Axes3D  # Import 3D plotting tools
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture


In [2]:
import numpy as np
import pandas as pd
import os
from sklearn.cluster import KMeans
from scipy.linalg import null_space
from scipy.spatial.distance import cdist
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import cProfile
import pstats

class GMM_NFST:
    def __init__(self, type="SemiSup", npd=None, null_point_X=None, alpha=None):
        if type != "OC" and alpha is not None:
            if not isinstance(alpha, (int, float)) or alpha <= 0:
                raise ValueError("Alpha must be a positive number when type is not 'OC'")
        
        self.type = type
        self.npd = npd if npd is not None else np.array([])
        self.nullpoint_X = null_point_X if null_point_X is not None else np.array([])
        self.centroids = None
        self.kmeans = None

    def calculate_NPD(self, X_train, y_train):
        X_train = np.array(X_train)
        y_train = np.array(y_train)
        X_train, y_train = self.cluster_kmeans(X_train, y_train)
        X = X_train.T
        c = len(np.unique(y_train))
        d, N = X.shape

        print(f"N = {N}, d = {d}, c = {c}")

        mean_total = np.mean(X, axis=1, keepdims=True)
        P_t = X - mean_total
        P_w = np.zeros_like(X)
        for i in range(c):
            class_indices = y_train == i
            class_mean = np.mean(X[:, class_indices], axis=1, keepdims=True)
            P_w[:, class_indices] = X[:, class_indices] - class_mean

        S_w = np.dot(P_w, P_w.T) / N
        S_t = np.dot(P_t, P_t.T) / N

        print(f"P_w: d x N = {P_w.shape}")
        print(f"P_t: d x N = {P_t.shape}")
        print(f"S_w: d x d = {S_w.shape}")
        print(f"S_t: d x d = {S_t.shape}")

        U, S, Vt = np.linalg.svd(P_t, full_matrices=False)
        Q = U
        B = null_space(Q.T @ S_w @ Q)
        W = Q @ B

        print(f"Q: d x r = {Q.shape}")
        print(f"B: r x L = {B.shape}")
        print(f"W: d x L = {W.shape}")

        return W

    def distance_vector(self, null_point_X, null_point_Y):
        norm_X = np.sum(null_point_X**2, axis=1)
        norm_Y = np.sum(null_point_Y**2, axis=1)
        dot_product = np.dot(null_point_Y, null_point_X.T)
        distance = np.sqrt(np.maximum(norm_Y[:, np.newaxis] + norm_X[np.newaxis, :] - 2 * dot_product, 0))
        return distance

    def fit(self, X_train, y_train):
        self.npd = self.calculate_NPD(X_train, y_train)
        self.nullpoint_X = np.dot(X_train, self.npd)
        # Project centroids to null space
        if self.centroids is not None:
            self.centroids = np.dot(self.centroids, self.npd)

    def predict(self, X_test):
        null_point_X_t = np.dot(X_test, self.npd)
        distest = self.distance_vector(self.nullpoint_X, null_point_X_t)
        scores = np.amin(distest, axis=1)
        return scores

    def cluster_kmeans(self, X_train, y_train):
        print("Starting K-Means clustering...")
        initial_k = len(np.unique(y_train))
        self.kmeans = KMeans(n_clusters=initial_k, random_state=42, n_init=10)
        cluster_labels = self.kmeans.fit_predict(X_train)
        self.centroids = self.kmeans.cluster_centers_
        sorted_indices = np.argsort(cluster_labels)
        sorted_data = X_train[sorted_indices]
        sorted_labels = cluster_labels[sorted_indices]
        print(f"Final number of clusters: {len(np.unique(sorted_labels))}")
        return sorted_data, sorted_labels

    def predict_proba(self, X_test):
        if self.centroids is None or self.npd is None:
            raise ValueError("Model must be fitted before calling predict_proba.")

        # Project test points to null space
        X_test_null = np.dot(X_test, self.npd)
        # Compute distances from test points to projected centroids
        distances = cdist(X_test_null, self.centroids, metric='euclidean')
        min_distances = np.min(distances, axis=1)
        closest_centroid_idx = np.argmin(distances, axis=1)

        # Compute distances between all pairs of centroids in null space
        centroid_distances = cdist(self.centroids, self.centroids, metric='euclidean')
        np.fill_diagonal(centroid_distances, np.inf)
        min_centroid_distances = np.min(centroid_distances, axis=1)

        proba = np.zeros(X_test.shape[0])
        for i in range(X_test.shape[0]):
            d_test = min_distances[i]
            closest_centroid = closest_centroid_idx[i]
            d_centroids = min_centroid_distances[closest_centroid]
            threshold = d_centroids / 2
            ratio = d_test / threshold
            proba[i] = 1 / (1 + np.exp(ratio - 1))

        return proba

    def predict_label(self, X_test, threshold=0.5):
        proba = self.predict_proba(X_test)
        return (proba >= threshold).astype(int)

def preprocess_data_noise(train_data, test_data, noise_percentage=0):
    print("..............................Data Overview................................")
    print("Train Data Shape:", train_data.shape)
    print("Test Data Shape:", test_data.shape)
    
    X_train_total = train_data.iloc[:, :-1].to_numpy()
    y_train_total = train_data.iloc[:, -1].to_numpy()

    X_train = X_train_total[y_train_total == 0]
    y_train = y_train_total[y_train_total == 0]

    print("Train Data Labels [0]:", np.unique(y_train))

    n_samples = X_train.shape[0]
    noise_samples_count = int(n_samples * (noise_percentage / 100))

    X_train_noise = X_train_total[y_train_total == 1]
    noisy_indices = np.random.choice(X_train_noise.shape[0], size=noise_samples_count, replace=False)
    X_train_noise = X_train_noise[noisy_indices]
    
    X_train = np.vstack((X_train, X_train_noise))
    y_train = np.concatenate((y_train, np.ones(X_train_noise.shape[0])))
    
    X_test = test_data.iloc[:, :-1].to_numpy()
    y_test = test_data.iloc[:, -1].to_numpy()

    n_samples = X_train.shape[0]
    n_features = X_train.shape[1]
    print("Number of samples after adding noise:", n_samples)
    print("Number of features:", n_features)

    return X_train, y_train, X_test, y_test

# Main processing loop
dataset_prefixes = ['data_CICIoT2023.csv', 'data_N_BaIoT.csv', 'data_BoTIoT.csv']
scaler_names = ['MinMaxScaler', 'Normalizer', 'StandardScaler', 'QuantileTransformer']
noise_percentage = 0  # As per the 5% in the filename
results = []

for prefix in dataset_prefixes:
    print("-"*50)
    print("--------", prefix, "-"*30)
    print("-"*50)

    base_output_file = f"Results/OURMODEL/SCALERS/{prefix}_100k_oldlear555n"
    output_file = base_output_file + "0.csv"
    counter = 0
    while os.path.exists(output_file):
        counter += 1
        output_file = f"{base_output_file}{counter}.csv"

    for scaler in scaler_names:
        print(f"Processing dataset {prefix} with {scaler} scaler ...")
        
        train_file = f'Datascaled/Train_{scaler}_{prefix}'
        test_file = f'Datascaled/Test_{scaler}_{prefix}'
        
        try:
            df_train = pd.read_csv(train_file)
            df_test = pd.read_csv(test_file)
        except FileNotFoundError:
            print(f"Error: Files {train_file} or {test_file} not found. Skipping...")
            continue

        # Preprocess data with noise
        X_train, y_train, X_test, y_test = preprocess_data_noise(df_train, df_test, noise_percentage)

        # Initialize and profile the model
        model = GMM_NFST(type="SemiSup")
        profiler = cProfile.Profile()
        profiler.enable()

        # Fit the model
        model.fit(X_train, y_train)

        # Predict labels and probabilities
        y_pred = model.predict_label(X_test)
        y_proba = model.predict_proba(X_test)

        profiler.disable()
        stats = pstats.Stats(profiler).sort_stats('cumulative')
        stats.dump_stats(f"Results/OURMODEL/SCALERS/56profile_{prefix}_{scaler}.prof")

        # Evaluate performance
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        auc = roc_auc_score(y_test, y_proba) if len(np.unique(y_test)) > 1 else np.nan

        # Store results
        results.append({
            'Dataset': prefix,
            'Scaler': scaler,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1,
            'AUC-ROC': auc
        })

        print(f"Results for {prefix} with {scaler}:")
        print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}, AUC-ROC: {auc:.4f}")

    # Save results to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

print("Processing complete.")

--------------------------------------------------
-------- data_CICIoT2023.csv ------------------------------
--------------------------------------------------
Processing dataset data_CICIoT2023.csv with MinMaxScaler scaler ...
..............................Data Overview................................
Train Data Shape: (69659, 41)
Test Data Shape: (29855, 41)
Train Data Labels [0]: [0]
Number of samples after adding noise: 2119
Number of features: 40
Starting K-Means clustering...
Final number of clusters: 1
N = 2119, d = 40, c = 1
P_w: d x N = (40, 2119)
P_t: d x N = (40, 2119)
S_w: d x d = (40, 40)
S_t: d x d = (40, 40)
Q: d x r = (40, 40)
B: r x L = (40, 11)
W: d x L = (40, 11)
Results for data_CICIoT2023.csv with MinMaxScaler:
Accuracy: 0.9705, Precision: 0.9705, Recall: 1.0000, F1-Score: 0.9850, AUC-ROC: 0.5000
Processing dataset data_CICIoT2023.csv with Normalizer scaler ...
..............................Data Overview................................
Train Data Shape: (69659, 4

KeyboardInterrupt: 

Exception ignored in: 'sklearn.cluster._k_means_common._relocate_empty_clusters_dense'
Traceback (most recent call last):
  File "<__array_function__ internals>", line 180, in where
  File "/home/jupyter-iec_23se07/.local/lib/python3.10/site-packages/numpy/core/multiarray.py", line 345, in where
    @array_function_from_c_func_and_dispatcher(_multiarray_umath.where)
KeyboardInterrupt: 


Final number of clusters: 1
N = 6285, d = 115, c = 1
P_w: d x N = (115, 6285)
P_t: d x N = (115, 6285)
S_w: d x d = (115, 115)
S_t: d x d = (115, 115)
Q: d x r = (115, 115)
B: r x L = (115, 16)
W: d x L = (115, 16)
Results for data_N_BaIoT.csv with MinMaxScaler:
Accuracy: 0.9055, Precision: 0.9055, Recall: 1.0000, F1-Score: 0.9504, AUC-ROC: 0.5000
Processing dataset data_N_BaIoT.csv with Normalizer scaler ...
..............................Data Overview................................
Train Data Shape: (66479, 116)
Test Data Shape: (28491, 116)
Train Data Labels [0]: [0]
Number of samples after adding noise: 6285
Number of features: 115
Starting K-Means clustering...
Final number of clusters: 1
N = 6285, d = 115, c = 1
P_w: d x N = (115, 6285)
P_t: d x N = (115, 6285)
S_w: d x d = (115, 115)
S_t: d x d = (115, 115)
Q: d x r = (115, 115)
B: r x L = (115, 16)
W: d x L = (115, 16)
Results for data_N_BaIoT.csv with Normalizer:
Accuracy: 0.9055, Precision: 0.9055, Recall: 1.0000, F1-Score: 0

ValueError: Input X contains NaN.
KMeans does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values