In [1]:
import pandas as pd
import sklearn.metrics
import matplotlib.pyplot as plt
import itertools
import numpy as np
from typing import Any, Dict, List, Tuple, Union
from sklearn import metrics

from src.Kmeans import Kmeans
from src.DBscan import DBscan
from src.isolationForest import IsolationForest
from src.GAN import GAN

from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import euclidean_distances
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.utils.estimator_checks import check_estimator
import csv
import os


2024-03-25 09:15:26.454334: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-03-25 09:15:26.454401: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
def compute_metrics(y_predicted, y_true):
    confusion_matrix = sklearn.metrics.confusion_matrix(y_predicted, y_true)
    precision = sklearn.metrics.precision_score(y_predicted, y_true)
    recall = sklearn.metrics.recall_score(y_predicted, y_true)
    f1 = sklearn.metrics.f1_score(y_predicted, y_true)

    return [confusion_matrix, precision, recall, f1]


In [3]:
"""
param_grid_Kmeans = {
    "n_clusters": np.arange(2, 4),
    "treshold": np.arange(0.1, 1, 0.1),
}

param_grid_DBscan = {
    "eps": np.arange(0.5, 1.5, 0.5),
    "treshold": np.arange(0.5, 2, 0.5),
    "min_samples": np.arange(1, 3, 1),
}

algorithms_params = [param_grid_Kmeans] 
algorithms = ["Kmeans"]

metrics = np.empty([2, 2, 5], dtype=object)

for i in range(1,2):

    df_test = pd.read_csv(f"data/sensor-cleaning-data/cleaned/test/reformatted/data1.csv")

    y_true = df_test[["error"]]

    for alg_id, alg in enumerate(algorithms_params):
        score = []
        for params in itertools.product(*alg.values()):

            inner_dict = {k: v for (k, v) in zip(algorithms_params[alg_id].keys(), params)}

            conf = {
                "filtering": "None",
                "train_data": f"data/sensor-cleaning-data/cleaned/train/data{i}.csv",
                "input_vector_size": 1,
                "warning_stages": [0.7, 0.9],
                **inner_dict,
                "output": [],
                "output_conf": [{}
                ],
            }

            class_name = algorithms[alg_id]

            class_ = globals()[class_name]

            detector = class_(conf)

            mask = []
            y_predicted = []

            for idx, row in df_test.iterrows():

                status_code = detector.message_insert(
                    {
                        "timestamp": df_test["offset"].iloc[idx],
                        "ftr_vector": [df_test["val"].iloc[idx]],
                    }
                )

                if status_code == 2:
                    y_predicted.append(False)
                if status_code == 1:
                    y_predicted.append(False)
                elif status_code == -1:
                    y_predicted.append(True)


            m = compute_metrics(y_predicted, y_true)
            m.append(params)
            score.append(m)
            print(m)
            #print(f"{algorithms[alg_id]}", score)
        #print(max(score, key=lambda x: x[0]))
        # max_list = max(score, key=lambda x: x[3])

        # metrics[alg_id, i-1, :] = max_list

"""



In [4]:
"""
df_test = pd.read_csv(f"data/sensor-cleaning-data/cleaned/test/reformatted/data1.csv")
# select to columns from dataframe
X = df_test[["offset", "val"]]
y = df_test[["error"]]

"""

'\ndf_test = pd.read_csv(f"data/sensor-cleaning-data/cleaned/test/reformatted/data1.csv")\n# select to columns from dataframe\nX = df_test[["offset", "val"]]\ny = df_test[["error"]]\n\n'

In [5]:
class Estimator(BaseEstimator):

    def __init__(
        self,
        
        train_data="data/sensor-cleaning-data/cleaned/train/data1.csv",
        test_data = "data/sensor-cleaning-data/cleaned/test/reformatted/data1.csv",
        alg = "Kmeans",

        # Kmeans 
        n_clusters=1,
        treshold=1,

        #DBscan
        eps = 0.5,
        db_treshold=1,
        min_samples = 2,

        #IsolationForest
        max_samples = 100,
        max_features=1,
        contamination = 0.05,
        # GAN
        N_latent=3,
        K=8,
        len_window=500
        
       
    ):
        self.train_data = train_data
        self.test_data = test_data
        self.alg = alg

        # Kmeans
        self.n_clusters = n_clusters
        self.treshold = treshold

        # DBscan
        self.eps = eps
        self.min_samples = min_samples
        self.db_treshold = db_treshold

        # IsolationForest
        self.max_samples = max_samples 
        self.max_features = max_features
        self.contamination = contamination

        # GAN
        self.N_latent = N_latent
        self.K = K
        self.len_window = len_window

    def fit(self, X, y):

        inner_dict = {
            "train_data": self.train_data,
            # Kmeans
            "n_clusters": self.n_clusters,
            "treshold": self.treshold,
            # DBscan
            "eps": self.eps,
            "min_samples": self.min_samples,
            "db_treshold": self.db_treshold,
            # IsolationForest
            "max_samples": self.max_samples,
            "max_features": self.max_features,
            "contamination": self.contamination,
            # GAN
            "N_latent": self.N_latent,
            "K": self.K,
            "len_window": self.len_window,
        }

        conf = {
            "filtering": "None",
            "input_vector_size": 1,
            "warning_stages": [0.7, 0.9],
            "model_name":"IsolationForest",
            **inner_dict,
            "output": [],
            "output_conf": [{}],
        }

        print(conf)

        class_ = globals()[self.alg]
        self.detector_ = class_(conf)

        self.is_fitted_ = True
        # `fit` should always return `self`
        return self

    def predict(self, X):

        print(X.shape, self.alg, self.contamination, self.max_samples, self.max_features, self.train_data, self.test_data)
        print("First 10 ", X.iloc[:10])
        print("Last 10 ", X.iloc[-10:])


        y_predicted = []

        df_test = pd.read_csv(self.test_data)

        # transverse X rows
        for idx, row in X.iterrows():
            message = {
                "timestamp": df_test["offset"].iloc[idx],
                "ftr_vector": [df_test["val"].iloc[idx]],
            }

            status_code = self.detector_.message_insert(message)

            if status_code == 2:
                y_predicted.append(False)
            if status_code == 1:
                y_predicted.append(False)
            elif status_code == -1:
                y_predicted.append(True)

        return y_predicted

In [6]:
algorithms = ["DBscan"]

DBscan_params = {
    
    "alg": ["DBscan"],
    # Kmeans
    "n_clusters": [2],
    "treshold": [0.5],
    # DBscan
    "eps": np.arange(0.1, 1, 0.1),
    "db_treshold": np.arange(0.05, 1, 0.05),
    "min_samples": np.arange(15, 50, 15),
    # IsolationForest
    "max_samples": [100],
    # GAN
    "N_latent": [3],
    "K": [8],
    "len_window": [500],
}

Kmeans_params = {
    
    "alg": ["Kmeans"],
    # Kmeans
    "n_clusters": [2, 3, 4],
    "treshold": np.arange(0.05, 1, 0.05),
    # DBscan
    "eps": [0.5],
    "db_treshold": [0.5],
    "min_samples": [100],
    # IsolationForest
    "max_samples": [100],
    # GAN
    "N_latent": [3],
    "K": [8],
    "len_window": [500],
}

IsolationForest_params = {
    "alg": ["IsolationForest"],
    # Kmeans
    "n_clusters": [2],
    "treshold": [0.5],
    # DBscan
    "eps": [0.5],
    "db_treshold": [0.5],
    "min_samples": [100],
    # IsolationForest
    "max_samples": np.arange(2500, 10001, 2500),
    "max_features": [1],
    "contamination": np.arange(0.001, 0.01, 0.001),
    # GAN
    "N_latent": [3],
    "K": [8],
    "len_window": [500],
}


for alg in algorithms:

    for i in range(7, 10):

        df_test = pd.read_csv(f"data/sensor-cleaning-data/cleaned/test/reformatted/data{i}.csv")
        # select to columns from dataframe
        X = df_test[["offset", "val"]]
        y = df_test[["error"]]

        print("Length of X and y", X.shape, y.shape)

        test_params = {
            **eval(f"{alg}_params"),
            "train_data": [f"data/sensor-cleaning-data/cleaned/train/data{i}.csv"],
            "test_data": [
                f"data/sensor-cleaning-data/cleaned/test/reformatted/data{i}.csv"
            ],
        }

        estimator = Estimator()

        clf = GridSearchCV(
            estimator,
            param_grid=test_params,
            scoring="f1",
            
            cv=TimeSeriesSplit(n_splits=2),
        )

        print("Length of X and y", X.shape, y.shape)
        print(X.iloc[:10])

        selected = clf.fit(X, y)

        print("Length ", X.shape, y.shape)
        print(selected)

        best_estimator = clf.best_estimator_
        print(best_estimator)
        y_pred = best_estimator.predict(X)
        print(y_pred)

        fpr, tpr, thresholds = metrics.roc_curve(y, y_pred)
        roc_auc = metrics.auc(fpr, tpr)

        comp_metrics = compute_metrics(y_pred, y)

        fig, ax = plt.subplots()

        ax.plot(fpr, tpr, color="blue", lw=2, label=f"AUC = {roc_auc}\n Conf_matrix={comp_metrics[0]}\n Precision={comp_metrics[1]}\n Recall={comp_metrics[2]}\n F1={comp_metrics[3]}",)
        ax.set_xlabel("False Positive Rate")
        ax.set_ylabel("True Positive Rate")
        ax.legend()
        alg_name = selected.cv_results_["params"][0]["alg"]
        plt.savefig(f'results_1/{alg_name}-data{i}.png')
        plt.close

        transposed_data = zip(*[selected.cv_results_[key] for key in selected.cv_results_])
        is_empty = (
            not os.path.exists(f"results_1/{alg_name}.csv")
            or os.path.getsize(f"results_1/{alg_name}.csv") == 0
        )

        with open(f"results_1/{alg_name}.csv", "a", newline="") as f:
            writer = csv.writer(f)

            # Write headers only if the file is empty
            if is_empty:
                writer.writerow(selected.cv_results_.keys())

            # Write data rows
            writer.writerows(transposed_data)

Length of X and y (9425, 2) (9425, 1)
Length of X and y (9425, 2) (9425, 1)
   offset       val
0  219.92  1.161799
1  219.93  0.185583
2  219.94  0.921674
3  219.95  0.304838
4  219.96 -2.034051
5  219.97 -0.588988
6  219.98  0.205164
7  219.99  0.245865
8  220.00  0.947986
9  220.01 -0.290984
(3141, 2) DBscan 0.05 100 1 data/sensor-cleaning-data/cleaned/train/data7.csv data/sensor-cleaning-data/cleaned/test/reformatted/data7.csv
First 10        offset       val
3143  251.35  0.744727
3144  251.36 -1.491226
3145  251.37 -1.057988
3146  251.38 -0.436209
3147  251.39 -0.671923
3148  251.40  0.334422
3149  251.41 -0.095149
3150  251.42  1.551743
3151  251.43 -0.176085
3152  251.44 -0.775500
Last 10        offset       val
6274  282.66  0.924732
6275  282.67 -0.035935
6276  282.68  0.500087
6277  282.69  1.333894
6278  282.70 -0.453388
6279  282.71 -0.418612
6280  282.72  0.390717
6281  282.73 -0.045148
6282  282.74  0.067653
6283  282.75 -0.057229
(3141, 2) DBscan 0.05 100 1 data/sensor-

KeyboardInterrupt: 

In [None]:
Estimator(max_features=1, contamination=0.01, max_samples=100).fit(X,y)

In [None]:
print(type(Estimator().fit(X,y).predict(X)))

In [None]:
selected.cv_results_["params"][0]["alg"]

In [None]:
keys_list = list(selected.cv_results_.keys())


In [None]:
for key in selected.cv_results_:
    print(key)
    print(type(selected.cv_results_[key]))
    print(len(selected.cv_results_[key]))

In [None]:
selected.best_params_


In [None]:
selected.best_score_

In [None]:
clf.best_estimator_

In [None]:
best_estimator = clf.best_estimator_
y_pred = best_estimator.predict(X)

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y, y_pred)
roc_auc = metrics.auc(fpr, tpr)
print(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='')
display.plot()
plt.show()

In [None]:
metrics = compute_metrics(y_pred, y)
plt.plot(fpr, tpr, color="blue", lw=2, label=f"AUC = {roc_auc}\n Conf_matrix={metrics[0]}\n Precision={metrics[1]}\n Recall={metrics[2]}\n F1={metrics[3]}")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()

In [None]:
transposed_data = zip(*[selected.cv_results_[key] for key in selected.cv_results_])


with open("results_1/Kmeans.csv", "a", newline="") as f:
    writer = csv.writer(f)

    # Write headers
    writer.writerow(selected.cv_results_.keys())

    # Write data rows
    writer.writerows(transposed_data)

In [None]:
df = pd.read_csv("results_1/IsolationForest.csv")

In [None]:
# Check the length of the DataFrame
print(len(df))

# Check the index of the row with the maximum mean test score
print(df["mean_test_score"].idxmax())

351
113


In [None]:
df.iloc[-5:]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_K,param_N_latent,param_alg,param_contamination,param_db_treshold,param_eps,...,param_n_clusters,param_test_data,param_train_data,param_treshold,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
346,3.068847,0.020908,63.101995,0.21515,8,3,IsolationForest,0.008,0.5,0.5,...,2,data/sensor-cleaning-data/cleaned/test/reforma...,data/sensor-cleaning-data/cleaned/train/data8.csv,0.5,"{'K': 8, 'N_latent': 3, 'alg': 'IsolationFores...",0.232558,0.204082,0.21832,0.014238,12
347,2.678781,0.035128,64.127148,1.852152,8,3,IsolationForest,0.009,0.5,0.5,...,2,data/sensor-cleaning-data/cleaned/test/reforma...,data/sensor-cleaning-data/cleaned/train/data8.csv,0.5,"{'K': 8, 'N_latent': 3, 'alg': 'IsolationFores...",0.238095,0.217391,0.227743,0.010352,8
348,2.804109,0.023218,62.942776,0.662056,8,3,IsolationForest,0.009,0.5,0.5,...,2,data/sensor-cleaning-data/cleaned/test/reforma...,data/sensor-cleaning-data/cleaned/train/data8.csv,0.5,"{'K': 8, 'N_latent': 3, 'alg': 'IsolationFores...",0.232558,0.2,0.216279,0.016279,14
349,2.854234,0.017731,62.741606,0.093628,8,3,IsolationForest,0.009,0.5,0.5,...,2,data/sensor-cleaning-data/cleaned/test/reforma...,data/sensor-cleaning-data/cleaned/train/data8.csv,0.5,"{'K': 8, 'N_latent': 3, 'alg': 'IsolationFores...",0.227273,0.204082,0.215677,0.011596,15
350,3.155751,0.017242,63.207887,0.208412,8,3,IsolationForest,0.009,0.5,0.5,...,2,data/sensor-cleaning-data/cleaned/test/reforma...,data/sensor-cleaning-data/cleaned/train/data8.csv,0.5,"{'K': 8, 'N_latent': 3, 'alg': 'IsolationFores...",0.227273,0.196078,0.211676,0.015597,17


In [None]:
df.iloc[df[df["param_train_data"] == "data/sensor-cleaning-data/cleaned/train/data1.csv"]["mean_test_score"].idxmax()]

mean_fit_time                                                   2.518151
std_fit_time                                                    0.006514
mean_score_time                                                64.288157
std_score_time                                                  1.341275
param_K                                                                8
param_N_latent                                                         3
param_alg                                                IsolationForest
param_contamination                                                 0.01
param_db_treshold                                                    0.5
param_eps                                                            0.5
param_len_window                                                     500
param_max_features                                                     1
param_max_samples                                                   2000
param_min_samples                                  

In [None]:
df.iloc[df[df["param_train_data"] == "data/sensor-cleaning-data/cleaned/train/data1.csv"]["split1_test_score"].idxmax()]

In [None]:
df.tail(15)['param_train_data']