In [None]:
import pandas as pd
import sklearn.metrics
import matplotlib.pyplot as plt
import itertools
import numpy as np
from typing import Any, Dict, List, Tuple, Union
from sklearn import metrics

from src.algorithms.kmeans import Kmeans
from src.algorithms.dbscan import DBscan
from src.algorithms.isolation_forest import IsolationForest
from src.algorithms.gan import GAN
from src.algorithms.border_check import BorderCheck
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import euclidean_distances
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.utils.estimator_checks import check_estimator
import csv
import os
import multiprocessing as mp
import time


In [None]:
def compute_metrics(y_true, y_predicted):
    confusion_matrix = sklearn.metrics.confusion_matrix(y_true, y_predicted)
    precision = sklearn.metrics.precision_score(y_true, y_predicted)
    recall = sklearn.metrics.recall_score(y_true, y_predicted)
    f1 = sklearn.metrics.f1_score(y_true, y_predicted)
    accuracy = sklearn.metrics.accuracy_score(y_true, y_predicted)

    return {"confusion matrix ": confusion_matrix, "precision ": precision, "recall ": recall, "f1 ": f1, "accuracy " : accuracy}


In [None]:
"""
param_grid_Kmeans = {
    "n_clusters": np.arange(2, 4),
    "treshold": np.arange(0.1, 1, 0.1),
}

param_grid_DBscan = {
    "eps": np.arange(0.5, 1.5, 0.5),
    "treshold": np.arange(0.5, 2, 0.5),
    "min_samples": np.arange(1, 3, 1),
}

algorithms_params = [param_grid_Kmeans] 
algorithms = ["Kmeans"]

metrics = np.empty([2, 2, 5], dtype=object)

for i in range(1,2):

    df_test = pd.read_csv(f"data/sensor-cleaning-data/cleaned/test/reformatted/data1.csv")

    y_true = df_test[["error"]]

    for alg_id, alg in enumerate(algorithms_params):
        score = []
        for params in itertools.product(*alg.values()):

            inner_dict = {k: v for (k, v) in zip(algorithms_params[alg_id].keys(), params)}

            conf = {
                "filtering": "None",
                "train_data": f"data/sensor-cleaning-data/cleaned/train/data{i}.csv",
                "input_vector_size": 1,
                "warning_stages": [0.7, 0.9],
                **inner_dict,
                "output": [],
                "output_conf": [{}
                ],
            }

            class_name = algorithms[alg_id]

            class_ = globals()[class_name]

            detector = class_(conf)

            mask = []
            y_predicted = []

            for idx, row in df_test.iterrows():

                status_code = detector.message_insert(
                    {
                        "timestamp": df_test["offset"].iloc[idx],
                        "ftr_vector": [df_test["val"].iloc[idx]],
                    }
                )

                if status_code == 2:
                    y_predicted.append(False)
                if status_code == 1:
                    y_predicted.append(False)
                elif status_code == -1:
                    y_predicted.append(True)


            m = compute_metrics(y_predicted, y_true)
            m.append(params)
            score.append(m)
            print(m)
            #print(f"{algorithms[alg_id]}", score)
        #print(max(score, key=lambda x: x[0]))
        # max_list = max(score, key=lambda x: x[3])

        # metrics[alg_id, i-1, :] = max_list

"""

In [None]:
"""
df_test = pd.read_csv(f"data/sensor-cleaning-data/cleaned/test/reformatted/data1.csv")
# select to columns from dataframe
X = df_test[["offset", "val"]]
y = df_test[["error"]]

"""

In [None]:
class Estimator(BaseEstimator):

    def __init__(
        self,
        
        train_data="data/train/ads-1.csv",
        alg = "Kmeans",

        # Kmeans 
        n_clusters=1,
        treshold=1,

        #DBscan
        eps = 0.5,
        db_treshold=1,
        min_samples = 2,

        #IsolationForest
        max_samples = 100,
        max_features=1,
        contamination = 0.05,

        # GAN
        N_latent=3,
        K=8,
        len_window=500,

        # BorderCheck
        UL=0.5,
        LL=-0.5

        
       
    ):
        self.train_data = train_data
        self.alg = alg

        # Kmeans
        self.n_clusters = n_clusters
        self.treshold = treshold

        # DBscan
        self.eps = eps
        self.min_samples = min_samples
        self.db_treshold = db_treshold

        # IsolationForest
        self.max_samples = max_samples 
        self.max_features = max_features
        self.contamination = contamination

        # GAN
        self.N_latent = N_latent
        self.K = K
        self.len_window = len_window

        # BorderCheck
        self.UL = UL
        self.LL = LL 


    def fit(self, X, y):

        inner_dict = {
            "train_data": self.train_data,
            # Kmeans
            "n_clusters": self.n_clusters,
            "treshold": self.treshold,
            # DBscan
            "eps": self.eps,
            "min_samples": self.min_samples,
            "db_treshold": self.db_treshold,
            # IsolationForest
            "max_samples": self.max_samples,
            "max_features": self.max_features,
            "contamination": self.contamination,
            # GAN
            "N_latent": self.N_latent,
            "K": self.K,
            "len_window": self.len_window,
            # BorderCheck
            "UL": self.UL,
            "LL": self.LL,

        }

        conf = {
            "filtering": "None",
            "input_vector_size": 1,
            "warning_stages": [0.7, 0.9],
            "model_name":"IsolationForest",
            **inner_dict,
            "output": [],
            "output_conf": [{}],
        }

      

        class_ = globals()[self.alg]
        self.detector_ = class_(conf)

        self.is_fitted_ = True

        # `fit` should always return `self`
        return self

    def predict(self, X):
        
        y_predicted = []
                
    
        # transverse X rows
        
        for idx, row in X.iterrows():
            
          
            message = {
                "timestamp": row["timestamp"],
                "ftr_vector": [row["ftr_vector"]],
            }
          
        
            status_code = self.detector_.message_insert(message)

            if status_code == 2 or status_code ==0:
                y_predicted.append(False)
            if status_code == 1:
                y_predicted.append(False)
            elif status_code == -1:
                y_predicted.append(True)

           
       
      

        
        return y_predicted

In [None]:
algorithms = ["Kmeans"]

DBscan_params = {
    
    "alg": ["DBscan"],
   
    # DBscan
    "eps": np.arange(0.1, 1, 0.1),
    "db_treshold": np.arange(0.05, 1, 0.05),
    "min_samples": np.arange(15, 50, 15),
   
}

Kmeans_params = {
    
    "alg": ["Kmeans"],
    # Kmeans
    "n_clusters": [2, 4],
    "treshold": [0.1, 0.15],
}

IsolationForest_params = {
    "alg": ["IsolationForest"],
  
    # IsolationForest
    "max_samples": np.arange(2500, 10001, 2500),
    "max_features": [1],
    "contamination": np.arange(0.001, 0.01, 0.001),

}

BorderCheck_params = {
    "alg": ["BorderCheck"],
  
    "UL": np.arange(0.1, 1.1, 0.1),
    "LL": np.arange(-0.1, -1.1, -0.1),

}





for alg in algorithms:

    for i in range(1, 2):

        df_validation = pd.read_csv(f"data/validation/ads-{i}.csv")
        X_validation = df_validation[["timestamp", "ftr_vector"]]
        y_validation = df_validation[["label"]]


        df_test = pd.read_csv(f"data/test/ads-{i}.csv")
        X_test = df_validation[["timestamp", "ftr_vector"]]
        y_test = df_validation[["label"]]


        test_params = {
            
            # Kmeans
            "n_clusters": [2],
            "treshold": [0.5],
            # DBscan
            "eps": [0.5],
            "db_treshold": [0.5],
            "min_samples": [100],
            # IsolationForest
            "max_samples": [10],
            "max_features": [1],
            "contamination": [0.01],
            # GAN
            "N_latent": [3],
            "K": [8],
            "len_window": [500],
            # Border Check
            "UL": [0.5],
            "LL":  [-0.5],
            **eval(f"{alg}_params"),
            "train_data": [f"data/train/ads-{i}.csv"],
     
        }

        print(test_params)

        estimator = Estimator()

        clf = GridSearchCV(
            estimator,
            param_grid=test_params,
            scoring="precision",
            
            cv=TimeSeriesSplit(n_splits=2)
        )

 
        #print(X_validation.shape, y_validation.shape)
        #print(X_validation.index[0], y_validation.index[0])
        selected = clf.fit(X_validation, y_validation)

    
        
        #best_estimator = clf.best_estimator_
        #y_pred = best_estimator.predict(X_test)
     

        #comp_metrics = compute_metrics(y_test, y_pred)
        #print("Metrics ", comp_metrics)
        
    '''  
        
        transposed_data = zip(*[selected.cv_results_[key] for key in selected.cv_results_])
        is_empty = (
            not os.path.exists(f"results_1/Kmeans-precision.csv")
            or os.path.getsize(f"results_1/Kmeans-precision.csv") == 0
        )

        with open(f"results_1/Kmeans-precision.csv", "a", newline="") as f:
            writer = csv.writer(f)

            # Write headers only if the file is empty
            if is_empty:
                writer.writerow(selected.cv_results_.keys())

            # Write data rows
            writer.writerows(transposed_data)

    '''  
        

In [None]:
best_estimator = clf.best_estimator_
print("Best params ", selected.best_params_)
y_pred = best_estimator.predict(X_test)
print(selected.cv_results_["mean_test_score"])   

comp_metrics = compute_metrics(y_test, y_pred)
print("Metrics ", comp_metrics)

In [None]:
Estimator(max_features=1, contamination=0.01, max_samples=100).fit(X,y)

In [None]:
print(type(Estimator().fit(X,y).predict(X)))

In [None]:
selected.cv_results_["params"][0]["alg"]

In [None]:
keys_list = list(selected.cv_results_.keys())


In [None]:
for key in selected.cv_results_:
    print(key)
    print(type(selected.cv_results_[key]))
    print(len(selected.cv_results_[key]))

In [None]:
selected.best_params_


In [None]:
selected.best_score_

In [None]:
clf.best_estimator_

In [None]:
best_estimator = clf.best_estimator_
y_pred = best_estimator.predict(X)

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y, y_pred)
roc_auc = metrics.auc(fpr, tpr)
print(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc, estimator_name='')
display.plot()
plt.show()

In [None]:
metrics = compute_metrics(y_pred, y)
plt.plot(fpr, tpr, color="blue", lw=2, label=f"AUC = {roc_auc}\n Conf_matrix={metrics[0]}\n Precision={metrics[1]}\n Recall={metrics[2]}\n F1={metrics[3]}")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()

In [None]:
df_isolation = pd.read_csv('results_1/IsolationForest.csv')

print("IsolationForest")
for i in range(1, 9):
    mask = df_isolation['param_train_data'].str.split('/').str[-1].eq(f'data{i}.csv')
    df = df_isolation[mask]
    df.reset_index(drop=True, inplace=True)
    
    max_row = df.iloc[df['mean_test_score'].idxmax()]
    print(f'data{i}.csv', 'Max samples=',max_row.param_max_samples, 'Contamination=',max_row.param_contamination, 'Split0 f1=',max_row.split0_test_score, "Split1 f1=",max_row.split1_test_score,'Mean f1=',max_row.mean_test_score)


In [None]:
df_DBscan = pd.read_csv('results_1/DBscan.csv')

for i in range(1, 10):
    mask = df_DBscan['param_train_data'].str.split('/').str[-1].eq(f'data{i}.csv')
    df = df_DBscan[mask]
    df.reset_index(drop=True, inplace=True)
    
    max_row = df.iloc[df['split1_test_score'].idxmax()]
    print(f'data{i}.csv', 'eps=',max_row.param_eps, 'Min samples=',max_row.param_min_samples, 'Treshold=', max_row.param_db_treshold, 'Split0 f1=',max_row.split0_test_score, 'Split1 f1=',max_row.split1_test_score, "Mean f1=",max_row.mean_test_score)


In [None]:
df_kmeans = pd.read_csv('results_1/Kmeans.csv')

for i in range(1, 5):
    mask = df_kmeans['param_train_data'].eq(f'data/train/ads-{i}.csv')
    df = df_kmeans[mask]
    df.reset_index(drop=True, inplace=True)
    
    max_row = df.iloc[df['mean_test_score'].idxmax()]
    print(f'data{i}.csv', 'n_clusters=',max_row.param_n_clusters, 'Treshold=', max_row.param_treshold, 'Split0 f1=',max_row.split0_test_score, 'Split1 f1=',max_row.split1_test_score, "Mean f1=",max_row.mean_test_score)


In [None]:
df = pd.DataFrame([[1, 1.5], [3, 4], [5, 6]], columns=['int', 'float'])
df.index = range(3, 3 + len(df))


for idx, i in df.iterrows():
    print(idx)



In [None]:
print(df.iloc[0])

In [None]:
df_test['label'].value_counts()

In [None]:
m = {"a": 5, "b":1, "c":5}
k = {"a":2}

j = {"a":1,
     **m}
print(j)


In [None]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()
parameters = {'kernel':('linear', 'rbf'), 'C':np.arange(1,10,0.01)}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, n_jobs=-1)
clf.fit(iris.data, iris.target)
sorted(clf.cv_results_.keys())

In [None]:
from sklearn.utils.estimator_checks import check_estimator
from sklearn.svm import LinearSVC
check_estimator(LinearSVC())  # passes

In [None]:
import multiprocessing as mp


In [None]:
def perform_grid_search(params):
    
    i = params["i"]


    df_validation = pd.read_csv(f"data/validation/ads-{i}.csv")
    X_validation = df_validation[["timestamp", "ftr_vector"]]
    y_validation = df_validation[["label"]]


    df_test = pd.read_csv(f"data/test/ads-{i}.csv")
    X_test = df_validation[["timestamp", "ftr_vector"]]
    y_test = df_validation[["label"]]


    test_params = {
        
        # Kmeans
        "n_clusters": [2],
        "treshold": [0.3],
        # DBscan
        "eps": [params["eps"]],
        "db_treshold": [params["db_treshold"]],
        "min_samples": [params["min_samples"]],
        # IsolationForest
        "max_samples": [10],
        "max_features": [1],
        "contamination": [0.01],
        # GAN
        "N_latent": [3],
        "K": [8],
        "len_window": [500],
        # Border Check
        "UL": [0.5],
        "LL":  [-0.5],
        "alg":["DBscan"],
        "train_data": [f"data/train/ads-{i}.csv"],

    }

    print(test_params)

    estimator = Estimator()

    clf = GridSearchCV(
        estimator,
        param_grid=test_params,
        scoring="precision",
        
        cv=TimeSeriesSplit(n_splits=2)
    )


    #print(X_validation.shape, y_validation.shape)
    #print(X_validation.index[0], y_validation.index[0])
    selected = clf.fit(X_validation, y_validation)


    
    best_estimator = clf.best_estimator_
    y_pred = best_estimator.predict(X_test)


    comp_metrics = compute_metrics(y_test, y_pred)
    print("Metrics ", comp_metrics)
    

    
    transposed_data = zip(*[selected.cv_results_[key] for key in selected.cv_results_])
    is_empty = (
        not os.path.exists(f"results_1/DBscan-precision.csv")
        or os.path.getsize(f"results_1/DBscan-precision.csv") == 0
    )

    with open(f"results_1/DBscan-precision.csv", "a", newline="") as f:
        writer = csv.writer(f)

        # Write headers only if the file is empty
        if is_empty:
            writer.writerow(selected.cv_results_.keys())

        # Write data rows
        writer.writerows(transposed_data)

      
        

In [None]:

# Define db_params_params_list dynamically
dbscan_params_list = [
    {"eps": eps, "db_treshold": db_treshold, "min_samples": min_samples, "i": i}
    for eps in np.arange(0.1, 0.2, 0.1)
    for db_treshold in np.arange(0.05, 0.1, 0.1)
    for min_samples in np.arange(50, 60, 50)
    for  i in np.arange(1,5,1)
]

batch_size = 2

for i in range(0, len(dbscan_params_list), batch_size):
    batch_params = dbscan_params_list[i:i+batch_size]
    processes = []
    for params in batch_params:
        p = mp.Process(target=perform_grid_search, args=(params,))
        processes.append(p)
        p.start()

    # Wait for all processes in the current batch to finish
    for p in processes:
        p.join()

    time.sleep(15)

