In [None]:
import pandas as pd
import sklearn.metrics
import matplotlib.pyplot as plt
import itertools
import numpy as np
from typing import Any, Dict, List, Tuple, Union

from src.Kmeans import Kmeans
from src.DBscan import DBscan

In [None]:
def compute_metrics(y_predicted, y_true):
    confusion_matrix = sklearn.metrics.confusion_matrix(y_predicted, y_true)
    precision = sklearn.metrics.precision_score(y_predicted, y_true)
    recall = sklearn.metrics.recall_score(y_predicted, y_true)
    f1 = sklearn.metrics.f1_score(y_predicted, y_true)

    return [confusion_matrix, precision, recall, f1]

In [None]:
param_grid_Kmeans = {
    "n_clusters": np.arange(1, 2),
    "treshold": np.arange(1, 2),
}

param_grid_DBscan = {
    "eps": np.arange(0.5, 1.5, 0.5),
    "treshold": np.arange(0.5, 2, 0.5),
    "min_samples": np.arange(1, 3, 1),
}

algorithms_params = [param_grid_Kmeans, param_grid_DBscan] 
algorithms = ["Kmeans", "DBscan"]

metrics = np.empty([2, 9, 5], dtype=object)

for i in range(1,9):

    df_test = pd.read_csv(f"data/sensor-cleaning-data/cleaned/test/data{i}.csv")

    df_original = pd.read_csv(f"data/sensor-cleaning-data/cleaned/test/reformatted/data{i}.csv")

    for idx, row in df_original.iterrows():

        if row["errorRate"] == 0.0:
            df_original["errorRate"].iloc[idx] = 1
        else:
            df_original["errorRate"].iloc[idx] = -1

    df_original["errorRate"] = df_original["errorRate"].astype(int)

    for alg_id, alg in enumerate(algorithms_params):
        score = []
        for params in itertools.product(*alg.values()):

            inner_dict = {k: v for (k, v) in zip(algorithms_params[alg_id].keys(), params)}

            conf = {
                "filtering": "None",
                "train_data": f"data/sensor-cleaning-data/cleaned/train/data{i}.csv",
                "input_vector_size": 1,
                "warning_stages": [0.7, 0.9],
                **inner_dict,
                "output": ["FileOutput()"],
                "output_conf": [
                    {
                        "file_path": f"{algorithms[alg_id]}/sensor-cleaning-data/data{i}.csv",
                        "file_name": f"{algorithms[alg_id]}/sensor-cleaning-data/data{i}.csv",
                        "mode": "w",
                    }
                ],
            }

            class_name = algorithms[alg_id]

            class_ = globals()[class_name]

            detector = class_(conf)

            mask = []
            y_predicted = []

            for idx, row in df_test.iterrows():

                status_code = detector.message_insert(
                        {
                            "timestamp": df_test["timestamp"].iloc[idx],
                            "ftr_vector": [df_test["ftr_vector"].iloc[idx]],
                        }
                    )

                if status_code == 2:
                    mask.append(False)
                if status_code == 1:
                    mask.append(True)
                    y_predicted.append(1)
                elif status_code == -1:
                    mask.append(True)
                    y_predicted.append(-1)

            df_original = df_original[mask]
            y_true = list(df_original["errorRate"])

            m = compute_metrics(y_predicted, y_true)
            m.append(params)
            score.append(m)

            # print(score)

        max_list = max(score, key=lambda x: x[3])
      
        metrics[alg_id, i-1, :] = max_list

        print(metrics)

In [None]:
print(metrics[1])

In [None]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import euclidean_distances


class TemplateEstimator(BaseEstimator):
    """A template estimator to be used as a reference implementation.

    For more information regarding how to build your own estimator, read more
    in the :ref:`User Guide <user_guide>`.

    Parameters
    ----------
    demo_param : str, default='demo_param'
        A parameter used for demonstation of how to pass and store paramters.

    Examples
    --------
    >>> from skltemplate import TemplateEstimator
    >>> import numpy as np
    >>> X = np.arange(100).reshape(100, 1)
    >>> y = np.zeros((100, ))
    >>> estimator = TemplateEstimator()
    >>> estimator.fit(X, y)
    TemplateEstimator()
    """

    def __init__(self, demo_param=None):
        self.demo_param = demo_param

    def fit(self, X, y):
        """A reference implementation of a fitting function.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.
        y : array-like, shape (n_samples,) or (n_samples, n_outputs)
            The target values (class labels in classification, real numbers in
            regression).

        Returns
        -------
        self : object
            Returns self.
        """


        inner_dict = self.demo_param["param_grid_Kmeans"]

        conf = {
            "filtering": "None",
            "train_data": "data/sensor-cleaning-data/cleaned/train/data1.csv",
            "input_vector_size": 1,
            "warning_stages": [0.7, 0.9],
            **inner_dict,
            "output": ["FileOutput()"],
            "output_conf": [
                {
                    "file_path": f"Kmeans/sensor-cleaning-data/data{i}.csv",
                    "file_name": f"Kmeans/sensor-cleaning-data/data{i}.csv",
                    "mode": "w",
                }
            ],
        }

       

        self.detector_ = Kmeans(conf)
        
        X, y = check_X_y(X, y, accept_sparse=True)
        self.is_fitted_ = True
        # `fit` should always return `self`
        return self

    def predict(self, X):
        """A reference implementation of a predicting function.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The training input samples.

        Returns
        -------
        y : ndarray, shape (n_samples,)
            Returns an array of ones.
        """

        df_test = pd.read_csv(f"data/sensor-cleaning-data/cleaned/test/data1.csv")

        mask = []
        y_predicted = []

        for idx, row in df_test.iterrows():

            status_code = self.detector_.message_insert(
                    {
                        "timestamp": df_test["timestamp"].iloc[idx],
                        "ftr_vector": [df_test["ftr_vector"].iloc[idx]],
                    }
                )

            if status_code == 2:
                mask.append(False)
            if status_code == 1:
                mask.append(True)
                y_predicted.append(1)
            elif status_code == -1:
                mask.append(True)
                y_predicted.append(-1)

        y_true = list(df_original["errorRate"])
        y_true = np.array(y_true).reshape(-1, 1)

        X = check_array(X, accept_sparse=True)
        check_is_fitted(self, "is_fitted_")
        # return np.empty(X.shape[0], dtype=np.int64)
        return m
    

In [None]:
param_grid_Kmeans = {
    "n_clusters": 2,
    "treshold": 2,
}

df_test = pd.read_csv(f"data/sensor-cleaning-data/cleaned/test/data1.csv")

df_original = pd.read_csv(f"data/sensor-cleaning-data/cleaned/test/reformatted/data1.csv")

for idx, row in df_original.iterrows():

    if row["errorRate"] == 0.0:
        df_original["errorRate"].iloc[idx] = 1
    else:
        df_original["errorRate"].iloc[idx] = -1

df_original["errorRate"] = df_original["errorRate"].astype(int)


df_train = pd.read_csv("data/sensor-cleaning-data/cleaned/train/data1.csv")
X = np.array(df_train['ftr_vector']).reshape(-1, 1)


params = {
    "trainning_data": "data/sensor-cleaning-data/cleaned/train/data1.csv",
    "testing_data": "data/sensor-cleaning-data/cleaned/test/data1.csv",
    "original_data": "data/sensor-cleaning-data/cleaned/test/reformatted/data1.csv",
    "param_grid_Kmeans": param_grid_Kmeans,
}

temp = TemplateEstimator(demo_param=params)

y = X
temp.fit(X,y)
temp.predict(X)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid_Kmeans = {
    "n_clusters": np.arange(1,4,1),
    "treshold": np.arange(1,4,1),
}

clf = GridSearchCV(TemplateEstimator(demo_param=params), param_grid_Kmeans)

In [None]:
from sklearn.utils.estimator_checks import check_estimator
from sklearn.svm import LinearSVC

params = {
    "trainning_data": "data/sensor-cleaning-data/cleaned/train/data1.csv",
    "testing_data": "data/sensor-cleaning-data/cleaned/test/data1.csv",
    "original_data": "data/sensor-cleaning-data/cleaned/test/reformatted/data1.csv",
    "param_grid_Kmeans" : param_grid_Kmeans
}


check_estimator(TemplateEstimator(demo_param=params))  # passes

In [None]:
X = np.arange(100).reshape(100, 1)
y = np.zeros((100,))
estimator = TemplateEstimator()
estimator.fit(X, y)
plt.plot(estimator.predict(X))
plt.show()

In [112]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV


iris = datasets.load_iris()
parameters = {"kernel": ("linear", "rbf"), "C": [1, 10]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(iris.data, iris.target)
sorted(clf.cv_results_.keys())

# Get the optimal parameters
optimal_params = clf.best_params_

print("Optimal parameters:", optimal_params)

Optimal parameters: {'C': 1, 'kernel': 'linear'}
