https://github.com/Eukla/ETS/tree/master/ets/algorithms

https://github.com/JakubBilski/CALIMERA

# ECTS

In [None]:
import numpy as np
import pandas as pd
from multiprocessing import Pool
from scipy.spatial import distance
from sklearn.neighbors import NearestNeighbors
from typing import Tuple, List, Sequence, Dict, Optional
import multiprocessing as mp

class ECTS():
    """Algorytm ECTS"""

    def __init__(self, timestamps, support: float):
        """
        Tworzy instancję ECTS.
        :param timestamps: lista znaczników czasu dla wczesnych prognoz
        :param support: minimalny próg wsparcia
        """
        self.rnn: Dict[int, Dict[int, List]] = dict()
        self.nn: Dict[int, Dict[int, List]] = dict()
        self.data: Optional[pd.DataFrame] = None
        self.labels: Optional[pd.Series] = None
        self.mpl: Dict[int, Optional[int]] = dict()
        self.timestamps = timestamps
        self.support = support
        self.clusters: Dict[int, List[int]] = dict()
        self.occur: Dict[int, int] = dict()
        self.correct: Optional[List[Optional[int]]] = None

    def train(self, train_data: pd.DataFrame, labels: Sequence[int]) -> None:
        """
        Trenowanie modelu.
        :param train_data: zbiór treningowy jako DataFrame
        :param labels: zbiór przypisanych klas do szeregów ze zbioru treningowego
        """
        self.data = train_data
        self.labels = labels

        for index, value in self.labels.value_counts().items():
            self.occur[index] = value

        time_pos = 0
        for e in self.timestamps:
            product = self.__nn_non_cluster(time_pos) 
            self.rnn[e] = product[1]
            self.nn[e] = product[0]
            time_pos += 1
        temp = {}
        finished = {}  
        for e in reversed(self.timestamps):
            for index, row in self.data.iterrows():
                if index not in temp:
                    self.mpl[index] = e
                    finished[index] = 0  

                else:
                    if finished[index] == 1: 
                        continue

                    if self.rnn[e][index] is not None:
                        self.rnn[e][index].sort()
                    if temp[index] is not None:
                        temp[index].sort()

                    if self.rnn[e][index] == temp[index]: 
                        self.mpl[index] = e

                    else:  
                        finished[index] = 1
                temp[index] = self.rnn[e][index]
        self.__mpl_clustering()

    def __nn_non_cluster(self, prefix: int):
        """Funkcja znajduje zbiór NN i RNN dla wszystkich szeregów czasowych o zadanej długości prefiksu.
        :param prefix: długość prefiksu
        :return: słowniki przechowujące zbiory NN i RNN"""
        nn = {}
        rnn = {}
        neigh = NearestNeighbors(n_neighbors=2, metric='euclidean').fit(self.data.iloc[:, 0:prefix + 1])
        def something(row):
            return neigh.kneighbors([row])

        result_data = self.data.iloc[:, 0:prefix + 1].apply(something, axis=1)
        for index, value in result_data.items():
            if index not in nn:
                nn[index] = []
            if index not in rnn:
                rnn[index] = []
            for item in value[1][0]:
                if item != index:
                    nn[index].append(item)
                    if item not in rnn:
                        rnn[item] = [index]
                    else:
                        rnn[item].append(index)
        return nn, rnn

    def __cluster_distance(self, cluster_a: Sequence[int], cluster_b: Sequence[int]):
        """
        Funkcja oblicz odległość między dwoma klastami i szuka minimalnej odległości między wszystkimi parami elementów z dwóch klastrów.
        :param cluster_a: pierwszy klaster
        :param cluster_b: drugi klaster
        :return:  odległość
        """

        min_distance = float("inf")
        for i in cluster_a:
            for j in cluster_b:
                d = distance.euclidean(self.data.iloc[i], self.data.iloc[j])
                if min_distance > d:
                    min_distance = d

        return min_distance

    def nn_cluster(self, cl_key: int, cluster_index: Sequence[int]):
        """
        Funkcja szuka najbliższego klastra używając __cluster_distance.
        :param cluster_index: lista indeksów serii należących do tego klastra
        :param cl_key: klucz klastra w słowniku
        """
        dist = float("inf")
        candidate = [] 

        for key, value in self.clusters.items(): 

            if cl_key == key: 
                continue
            temp = self.__cluster_distance(cluster_index, value) 

            if dist >= temp: 
                dist = temp
                candidate = [key]
        return candidate

    def __rnn_cluster(self, e: int, cluster: List[int]):
        """
        Oblicza RNN klastra dla obecnegp prefiksu.
        :param e: prefiks, dla którego szukamy zbioru RNN
        :param cluster: klaster, dla którego szukamy zbioru RNN
        """

        rnn = set()
        complete = set()
        for item in cluster:
            rnn.union(self.rnn[e][item])
        for item in rnn:
            if item not in cluster:
                complete.add(item)
        return complete

    def __mpl_calculation(self, cluster: List[int]):
        """
        Funkcja szuka MPL dla klastrów.
        :param cluster: klaster, dla którego szukamy MPL
        """
        index = self.labels[cluster[0]]
        if self.support > len(cluster) / self.occur[index]:
            return #nie liczymy, jeśli klaster jest zbyt mały
        mpl_rnn = self.timestamps[len(self.timestamps) - 1] 
        mpl_nn = self.timestamps[len(self.timestamps) - 1]

        curr_rnn = self.__rnn_cluster(self.timestamps[len(self.timestamps) - 1], cluster)  # RNN dla pełniej długości

        for e in reversed(self.timestamps):
            temp = self.__rnn_cluster(e, cluster)  # RNN dla kolejnych długości
            if not curr_rnn - temp: 
                mpl_rnn = e
            else:
                break
            curr_rnn = temp

        rule_broken = 0
        for e in reversed(self.timestamps):  # NN dla kolejnych długości
            for series in cluster:  # Dla wszystkich szeregów czasowych
                for my_tuple in self.nn[e][series]:  
                    if my_tuple not in cluster:
                        rule_broken = 1
                        break
                if rule_broken == 1:
                    break
            if rule_broken == 1:
                break
            else:
                mpl_nn = e
        for series in cluster:
            pos = max(mpl_rnn, mpl_nn)  
            if self.mpl[series] > pos:
                self.mpl[series] = pos

    def __mpl_clustering(self):
        """Funkcja wywołuje grupowanie hierarchiczne"""
        n = self.data.shape[0]
        redirect = {}
        discriminative = 0

        # Każdy szereg jako osobny klaster
        for index, row in self.data.iterrows():
            self.clusters[index] = [index]
            redirect[index] = index

        result = []
        max_iterations = n * n  # maksymalna liczba iteracji, zabezpieczenie
        iter_count = 0

        while n > 1:
            iter_count += 1
            if iter_count > max_iterations:
                break

            closest = {}

            # Wyznaczamy najbliższy klaster dla każdego klastra
            for key, cluster in self.clusters.items():
                closest[key] = self.nn_cluster(key, cluster)

            merged = False

            for key, candidates in closest.items():
                for item in list(candidates):
                    if key in closest.get(item, []):
                        # Sprawdzenie, czy nie są już w tym samym klastrze
                        if redirect[item] == redirect[key]:
                            continue

                        self.clusters[redirect[key]] += self.clusters[redirect[item]]
                        del self.clusters[redirect[item]]
                        n -= 1
                        redirect[item] = redirect[key]

                        result = [self.labels.loc[idx] for idx in self.clusters[redirect[key]]]
                        if len(set(result)) == 1:
                            discriminative += 1
                            self.__mpl_calculation(self.clusters[redirect[key]])

                        for k in redirect:
                            if redirect[k] == item:
                                redirect[k] = redirect[key]

                        merged = True

            # Jeśli nie połączono żadnego klastra, kończymy pętlę
            if not merged:
                break

            discriminative = 0

    def predict(self, test_data: pd.DataFrame) -> List[Tuple[int, int]]:
        """Faza predykcji."""
        predictions = []
        nn = []
        candidates = [] 
        cand_min_mpl = []
        for test_index, test_row in test_data.iterrows():
            for e in self.timestamps:
                neigh = NearestNeighbors(n_neighbors=1, metric='euclidean').fit(self.data.iloc[:, 0:e + 1])
                neighbors = neigh.kneighbors([test_row[0:e + 1]])
                candidates.clear()
                cand_min_mpl.clear()
                nn = neighbors[1]
                for i in nn:
                    if e >= self.mpl[i[0]]:
                        candidates.append((self.mpl[i[0]], self.labels[i[0]])) 
                if len(candidates) > 1: 
                    candidates.sort(key=lambda x: x[0])
                    for candidate in candidates:

                        if candidate[0] == candidates[0][0]:
                            cand_min_mpl.append(candidate) 
                        else:
                            break 
                    predictions.append((e, max(set(cand_min_mpl), key=cand_min_mpl.count)))  
                    break
                elif len(candidates) == 1: 
                    predictions.append((e, candidates[0][1]))
                    break
            if candidates == 0:
                predictions.append((self.timestamps[-1], 0))
        return predictions

# TEASER

In [1]:
from sktime.classification.early_classification import TEASER

KeyboardInterrupt: 

# CALIMERA

In [None]:
import numpy as np
from sktime.transformations.panel.rocket import MiniRocketMultivariate
from sklearn.linear_model import RidgeClassifierCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.calibration import CalibratedClassifierCV


class CALIMERA:
    def __init__(self, delay_penalty):
        self.delay_penalty = delay_penalty

    def _generate_timestamps(max_timestamp):
        NUM_TIMESTAMPS = 20
        num_intervals_between_timestamps = min(NUM_TIMESTAMPS-1, max_timestamp)
        step = max_timestamp // num_intervals_between_timestamps
        timestamps = np.arange(max(2, step), max_timestamp+step, step).astype(np.int32)
        timestamps[-1] = max_timestamp
        return timestamps

    def _learn_feature_extractors(X, timestamps):
        extractors = []
        for timestamp in timestamps:
            if timestamp < 9:
                extractors.append(lambda x: x.reshape(x.shape[0], -1))
            else:
                X_sub = X[:, :, :timestamp]
                extractors.append(MiniRocketMultivariate().fit(X_sub).transform)
        return extractors

    def _get_features(X, feature_extractors, timestamps):
        features = [[] for i in range(timestamps.shape[0])]
        for i in range(timestamps.shape[0]):
            timestamp = timestamps[i]
            X_sub = X[:, :, :timestamp]
            feature = feature_extractors[i](X_sub)
            features[i] = np.asarray(feature)
            features[i] = features[i].reshape(features[i].shape[0], -1)
        return features

    def _learn_classifiers(features, ys, timestamps):
        T = timestamps.shape[0]
        classifiers = [None for t in range(T)]

        for t in range(T):
            classifier = WeakClassifier()
            classifier.fit(features[t], ys)
            classifiers[t] = classifier

        return classifiers

    def _generate_data_for_training_stopping_module(classifiers):
        predictors = []
        costs = []
        for classifier in classifiers:
            costs.append(classifier.costs_for_training_stopping_module)
            predictors.append([
                _scores_to_predictors(s) 
                for s in classifier.predictors_for_training_stopping_module
            ])
        return np.asarray(predictors), np.asarray(costs)

    def fit(self, X_train, labels):
        timestamps = CALIMERA._generate_timestamps(max_timestamp=X_train.shape[-1])
        self.feature_extractors = CALIMERA._learn_feature_extractors(X_train, timestamps)
        features_train = CALIMERA._get_features(X_train, self.feature_extractors, timestamps)
        self.classifiers = CALIMERA._learn_classifiers(features_train, labels, timestamps)
        predictors, costs = CALIMERA._generate_data_for_training_stopping_module(self.classifiers)
        self.stopping_module = StoppingModule()
        self.stopping_module.fit(
            predictors,
            costs,
            timestamps,
            self.delay_penalty,
            KernelRidgeRegressionWrapper
        )
        self.timestamps = timestamps

    def test(self, X):
        n = X.shape[0]
        stop_timestamps = []
        predicted_y = []
        for j in range(n):
            for t in range(self.timestamps.shape[0]):
                X_sub = X[j, :, :self.timestamps[t]]
                X_sub = X_sub.reshape(1, -1, X_sub.shape[-1])
                features = np.asarray(self.feature_extractors[t](X_sub))
                scores = self.classifiers[t].get_scores(features.reshape(1, -1))[0]
                predictors = _scores_to_predictors(scores)
                should_stop = (
                    t==(self.timestamps.shape[0]-1) or \
                        self.stopping_module.should_stop(predictors, t)
                )
                if should_stop:
                    predicted_label = self.classifiers[t].predict(features.reshape(1, -1))[0]
                    stop_timestamps.append(self.timestamps[t])
                    predicted_y.append(predicted_label)
                    break
        return stop_timestamps, predicted_y


class KernelRidgeRegressionWrapper:
    def __init__(self):
        self.model = KernelRidge(kernel="rbf")

    def fit(self, X, y):
        self.model.fit(X, y)
        return self

    def predict(self, X):
        return self.model.predict(X)


class WeakClassifier:
    def normalize_X(self, X):
        return (X - self.feature_means) / self.feature_norms

    def fit(self, X, y):
        RC_ALPHAS = np.logspace(-3, 3, 10)

        self.feature_means = np.mean(X, axis=0)
        self.feature_norms = np.linalg.norm(X, axis=0)
        self.feature_norms[self.feature_norms == 0] = 1.0

        uncalibrated_clf = RidgeClassifierCV(
            alphas=RC_ALPHAS, store_cv_values=True, scoring='accuracy')
        normalized_X = self.normalize_X(X)
        uncalibrated_clf.fit(normalized_X, y)

        chosen_alpha_index = np.where(RC_ALPHAS == uncalibrated_clf.alpha_)
        X_scores = uncalibrated_clf.cv_values_[:, :, chosen_alpha_index]
        X_scores = X_scores.reshape((X_scores.shape[0], X_scores.shape[1]))
        X_scores = X_scores + uncalibrated_clf.intercept_

        # transform to uncalibrated probabilities
        exped_X_scores = np.exp(X_scores)
        if len(uncalibrated_clf.classes_) == 2:
            X_probab = exped_X_scores / (exped_X_scores + np.exp(-X_scores))
        else:
            X_probab = exped_X_scores / np.sum(exped_X_scores, axis=1)[:,None]

        # walkaround to use L1O validation data in an sklearn calibrator and save some time
        mockup_clf = MockupClassifierForPassingValidationDataToSklearnCalibrator(
            X_probab, uncalibrated_clf.classes_
        )
        calibrated_clf = CalibratedClassifierCV(mockup_clf, method="sigmoid", cv="prefit")
        mockup_X = np.zeros((y.shape[0], 1))
        calibrated_clf.fit(mockup_X, y)

        # generate data for stopping module training
        self.costs_for_training_stopping_module = 1.0 - np.max(
            calibrated_clf.predict_proba(mockup_X), axis=1
        )
        self.predictors_for_training_stopping_module = X_scores

        # actual classification can be performed with uncalibrated clfs
        self.clf = uncalibrated_clf 

    def predict(self, X):
        return self.clf.predict(self.normalize_X(X))

    def get_scores(self, X):
        return np.atleast_2d(self.clf.decision_function(self.normalize_X(X)))

    def get_labels(self):
        return self.clf.classes_


class MockupClassifierForPassingValidationDataToSklearnCalibrator:
    def __init__(self, mockup_scores, classes):
        self.mockup_scores = mockup_scores
        self.classes_ = classes
        self._estimator_type = "classifier"

    def fit(self):
        pass

    def decision_function(self, X):
        return self.mockup_scores


class StoppingModule:
    def fit(self, predictors, original_costs, timestamps, alpha, REGRESSOR_WAIT):
        costs = np.copy(original_costs)

        T = timestamps.shape[0]
        n = predictors.shape[1]

        for t in range(timestamps.shape[0]):
            costs[t, :] += alpha * (timestamps[t] / timestamps[-1])

        self.halters = [None for t in range(T-1)]

        for t in range(T-2, -1, -1):
            X = predictors[t, :].squeeze()
            X = X.reshape(X.shape[0], -1)
            y = costs[t+1, :] - costs[t, :]

            model = REGRESSOR_WAIT().fit(X, y)

            self.halters[t] = model
            predicted_cost_difference = model.predict(X)
            for j in range(n):
                if predicted_cost_difference[j] < 0:
                    costs[t, j] = costs[t+1, j]

    def should_stop(self, predictors, t):
        predicted_cost_difference = self.halters[t].predict([predictors])
        return predicted_cost_difference > 0


def _scores_to_predictors(scores):
    if len(scores) == 1:
        return scores
    highest_score = np.max(scores)
    second_highest_score = np.partition(scores, -2)[-2]
    score_diff_stolen_from_teaser = highest_score - second_highest_score
    predictors = np.zeros(scores.shape[0]+2)
    predictors[:-2] = scores
    predictors[-2] = score_diff_stolen_from_teaser
    predictors[-1] = highest_score
    return predictors