KDD-OpenSource · WGierke · Jun 20, 2018 · Jun 5, 2018 · Jun 5, 2018 · Jun 5, 2018
diff --git a/src/evaluation/experiments.py → experiments.py b/src/evaluation/experiments.py → experiments.py
@@ -1,19 +1,15 @@
-import os
-
 import numpy as np
 
-from src.algorithms import DAGMM, Donut, RecurrentEBM, LSTMAD, LSTM_Enc_Dec
-from src.datasets import SyntheticDataGenerator
 from src.evaluation.evaluator import Evaluator
+from src.datasets import SyntheticDataGenerator, MultivariateAnomalyFunction
 
 
 # Validates all algorithms regarding polluted data based on a given outlier type.
 # The pollution of the training data is tested from 0 to 100% (with default steps=5).
-def run_pollution_experiment(outlier_type='extreme_1', output_dir=None, steps=5):
+def run_pollution_experiment(detectors, outlier_type='extreme_1', output_dir=None, steps=5):
     datasets = [
         SyntheticDataGenerator.get(f'{outlier_type}_polluted', pollution) for pollution in np.linspace(0, 1, steps)
     ]
-    detectors = [LSTM_Enc_Dec(num_epochs=15), DAGMM(), Donut(), RecurrentEBM(), LSTMAD()]
     evaluator = Evaluator(datasets, detectors, output_dir)
     evaluator.evaluate()
     evaluator.plot_auroc(title='Area under the curve for polluted data')
@@ -28,11 +24,10 @@ def run_pollution_experiment(outlier_type='extreme_1', output_dir=None, steps=5)
 # The percentage of missing values within the training data is tested from 0 to 100% (with default
 # steps=5). By default the missing values are represented as zeros since no algorithm can't handle
 # nan values.
-def run_missing_experiment(outlier_type='extreme_1', output_dir=None, steps=5):
+def run_missing_experiment(detectors, outlier_type='extreme_1', output_dir=None, steps=5):
     datasets = [
         SyntheticDataGenerator.get(f'{outlier_type}_missing', missing) for missing in np.linspace(0, 1, steps)
     ]
-    detectors = [LSTM_Enc_Dec(num_epochs=200), DAGMM(), Donut(), RecurrentEBM(), LSTMAD()]
     evaluator = Evaluator(datasets, detectors, output_dir)
     evaluator.evaluate()
     evaluator.plot_auroc(title='Area under the curve for missing values')
@@ -45,11 +40,10 @@ def run_missing_experiment(outlier_type='extreme_1', output_dir=None, steps=5):
 
 # Validates all algorithms regarding different heights of extreme outliers
 # The extreme values are added to the outlier timestamps everywhere in the dataset distribution.
-def run_extremes_experiment(outlier_type='extreme_1', output_dir=None, steps=10):
+def run_extremes_experiment(detectors, outlier_type='extreme_1', output_dir=None, steps=10):
     datasets = [
         SyntheticDataGenerator.get(f'{outlier_type}_extremeness', extreme) for extreme in np.linspace(1, 10, steps)
     ]
-    detectors = [LSTM_Enc_Dec(num_epochs=200), DAGMM(), Donut(), RecurrentEBM(), LSTMAD()]
     evaluator = Evaluator(datasets, detectors, output_dir)
     evaluator.evaluate()
     evaluator.plot_auroc(title='Area under the curve for differing outlier heights')
@@ -60,26 +54,16 @@ def run_extremes_experiment(outlier_type='extreme_1', output_dir=None, steps=10)
     return evaluator
 
 
-def run_experiments(outlier_type='extreme_1', output_dir=None, steps=5):
-    output_dir = output_dir or os.path.join('reports/experiments', outlier_type)
-
-    announce_experiment('Missing Values')
-    run_pollution_experiment(outlier_type, output_dir=os.path.join(output_dir, 'pollution'),
-                             steps=steps)
-
-    announce_experiment('Pollution')
-    run_missing_experiment(outlier_type, output_dir=os.path.join(output_dir, 'missing'),
-                           steps=steps)
-
-    announce_experiment('Outlier height')
-    run_extremes_experiment(outlier_type, output_dir=os.path.join(output_dir, 'extremes'),
-                            steps=steps)
-
-
-def announce_experiment(title: str, dashes: int = 70):
-    print(f'\n###{"-"*dashes}###')
-    message = f'Experiment: {title}'
-    before = (dashes - len(message)) // 2
-    after = dashes - len(message) - before
-    print(f'###{"-"*before}{message}{"-"*after}###')
-    print(f'###{"-"*dashes}###\n')
+def run_multivariate_experiment(detectors, output_dir=None):
+    anomaly_functions = ['doubled', 'inversed', 'shrinked', 'delayed', 'xor']
+    datasets = [
+        MultivariateAnomalyFunction.get_multivariate_dataset(dim_func) for dim_func in anomaly_functions
+    ]
+    evaluator = Evaluator(datasets, detectors, output_dir)
+    evaluator.evaluate()
+    evaluator.plot_auroc(title='Area under the curve for multivariate outliers')
+    evaluator.print_tables()
+    evaluator.plot_threshold_comparison()
+    evaluator.plot_scores()
+    evaluator.plot_roc_curves()
+    return evaluator
diff --git a/main.py b/main.py
@@ -6,12 +6,13 @@
 from src.algorithms import DAGMM, Donut, RecurrentEBM, LSTMAD, LSTM_Enc_Dec
 from src.datasets import AirQuality, KDDCup, SyntheticDataGenerator
 from src.evaluation.evaluator import Evaluator
-# from src.evaluation.experiments import run_experiments
+from experiments import run_pollution_experiment, run_missing_experiment, run_extremes_experiment, \
+                        run_multivariate_experiment
 
 
 def main():
     run_pipeline()
-    # run_experiments()
+    run_experiments()
 
 
 def run_pipeline():
@@ -27,14 +28,6 @@ def run_pipeline():
             SyntheticDataGenerator.trend_1(),
             SyntheticDataGenerator.combined_1(),
             SyntheticDataGenerator.combined_4(),
-            SyntheticDataGenerator.variance_1_missing(0.1),
-            SyntheticDataGenerator.variance_1_missing(0.3),
-            SyntheticDataGenerator.variance_1_missing(0.5),
-            SyntheticDataGenerator.variance_1_missing(0.8),
-            SyntheticDataGenerator.extreme_1_polluted(0.1),
-            SyntheticDataGenerator.extreme_1_polluted(0.3),
-            SyntheticDataGenerator.extreme_1_polluted(0.5),
-            SyntheticDataGenerator.extreme_1_polluted(0.9)
         ]
         detectors = [RecurrentEBM(num_epochs=15), LSTMAD(), Donut(), DAGMM(), LSTM_Enc_Dec(num_epochs=15)]
     evaluator = Evaluator(datasets, detectors)
@@ -68,5 +61,40 @@ def evaluate_on_real_world_data_sets():
     print("Donut results: ", pred)
 
 
+def run_experiments(outlier_type='extreme_1', output_dir=None, steps=5):
+    output_dir = output_dir or os.path.join('reports/experiments', outlier_type)
+    if os.environ.get("CIRCLECI", False):
+        detectors = [RecurrentEBM(num_epochs=2), LSTMAD(num_epochs=5), Donut(num_epochs=5), DAGMM(),
+                     LSTM_Enc_Dec(num_epochs=2)]
+        run_extremes_experiment(detectors, outlier_type, output_dir=os.path.join(output_dir, 'extremes'),
+                                steps=1)
+    else:
+        detectors = [RecurrentEBM(num_epochs=15), LSTMAD(), Donut(), DAGMM(), LSTM_Enc_Dec(num_epochs=15)]
+
+        announce_experiment('Missing Values')
+        run_pollution_experiment(detectors, outlier_type, output_dir=os.path.join(output_dir, 'pollution'),
+                                 steps=steps)
+
+        announce_experiment('Pollution')
+        run_missing_experiment(detectors, outlier_type, output_dir=os.path.join(output_dir, 'missing'),
+                               steps=steps)
+
+        announce_experiment('Outlier height')
+        run_extremes_experiment(detectors, outlier_type, output_dir=os.path.join(output_dir, 'extremes'),
+                                steps=steps)
+
+        announce_experiment('Multivariate Datasets')
+        run_multivariate_experiment(detectors, output_dir=os.path.join(output_dir, 'multivariate'))
+
+
+def announce_experiment(title: str, dashes: int = 70):
+    print(f'\n###{"-"*dashes}###')
+    message = f'Experiment: {title}'
+    before = (dashes - len(message)) // 2
+    after = dashes - len(message) - before
+    print(f'###{"-"*before}{message}{"-"*after}###')
+    print(f'###{"-"*dashes}###\n')
+
+
 if __name__ == '__main__':
     main()
diff --git a/notebooks/3.0-tk-data-generation.ipynb b/notebooks/3.0-tk-data-generation.ipynb
diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py
@@ -4,12 +4,14 @@
 from .real_dataset import RealDataset
 from .synthetic_data_generator import SyntheticDataGenerator
 from .synthetic_dataset import SyntheticDataset
+from .multivariate_anomaly_function import MultivariateAnomalyFunction
 
 __all__ = [
     'Dataset',
     'SyntheticDataset',
     'RealDataset',
     'AirQuality',
     'KDDCup',
-    'SyntheticDataGenerator'
+    'SyntheticDataGenerator',
+    'MultivariateAnomalyFunction'
 ]
diff --git a/src/datasets/multivariate_anomaly_function.py b/src/datasets/multivariate_anomaly_function.py
@@ -0,0 +1,65 @@
+from .synthetic_multivariate_dataset import SyntheticMultivariateDataset
+import numpy as np
+
+
+class MultivariateAnomalyFunction:
+    # ----- Functions generating the anomalous dimension --------- #
+    # A MultivariateAnomalyFunction should return a tuple containing the following three values:
+    # * The values of the second dimension (array of max `interval_length` numbers)
+    # * Starting point for the anomaly
+    # * End point for the anomaly section
+    # The last two values are ignored for generation of not anomalous data
+
+    # Get a dataset by passing the method name as string. All following parameters
+    # are passed through. Throws AttributeError if attribute was not found.
+    @staticmethod
+    def get_multivariate_dataset(method, *args, **kwargs):
+        func = getattr(MultivariateAnomalyFunction, method)
+        return SyntheticMultivariateDataset(anomaly_func=func, name=f'Synthetic Multivariate {method} Curve Outliers')
+
+    @staticmethod
+    def doubled(curve_values, anomalous, _):
+        factor = 4 if anomalous else 2
+        return curve_values * factor, 0, len(curve_values)
+
+    @staticmethod
+    def inversed(curve_values, anomalous, _):
+        factor = -2 if anomalous else 2
+        return curve_values * factor, 0, len(curve_values)
+
+    @staticmethod
+    def shrinked(curve_values, anomalous, _):
+        if not anomalous:
+            return curve_values, -1, -1
+        else:
+            new_curve = curve_values[::2]
+            nonce = np.zeros(len(curve_values) - len(new_curve))
+            values = np.concatenate([nonce, new_curve])
+            return values, 0, len(values)
+
+    @staticmethod
+    def delayed(curve_values, anomalous, _):
+        if not anomalous:
+            return curve_values, -1, -1
+        else:
+            # The curve in the second dimension occurs a few timestamps later
+            nonce = np.zeros(len(curve_values) // 10)
+            values = np.concatenate([nonce, curve_values])
+            return values, 0, len(values)
+
+    @staticmethod
+    def xor(curve_values, anomalous, interval_length):
+        orig_amplitude = max(abs(curve_values))
+        orig_amplitude *= np.sign(curve_values.mean())
+        pause_length = interval_length - len(curve_values)
+        if not anomalous:
+            # No curve during the other curve in the 1st dimension
+            nonce = np.zeros(len(curve_values))
+            # Insert a curve with the same amplitude during the pause of the 1st dimension
+            new_curve = SyntheticMultivariateDataset.get_curve(pause_length, orig_amplitude)
+            return np.concatenate([nonce, new_curve]), -1, -1
+        else:
+            # Anomaly: curves overlap (at the same time or at least half overlapping)
+            max_pause = min(len(curve_values) // 2, pause_length)
+            nonce = np.zeros(np.random.randint(max_pause))
+            return np.concatenate([nonce, curve_values]), len(nonce), len(nonce) + len(curve_values)
diff --git a/src/datasets/synthetic_multivariate_dataset.py b/src/datasets/synthetic_multivariate_dataset.py
@@ -0,0 +1,125 @@
+from typing import Tuple, Callable
+
+import numpy as np
+import pandas as pd
+from . import Dataset
+
+
+class SyntheticMultivariateDataset(Dataset):
+    def __init__(self,
+                 # anomaly_func: Lambda for curve values of 2nd dimension
+                 anomaly_func: Callable[[np.ndarray, bool, int], Tuple[np.ndarray, int, int]],
+                 name: str = 'Synthetic Multivariate Curve Outliers',
+                 length: int = 5000,
+                 mean_curve_length: int = 40,  # varies between -5 and +5
+                 mean_curve_amplitude: int = 1,  # By default varies between -0.5 and 1.5
+                 pause_range: Tuple[int, int] = (5, 75),  # min and max value for this a pause
+                 labels_padding: int = 6,
+                 random_seed: int = 42,
+                 features: int = 2,
+                 file_name: str = 'synthetic_mv1.pkl'):
+        super().__init__(name, file_name)
+        self.length = length
+        self.mean_curve_length = mean_curve_length
+        self.mean_curve_amplitude = mean_curve_amplitude
+        self.global_noise = 0.1  # Noise added to all dimensions over the whole timeseries
+        self.anomaly_func = anomaly_func
+        self.pause_range = pause_range
+        self.labels_padding = labels_padding
+        self.random_seed = random_seed
+        self.features = features
+
+    @staticmethod
+    def get_noisy_value(x, strength=1):
+        return x + np.random.random(np.shape(x)) * strength - strength / 2
+
+    # Use part of sinus to create a curve starting and ending with zero gradients.
+    # Using `length` and `amplitude` you can adjust it in both dimensions.
+    @staticmethod
+    def get_curve(length, amplitude):
+        # Transformed sinus curve: [-1, 1] -> [0, amplitude]
+        def curve(t: int):
+            return amplitude * (np.sin(t)/2 + 0.5)
+        # Start and end of one curve section in sinus
+        from_ = 1.5 * np.pi
+        to_ = 3.5 * np.pi
+        return np.array([curve(t) for t in np.linspace(from_, to_, length)])
+
+    # Randomly adjust curve size by adding noise to the passed parameters
+    def get_random_curve(self, length_randomness=10, amplitude_randomness=1):
+        is_negative = np.random.choice([True, False])
+        sign = -1 if is_negative else 1
+        new_length = self.get_noisy_value(self.mean_curve_length, length_randomness)
+        new_amplitude = self.get_noisy_value(sign * self.mean_curve_amplitude, amplitude_randomness)
+        return self.get_curve(new_length, new_amplitude)
+
+    # The interval between two curves must be random so a detector doesn't recognize a pattern
+    def create_pause(self):
+        xmin, xmax = self.pause_range
+        diff = xmax - xmin
+        return xmin + np.random.randint(diff)
+
+    def add_global_noise(self, x):
+        return self.get_noisy_value(x, self.global_noise)
+
+    """
+        pollution: Portion of anomalous curves. Because it's not known how many curves there are
+            in the end. It's randomly chosen based on this value. To avoid anomalies set this to zero.
+    """
+    def generate_data(self, pollution=0.5):
+        values = np.zeros((self.length, self.features))
+        labels = np.zeros(self.length)
+        pos = self.create_pause()
+
+        # First pos data points are noise (don't start directly with curve)
+        values[:pos] = self.add_global_noise(values[:pos])
+
+        while pos < self.length - self.mean_curve_length - 20:
+            # General outline for the repeating curves, varying height and length
+            curve = self.get_random_curve()
+            # Outlier generation in second dimension
+            create_anomaly = np.random.choice([False, True], p=[1-pollution, pollution])
+            # After curve add pause, only noise
+            end_of_interval = pos + len(curve) + self.create_pause()
+            self.insert_features(values[pos:end_of_interval], labels[pos:end_of_interval], curve, create_anomaly)
+            pos = end_of_interval
+        # rest of values is noise
+        values[pos:] = self.add_global_noise(values[pos:])
+        return pd.DataFrame(values), pd.Series(labels)
+
+    """
+        Insert values for curve and following pause over all dimensions.
+        interval_values is changed by reference so this function doesn't return anything.
+        (this is done by using numpy place function/slice operator)
+
+    """
+    def insert_features(self, interval_values: np.ndarray, interval_labels: np.ndarray,
+                        curve: np.ndarray, create_anomaly: bool):
+        assert self.features == 2, 'Only two features are supported right now!'
+
+        # Insert curve and pause in first dimension (after adding the global noise)
+        interval_values[:len(curve), 0] = self.add_global_noise(curve)
+        interval_values[len(curve):, 0] = self.add_global_noise(interval_values[len(curve):, 0])
+
+        # Get values of anomaly_func and fill missing spots with noise
+        # anomaly_func function gets the clean curve values (not noisy)
+        interval_length = interval_values.shape[0]
+        anomaly_values, start, end = self.anomaly_func(curve, create_anomaly, interval_length)
+        assert len(anomaly_values) <= interval_length, f'Interval too long: {len(anomaly_values)} > {interval_length}'
+
+        interval_values[:len(anomaly_values), 1] = self.add_global_noise(anomaly_values)
+        # Fill interval up with noisy zero values
+        interval_values[len(anomaly_values):, 1] = self.add_global_noise(interval_values[len(anomaly_values):, 1])
+
+        # Add anomaly labels with slight padding (dont start with the first interval value).
+        # The padding is curve_length / padding_factor
+        if create_anomaly:
+            assert end > start and start >= 0, f'Invalid anomaly indizes: {start} to {end}'
+            padding = (end - start) // self.labels_padding
+            interval_labels[start+padding:end-padding] += 1
+
+    def load(self):
+        np.random.seed(self.random_seed)
+        X_train, y_train = self.generate_data(pollution=0)
+        X_test, y_test = self.generate_data(pollution=0.5)
+        self._data = X_train, y_train, X_test, y_test
diff --git a/src/evaluation/evaluator.py b/src/evaluation/evaluator.py
@@ -156,10 +156,8 @@ def plot_threshold_comparison(self, steps=40, store=True):
             for det, ax in zip(self.detectors, axes_row):
                 score = np.array(self.results[(ds.name, det.name)])
 
-                anomalies, _, prec, rec, f_score, f01_score, thresh = self.get_optimal_threshold(det,
-                                                                                                 y_test,
-                                                                                                 score,
-                                                                                                 return_metrics=True)
+                anomalies, _, prec, rec, f_score, f01_score, thresh = self.get_optimal_threshold(
+                    det, y_test, score, return_metrics=True)
 
                 ax.plot(thresh, anomalies / len(y_test),
                         label=fr"anomalies ({len(y_test)} $\rightarrow$ 1)")