In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [14]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from collections import deque

# Parameters
start_datetime = datetime(2024, 1, 1, 0, 0)  
end_datetime = datetime(2024, 1, 1, 23, 59)  
frequency = '1T'  # Frequency in minutes
num_kpis = 10  # Number of KPIs
outlier_duration = 5  # Minimum duration of outliers in minutes
outlier_magnitude = 10  # Magnitude of outliers
outlier_prob = 0.05  # Probability of outlier occurrence per KPI

# Generate time series
date_rng = pd.date_range(start=start_datetime, end=end_datetime, freq=frequency)
df = pd.DataFrame(date_rng, columns=['datetime'])

# Generate random KPI data
for i in range(1, num_kpis + 1):
    kpi_values = np.random.normal(loc=50, scale=10, size=len(df))  # Normal distribution
    df[f'kpi_{i}'] = kpi_values

# Inject outliers
for i in range(1, num_kpis + 1):
    num_outliers = int(len(df) * outlier_prob)
    outlier_indices = np.random.choice(len(df), num_outliers, replace=False)
    
    # Ensure some outliers last at least `outlier_duration` minutes
    for idx in outlier_indices:
        if idx + outlier_duration < len(df):
            df.loc[idx:idx + outlier_duration - 1, f'kpi_{i}'] += outlier_magnitude

# Save to CSV or display
# print(df.head())
df.to_csv('/kaggle/working/timeseries_kpis_with_outliers.csv', index=False)


In [15]:
outlier_threshold = 90  
outlier_duration = 2  

windows = {f'kpi_{i}': deque(maxlen=outlier_duration) for i in range(1, num_kpis + 1)}

def process_record(record):
    timestamp = record['datetime']
    alarms = []
    for i in range(1, num_kpis + 1):
        kpi_name = f'kpi_{i}'
        kpi_value = record[kpi_name]

        if kpi_value > outlier_threshold: 
            windows[kpi_name].append(True)
        else:
            windows[kpi_name].append(False)

        # Check if all values in the window are True (continuous outliers)
        if len(windows[kpi_name]) == outlier_duration and all(windows[kpi_name]):
            alarms.append(f'Alarm triggered for {kpi_name} at {timestamp}')

    return alarms


def stream_data(data):
    for _, row in data.iterrows():
        record = row.to_dict()
        alarms = process_record(record)
        for alarm in alarms:
            print(alarm)


data = pd.read_csv('timeseries_kpis_with_outliers.csv', parse_dates=['datetime'])

stream_data(df)


Alarm triggered for kpi_4 at 2023-01-01 00:51:00
Alarm triggered for kpi_6 at 2023-01-01 05:54:00
Alarm triggered for kpi_8 at 2023-01-01 09:26:00
Alarm triggered for kpi_8 at 2023-01-01 09:27:00
Alarm triggered for kpi_8 at 2023-01-01 09:28:00
Alarm triggered for kpi_8 at 2023-01-01 09:29:00
Alarm triggered for kpi_4 at 2023-01-01 16:14:00
Alarm triggered for kpi_1 at 2023-01-01 17:12:00
Alarm triggered for kpi_1 at 2023-01-01 17:15:00
Alarm triggered for kpi_8 at 2023-01-01 19:33:00


In [20]:
print(data[data['datetime'] == '2023-01-01 00:51:00'])
print(data[data['datetime'] == '2023-01-01 00:50:00'])

              datetime      kpi_1      kpi_2      kpi_3      kpi_4      kpi_5  \
51 2023-01-01 00:51:00  41.316099  59.389862  63.798403  94.712291  52.157851   

        kpi_6     kpi_7     kpi_8      kpi_9     kpi_10  
51  72.763472  49.26415  52.05176  61.960015  69.458719  
              datetime      kpi_1     kpi_2      kpi_3       kpi_4      kpi_5  \
50 2023-01-01 00:50:00  37.036014  46.44119  72.600071  106.767597  51.037961   

        kpi_6      kpi_7      kpi_8      kpi_9    kpi_10  
50  73.668314  55.192981  77.799894  37.541537  55.26551  


In [21]:
import pandas as pd
import numpy as np
from collections import deque


outlier_duration = 2
num_kpis = 10  
z_score_threshold = 3  


history = {f'kpi_{i}': [] for i in range(1, num_kpis + 1)}
outlier_windows = {f'kpi_{i}': deque(maxlen=outlier_duration) for i in range(1, num_kpis + 1)}


def process_record(record):
    timestamp = record['datetime']
    alarms = []
    for i in range(1, num_kpis + 1):
        kpi_name = f'kpi_{i}'
        kpi_value = record[kpi_name]

        history[kpi_name].append(kpi_value)

        if len(history[kpi_name]) > 1:
            mean = np.mean(history[kpi_name])
            std = np.std(history[kpi_name])

            if std > 0:
                z_score = (kpi_value - mean) / std
                is_outlier = abs(z_score) > z_score_threshold
            else:
                is_outlier = False
        else:
            is_outlier = False 

        # Update outlier sliding window
        outlier_windows[kpi_name].append(is_outlier)

        # Check if all values in the outlier window are True
        if len(outlier_windows[kpi_name]) == outlier_duration and all(outlier_windows[kpi_name]):
            alarms.append(
                f"Alarm triggered for {kpi_name} at {timestamp} (Value: {kpi_value}, Z-Score: {z_score:.2f})"
            )
    return alarms


def stream_data(data):
    for _, row in data.iterrows():
        record = row.to_dict()
        alarms = process_record(record)
        for alarm in alarms:
            print(alarm)


data = pd.read_csv('timeseries_kpis_with_outliers.csv', parse_dates=['datetime'])

stream_data(data)


Alarm triggered for kpi_8 at 2023-01-01 09:26:00 (Value: 109.18661363849918, Z-Score: 4.51)
Alarm triggered for kpi_8 at 2023-01-01 09:27:00 (Value: 100.12099881619724, Z-Score: 3.69)
Alarm triggered for kpi_8 at 2023-01-01 09:28:00 (Value: 99.50591333507283, Z-Score: 3.60)
Alarm triggered for kpi_1 at 2023-01-01 17:12:00 (Value: 97.23057257420068, Z-Score: 3.53)
Alarm triggered for kpi_1 at 2023-01-01 17:15:00 (Value: 94.79094671564349, Z-Score: 3.29)
Alarm triggered for kpi_8 at 2023-01-01 19:33:00 (Value: 92.2841946444093, Z-Score: 3.04)


In [22]:
print(data[data['datetime'] == '2023-01-01 19:32:00'])
print(data[data['datetime'] == '2023-01-01 19:33:00'])

                datetime      kpi_1      kpi_2      kpi_3      kpi_4  \
1172 2023-01-01 19:32:00  84.960059  60.265133  68.827719  54.642632   

          kpi_5      kpi_6      kpi_7      kpi_8      kpi_9     kpi_10  
1172  60.930068  44.967836  46.912729  95.649897  64.356182  63.566348  
                datetime      kpi_1      kpi_2      kpi_3      kpi_4  \
1173 2023-01-01 19:33:00  83.885398  59.764106  60.130541  43.707564   

          kpi_5      kpi_6      kpi_7      kpi_8      kpi_9     kpi_10  
1173  57.924255  51.341943  35.866212  92.284195  62.274585  67.403986  


In [None]:
import pandas as pd
import numpy as np
from collections import deque

class KPIOutlierDetection:
    def __init__(self, kpi_columns, z_score_threshold=3, outlier_duration=2):
        """
        Initialize the outlier detection with dynamic column names.
        
        Args:
            kpi_columns (list): A list of strings representing the column names for the KPIs.
            z_score_threshold (float): Z-score threshold for outlier detection.
            outlier_duration (int): Duration for consecutive outliers to trigger alarm.
        """
        self.kpi_columns = kpi_columns  # List of column names for KPIs
        self.z_score_threshold = z_score_threshold  # Z-score threshold for outlier detection
        self.outlier_duration = outlier_duration  # Duration for consecutive outliers to trigger alarm

        # Historical data for each KPI
        self.history = {col: [] for col in kpi_columns}
        self.outlier_windows = {col: deque(maxlen=outlier_duration) for col in kpi_columns}

    def process_record(self, record):
        """
        Processes each record and checks for outliers based on z-score.
        
        Args:
            record (dict): A single record with datetime and KPI values.
            
        Returns:
            alarms (list): A list of alarms triggered for the KPIs.
        """
        timestamp = record['datetime']
        alarms = []

        # Check each KPI column
        for kpi_name in self.kpi_columns:
            kpi_value = record[kpi_name]

            # Update historical data
            self.history[kpi_name].append(kpi_value)

            # Calculate dynamic mean and std for z-score
            if len(self.history[kpi_name]) > 1:
                mean = np.mean(self.history[kpi_name])
                std = np.std(self.history[kpi_name])

                if std > 0:  # Avoid division by zero
                    z_score = (kpi_value - mean) / std
                    is_outlier = abs(z_score) > self.z_score_threshold
                else:
                    is_outlier = False
            else:
                is_outlier = False  # Not enough data to calculate z-score

            # Update outlier sliding window
            self.outlier_windows[kpi_name].append(is_outlier)

            # Check if all values in the outlier window are True
            if len(self.outlier_windows[kpi_name]) == self.outlier_duration and all(self.outlier_windows[kpi_name]):
                alarms.append(
                    f"Alarm triggered for {kpi_name} at {timestamp} (Value: {kpi_value}, Z-Score: {z_score:.2f})"
                )

        return alarms

    def stream_data(self, data):
        """
        Simulates streaming data and processes each record for outliers.
        
        Args:
            data (pd.DataFrame): A DataFrame containing time series data for KPIs.
        """
        for _, row in data.iterrows():
            record = row.to_dict()
            alarms = self.process_record(record)
            for alarm in alarms:
                print(alarm)


In [None]:
To convert the above code block into a reusable module, you can create a Python module with functions that can be imported into other scripts. Here's a step-by-step guide on how to achieve this.

### Step 1: Create the Module File
Create a new Python file (e.g., `kpi_outlier_detection.py`) to contain the code logic.

#### `kpi_outlier_detection.py`

```python
import pandas as pd
import numpy as np
from collections import deque

class KPIOutlierDetection:
    def __init__(self, num_kpis=10, z_score_threshold=3, outlier_duration=2, window_size=30):
        self.num_kpis = num_kpis  # Number of KPIs
        self.z_score_threshold = z_score_threshold  # Z-score threshold for outlier detection
        self.outlier_duration = outlier_duration  # Duration for consecutive outliers to trigger alarm
        self.window_size = window_size  # Window size for rolling mean and std calculation

        # Historical data for each KPI
        self.history = {f'kpi_{i}': [] for i in range(1, num_kpis + 1)}
        self.outlier_windows = {f'kpi_{i}': deque(maxlen=outlier_duration) for i in range(1, num_kpis + 1)}

    def process_record(self, record):
        """
        Processes each record and checks for outliers based on z-score.
        Args:
            record (dict): A single record with datetime and KPI values.
        Returns:
            alarms (list): A list of alarms triggered for the KPIs.
        """
        timestamp = record['datetime']
        alarms = []

        # Check each KPI
        for i in range(1, self.num_kpis + 1):
            kpi_name = f'kpi_{i}'
            kpi_value = record[kpi_name]

            # Update historical data
            self.history[kpi_name].append(kpi_value)

            # Calculate dynamic mean and std for z-score
            if len(self.history[kpi_name]) > 1:
                mean = np.mean(self.history[kpi_name])
                std = np.std(self.history[kpi_name])

                if std > 0:  # Avoid division by zero
                    z_score = (kpi_value - mean) / std
                    is_outlier = abs(z_score) > self.z_score_threshold
                else:
                    is_outlier = False
            else:
                is_outlier = False  # Not enough data to calculate z-score

            # Update outlier sliding window
            self.outlier_windows[kpi_name].append(is_outlier)

            # Check if all values in the outlier window are True
            if len(self.outlier_windows[kpi_name]) == self.outlier_duration and all(self.outlier_windows[kpi_name]):
                alarms.append(
                    f"Alarm triggered for {kpi_name} at {timestamp} (Value: {kpi_value}, Z-Score: {z_score:.2f})"
                )

        return alarms

    def stream_data(self, data):
        """
        Simulates streaming data and processes each record for outliers.
        Args:
            data (pd.DataFrame): A DataFrame containing time series data for KPIs.
        """
        for _, row in data.iterrows():
            record = row.to_dict()
            alarms = self.process_record(record)
            for alarm in alarms:
                print(alarm)
```

### Step 2: Create an Entry Point or Script for Testing
You can create a separate script (e.g., `test_outlier_detection.py`) to test the module and simulate how it works.

#### `test_outlier_detection.py`

```python
import pandas as pd
from kpi_outlier_detection import KPIOutlierDetection

# Example: Create an instance of the KPIOutlierDetection class
kpi_detector = KPIOutlierDetection(num_kpis=10, z_score_threshold=3, outlier_duration=2)

# Example: Load sample data (replace with your actual data)
data = pd.read_csv('timeseries_kpis_with_outliers.csv', parse_dates=['datetime'])

# Simulate streaming data
kpi_detector.stream_data(data)
```

### Step 3: Organize Your Files

- `kpi_outlier_detection.py`: Contains the class and logic for outlier detection.
- `test_outlier_detection.py`: Tests the functionality of the `KPIOutlierDetection` class using a sample dataset.

Both files should be placed in the same directory or in a Python package structure.

### Step 4: Reusing the Module
Now, you can import and reuse the `KPIOutlierDetection` class in any other script by importing it as follows:

```python
from kpi_outlier_detection import KPIOutlierDetection

# Create an instance of the outlier detector
kpi_detector = KPIOutlierDetection(num_kpis=5, z_score_threshold=2.5, outlier_duration=3)

# Example DataFrame (replace with real data)
data = pd.DataFrame({
    'datetime': ['2024-01-01 00:01', '2024-01-01 00:02'],
    'kpi_1': [10, 15],
    'kpi_2': [5, 6],
    'kpi_3': [12, 13],
    'kpi_4': [20, 21],
    'kpi_5': [30, 25]
})

# Process data to detect outliers
kpi_detector.stream_data(data)
```

### Step 5: Install and Use the Module
If you're working with multiple projects, you might want to package and install the module for easier reuse. Here's a simple example of packaging it with `setuptools`:

1. **Directory structure**:
    ```
    my_kpi_outlier_package/
    ├── kpi_outlier_detection.py
    ├── test_outlier_detection.py
    └── setup.py
    ```

2. **`setup.py`**:
    ```python
    from setuptools import setup, find_packages

    setup(
        name='kpi_outlier_detection',
        version='0.1',
        packages=find_packages(),
        install_requires=[
            'numpy',
            'pandas',
        ],
    )
    ```

3. Install the package locally:
    ```bash
    pip install .
    ```

Now, you can import and use `KPIOutlierDetection` in any Python script or project.

---

### Summary:
- We created a Python module (`kpi_outlier_detection.py`) that contains the outlier detection logic.
- The module is reusable and can be imported into any other script.
- You can use it for processing time-series data and detecting outliers based on the z-score of historical KPI values.

Let me know if you need further assistance!