In [1]:
import os

NAB_files = [
    os.path.join(path, name) 
        for path, _, files in os.walk("NAB/streams") for name in files
]

In [2]:
f"Number of files in NAB dataset: {len(NAB_files)}"

'Number of files in NAB dataset: 58'

In [3]:
import json


with open("NAB/labels/combined_labels.json") as f:
    labels = json.load(f)

labels

{'artificialNoAnomaly/art_daily_no_noise.csv': [],
 'artificialNoAnomaly/art_daily_perfect_square_wave.csv': [],
 'artificialNoAnomaly/art_daily_small_noise.csv': [],
 'artificialNoAnomaly/art_flatline.csv': [],
 'artificialNoAnomaly/art_noisy.csv': [],
 'artificialWithAnomaly/art_daily_flatmiddle.csv': ['2014-04-11 00:00:00'],
 'artificialWithAnomaly/art_daily_jumpsdown.csv': ['2014-04-11 09:00:00'],
 'artificialWithAnomaly/art_daily_jumpsup.csv': ['2014-04-11 09:00:00'],
 'artificialWithAnomaly/art_daily_nojump.csv': ['2014-04-11 09:00:00'],
 'artificialWithAnomaly/art_increase_spike_density.csv': ['2014-04-07 23:10:00'],
 'artificialWithAnomaly/art_load_balancer_spikes.csv': ['2014-04-11 04:35:00'],
 'realAWSCloudwatch/ec2_cpu_utilization_24ae8d.csv': ['2014-02-26 22:05:00',
  '2014-02-27 17:15:00'],
 'realAWSCloudwatch/ec2_cpu_utilization_53ea38.csv': ['2014-02-19 19:10:00',
  '2014-02-23 20:05:00'],
 'realAWSCloudwatch/ec2_cpu_utilization_5f5533.csv': ['2014-02-19 00:22:00',
  '20

In [4]:
import matplotlib.pyplot as plt
import pandas as pd

In [5]:
plots_per_file = 25
figure_size = 30
for idx, filename in enumerate(NAB_files):
    if idx % plots_per_file == 0:
        if idx / plots_per_file != 0:
            plt.savefig(f'plots_NAB/NAB_stacked{int(idx / plots_per_file)}.png')
            plt.close(figure)
        figure, axis = plt.subplots(5, 5, figsize=(figure_size,figure_size))

    
    row, col = (idx % plots_per_file) // 5, (idx % plots_per_file) % 5

    data = pd.read_csv(filename)

    axis[row, col].plot(data["value"])
    for timestamp in labels[filename[len("NAB/streams/"):]]:

        anomaly = data[data["timestamp"] == timestamp]

        axis[row, col].plot(anomaly.index[0], 
                            anomaly["value"],
                            marker="o", 
                            markeredgecolor="red", 
                            markerfacecolor="red"
        )
    axis[row, col].set_title(filename[len("NAB/streams/"):])

plt.savefig(f'plots_NAB/NAB_stacked{int(idx / plots_per_file) + 1}.png')
plt.close(figure)

In [6]:
plots_per_file = 25
figure_size = 30
for idx, filename in enumerate(NAB_files):
    if idx % plots_per_file == 0:
        if idx / plots_per_file != 0:
            plt.savefig(f'plots_NAB/NAB_stacked_boxplot{int(idx / plots_per_file)}.png')
            plt.close(figure)
        figure, axis = plt.subplots(5, 5, figsize=(figure_size,figure_size))

    
    row, col = (idx % plots_per_file) // 5, (idx % plots_per_file) % 5

    data = pd.read_csv(filename)

    axis[row, col].boxplot(data["value"])

    axis[row, col].set_title(filename[len("NAB/streams/"):])

plt.savefig(f'plots_NAB/NAB_stacked_boxplot{int(idx / plots_per_file) + 1}.png')
plt.close(figure)

Wybrałem 8 najciekawszych wykresów do pokazania.

In [22]:
choosen_plots = [
    "NAB/streams/realTweets/Twitter_volume_GOOG.csv",
    "NAB/streams/artificialWithAnomaly/art_daily_jumpsup.csv",
    "NAB/streams/realKnownCause/machine_temperature_system_failure.csv",
    "NAB/streams/realAWSCloudwatch/ec2_cpu_utilization_ac20cd.csv",
    "NAB/streams/realTraffic/speed_t4013.csv",
    "NAB/streams/realAWSCloudwatch/ec2_cpu_utilization_53ea38.csv",
    "NAB/streams/realAWSCloudwatch/iio_us-east-1_i-a2eb1cd9_NetworkIn.csv",
    "NAB/streams/realAWSCloudwatch/ec2_network_in_257a54.csv"
]

In [24]:
figure, axis = plt.subplots(2, 4, figsize=(30,15))

for idx, plot_path in enumerate(choosen_plots):
    row, col = (idx % plots_per_file) // 4, (idx % plots_per_file) % 4

    data = pd.read_csv(plot_path)
    
    axis[row, col].plot(data["value"])
    for timestamp in labels[plot_path[len("NAB/streams/"):]]:

        anomaly = data[data["timestamp"] == timestamp]

        axis[row, col].plot(anomaly.index[0], 
                            anomaly["value"],
                            marker="o", 
                            markeredgecolor="red", 
                            markerfacecolor="red"
        )
    axis[row, col].set_title(plot_path)

plt.savefig(f'plots_NAB/choosen.png')
plt.close(figure)

In [26]:
figure, axis = plt.subplots(2, 4, figsize=(30,15))

for idx, plot_path in enumerate(choosen_plots):
    row, col = (idx % plots_per_file) // 4, (idx % plots_per_file) % 4

    data = pd.read_csv(plot_path)
    
    axis[row, col].boxplot(data["value"])
    axis[row, col].set_title(plot_path)

plt.savefig(f'plots_NAB/choosen_boxplot.png')
plt.close(figure)