In [1]:
%load_ext autoreload
%autoreload 2
import sys
import os
sys.argv=['']
del sys
os.chdir("../")

In [2]:
import matplotlib.pyplot as plt
import os
import json
import math
import torch
import logging
import numpy as np
import pandas as pd

from IPython import embed
from common import data_preprocess 
from common.utils import print_to_json, iter_thresholds
from common.dataloader import load_dataset
from common.sliding import WindowIterator
from common.config import parse_arguments, set_logger, initialize_config
from networks.mlstm import MultiLSTMEncoder
from torch import nn
%matplotlib inline

In [139]:
# train
args = parse_arguments()

# load config
config_dir = "./hypers/" if not args["load"] else args["load"]
params = initialize_config(config_dir, args)
params["clear"] = 1
params["nrows"] = None
params["inter"] = "MEAN"
params["window_size"] = 32

In [140]:
data_dict = load_dataset("SMD","machine-2-4", use_dim="all")
# data_dict["train"].max(), data_dict["train"].min()

2021-02-22 00:18:47,052 P25707 INFO Loading machine-2-4 of SMD dataset
2021-02-22 00:18:47,053 P25707 INFO 1 files found.


In [141]:
pp = data_preprocess.preprocessor()
pp.save(params["save_path"])
data_dict = pp.normalize(data_dict,method="minmax")
window_dict = data_preprocess.generate_windows(data_dict, data_hdf5_path=params["path"], **params)

2021-02-22 00:18:47,242 P25707 INFO Saving preprocessor into ./checkpoints/20210222-001846/preprocessor.pkl
2021-02-22 00:18:47,244 P25707 INFO Normalizing data
2021-02-22 00:18:47,261 P25707 INFO Generating sliding windows (size 32).
2021-02-22 00:18:47,367 P25707 INFO Train windows #: (4732, 32, 38)
2021-02-22 00:18:47,368 P25707 INFO Test windows #: (23657, 32, 38)


In [142]:
data_dict["train"].max(), data_dict["train"].min(), data_dict["test"].max(), data_dict["test"].min()

(1.0, 0.0, 1.0, 0.0)

In [143]:
window_dict["train_windows"].shape

(4732, 32, 38)

In [144]:
train_iterator = WindowIterator(window_dict["train_windows"], batch_size=params["batch_size"], shuffle=True)
test_iterator = WindowIterator(window_dict["test_windows"], batch_size=params["batch_size"], shuffle=False)
params['in_channels'] = data_dict["dim"]

In [145]:
params["nb_steps"] = 200
encoder = MultiLSTMEncoder(**params)

2021-02-22 00:18:47,963 P25707 INFO Compiling finished.


In [146]:
# training
encoder.fit(
            train_iterator,
            test_iterator=test_iterator.loader,
            test_labels=None,
            **params
        )
encoder.save_encoder()

2021-02-22 00:18:48,133 P25707 INFO Start training for 74 batches.
2021-02-22 00:18:48,361 P25707 INFO Epoch: 1, loss: 54.57195
2021-02-22 00:18:48,544 P25707 INFO Epoch: 2, loss: 16.00438
2021-02-22 00:18:48,758 P25707 INFO Epoch: 3, loss: 13.09840
2021-02-22 00:18:48,951 P25707 INFO Epoch: 4, loss: 12.01773
2021-02-22 00:18:49,128 P25707 INFO Epoch: 5, loss: 11.38449
2021-02-22 00:18:49,307 P25707 INFO Epoch: 6, loss: 10.88530
2021-02-22 00:18:49,493 P25707 INFO Epoch: 7, loss: 10.61105
2021-02-22 00:18:49,671 P25707 INFO Epoch: 8, loss: 10.26685
2021-02-22 00:18:49,853 P25707 INFO Epoch: 9, loss: 9.95500
2021-02-22 00:18:50,034 P25707 INFO Epoch: 10, loss: 9.77768
2021-02-22 00:18:50,218 P25707 INFO Epoch: 11, loss: 9.53253
2021-02-22 00:18:50,445 P25707 INFO Epoch: 12, loss: 9.36402
2021-02-22 00:18:50,654 P25707 INFO Epoch: 13, loss: 9.28466
2021-02-22 00:18:50,883 P25707 INFO Epoch: 14, loss: 9.26840
2021-02-22 00:18:51,134 P25707 INFO Epoch: 15, loss: 8.99486
2021-02-22 00:18:51

KeyboardInterrupt: 

In [None]:
# def reconstruction_loader(loader, encoder):
train_iterator_non_shuffle = WindowIterator(window_dict["train_windows"],
                                            batch_size=params["batch_size"], shuffle=False)
loader = train_iterator_non_shuffle.loader

loader = test_iterator.loader
encoder = encoder.eval()
with torch.no_grad():
    recst_list = []
    real_list = []
    loss = 0
    for batch in loader:
        batch = batch.to(encoder.device)
        return_dict = encoder(batch)
        # diff = return_dict["diff"].max(dim=-1)[0] # chose the most anomaous ts
        recst = return_dict["recst"]  # chose the most anomaous ts
        recst_list.append(recst)
        real_list.append(return_dict["y"])
        loss += return_dict["loss"].item()
print(loss / len(loader))
recst_list = torch.cat(recst_list).squeeze()
real_list = torch.cat(real_list).squeeze()
diff_list = (recst_list - real_list).sigmoid()
recst_list = recst_list.cpu().numpy()
real_list = real_list.cpu().numpy()

score_dict = encoder.score(test_iterator.loader, window_dict["test_labels"])
best_f1, best_theta, best_adjust, best_raw = iter_thresholds(
            score_dict["score"], score_dict["anomaly_label"]
        )
# plt.plot(best_raw, "r")
# plt.plot(best_adjust + 0.1, "b")
# plt.plot(score_dict["anomaly_label"]+0.3, "g")
# plt.plot(score_dict["score"], "g")

In [None]:
print(recst_list.shape)
print(real_list.shape)
print(diff_list.shape)
print(window_dict["test_labels"].shape)
print(real_list.min(), real_list.max(), recst_list.min(), recst_list.max())

In [None]:
dims = [9,10,13,14]
fig, ax = plt.subplots(nrows=len(dims)+1, sharey=False, figsize=(12,12))
i=0
# ax[0].plot(recst, 'r', label='reconstruction')
# ax[0].legend(loc='best')
for i in range(len(dims)):
    j = dims[i] 
    ax[i].plot(recst_list[0:, j].reshape(-1), 'r', label='recst')
    ax[i].plot(real_list[0:, j].reshape(-1), 'b', label='real')
#     ax[i].plot(diff_list[0:, j].reshape(-1), 'g', label='KPI Error Score')
#     ax[i].plot(window_dict["test_labels"][0:, -1].reshape(-1)*0.2+0.4, 'brown', label='Label')
    ax[i].legend(loc='best')
#     ax[i].set_ylim([0.8,1])
# ax[-1].plot(diff_list[:, [5, 6, 10, 14, 18, 29]].max(axis=-1)[0])
ax[-1].plot((score_dict["score"]-0.5) * 10, label='Entity error score')
ax[-1].plot([(best_theta-0.5)*10] * len(score_dict["score"]), label='Threshold')
ax[-1].plot(window_dict["test_labels"][0:, -1].reshape(-1)*0.3, 'brown', label='Label')
# ax[-1].set_ylim([0.8,1])
# ax[1].legend(loc='best')

plt.legend()
plt.suptitle("Result of the autoencoder")
plt.show()

In [29]:
names=["time", "expid", "dataset", "auc", "f1", "f1a"]
df = pd.read_csv("./experiment_results.csv", sep="\t", names=names)

In [30]:
df["f1a"].map(lambda x: float(x.split("-")[1])).mean()

0.9235357142857143