In [15]:

import numpy as np
import pandas as pd
import data_loader
import modules
import time
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch._C import device
import utils
import eval_methods

device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")

# Customized Dataset Setting

In [16]:
# customized dataset structure
'''
You need to define x_train, x_test, y_test variable as followed with ndarray format.
x_train: train data (usually without any label)
x_test: test data
y_test: label for test data. (1: anomaly, 0: normal)
'''

# We use one of the entity in SMD dataset for customized setting example.
PATH = os.getcwd() + "\\data/\\"
dataset = data_loader.dataset_choice("SMD", path = PATH)
data = next(iter(dataset))

x_train = data.x_train
x_test = data.x_test
y_test = data.y_test

x_dim = x_train.shape[1] # dimension of given time-series data

This is multi-entity dataset.


In [17]:
# data shape
print("train data shape: ", x_train.shape)
print("test data shape: ", x_test.shape)
print("label data shape: ", y_test.shape)

train data shape:  (28479, 38)
test data shape:  (28479, 38)
label data shape:  (28479,)


# Proposed Model (INRAD)

## temporal encoding

In [18]:
# If acutal timestamps are not available, you can arbitrarily make timestamps as belowed. 
train_timestamp = None

# For the detailed understanding, please refer to the attached technical appendix pdf file.

# default start: 2021-01-01 00:00:00
# default interval unit : 1 minute

# making timestamps for train set
if train_timestamp is None:
    train_timestamps = modules.timestamp_maker(
        len(x_train) + 1,
    )
# '+1' is needed for setting start timestamp for test set  

# making timestamps for test set
test_timestamps = modules.timestamp_maker(
                        len(x_test), start=train_timestamps[-1], unit="1 min"
                    )

In [19]:
# train data timestamps
print(train_timestamps[:-1])

DatetimeIndex(['2021-01-01 00:00:00', '2021-01-01 00:01:00',
               '2021-01-01 00:02:00', '2021-01-01 00:03:00',
               '2021-01-01 00:04:00', '2021-01-01 00:05:00',
               '2021-01-01 00:06:00', '2021-01-01 00:07:00',
               '2021-01-01 00:08:00', '2021-01-01 00:09:00',
               ...
               '2021-01-20 18:29:00', '2021-01-20 18:30:00',
               '2021-01-20 18:31:00', '2021-01-20 18:32:00',
               '2021-01-20 18:33:00', '2021-01-20 18:34:00',
               '2021-01-20 18:35:00', '2021-01-20 18:36:00',
               '2021-01-20 18:37:00', '2021-01-20 18:38:00'],
              dtype='datetime64[ns]', length=28479, freq='T')


In [20]:
# test data timestamps
print(test_timestamps)

DatetimeIndex(['2021-01-20 18:39:00', '2021-01-20 18:40:00',
               '2021-01-20 18:41:00', '2021-01-20 18:42:00',
               '2021-01-20 18:43:00', '2021-01-20 18:44:00',
               '2021-01-20 18:45:00', '2021-01-20 18:46:00',
               '2021-01-20 18:47:00', '2021-01-20 18:48:00',
               ...
               '2021-02-09 13:08:00', '2021-02-09 13:09:00',
               '2021-02-09 13:10:00', '2021-02-09 13:11:00',
               '2021-02-09 13:12:00', '2021-02-09 13:13:00',
               '2021-02-09 13:14:00', '2021-02-09 13:15:00',
               '2021-02-09 13:16:00', '2021-02-09 13:17:00'],
              dtype='datetime64[ns]', length=28479, freq='T')


In [21]:
# temporal encoding
train_encoded_input = modules.temporal_encoding(train_timestamps[:-1])
test_encoded_input = modules.temporal_encoding(test_timestamps)

In [22]:
# encoded time (input of our method) for train set
print(train_encoded_input)

tensor([[-1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000],
        [-1.0000, -1.0000, -1.0000, -1.0000, -0.9661, -1.0000],
        [-1.0000, -1.0000, -1.0000, -1.0000, -0.9322, -1.0000],
        ...,
        [-1.0000, -1.0000,  0.2667,  0.5652,  0.2203, -1.0000],
        [-1.0000, -1.0000,  0.2667,  0.5652,  0.2542, -1.0000],
        [-1.0000, -1.0000,  0.2667,  0.5652,  0.2881, -1.0000]])


In [23]:
# encoded time (input of our method) for test set
print(test_encoded_input)

tensor([[-1.0000, -1.0000,  0.2667,  0.5652,  0.3220, -1.0000],
        [-1.0000, -1.0000,  0.2667,  0.5652,  0.3559, -1.0000],
        [-1.0000, -1.0000,  0.2667,  0.5652,  0.3898, -1.0000],
        ...,
        [-1.0000, -0.8182, -0.4667,  0.1304, -0.4915, -1.0000],
        [-1.0000, -0.8182, -0.4667,  0.1304, -0.4576, -1.0000],
        [-1.0000, -0.8182, -0.4667,  0.1304, -0.4237, -1.0000]])


## Implicit Neural Representation model

In [24]:
# Hyperparameters 
# We fix these across all datasets
hidden_dim=256
batch_size=131072 # 2^17 (full batch as long as memory capacity allows)
epochs=1 # For simplicity, we set it as 1, however originally we set it as 10000
earlystopping_patience=30
first_omega_0=3000

# Model initialization

model = modules.Siren(
    in_features=train_encoded_input.shape[1],
    out_features=x_dim,
    hidden_features=hidden_dim,
    hidden_layers=3,
    first_omega_0=first_omega_0,
    outermost_linear=True,
)
model.to(device)

optim = torch.optim.Adam(lr=1e-4, params=model.parameters())

In [25]:
# Implicit Nerual Representation Learning on train set.

data_train = modules.Timedata(x_train, train_encoded_input)
train_dataloader = DataLoader(
    data_train,
    shuffle=True,
    batch_size=batch_size,
    pin_memory=True,
    num_workers=0,
)

early_stopping = utils.EarlyStopping(
    patience=earlystopping_patience, verbose=False
)

epoch_time = []
for step in range(epochs):
    epoch_start = time.time()
    model_loss = 0
    for batch_model_input, batch_ground_truth in train_dataloader:
        batch_model_input = batch_model_input.to(device)
        batch_ground_truth = batch_ground_truth.to(device)

        batch_model_output, _ = model(batch_model_input)
        loss = F.mse_loss(batch_model_output, batch_ground_truth)
        optim.zero_grad()
        loss.backward()
        optim.step()
        model_loss += loss.item()
        batch_model_input = batch_model_input.detach().cpu()
        batch_ground_truth = batch_ground_truth.detach().cpu()
    epoch_time.append(time.time() - epoch_start)
    early_stopping(model_loss)
    if early_stopping.early_stop:
        break
    
print("average training time per epoch: ", np.mean(epoch_time))
    



average training time per epoch:  0.277463436126709


In [26]:
# Implicit Nerual Representation Learning on test set (re_training). For adopting variants of our method, INRAD-c, you can directly start this phase without using train set.

data_test = modules.Timedata(x_test, test_encoded_input)
test_dataloader = DataLoader(
    data_test,
    shuffle=True,
    batch_size=batch_size,
    pin_memory=True,
    num_workers=0,
)

early_stopping = utils.EarlyStopping(
    patience=earlystopping_patience, verbose=False
)

print("re-training start")
for step in range(epochs):
    epoch_start = time.time()
    model_loss = 0
    for batch_model_input, batch_ground_truth in train_dataloader:
        batch_model_input = batch_model_input.to(device)
        batch_ground_truth = batch_ground_truth.to(device)
        batch_model_output, _ = model(batch_model_input)
        loss = F.mse_loss(batch_model_output, batch_ground_truth)
        optim.zero_grad()
        loss.backward()
        optim.step()
        model_loss += loss.item()
        batch_model_input = batch_model_input.detach().cpu()
        batch_ground_truth = batch_ground_truth.detach().cpu()
    early_stopping(model_loss)
    if early_stopping.early_stop:
        break
print("re-training end")

re-training start
re-training end


In [27]:
# anomaly score calculation
total_input = data_test.timepoints
model = model.cpu()
total_ground_truth = data_test.data_ready
total_model_output, _ = model(total_input)

anomaly_score = np.mean(
    np.abs(
        np.squeeze(
            total_ground_truth.numpy()
            - total_model_output.detach().cpu().numpy()
        )
    ),
    axis=1,
)
# The larger the anomaly score is, the higher possiblity of abnormal status is.

In [28]:
# Evaluation based on Best F1-score 
# Note that for simplicity, we set number of epoch as 1.

accuracy, threshold = eval_methods.bf_search(anomaly_score, y_test, verbose = False)
print("Precision: {}, Recall {}, F1-score: {}".format(accuracy[1], accuracy[2], accuracy[0]))

Precision: 0.9262323304852383, Recall 0.9974016295567868, F1-score: 0.9604954502640649
