In [None]:
import torch
from torch import nn
from statistics import mean

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from himodule.custom_classes import NasaDataset, WindowedLoader, CAE
from himodule.normalisation import StandardScaler, MinMaxScaler, ErrorScaler
from himodule.secondary_funcs import save_object, load_object, check_path, split_dataset, \
    seed_everything, split_anomaly_normal, split_anomaly_normal23

from collections import defaultdict

import os

sns.set_theme(style='whitegrid', font_scale=1.2)

In [None]:
# Check for GPU availability

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'{device=}')

In [None]:
def calculate_window_size(window_size: int, kernel: int, stride: int):
    return (window_size - kernel) // stride + 1

def calculate_kernel_size(new_window_size: int, last_window_size: int, stride: int):
    return last_window_size - (new_window_size - 1) * stride

In [None]:
seed = 37
batch_size = 20
window_size = 20

# Whole dataset loading
train_dataset = NasaDataset('../datasets/clean_train_data.csv')

test_dataset = NasaDataset('../datasets/clean_test_data.csv')

scaler_path = '../scalers/MinMaxScaler.pkl'
scaler = load_object('../scalers/MinMaxScaler.pkl')
try:
    norm_name = repr(scaler).split(' ', maxsplit=2)[0].split('.')[-1]
except IndexError:
    norm_name = 'no_scaling'

train_dataset.to(device)
train_dataset.dataset = scaler.transform(train_dataset.dataset)

test_dataset.to(device)
test_dataset.dataset = scaler.transform(test_dataset.dataset)

seed_everything(seed)
train_loader = WindowedLoader(train_dataset, batch_size=batch_size, window_size=window_size, for_conv=True)

seed_everything(seed)
test_loader = WindowedLoader(test_dataset, batch_size=batch_size, window_size=window_size, for_conv=True)

print(f'Train: {len(train_dataset)}\nTest: {len(test_dataset)}')

input_shape = train_dataset.get_input_shape()
layers_sizes = (input_shape*window_size//2, input_shape*window_size//4, input_shape*window_size//8)

# Model creating
params_dct = {
    'conv_kernel': 3,
    'conv_stride': 1,
    'pool_kernel': 2,
    'pool_stride': 1,
    'unconv_stride': 1
}

conv_dct = {'kernel': params_dct['conv_kernel'], 'stride': params_dct['conv_stride']}
pool_dct = {'kernel': params_dct['pool_kernel'], 'stride': params_dct['pool_stride']}

window_sizes = [window_size]
for dct in (conv_dct, pool_dct, conv_dct, pool_dct):
    window_sizes.append(calculate_window_size(window_sizes[-1], **dct))

input_shape = train_dataset.get_input_shape()
layers_sizes = (input_shape*2,
                input_shape*4,
                window_sizes[-1]*input_shape*4,
                window_sizes[-1]*input_shape*4 // 8,
                window_sizes[-1]*input_shape*4 // 16)

models_path = f'../Models/cae/{len(layers_sizes)+1}'

kernels = list()
new_window_size = window_sizes[-1]
for last_window_size in window_sizes[-3::-2]:
    kernels.append(calculate_kernel_size(new_window_size, last_window_size, params_dct['unconv_stride']))
    new_window_size = last_window_size
params_dct.update({'unconv_kernels': kernels})

seed_everything(seed)
model_cae = CAE(input_channels=input_shape, layers=layers_sizes, **params_dct).to(device)
loss_func = nn.MSELoss()
model_cae.load_state_dict(torch.load(os.path.join(models_path, f'{window_size}.pth')))
model_cae = model_cae.to(device)
loss_func = nn.MSELoss(reduction='none')

In [None]:
def get_losses(loader: WindowedLoader, model_cae: CAE, loss_func, window_size: int, input_shape: int) -> list:
    losses_dct = defaultdict(list)

    with torch.no_grad():
        for dta in loader:
            sample = dta['sensors']
            indeces = dta['indeces'].flatten()
            _, reconstruction = model_cae(sample)
            loss = loss_func(nn.Flatten()(reconstruction), nn.Flatten()(sample))
            loss = loss.view(-1, input_shape, window_size).mean(dim=1).flatten()

            for idx, ls in zip(indeces, loss):
                losses_dct[idx.item()].append(ls.item())
    
    for id, lst in losses_dct.items():
        losses_dct[id] = mean(lst)
    return losses_dct

In [None]:
train_losses = get_losses(train_loader, model_cae, loss_func,
                          window_size=window_size, input_shape=input_shape)
test_losses = get_losses(test_loader, model_cae, loss_func,
                          window_size=window_size, input_shape=input_shape)


train_losses = torch.FloatTensor(tuple(train_losses.values()))
test_losses = torch.FloatTensor(tuple(test_losses.values()))

losses = {'train': train_losses, 'test': test_losses}

In [None]:
arrays_path = '../HIs/cae/'
for key in losses.keys():
    check_path(os.path.join(arrays_path, key))

error_scaler = ErrorScaler()

for (sample_type, loss), dataset in zip(losses.items(), (train_dataset, test_dataset)):
    for machine_id in dataset.machine_ids.unique():
        indeces = dataset.get_indeces(machine_id)

        arr = error_scaler.fit_transform(loss[indeces.cpu()]).numpy()
        arr.tofile(os.path.join(arrays_path, sample_type, f'{int(machine_id)}.dat'))