# 0. Загрузка библиотек

In [1]:
import pandas as pd
import numpy as np
import datetime
from matplotlib import pyplot as plt
import seaborn as sns
from IPython.display import display, Markdown

from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn import model_selection, linear_model, metrics
from sklearn.preprocessing import StandardScaler

plt.rcParams.update({'figure.max_open_warning': 0})
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
import joblib
import pickle
import tensorflow as tf

2024-09-02 00:04:21.950810: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-02 00:04:21.950880: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-02 00:04:21.952678: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
import tensorflow as tf
print("GPU Available:", tf.test.is_gpu_available())

GPU Available: False


In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 2550405252319849391
xla_global_id: -1
]


# 1. Загрузка данных

In [4]:
# загрузка тестовой выборки
with open("/kaggle/input/ml-cource-cifrum-anomaly-public/test.pkl", "rb") as f:
    list_of_df = pickle.load(f)


In [5]:
len(pd.concat(list_of_df))

37401

# 2. Подготовка данных

In [6]:
for ind in range(len(list_of_df)):
    # признаки, имеющие физический смысл
    # мощность
    list_of_df[ind]["Power"] = list_of_df[ind]["Current"] * list_of_df[ind]["Voltage"] 
    # отношение расхода к мощности
    list_of_df[ind]["Power_flow_rate"] = list_of_df[ind]["Volume Flow RateRMS"] / list_of_df[ind]["Power"]
    # разница температур (если признаки скоррелированы и с одним из них что-то происходит, то покажет наличие аномалии)
    list_of_df[ind]["Temperature_diff"] = list_of_df[ind]['Temperature'] - list_of_df[ind]['Thermocouple'] 
    # разница акселлерометров (если признаки скоррелированы и с одним из них что-то происходит, то покажет наличие аномалии)
    list_of_df[ind]["Accel_diff"] = list_of_df[ind]['Accelerometer1RMS'] - list_of_df[ind]['Accelerometer2RMS']
    
    # сглаживания
    list_of_df[ind]["Volume Flow RateRMS_10mean"] = list_of_df[ind]["Volume Flow RateRMS"].rolling(window = 10, min_periods=0).mean()
    
    # list_of_df[ind] = list_of_df[ind].drop(['Thermocouple', 'Accelerometer2RMS'], axis=1)
    # out_std = list_of_df[ind].std()
    # out_mean = list_of_df[ind].mean()
    # list_of_df[ind] = (list_of_df[ind] - out_mean) / out_std
   

# 3. Инициализация и обучение модели

In [7]:
# from cnn_ae import Conv_AE

In [8]:
from tensorflow.keras.layers import Input, Conv1D, Dropout, Conv1DTranspose
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, History
from tensorflow.keras.models import load_model, save_model

import tensorflow as tf
import os
import random
import numpy as np

In [9]:
class Conv_AE_6:     
    def __init__(self):
        self._Random(0)
        
    def _Random(self, seed_value):      
        os.environ['PYTHONHASHSEED'] = str(seed_value)
        random.seed(seed_value)
        np.random.seed(seed_value)
        tf.random.set_seed(seed_value)
        
    def _build_model(self):
        
        model = Sequential(
            [
                Input(shape=(self.shape[1], self.shape[2])),
                Conv1D(
                    filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
                ),
                Dropout(rate=0.2),
                Conv1D(
                    filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
                ),
                Dropout(rate=0.2),
                Conv1D(
                    filters=8, kernel_size=7, padding="same", strides=1, activation="relu"
                ),
                Conv1DTranspose(
                    filters=8, kernel_size=7, padding="same", strides=1, activation="relu"
                ),
                Dropout(rate=0.2),
                Conv1DTranspose(
                    filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
                ),
                Conv1DTranspose(
                    filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
                ),
                Conv1DTranspose(filters=1, kernel_size=7, padding="same"),
            ]
        )
        model.compile(optimizer=Adam(learning_rate=0.001), loss="mse")
        
        return model
    
    def fit(self, data, validation_split=0.1, epochs=40, verbose=0, shuffle=True, batch_size = 32):      
        self.shape = data.shape
        self.model = self._build_model()
        
        history = History()
        
        return self.model.fit(
            data,
            data,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=validation_split,
            verbose=verbose,
            callbacks=[
                EarlyStopping(monitor="val_loss", patience=5, mode="min", verbose=0),history
            ],
        )

    def predict(self, data):       
        return self.model.predict(data)

In [16]:
from tensorflow.keras.layers import BatchNormalization

In [17]:
class Conv_AE_12:     
    def __init__(self):
        self._Random(0)
        
    def _Random(self, seed_value):      
        os.environ['PYTHONHASHSEED'] = str(seed_value)
        random.seed(seed_value)
        np.random.seed(seed_value)
        tf.random.set_seed(seed_value)
        
    def _build_model(self):
        
        model = Sequential(
            [
                Input(shape=(self.shape[1], self.shape[2])),
                Conv1D(
                    filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
                ),
                BatchNormalization(),
                # Dropout(rate=0.2),
                Conv1D(
                    filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
                ),
                Dropout(rate=0.2),
                Conv1D(
                    filters=8, kernel_size=7, padding="same", strides=1, activation="relu"
                ),
                Conv1DTranspose(
                    filters=8, kernel_size=7, padding="same", strides=1, activation="relu"
                ),
                Dropout(rate=0.2),
                Conv1DTranspose(
                    filters=16, kernel_size=7, padding="same", strides=2, activation="relu"
                ),
                Conv1DTranspose(
                    filters=32, kernel_size=7, padding="same", strides=2, activation="relu"
                ),
                Conv1DTranspose(filters=1, kernel_size=7, padding="same"),
            ]
        )
        model.compile(optimizer=Adam(learning_rate=0.001), loss="mse")
        
        return model
    
    def fit(self, data, validation_split=0.1, epochs=40, verbose=0, shuffle=True, batch_size = 32):      
        self.shape = data.shape
        self.model = self._build_model()
        
        history = History()
        
        return self.model.fit(
            data,
            data,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=validation_split,
            verbose=verbose,
            callbacks=[
                EarlyStopping(monitor="val_loss", patience=5, mode="min", verbose=0),history
            ],
        )

    def predict(self, data):       
        return self.model.predict(data)

In [10]:
EPOCHS = 10
BATCH_SIZE = 32 # test 10
# BATCH_SIZE = 16 # test 12 #64
VAL_SPLIT = 0.1
N_STEPS = 60 # test 10
# N_STEPS = 100 # test 11
Q = 0.999

In [11]:
# функция для генерации выборок для обучения
def create_sequences(values, time_steps=N_STEPS):
    output = []
    for i in range(len(values) - time_steps + 1):
        output.append(values[i : (i + time_steps)])
    return np.stack(output)

In [18]:
%%time
# инференс
predicted_outlier, predicted_cp = [], []
with tf.device("/device:GPU:0"):
    for df in list_of_df:
        X_train = df[:400]

        # инициализация и обучение нормализатора данных
        StSc = StandardScaler()
        StSc.fit(X_train)

        # масштабирование данных и генерация выборок для обучения
        X = create_sequences(StSc.transform(X_train), N_STEPS)

        # инициализация модели
        # model = Conv_AE_6()
        model = Conv_AE_12()
        # обучение модели
        model.fit(X)

        # прогноз на обучающей выборке и выбор порога
        residuals = pd.Series(np.sum(np.mean(np.abs(X - model.predict(X)), axis=1), axis=1))
        # UCL = residuals.quantile(Q) * 1.1 # test 6
        # UCL = residuals.quantile(Q) # test 7
        # UCL = residuals.quantile(Q) * 0.9 # test 8
        # UCL = residuals.quantile(Q) * 1.2 # test 9
        UCL = residuals.quantile(Q) * 1.15 # test 10, 12

        # прогноз на всей выборке и поиск аномалий по порогу
        X = create_sequences(StSc.transform(df), N_STEPS)
        cnn_residuals = pd.Series(np.sum(np.mean(np.abs(X - model.predict(X)), axis=1), axis=1))
        
        anomalous_data = cnn_residuals > UCL
        anomalous_data_indices = []
        for data_idx in range(N_STEPS - 1, len(X) - N_STEPS + 1):
            if np.all(anomalous_data[data_idx - N_STEPS + 1 : data_idx]):
                anomalous_data_indices.append(data_idx)

        prediction = pd.Series(data=0, index=df.index)
        prediction.iloc[anomalous_data_indices] = 1

        # сохранение результатов предсказания аномалий
        predicted_outlier.append(prediction)

        # сохранение предсказания точек изменения состояния
        prediction_cp = abs(prediction.diff())
        prediction_cp[0] = prediction[0]
        predicted_cp.append(prediction_cp)

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

# 4. Формирование и сохранение предсказания моделью

In [19]:
# формирование вектора предсказаний по всем экспериментам
pred = pd.concat(predicted_outlier)

# формирование массива для загрузки на платформу
y_pred=pd.Series(pred, name = "anomaly")
y_pred.index = np.arange(0, y_pred.shape[0])

y_pred = y_pred.reset_index()
y_pred.columns = ["ID", "anomaly"]
y_pred.to_csv("predict.csv", index = False)

In [20]:
# отображение созданного массива с предсказанием
y_pred

Unnamed: 0,ID,anomaly
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
37396,37396,0
37397,37397,0
37398,37398,0
37399,37399,0


In [21]:
y_pred["anomaly"].value_counts()

anomaly
0    24470
1    12931
Name: count, dtype: int64