# Metrics

Evaluates a trained model accordingly to the metrics specified on the paper

In [17]:
import sys
sys.path.append("../src/")

import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, multilabel_confusion_matrix 
from sklearn.model_selection import train_test_split
from ModelHandler import ModelHandler
import pickle
import h5py
from sklearn.metrics import f1_score, precision_score, recall_score     
from tqdm import tqdm

configs = {
    "N_GRIDS": 5, 
    "SIGNAL_BASE_LENGTH": 12800, 
    "N_CLASS": 26, 
    "USE_NO_LOAD": False, 
    "AUGMENTATION_RATIO": 5, 
    "MARGIN_RATIO": 0.15, 
    "DATASET_PATH": "../Synthetic_Full_iHall.hdf5",
    "TRAIN_SIZE": 0.8,
    "FOLDER_PATH": "../TrainedWeights/Final/", 
    "FOLDER_DATA_PATH": "../TrainedWeights/Final/", 
    "N_EPOCHS_TRAINING": 250,
    "INITIAL_EPOCH": 0,
    "TOTAL_MAX_EPOCHS": 250,
    "SNRdb": None # Noise level on db
}


folderPath = configs["FOLDER_PATH"]
folderDataPath = configs["FOLDER_DATA_PATH"]
signalBaseLength = configs["SIGNAL_BASE_LENGTH"]
ngrids = configs["N_GRIDS"]
trainSize = configs["TRAIN_SIZE"]

dict_data = pickle.load(open(folderDataPath + "sorted_aug_data_" + str(ngrids) + "_" + str(signalBaseLength) + ".p", "rb")) # Load data
x_train = dict_data["x_train"]
x_test = dict_data["x_test"]
y_train = dict_data["y_train"]
y_test = dict_data["y_test"]

## Evaluates the identification

This step generates a dict with the ground truth and the prediction for each test example

In [12]:
from tqdm import tqdm

X_all = np.vstack((x_train, x_test))
ydet_all = np.vstack((y_train["detection"], y_test["detection"]))
ytype_all = np.vstack((y_train["type"], y_test["type"]))
yclass_all = np.vstack((y_train["classification"], y_test["classification"]))

final_acc_on, final_acc_off, final_acc = [], [], []
y = {}
for fold in tqdm(range(1, 11)):
    foldFolderPath = folderPath + str(fold) + "/"
    
    train_index = np.load(foldFolderPath + "train_index.npy")
    test_index = np.load(foldFolderPath + "test_index.npy")

    bestModel = ModelHandler.loadModel(foldFolderPath + "best_model.h5", type_weights=None) # Load model

    x_train = X_all[train_index]
    x_test = X_all[test_index]
    y_train["detection"] = ydet_all[train_index]
    y_test["detection"] = ydet_all[test_index]
    y_train["type"] = ytype_all[train_index]
    y_test["type"] = ytype_all[test_index]
    y_train["classification"] = yclass_all[train_index]
    y_test["classification"] = yclass_all[test_index]

    final_prediction = []
    final_groundTruth = []
    for xi, yclass, ytype in zip(x_test, y_test["classification"], y_test["type"]):
        pred = bestModel.predict(np.expand_dims(xi, axis=0))
        prediction = np.max(pred[2][0],axis=0)
        groundTruth = np.max(yclass,axis=0)

        final_prediction.append(prediction)
        final_groundTruth.append(groundTruth) 

        del xi, yclass, ytype

    y[fold] = {}
    y[fold]["true"] = final_groundTruth.copy()
    y[fold]["pred"] = final_prediction.copy()

100%|██████████| 10/10 [14:23<00:00, 86.34s/it]


### F1 Score

#### F1 Macro:
$$
\begin{gather*}
F1_{Macro} = \frac{1}{Y} \sum_{i=1}^{Y} \frac{2 \cdot tp_i}{2 \cdot tp_i + fp_i + fn_i}
\end{gather*}
$$

#### F1 Micro:
$$
\begin{gather*}
F1_{Micro} = \frac{2 \cdot \sum_{i=1}^{Y} tp_i}{\sum_{i=1}^{Y} 2 \cdot tp_i + fp_i + fn_i}
\end{gather*}
$$

- $tp_i$: True positives classifications for appliance $i$
- $fp_i$: False positives classifications for appliance $i$
- $fn_i$: False negatives classifications for appliance $i$

In [13]:
from sklearn.metrics import f1_score, precision_score, recall_score   
from PostProcessing import PostProcessing

postProcessing = PostProcessing(configs=configs)

threshold = 0.5
f1_macro, f1_micro = [], []
for fold in range(1, 11):
    f1_macro.append(f1_score(np.array(y[fold]["true"]) > threshold, np.array(y[fold]["pred"]) > threshold, average='macro'))
    f1_micro.append(f1_score(np.array(y[fold]["true"]) > threshold, np.array(y[fold]["pred"]) > threshold, average='micro'))
    print(f"Fold {fold}: F1 Macro: {f1_macro[-1] * 100:.1f}, F1 Micro: {f1_micro[-1] * 100:.1f}")

print(f"Average: F1 Macro: {np.average(f1_macro) * 100:.1f}, F1 Micro: {np.average(f1_micro) * 100:.1f}")

Fold 1: F1 Macro: 96.9, F1 Micro: 97.3
Fold 2: F1 Macro: 96.6, F1 Micro: 96.3
Fold 3: F1 Macro: 96.5, F1 Micro: 96.5
Fold 4: F1 Macro: 97.1, F1 Micro: 97.1
Fold 5: F1 Macro: 97.8, F1 Micro: 97.4
Fold 6: F1 Macro: 96.8, F1 Micro: 97.2
Fold 7: F1 Macro: 96.8, F1 Micro: 96.6
Fold 8: F1 Macro: 96.5, F1 Micro: 96.5
Fold 9: F1 Macro: 97.6, F1 Micro: 97.1
Fold 10: F1 Macro: 95.5, F1 Micro: 96.1
Average: F1 Macro: 96.8, F1 Micro: 96.8


### Accuracy (ACC)

$$
\begin{gather*}
ACC_i = \frac{CCE_i}{TNE_i} \\ \\
ACC = \frac{1}{Y} \sum_{i = 1}^{Y} ACC_i
\end{gather*}
$$

- $ACC_i$: Accuracy for appliance $i$
- $CCE_i$: Load connected successfully identified
- $TNE_i$: Total of connected events

In [15]:
threshold = 0.5

ytype_all = np.vstack((y_train["type"], y_test["type"]))

acc_on, acc_off, acc_no_event, acc_total = [], [], [], []
for fold in range(1, 11):
    correct_on = np.zeros((26,1))
    total_on = np.zeros((26,1))
    correct_off = np.zeros((26,1))
    total_off = np.zeros((26,1))
    correct_no_event = np.zeros((26,1))
    total_no_event = np.zeros((26,1))

    train_index = np.load(folderPath + str(fold) + "/train_index.npy")
    test_index = np.load(folderPath + str(fold) + "/test_index.npy")

    ytype_train = ytype_all[train_index]
    ytype_test = ytype_all[test_index]

    for ytype, ytrue, ypred in zip(ytype_test, y[fold]["true"], y[fold]["pred"]):
        event_type = np.min(np.argmax(ytype, axis=1))
        if event_type == 0:
            correct_on[np.bitwise_and(ytrue > threshold, ypred > threshold)] += 1
            total_on[ytrue > threshold] += 1
        elif event_type == 1:
            correct_off[np.bitwise_and(ytrue > threshold, ypred > threshold)] += 1
            total_off[ytrue > threshold] += 1
        else:
            correct_no_event[np.bitwise_and(ytrue > threshold, ypred > threshold)] += 1
            total_no_event[ytrue > threshold] += 1
    
    acc_on.append(100 * np.average(np.nan_to_num(correct_on/total_on)))
    acc_off.append(100 * np.average(np.nan_to_num(correct_off/total_off)))
    acc_no_event.append(100 * np.average(np.nan_to_num(correct_no_event/total_no_event)))
    acc_total.append(100 * np.average(np.nan_to_num((correct_on + correct_off + correct_no_event)/(total_on + total_off + total_no_event))))

    print(f"Fold {fold}, Acc on: {acc_on[-1]:.1f}, Acc off: {acc_off[-1]:.1f}, Acc no event: {acc_no_event[-1]:.1f} Acc total: {acc_total[-1]:.1f}")

print(f"Total, Acc on: {np.average(acc_on):.1f}, Acc off: {np.average(acc_off):.1f}, Acc no event: {np.average(acc_no_event):.1f}, Acc total: {np.average(acc_total):.1f}")

Fold 1, Acc on: 99.2, Acc off: 97.6, Acc no event: 95.4 Acc total: 96.7
Fold 2, Acc on: 95.7, Acc off: 93.8, Acc no event: 97.4 Acc total: 97.0
Fold 3, Acc on: 95.6, Acc off: 97.6, Acc no event: 96.9 Acc total: 96.9
Fold 4, Acc on: 97.3, Acc off: 96.8, Acc no event: 97.1 Acc total: 97.1
Fold 5, Acc on: 99.0, Acc off: 97.9, Acc no event: 98.5 Acc total: 98.5
  acc_off.append(100 * np.average(np.nan_to_num(correct_off/total_off)))
Fold 6, Acc on: 97.3, Acc off: 97.5, Acc no event: 96.7 Acc total: 97.0
Fold 7, Acc on: 97.8, Acc off: 97.6, Acc no event: 96.8 Acc total: 97.1
Fold 8, Acc on: 95.8, Acc off: 94.7, Acc no event: 97.7 Acc total: 96.7
Fold 9, Acc on: 96.9, Acc off: 94.9, Acc no event: 97.6 Acc total: 97.7
Fold 10, Acc on: 95.6, Acc off: 92.4, Acc no event: 96.3 Acc total: 95.8
Total, Acc on: 97.0, Acc off: 96.1, Acc no event: 97.0, Acc total: 97.0


## Detection Metrics

### D
$$
\begin{gather*}
D = \frac{ \sum_{i=1}^{A} |d(i) - ev(i)|}{A}
\end{gather*}
$$

- `A`: Total of events correctly detected ($\pm$ 10 semi cycles tolerance)
- `d(i)`: Detection for appliance $i$
- `ev(i)`: Ground truth detection for appliance $i$

## PC

$$
\begin{gather*}
PC = \frac{A}{N}
\end{gather*}
$$

- `A`: Total of events correctly detected ($\pm$ 10 semi cycles tolerance)
- `N`: Total of events

In [19]:
from PostProcessing import PostProcessing
from DataHandler import DataHandler

postProcessing = PostProcessing(configs=configs)
dataHandler = DataHandler(configs=configs)

group_distribution = {
    "1": 4139,
    "2": 6916,
    "3": 7128,
    "8": 2629
}

general_qtd_train, general_qtd_test = dataHandler.generateAcquisitionType(trainSize, distribution=group_distribution)
X_all = np.vstack((x_train, x_test))
ydet_all = np.vstack((y_train["detection"], y_test["detection"]))
ytype_all = np.vstack((y_train["type"], y_test["type"]))
yclass_all = np.vstack((y_train["classification"], y_test["classification"]))

general_qtd = np.vstack((np.expand_dims(general_qtd_train, axis=1), np.expand_dims(general_qtd_test, axis=1)))

pcMetric, dMetric = [], []
for fold in range(1, 11):
    foldFolderPath = folderPath + str(fold) + "/"
    
    train_index = np.load(foldFolderPath + "train_index.npy")
    test_index = np.load(foldFolderPath + "test_index.npy")

    bestModel = ModelHandler.loadModel(foldFolderPath + "best_model.h5", type_weights=None) # Load model

    x_train = X_all[train_index]
    x_test = X_all[test_index]
    y_train["detection"] = ydet_all[train_index]
    y_test["detection"] = ydet_all[test_index]
    y_train["type"] = ytype_all[train_index]
    y_test["type"] = ytype_all[test_index]
    y_train["classification"] = yclass_all[train_index]
    y_test["classification"] = yclass_all[test_index]

    general_qtd_test = general_qtd[test_index]

    print(f"-------------- FOLD {fold} ---------------")
    pcMetric_fold, dMetric_fold = postProcessing.checkModel(bestModel, x_test, y_test, general_qtd=general_qtd_test, print_error=False)
    pcMetric.append(pcMetric_fold)
    dMetric.append(dMetric_fold)

print("------------ AVERAGE --------------")
avgPCMetric = np.average(pcMetric, axis=0) * 100
avgDMetric = np.average(dMetric, axis=0)
for i, subset in enumerate(["1", "2", "3", "8", "All"]):
    print(f"Average, LIT-SYN-{subset}, PCMetric - On: {avgPCMetric[i][0]:.1f}, Off: {avgPCMetric[i][1]:.1f}, Total: {avgPCMetric[i][2]:.1f}")
    print(f"Average, LIT-SYN-{subset}, DMetric - On: {avgDMetric[i][0]:.1f}, Off: {avgDMetric[i][1]:.1f}, Total: {avgDMetric[i][2]:.1f}")

-------------- FOLD 1 ---------------
Total time: 108.65421447705921, Average Time: 0.05208735113952982
LIT-SYN-1 PCmetric: (1.0, 1.0, 1.0)
LIT-SYN-1 Dmetric: (0.7837837837837838, 0.71875, 0.7536231884057971)
LIT-SYN-2 PCmetric: (0.991304347826087, 0.9791666666666666, 0.9845559845559846)
LIT-SYN-2 Dmetric: (0.7456140350877193, 0.6808510638297872, 0.7098039215686275)
LIT-SYN-3 PCmetric: (0.9870967741935484, 0.9840425531914894, 0.9854227405247813)
LIT-SYN-3 Dmetric: (0.8431372549019608, 0.8324324324324325, 0.8372781065088757)
LIT-SYN-8 PCmetric: (0.9142857142857143, 0.9324324324324325, 0.9236111111111112)
LIT-SYN-8 Dmetric: (1.25, 1.1014492753623188, 1.1729323308270676)
LIT-SYN-All PCmetric: (0.9761273209549072, 0.9748858447488584, 0.9754601226993865)
LIT-SYN-All Dmetric: (0.8777173913043478, 0.8173302107728337, 0.8452830188679246)
-------------- FOLD 2 ---------------
Total time: 85.39059834195359, Average Time: 0.04081768563190898
LIT-SYN-1 PCmetric: (0.96875, 1.0, 0.9855072463768116)
