In [1]:
# In[] # import libraries 
from utils_process import peaks_on_flatten, choose_chunk_peak, calculate_50hz_fourier_coefficient, process_measurement, create_global_features
import time 
import pickle
import pyarrow 
import pyarrow.parquet as pq
import pandas as pd
from train import whole_process_training_single_iter, whole_process_training, whole_Network_training
from utils_model import display_metrics

import random
import numpy as np
import tensorflow as tf
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt 

In [2]:
# In[] # settings 
preprocessed = True
data_dir = 'vsb-power-line-fault-detection/'
recalculate_peaks = False 
input_dir = 'processed_input/'
signal_len = 800000
# window_sizes = [2000, 4000, 5000, 8000] 
window_size = 4000
nchunks = int(signal_len / window_size)

In [3]:
# In[] metadata
meta_df = pd.read_csv('metadata_train.csv')
signal_ids = meta_df['signal_id'].values

# STEP 1 Preprocessing & Feature Extraction

In [4]:
# =============================================================================
# ############ STEP 1 Preprocessing & Feature Extraction ######################
# =============================================================================
if not preprocessed:
    signal_df = pq.read_pandas(data_dir + 'train.parquet').to_pandas()
    _, all_flat_signals, all_points = peaks_on_flatten(signal_df, signal_ids)

    ##### STEP 1A. Denoise & Extract Waveforms #####
    # construct waveforms with given window size 
    waveforms = choose_chunk_peak(all_flat_signals, all_points, window_size=window_size)
    print(waveforms.shape)
    pickle.dump(waveforms, open(input_dir + 'all_chunk_waves_{}chunks.dat'.format(nchunks), 'wb'))

    ##### STEP 1B. Extract Global Features #####
    if recalculate_peaks:
        signal_fft = calculate_50hz_fourier_coefficient(signal_df.values)
        signal_peaks = process_measurement(signal_df, meta_df, signal_fft)
        signal_peaks.to_pickle(input_dir + 'signal_peaks.pkl')
        del signal_fft
        gc.collect()
    else:
        signal_peaks = pd.read_pickle(input_dir + 'signal_peaks.pkl')
    signal_peaks = pd.merge(signal_peaks, meta_df[['signal_id', 'id_measurement', 'target']], on='signal_id', how='left')

    ### load KMeans results
    # x, x_A, x_B, x_C = pickle.load(open('waves_list.dat','rb'))
    # kmeans = KMeans(n_clusters=15, random_state=9, init='k-means++').fit(x)
    # kmeans_A = KMeans(n_clusters=6, random_state=9, init='k-means++').fit(x_A)
    # kmeans_B = KMeans(n_clusters=6, random_state=9, init='k-means++').fit(x_B)
    # kmeans_C = KMeans(n_clusters=6, random_state=9, init='k-means++').fit(x_C)
    kmeans = pickle.load(open(input_dir + 'kmeans.dat', 'rb'))
    kmeans_A = pickle.load(open(input_dir + 'kmeans_A.dat', 'rb'))
    kmeans_B = pickle.load(open(input_dir + 'kmeans_B.dat', 'rb'))
    kmeans_C = pickle.load(open(input_dir + 'kmeans_C.dat', 'rb'))

    X_global = create_global_features(meta_df, signal_peaks, kmeans, kmeans_A, kmeans_B, kmeans_C)
    X_global.to_csv(input_dir + 'global_features.csv')

else:
    X_global = pd.read_csv(input_dir + 'global_features.csv')
    waveforms = pickle.load(open(input_dir + 'all_chunk_waves_{}chunks.dat'.format(nchunks), 'rb'))

X_global.set_index('id_measurement', inplace=True)

# STEP 2 Model Training

In [5]:
seed = 0
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# In[] parameters
loss_name = 'weighted_bce'
output_folder = 'results_{}chunks_{}'.format(nchunks, loss_name)
# local_features = True 
load_local_features = True 
NN_level = 'signal'
NN_model = 'LSTM'
Dense_layers = 2
NN_pretrained = True 
layer_idx = 5 
NN_batch_size = 512 
classifier = 'XGboost'
classifier_level = 'measurement'
num_folds = 5 
num_iterations = 25 
feature_set = 'global'
kfold_random_state = 123948
pretrained = True 
predict = True 
weights_dict = None 
monitor = 'val_loss'
dropout = 0.4 
regularizer = 'l2'
from_logits = True
n_epochs = 100

## LMFE

In [6]:
########### LMFE ########### 
_, best_proba_LMFE, metrics_LMFE, test_pred_LMFE = whole_process_training(meta_df, waveforms, X_global,
    local_features=True, NN_level=NN_level, NN_model=NN_model,
    Dense_layers=Dense_layers, NN_pretrained=NN_pretrained, layer_idx=layer_idx, NN_batch_size=NN_batch_size, 
    output_folder=output_folder, classifier=classifier, classifier_level=classifier_level, num_folds=num_folds,
    num_iterations=num_iterations, feature_set=feature_set, kfold_random_state=kfold_random_state, 
    load_local_features=load_local_features, pretrained=pretrained, predict=predict, early_stopping_rounds=100, 
    verbose_eval=0, weights_dict=weights_dict, monitor=monitor, dropout=dropout, regularizer=regularizer, 
    loss_name=loss_name, from_logits=from_logits, n_epochs=n_epochs)
# test_pred_LMFE.to_csv(output_folder + '/test_pred_LMFE.csv')
metrics_LMFE = display_metrics(test_pred_LMFE)

100%|██████████████████████████████████████████████████████████████████████████████████| 25/25 [00:22<00:00,  1.13it/s]


Best Probability Threshold based on validation set: 0.404
MCC Training: 0.946
MCC Validation: 0.727
MCC Test: 0.730
For best probability thresholded: 0.404,
         mcc:0.7302598775889093, precision:0.7115716753022453, recall:0.7847619047619048, f1:0.746376811594203, acc:0.967860422405877, roc_auc:0.8821818562529445,average_precision:0.5713849585281522



  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


## Only global-scale features

In [7]:
########### Only global-scale features ########### 
###### all global-scale features
# _, best_proba_global, metrics_global, test_pred_global = whole_process_training(meta_df, waveforms, X_global,
#     local_features=False, NN_level=NN_level, NN_model=NN_model, 
#     Dense_layers=Dense_layers, NN_pretrained=NN_pretrained, layer_idx=layer_idx, NN_batch_size=NN_batch_size, 
#     output_folder=output_folder, classifier='LightGBM', classifier_level=classifier_level, num_folds=num_folds,
#     num_iterations=num_iterations, feature_set='global', kfold_random_state=kfold_random_state, 
#     load_local_features=load_local_features, pretrained=pretrained, predict=predict, early_stopping_rounds=100, 
#     verbose_eval=0, weights_dict=weights_dict, monitor=monitor, dropout=dropout, regularizer=regularizer, 
#     loss_name=loss_name, from_logits=from_logits, n_epochs=n_epochs)
# test_pred_global.to_csv(output_folder + '/test_pred_global.csv')
test_pred_global = pd.read_csv(output_folder + '/test_pred_global.csv')
metrics_global = display_metrics(test_pred_global)

###### only phase-level features
# _, best_proba_phase, metrics_phase, test_pred_phase = whole_process_training(meta_df, waveforms, X_global,
#     local_features=False, NN_level=NN_level, NN_model=NN_model, 
#     Dense_layers=Dense_layers, NN_pretrained=NN_pretrained, layer_idx=layer_idx, NN_batch_size=NN_batch_size, 
#     output_folder=output_folder, classifier='LightGBM', classifier_level=classifier_level, num_folds=num_folds,
#     num_iterations=num_iterations, feature_set='phase_level', kfold_random_state=kfold_random_state, 
#     load_local_features=load_local_features, pretrained=pretrained, predict=predict, early_stopping_rounds=100, 
#     verbose_eval=0, weights_dict=weights_dict, monitor=monitor, dropout=dropout, regularizer=regularizer, 
#     loss_name=loss_name, from_logits=from_logits, n_epochs=n_epochs)
# test_pred_phase.to_csv(output_folder + '/test_pred_phase.csv')
test_pred_phase = pd.read_csv(output_folder + '/test_pred_phase.csv')
metrics_phase = display_metrics(test_pred_phase)

###### only measurement-level features
# _, best_proba_measure, metrics_measure, test_pred_measure = whole_process_training(meta_df, waveforms, X_global,
#     local_features=False, NN_level=NN_level, NN_model=NN_model, 
#     Dense_layers=Dense_layers, NN_pretrained=NN_pretrained, layer_idx=layer_idx, NN_batch_size=NN_batch_size, 
#     output_folder=output_folder, classifier='LightGBM', classifier_level=classifier_level, num_folds=num_folds,
#     num_iterations=num_iterations, feature_set='measurement_level', kfold_random_state=kfold_random_state, 
#     load_local_features=load_local_features, pretrained=pretrained, predict=predict, early_stopping_rounds=100, 
#     verbose_eval=0, weights_dict=weights_dict, monitor=monitor, dropout=dropout, regularizer=regularizer, 
#     loss_name=loss_name, from_logits=from_logits, n_epochs=n_epochs)
# test_pred_measure.to_csv(output_folder + '/test_pred_measure.csv')
test_pred_measure = pd.read_csv(output_folder + '/test_pred_measure.csv')
metrics_measure = display_metrics(test_pred_measure)

## Only local-scale features

In [8]:
########### Only local-scale features ########### 
#### Prediction on test set
# _, best_proba_RNN, metrics_RNN, test_pred_RNN, _ = whole_Network_training(meta_df, waveforms,
#     NN_level=NN_level, NN_model=NN_model, Dense_layers=Dense_layers, NN_pretrained=NN_pretrained, 
#     layer_idx=layer_idx, NN_batch_size=NN_batch_size, indice_level=classifier_level,
#     output_folder=output_folder, kfold_random_state=kfold_random_state, num_folds=num_folds,
#     num_iterations=num_iterations, predict=predict, monitor=monitor, dropout=dropout, regularizer=regularizer,
#     from_logits=from_logits, loss_name=loss_name, extract_attention_weights=False)
# test_pred_RNN.to_csv(output_folder + '/test_pred_RNN.csv')
# load test set prediction
test_pred_RNN = pd.read_csv(output_folder + '/test_pred_RNN.csv')
metrics_RNN = display_metrics(test_pred_RNN)

# Performance Comparison

In [9]:
# In[] display the performance
all_metrics = np.array([metrics_LMFE, metrics_global, metrics_phase, metrics_measure, metrics_RNN])
df_res = pd.DataFrame(data=all_metrics, index=['LMFE', 'Global-scale', 'Phase', 'Measurement', 'RNN'], 
    columns=['MCC', 'Precision', 'Recall', 'F1 Score', 'AUC'])
print(df_res)

                   MCC  Precision    Recall  F1 Score       AUC
LMFE          0.730260   0.711572  0.784762  0.746377  0.882182
Global-scale  0.705548   0.735700  0.710476  0.722868  0.847054
Phase         0.708669   0.712454  0.740952  0.726424  0.860888
Measurement   0.706481   0.796296  0.655238  0.718913  0.822245
RNN           0.679368   0.632970  0.775238  0.696918  0.873206
