In [None]:
# In[] # import libraries 
from utils_process import peaks_on_flatten, choose_chunk_peak, calculate_50hz_fourier_coefficient, process_measurement, create_global_features
import time 
import pickle
import pyarrow 
import pyarrow.parquet as pq
import pandas as pd
from train import whole_process_training_single_iter, whole_process_training, whole_Network_training

from sklearn.cluster import KMeans
from matplotlib import pyplot as plt 

In [None]:
# In[] # settings 
preprocessed = True
recalculate_peaks = False 
data_dir = 'processed_input/'
signal_len = 800000
# window_sizes = [2000, 4000, 5000, 8000] 
window_size = 4000
nchunks = int(signal_len / window_size)

In [None]:
# In[] parameters
loss_name = 'weighted_bce'
output_folder = 'results_{}chunks_{}'.format(nchunks, loss_name)
local_features = True 
load_local_features = True 
NN_level = 'signal'
NN_model = 'LSTM'
Dense_layers = 2
NN_pretrained = True 
layer_idx = 5 
NN_batch_size = 512 
classifier = 'XGboost'
classifier_level = 'measurement'
num_folds = 5 
num_iterations = 25 
feature_set = 'global'
kfold_random_state = 123948
pretrained = True 
predict = True 
weights_dict = None 
monitor = 'val_loss'
dropout = 0.4 
regularizer = 'l2'
from_logits = True
n_epochs = 100

In [None]:
# In[] metadata
meta_df = pd.read_csv('metadata_train.csv')
signal_ids = meta_df['signal_id'].values

In [None]:
# =============================================================================
# ############ STEP 1 Preprocessing & Feature Extraction ######################
# =============================================================================
if not preprocessed:
    signal_df = pq.read_pandas(data_dir + 'train.parquet').to_pandas()
    _, all_flat_signals, all_points = peaks_on_flatten(signal_df, signal_ids)

    ##### STEP 1A. Denoise & Extract Waveforms #####
    # construct waveforms with given window size 
    waveforms = choose_chunk_peak(all_flat_signals, all_points, window_size=window_size)
    print(waveforms.shape)
    pickle.dump(waveforms, open(data_dir + 'all_chunk_waves_{}chunks.dat'.format(nchunks), 'wb'))

    ##### STEP 1B. Extract Global Features #####
    if recalculate_peaks:
        signal_fft = calculate_50hz_fourier_coefficient(signal_df.values)
        signal_peaks = process_measurement(signal_df, meta_df, signal_fft)
        signal_peaks.to_pickle('signal_peaks.pkl')
        del signal_fft
        gc.collect()
    else:
        signal_peaks = pd.read_pickle(data_dir + 'signal_peaks.pkl')
    signal_peaks = pd.merge(signal_peaks, meta_df[['signal_id', 'id_measurement', 'target']], on='signal_id', how='left')

    ### load KMeans results
    # x, x_A, x_B, x_C = pickle.load(open('waves_list.dat','rb'))
    # kmeans = KMeans(n_clusters=15, random_state=9, init='k-means++').fit(x)
    # kmeans_A = KMeans(n_clusters=6, random_state=9, init='k-means++').fit(x_A)
    # kmeans_B = KMeans(n_clusters=6, random_state=9, init='k-means++').fit(x_B)
    # kmeans_C = KMeans(n_clusters=6, random_state=9, init='k-means++').fit(x_C)
    kmeans = pickle.load(open(data_dir + 'kmeans.dat', 'rb'))
    kmeans_A = pickle.load(open(data_dir + 'kmeans_A.dat', 'rb'))
    kmeans_B = pickle.load(open(data_dir + 'kmeans_B.dat', 'rb'))
    kmeans_C = pickle.load(open(data_dir + 'kmeans_C.dat', 'rb'))

    X_global = create_global_features(meta_df, signal_peaks, kmeans, kmeans_A, kmeans_B, kmeans_C)
    X_global.to_csv(data_dir + 'global_features.csv')

else:
    X_global = pd.read_csv(data_dir + 'global_features.csv')
    if not load_local_features:
        waveforms = pickle.load(open(data_dir + 'all_chunk_waves_{}chunks.dat'.format(nchunks), 'rb'))
    else:
        waveforms = None 

X_global.set_index('id_measurement', inplace=True)

In [None]:
# =============================================================================
# ###################### STEP 2 Model Training ################################
# =============================================================================
import random
import numpy as np
import tensorflow as tf
seed = 0
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

_, best_proba, metrics, test_pred = whole_process_training(meta_df, waveforms, X_global,
        local_features=local_features, NN_level=NN_level, NN_model=NN_model,
        Dense_layers=Dense_layers, NN_pretrained=NN_pretrained, layer_idx=layer_idx, NN_batch_size=NN_batch_size, 
        output_folder=output_folder, classifier=classifier, classifier_level=classifier_level, num_folds=num_folds,
        num_iterations=num_iterations, feature_set=feature_set, kfold_random_state=kfold_random_state, 
        load_local_features=load_local_features, pretrained=pretrained, predict=predict, early_stopping_rounds=100, 
        verbose_eval=0, weights_dict=weights_dict, monitor=monitor, dropout=dropout, regularizer=regularizer, 
        loss_name=loss_name, from_logits=from_logits, n_epochs=n_epochs)
