### Meeting a Sayed Athar's request, I'm using the Kernel altered by Khoi Nguyen to explain how the whole code works.
### If any part is not clear, please comment.  
### Please upvote if it was helpful.

In [1]:
import pandas as pd
import pyarrow.parquet as pq # Used to read the data
import os 
import numpy as np
from keras.layers import * # Keras is the most friendly Neural Network library, this Kernel use a lot of layers classes
from keras.models import Model
from tqdm import tqdm # Processing time measurement
from sklearn.model_selection import train_test_split 
from keras import backend as K # The backend give us access to tensorflow operations and allow us to create the Attention class
from keras import optimizers # Allow us to access the Adam class to modify some parameters
from sklearn.model_selection import GridSearchCV, StratifiedKFold # Used to use Kfold to train our model
from keras.callbacks import * # This object helps the model to train in a smarter way, avoiding overfitting

from scipy.signal import chirp, find_peaks, peak_widths
import pywt

from multiprocessing import Pool
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import warnings

warnings.filterwarnings('ignore')

import tensorflow as tf

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# select how many folds will be created
N_SPLITS = 5
# it is just a constant with the measurements data size
sample_size = 800000

In [3]:
def matthews_correlation(y_true, y_pred):
    '''Calculates the Matthews correlation coefficient measure for quality
    of binary classification problems.
    '''
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())

In [4]:
# https://www.kaggle.com/suicaokhoailang/lstm-attention-baseline-0-652-lb

class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        
        x = K.concatenate([weighted_input, x], axis=2)
        return x

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[1], self.features_dim*2

In [5]:
# just load train data
df_train = pd.read_csv('../input/metadata_train.csv')
# set index, it makes the data access much faster
df_train = df_train.set_index(['id_measurement', 'phase'])
df_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,signal_id,target
id_measurement,phase,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,0
0,1,1,0
0,2,2,0
1,0,3,1
1,1,4,1


In [6]:
# in other notebook I have extracted the min and max values from the train data, the measurements
max_num = 127
min_num = -128

In [7]:
# This function standardize the data from (-128 to 127) to (-1 to 1)
# Theoretically it helps in the NN Model training, but I didn't tested without it
def min_max_transf(ts, min_data, max_data, range_needed=(-1,1)):
    if min_data < 0:
        ts_std = (ts + abs(min_data)) / (max_data + abs(min_data))
    else:
        ts_std = (ts - min_data) / (max_data - min_data)
    if range_needed[0] < 0:    
        return ts_std * (range_needed[1] + abs(range_needed[0])) + range_needed[0]
    else:
        return ts_std * (range_needed[1] - range_needed[0]) + range_needed[0]

In [8]:
def maddest(d, axis=None):
    """
    Mean Absolute Deviation
    """
    return np.mean(np.absolute(d - np.mean(d, axis)), axis)

In [9]:
def denoise_signal( x, wavelet='db4', level=1):
    """
    1. Adapted from waveletSmooth function found here:
    http://connor-johnson.com/2016/01/24/using-pywavelets-to-remove-high-frequency-noise/
    2. Threshold equation and using hard mode in threshold as mentioned
    in section '3.2 denoising based on optimized singular values' from paper by Tomas Vantuch:
    http://dspace.vsb.cz/bitstream/handle/10084/133114/VAN431_FEI_P1807_1801V001_2018.pdf
    """
    
    # Decompose to get the wavelet coefficients
    coeff = pywt.wavedec( x, wavelet, mode="per", level=level)
    
    # Calculate sigma for threshold as defined in http://dspace.vsb.cz/bitstream/handle/10084/133114/VAN431_FEI_P1807_1801V001_2018.pdf
    # As noted by @harshit92 MAD referred to in the paper is Mean Absolute Deviation not Median Absolute Deviation
    sigma = (1/0.6745) * maddest( coeff[-level] )

    # Calculte the univeral threshold
    uthresh = sigma * np.sqrt( 2*np.log( len( x ) ) )
    coeff[1:] = ( pywt.threshold( i, value=uthresh, mode='hard' ) for i in coeff[1:] )
    
    # Reconstruct the signal using the thresholded coefficients
    return pywt.waverec( coeff[1:], wavelet, mode='per' )

In [10]:
def denoise_signal_2( x, wavelet='db4', level=1):
    """
    1. Adapted from waveletSmooth function found here:
    http://connor-johnson.com/2016/01/24/using-pywavelets-to-remove-high-frequency-noise/
    2. Threshold equation and using hard mode in threshold as mentioned
    in section '3.2 denoising based on optimized singular values' from paper by Tomas Vantuch:
    http://dspace.vsb.cz/bitstream/handle/10084/133114/VAN431_FEI_P1807_1801V001_2018.pdf
    """
    
    # Decompose to get the wavelet coefficients
    coeff = pywt.wavedec( x, wavelet, mode="per", level=level)
    
    # Calculate sigma for threshold as defined in http://dspace.vsb.cz/bitstream/handle/10084/133114/VAN431_FEI_P1807_1801V001_2018.pdf
    # As noted by @harshit92 MAD referred to in the paper is Mean Absolute Deviation not Median Absolute Deviation
    sigma = (1/0.6745) * maddest( coeff[-level] )

    # Calculte the univeral threshold
    uthresh = sigma * np.sqrt( 2*np.log( len( x ) ) )
    coeff[1:] = ( pywt.threshold( i, value=uthresh, mode='hard' ) for i in coeff[1:] )
    
    # Reconstruct the signal using the thresholded coefficients
    return pywt.waverec( coeff[0:], wavelet, mode='per' )

In [11]:
def remove_corona(x_dn, maxDistance=10, maxHeightRatio=0.25, maxTicksRemoval=500):
    index = pd.Series(x_dn).loc[np.abs(x_dn)>0].index
    corona_idx = []
    for idx in index:
        for i in range(1,maxDistance+1):
            if idx+i < pd.Series(x_dn).shape[0]:
                if x_dn[idx+i]/(x_dn[idx]+1e-04)<-maxHeightRatio:
                    x_dn[idx:idx+maxTicksRemoval] = 0
                    corona_idx.append(idx)
    return x_dn, corona_idx

In [12]:
# This is one of the most important peace of code of this Kernel
# Any power line contain 3 phases of 800000 measurements, or 2.4 millions data 
# It would be praticaly impossible to build a NN with an input of that size
# The ideia here is to reduce it each phase to a matrix of <n_dim> bins by n features
# Each bean is a set of 5000 measurements (800000 / 160), so the features are extracted from this 5000 chunk data.
def transform_ts(ts, n_dim=160, min_max=(-1,1)):
    # convert data into -1 to 1
    ts_std = min_max_transf(ts, min_data=min_num, max_data=max_num)
    # bucket or chunk size, 5000 in this case (800000 / 160)
    bucket_size = int(sample_size / n_dim)
    # new_ts will be the container of the new data
    
    new_ts = []
    # this for iteract any chunk/bucket until reach the whole sample_size (800000)
    for i in range(0, sample_size, bucket_size):
        # cut each bucket to ts_range
        ts_range = ts_std[i:i + bucket_size]
        
        # calculate each feature
        mean = ts_range.mean()
        std = ts_range.std() # standard deviation
        std_top = mean + std # I have to test it more, but is is like a band
        std_bot = mean - std
        # I think that the percentiles are very important, it is like a distribuiton analysis from eath chunk
        percentil_calc = np.percentile(ts_range, [0, 1, 25, 50, 75, 99, 100]) 
        max_range = percentil_calc[-1] - percentil_calc[0] # this is the amplitude of the chunk
        relative_percentile = percentil_calc - mean # maybe it could heap to understand the asymmetry
                
        
        
        feat_array = np.asarray([mean, std, std_top, std_bot, max_range], dtype=np.float32)
        
        new_ts.append(np.concatenate([feat_array
                                      , percentil_calc, relative_percentile]))
    
        
    return new_ts

In [13]:
# this function take a piece of data and convert using transform_ts(), but it does to each of the 3 phases
# if we would try to do in one time, could exceed the RAM Memmory
def prep_data(start, end):
    # load a piece of data from file
    praq_train = pq.read_pandas('../input/train.parquet', columns=[str(i) for i in range(start, end)]).to_pandas()
    praq_train_2 = praq_train*-1
    X = []
    y = []
    # using tdqm to evaluate processing time
    # takes each index from df_train and iteract it from start to end
    # it is divided by 3 because for each id_measurement there are 3 id_signal, and the start/end parameters are id_signal
    for id_measurement in tqdm(df_train.index.levels[0].unique()[int(start/3):int(end/3)]):
        X_signal = []
        # for each phase of the signal
        for phase in [0,1,2]:
            # extract from df_train both signal_id and target to compose the new data sets
            signal_id, target = df_train.loc[id_measurement].loc[phase]
            # but just append the target one time, to not triplicate it
            if phase == 0:
                y.append(target)                
            # extract and transform data into sets of features
            X_signal.append(transform_ts(np.asarray(praq_train[str(signal_id)], dtype=np.float32)))
        # concatenate all the 3 phases in one matrix
        X_signal = np.concatenate(X_signal, axis=1)
        # add the data to X
        X.append(X_signal)
        
    for id_measurement in tqdm(df_train.index.levels[0].unique()[int(start/3):int(end/3)]):
        X_signal = []
        # for each phase of the signal
        for phase in [0,1,2]:
            # extract from df_train both signal_id and target to compose the new data sets
            signal_id, target = df_train.loc[id_measurement].loc[phase]
            # but just append the target one time, to not triplicate it
            if phase == 0:
                y.append(target)                
            # extract and transform data into sets of features
            X_signal.append(transform_ts(np.asarray(praq_train_2[str(signal_id)], dtype=np.float32)))
        # concatenate all the 3 phases in one matrix
        X_signal = np.concatenate(X_signal, axis=1)
        # add the data to X
        X.append(X_signal)
        
    X = np.asarray(X, dtype=np.float32)
    y = np.asarray(y, dtype=np.int32)
    return X, y

In [14]:
def process_subtrain(arg_tuple):
    start, end, idx = arg_tuple
    X, y = prep_data(start, end)
    return idx, X, y

In [15]:
# this code is very simple, divide the total size of the df_train into two sets and process it
#X = []
#y = []
all_chunks = []

num_cores = 8 
#def load_all():
total_size = len(df_train)
chunk_size = total_size/num_cores

for i in range(8):
    start_idx = int(i * chunk_size)
    end_idx = int(start_idx + chunk_size)
    chunk = (start_idx, end_idx, i)
    all_chunks.append(chunk)

pool = Pool()
results = pool.map(process_subtrain, all_chunks)    
results = sorted(results, key=lambda tup: tup[0])

X = np.concatenate([item[1] for item in results], axis=0)
y = np.concatenate([item[2] for item in results], axis=0)

#load_all()

#X = np.asarray(X)
#y = np.asarray(y)

100%|██████████| 363/363 [01:33<00:00,  3.62it/s]
100%|██████████| 363/363 [01:33<00:00,  3.76it/s]
100%|██████████| 363/363 [01:33<00:00,  3.73it/s]
100%|██████████| 363/363 [01:34<00:00,  3.63it/s]
100%|██████████| 363/363 [01:34<00:00,  3.64it/s]
100%|██████████| 363/363 [01:34<00:00,  3.67it/s]
  0%|          | 0/363 [00:00<?, ?it/s] 3.80it/s]
  0%|          | 1/363 [00:00<01:37,  3.71it/s]s]
100%|██████████| 363/363 [01:39<00:00,  3.73it/s]
100%|██████████| 363/363 [01:39<00:00,  3.79it/s]
100%|██████████| 363/363 [01:39<00:00,  3.75it/s]

 99%|█████████▉| 361/363 [01:39<00:00,  3.71it/s]
100%|██████████| 363/363 [01:39<00:00,  3.60it/s]
100%|██████████| 363/363 [01:39<00:00,  4.00it/s]
100%|██████████| 363/363 [01:39<00:00,  3.83it/s]
Process ForkPoolWorker-5:
Process ForkPoolWorker-4:
Process ForkPoolWorker-8:
Process ForkPoolWorker-7:
Process ForkPoolWorker-2:
Process ForkPoolWorker-3:
Process ForkPoolWorker-6:
Traceback (most recent call last):
Traceback (most recent call last

In [16]:
# The X shape here is very important. It is also important undertand a little how a LSTM works
# X.shape[0] is the number of id_measuremts contained in train data
# X.shape[1] is the number of chunks resultant of the transformation, each of this date enters in the LSTM serialized
# This way the LSTM can understand the position of a data relative with other and activate a signal that needs
# a serie of inputs in a specifc order.
# X.shape[3] is the number of features multiplied by the number of phases (3)
print(X.shape, y.shape)

(5808, 160, 57) (5808,)


In [17]:
print(X.dtype, y.dtype)

float32 int32


In [18]:
# save data into file, a numpy specific format
np.save("X_8.npy",X)
np.save("y_8.npy",y)

In [48]:
X = np.load("./X.npy")
y = np.load("./y.npy")

In [19]:
class SimpleAttention(Layer):
  
    def __init__(self, depth:int, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.depth = depth
        self.q_dense_layer = Dense(depth, use_bias=False)
        self.k_dense_layer = Dense(depth, use_bias=False)
        self.v_dense_layer = Dense(depth, use_bias=False)
        self.output_dense_layer = Dense(depth, use_bias=False)
    
    def call(self, inp):
        q = self.q_dense_layer(inp)  # [batch_size, q_length, depth]
        q *= self.depth ** -0.5
        print(q.shape)
        
        k = self.k_dense_layer(inp)  # [batch_size, m_length, depth]
        v = self.v_dense_layer(inp)

        logit = tf.matmul(q, k, transpose_b=True)
        print(logit.shape)
        
        attention_weight = tf.nn.softmax(logit, name='attention_weight')
        
        attention_output = tf.matmul(attention_weight, v)  # [batch_size, q_length, depth]
        print(attention_output.shape)
        
        x = self.output_dense_layer(attention_output) + q
        print(x.shape)
        
        
        return x
    
    def compute_output_shape(self, input_shape):
        if len(input_shape)==3:
            return input_shape[0], input_shape[1], self.depth
        if len(input_shape)==4:
            return input_shape[0], input_shape[1], input_shape[2], self.depth

In [25]:
# This is NN LSTM Model creation
def model_lstm(input_shape):
    # The shape was explained above, must have this order
    inp = Input(shape=(input_shape[1], input_shape[2]))
    
    x = Bidirectional(LSTM(30, return_sequences=True))(inp)
    x = Bidirectional(LSTM(30, return_sequences=True))(x)
    
    x = Attention(input_shape[1])(x)
    x = Lambda(lambda x: K.sum(x, axis=1))(x)


    x = Activation('tanh')(x)
    # A binnary classification as this must finish with shape (1,)
    x = Dense(1, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=x)
    # Pay attention in the addition of matthews_correlation metric in the compilation, it is a success factor key
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[matthews_correlation])
    
    return model

In [None]:
# Here is where the training happens

# First, create a set of indexes of the 5 folds
splits = list(StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=2019).split(X, y))
preds_val = []
y_val = []
# Then, iteract with each fold
# If you dont know, enumerate(['a', 'b', 'c']) returns [(0, 'a'), (1, 'b'), (2, 'c')]
for idx, (train_idx, val_idx) in enumerate(splits):
    K.clear_session() # I dont know what it do, but I imagine that it "clear session" :)
    print("Beginning fold {}".format(idx+1))
    # use the indexes to extract the folds in the train and validation data
    train_X, train_y, val_X, val_y = X[train_idx], y[train_idx], X[val_idx], y[val_idx]
    # instantiate the model for this fold
    model = model_lstm(train_X.shape)
    # This checkpoint helps to avoid overfitting. It just save the weights of the model if it delivered an
    # validation matthews_correlation greater than the last one.
    ckpt = ModelCheckpoint('weights_{}.h5'.format(idx), save_best_only=True, save_weights_only=True, verbose=1, monitor='val_matthews_correlation', mode='max')
    # Train, train, train
    model.fit(train_X, train_y, batch_size=150, epochs=50, validation_data=[val_X, val_y], callbacks=[ckpt])
    # loads the best weights saved by the checkpoint
    model.load_weights('weights_{}.h5'.format(idx))
    # Add the predictions of the validation to the list preds_val
    preds_val.append(model.predict(val_X, batch_size=512))
    # and the val true y
    y_val.append(val_y)

# concatenates all and prints the shape    
preds_val = np.concatenate(preds_val)[...,0]
y_val = np.concatenate(y_val)
preds_val.shape, y_val.shape

Beginning fold 1
Train on 4645 samples, validate on 1163 samples
Epoch 1/50

Epoch 00001: val_matthews_correlation improved from -inf to 0.00000, saving model to weights_0.h5
Epoch 2/50

Epoch 00002: val_matthews_correlation did not improve from 0.00000
Epoch 3/50

Epoch 00003: val_matthews_correlation did not improve from 0.00000
Epoch 4/50

Epoch 00004: val_matthews_correlation improved from 0.00000 to 0.46281, saving model to weights_0.h5
Epoch 5/50

Epoch 00005: val_matthews_correlation improved from 0.46281 to 0.60827, saving model to weights_0.h5
Epoch 6/50

Epoch 00006: val_matthews_correlation improved from 0.60827 to 0.61714, saving model to weights_0.h5
Epoch 7/50

Epoch 00007: val_matthews_correlation did not improve from 0.61714
Epoch 8/50

Epoch 00008: val_matthews_correlation did not improve from 0.61714
Epoch 9/50

Epoch 00009: val_matthews_correlation improved from 0.61714 to 0.64456, saving model to weights_0.h5
Epoch 10/50

Epoch 00010: val_matthews_correlation did no

In [24]:
np.save('./tmp/y_val_tmp_8.npy', y_val)
np.save('./tmp/preds_val_tmp_8.npy', preds_val)

In [101]:
# It is the official metric used in this competition
# below is the declaration of a function used inside the keras model, calculation with K (keras backend / thensorflow)
def matthews_correlation(y_true, y_pred):
    '''Calculates the Matthews correlation coefficient measure for quality
    of binary classification problems.
    '''
    
    #y_pred = K.cast(y_pred, np.float)
    y_pred_pos = np.round(np.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = np.round(np.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = np.sum(y_pos * y_pred_pos)
    tn = np.sum(y_neg * y_pred_neg)

    fp = np.sum(y_neg * y_pred_pos)
    fn = np.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())

In [102]:
# The output of this kernel must be binary (0 or 1), but the output of the NN Model is float (0 to 1).
# So, find the best threshold to convert float to binary is crucial to the result
# this piece of code is a function that evaluates all the possible thresholds from 0 to 1 by 0.01
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in tqdm([i * 0.01 for i in range(100)]):
        #score = matthews_correlation(y_true, (y_proba > threshold).astype(int))
        #score = K.eval(matthews_correlation(y_true, (y_proba > threshold).astype(int)))
        #score = K.eval(matthews_correlation(y_true.astype(np.float64), (y_proba > threshold).astype(np.float64)))
        score = matthews_correlation(y_true.astype(np.float64), (y_proba > threshold).astype(np.float64))
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'matthews_correlation': best_score}
    return search_result

In [76]:
best_threshold = threshold_search(y_val, preds_val)['threshold']

100%|██████████| 100/100 [00:00<00:00, 7696.82it/s]


In [77]:
best_threshold

0.5

In [78]:
matthews_correlation(y_val, preds_val)

0.6883714509140777

In [79]:
threshold_search(y_val[:582], preds_val[:582])['threshold']

100%|██████████| 100/100 [00:00<00:00, 14110.83it/s]


0.51

In [103]:
threshold_search(y_val[582:582+581], preds_val[582:582+581])['threshold']

100%|██████████| 100/100 [00:00<00:00, 12112.81it/s]


0.51

In [104]:
threshold_search(y_val[582+581:582+581*2], preds_val[582+581:582+581*2])['threshold']

100%|██████████| 100/100 [00:00<00:00, 15628.81it/s]


0.5

In [105]:
threshold_search(y_val[582+581*2:582+581*2+580], preds_val[582+581*2:582+581*2+580])['threshold']

100%|██████████| 100/100 [00:00<00:00, 15360.38it/s]


0.5

In [107]:
threshold_search(y_val[582+581*2+580:], preds_val[582+581*2+580:])['threshold']

100%|██████████| 100/100 [00:00<00:00, 15026.35it/s]


0.33

In [80]:
%%time
# Now load the test data
# This first part is the meta data, not the main data, the measurements
meta_test = pd.read_csv('../input/metadata_test.csv')
df_test = meta_test.set_index(['id_measurement', 'phase'])

CPU times: user 34.2 ms, sys: 8.27 ms, total: 42.5 ms
Wall time: 71.1 ms


In [81]:
meta_test = meta_test.set_index(['signal_id'])
meta_test.head()

Unnamed: 0_level_0,id_measurement,phase
signal_id,Unnamed: 1_level_1,Unnamed: 2_level_1
8712,2904,0
8713,2904,1
8714,2904,2
8715,2905,0
8716,2905,1


In [82]:
# this function take a piece of data and convert using transform_ts(), but it does to each of the 3 phases
# if we would try to do in one time, could exceed the RAM Memmory
def prep_data_test(start, end):
    # load a piece of data from file
    praq_test = pq.read_pandas('../input/test.parquet', columns=[str(i) for i in range(start+8712, end+8712)]).to_pandas()
    X = []

    # using tdqm to evaluate processing time
    # takes each index from df_train and iteract it from start to end
    # it is divided by 3 because for each id_measurement there are 3 id_signal, and the start/end parameters are id_signal
    for id_measurement in tqdm(df_test.index.levels[0].unique()[int(start/3):int(end/3)]):
        X_signal = []
        # for each phase of the signal
        for phase in [0,1,2]:
            # extract from df_train both signal_id and target to compose the new data sets
            signal_id = df_test.loc[id_measurement].loc[phase][0]
            # but just append the target one time, to not triplicate it
            #if phase == 0:
                #ts_1 = min_max_transf(praq_test[str(signal_id)], min_data=min_num, max_data=max_num)
                #ts_2 = min_max_transf(praq_test[str(signal_id+1)], min_data=min_num, max_data=max_num)
                #ts_3 = min_max_transf(praq_test[str(signal_id+2)], min_data=min_num, max_data=max_num)
                
                #ts_wave_1 = denoise_signal(ts_1, wavelet='haar', level=1)
                #ts_wave_2 = denoise_signal(ts_2, wavelet='haar', level=1)
                #ts_wave_3 = denoise_signal(ts_3, wavelet='haar', level=1)
                
                #ts_rm_1, _ = remove_corona(ts_wave_1)
                #ts_rm_2, _ = remove_corona(ts_wave_2)
                #ts_rm_3, _ = remove_corona(ts_wave_3)
                
                #ts_sum = ts_rm_1 + ts_rm_2 + ts_rm_3
                #X_signal.append(transform_ts_sum(ts_sum))
            # extract and transform data into sets of features
            X_signal.append(transform_ts(praq_test[str(signal_id)]))
        # concatenate all the 3 phases in one matrix
        X_signal = np.concatenate(X_signal, axis=1)
        # add the data to X
        X.append(X_signal)
    X = np.asarray(X)
    return X

In [83]:
def process_subtest(arg_tuple):
    start, end, idx = arg_tuple
    X = prep_data_test(start, end)
    return idx, X

In [84]:
all_chunks = []

num_cores = 16 
#def load_all():
total_size = len(meta_test)
chunk_size = np.ceil(total_size/num_cores)
#train_size = len(df_train)

for i in range(16):
    if i != 15:
        start_idx = int(i * chunk_size)
        end_idx = int(start_idx + chunk_size)
        #chunk = (start_idx+train_size, end_idx+train_size, i)
        chunk = (start_idx, end_idx, i)
        all_chunks.append(chunk)
    else:
        start_idx = int(i * chunk_size)
        end_idx = int(total_size)
        #chunk = (start_idx+train_size, end_idx+train_size, i)
        chunk = (start_idx, end_idx, i)
        all_chunks.append(chunk)
        

In [85]:
pool = Pool()
results_1 = pool.map(process_subtest, all_chunks[0:8])    
results_1 = sorted(results_1, key=lambda tup: tup[0])

100%|██████████| 424/424 [02:49<00:00,  2.55it/s]
100%|██████████| 424/424 [02:49<00:00,  2.47it/s]
100%|██████████| 424/424 [02:50<00:00,  2.52it/s]
100%|██████████| 424/424 [02:50<00:00,  2.52it/s]
100%|██████████| 424/424 [02:50<00:00,  2.49it/s]
100%|██████████| 424/424 [02:50<00:00,  2.66it/s]
100%|██████████| 424/424 [02:50<00:00,  2.80it/s]
100%|██████████| 424/424 [02:50<00:00,  2.84it/s]
100%|██████████| 419/419 [02:45<00:00,  2.58it/s]
100%|██████████| 424/424 [02:48<00:00,  2.64it/s]
100%|██████████| 424/424 [02:48<00:00,  2.70it/s]
100%|██████████| 424/424 [02:48<00:00,  2.80it/s]
100%|██████████| 424/424 [02:48<00:00,  2.84it/s]
100%|██████████| 424/424 [02:47<00:00,  2.93it/s]
100%|██████████| 424/424 [02:48<00:00,  3.39it/s]
100%|██████████| 424/424 [02:48<00:00,  3.39it/s]


In [86]:
results_2 = pool.map(process_subtest, all_chunks[8:16])    
results_2 = sorted(results_2, key=lambda tup: tup[0])

In [87]:
results = results_1 + results_2
X_test = np.concatenate([item[1] for item in results], axis=0)

In [43]:
np.save("X_test_7.npy",X_test)

In [88]:
submission = pd.read_csv('../input/sample_submission.csv')
print(len(submission))
submission.head()

20337


Unnamed: 0,signal_id,target
0,8712,0
1,8713,0
2,8714,0
3,8715,0
4,8716,0


splits = list(StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=2019).split(X, y))

for idx, (train_idx, val_idx) in enumerate(splits):
    K.clear_session() # I dont know what it do, but I imagine that it "clear session" :)
    print("Beginning fold {}".format(idx+1))
    # use the indexes to extract the folds in the train and validation data
    train_X, train_y, val_X, val_y = X[train_idx], y[train_idx], X[val_idx], y[val_idx]
    # instantiate the model for this fold
    model = model_lstm(train_X.shape)

In [45]:
X_test[np.isnan(X_test)] = 0

In [28]:
X_test = np.load("X_test.npy")

In [35]:
X.shape

(2904, 160, 57)

In [34]:
X_test.shape

(6779, 160, 57)

In [140]:
X

array([[[ 0.14322358,  0.00713458,  0.15035816, ...,  0.00510432,
          0.02079059,  0.03647687],
        [ 0.14152163,  0.00867319,  0.15019482, ...,  0.00490825,
          0.02059452,  0.02843766],
        [ 0.1306479 ,  0.00819584,  0.13884374, ...,  0.00410981,
          0.01979609,  0.01979609],
        ...,
        [ 0.1508832 ,  0.00819278,  0.15907598, ...,  0.00390902,
          0.0195953 ,  0.02743843],
        [ 0.148993  ,  0.00857161,  0.15756461, ...,  0.00399999,
          0.01968627,  0.03537254],
        [ 0.145873  ,  0.00776367,  0.15363666, ...,  0.00493018,
          0.02061646,  0.02845959]],

       [[-0.1186494 ,  0.00703992, -0.11160948, ...,  0.00143373,
          0.01712   ,  0.06417882],
        [-0.11164077,  0.0072766 , -0.10436417, ...,  0.0018902 ,
          0.01757647,  0.10385098],
        [-0.10408157,  0.0066795 , -0.09740207, ...,  0.00229335,
          0.01797962,  0.0336659 ],
        ...,
        [-0.13483763,  0.00655647, -0.12828116, ...,  

In [139]:
X_test

array([[[ 0.12737725,  0.00671154,  0.13408879, ...,  0.00604392,
          0.01388706,  0.06878902],
        [ 0.13553725,  0.00676144,  0.14229869, ...,  0.00260078,
          0.01828706,  0.04965961],
        [ 0.14090824,  0.00665593,  0.14756416, ...,  0.00105569,
          0.01674196,  0.05595765],
        ...,
        [ 0.1080502 ,  0.00712202,  0.11517221, ...,  0.00661333,
          0.01445647,  0.03798588],
        [ 0.11318431,  0.00706231,  0.12024663, ...,  0.00566118,
          0.01350431,  0.02919059],
        [ 0.11981333,  0.00701456,  0.12682789, ...,  0.00527686,
          0.01312   ,  0.02096314]],

       [[ 0.11695686,  0.01065462,  0.12761148, ...,  0.00762824,
          0.02331451,  0.03900078],
        [ 0.12716235,  0.01088737,  0.13804972, ...,  0.00403137,
          0.01971765,  0.03540392],
        [ 0.13502902,  0.01066675,  0.14569577, ...,  0.00606275,
          0.02174902,  0.03743529],
        ...,
        [ 0.10294902,  0.01080448,  0.1137535 , ...,  

In [109]:
preds_test = []
for i in range(4):
    model.load_weights('weights_{}.h5'.format(i))
    pred = model.predict(X_test, batch_size=300, verbose=1)
    pred_3 = []
    for pred_scalar in pred:
        for i in range(3):
            pred_3.append(pred_scalar)
    preds_test.append(pred_3)



In [131]:
np.sum(np.asarray(preds_test[1])>0.51)

525

In [137]:
preds_test_3 = (np.asarray(preds_test[1])>0.51).astype(np.int).reshape(20337)

In [111]:
preds_test_2 = (np.squeeze(np.mean(preds_test, axis=0)) > 0.5).astype(np.int)
preds_test_2.shape

(20337,)

In [138]:
submission['target'] = preds_test_3
submission.to_csv('../output/submission_27.csv', index=False)
submission.head()

Unnamed: 0,signal_id,target
0,8712,0
1,8713,0
2,8714,0
3,8715,0
4,8716,0


In [None]:
lstm_preds = np.squeeze(np.mean(preds_test, axis=0))
np.save("./lstm_preds.npy", lstm_preds)

In [112]:
np.sum(preds_test_2[preds_test_2==1])

0

In [59]:
submission['target'].value_counts()

0    19404
1      933
Name: target, dtype: int64

In [60]:
model.load_weights('weights_0.h5')
pred = model.predict(X_test, batch_size=300, verbose=1)
pred_3 = []
for pred_scalar in pred:
    for i in range(3):
        pred_3.append(pred_scalar)



In [62]:
pred_3>0.5

TypeError: '>' not supported between instances of 'list' and 'float'

In [65]:
preds_3 = (np.squeeze(pred_3) > 0.5).astype(np.int)

In [66]:
np.sum(preds_3[preds_3==1])

1209

In [69]:
submission['target'] = preds_3
submission.to_csv('../output/submission_22.csv', index=False)

In [68]:
submission['target'].value_counts()

0    19128
1     1209
Name: target, dtype: int64