In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pyarrow.parquet as pq
import gc
import pywt
from statsmodels.robust import mad
import scipy
from scipy import signal
from scipy.signal import butter
from tqdm import tqdm_notebook
import os
from multiprocessing import Pool
import functools

from numba import jit

import warnings

# Suppress pandas future warnings, I am using different library versions locally
# that do not raise warnings.
warnings.simplefilter(action='ignore', category=FutureWarning)

data_dir = '../input'

In [2]:
print(scipy.__version__)

1.1.0


In [3]:
# 800,000 data points taken over 20 ms
# Grid operates at 50hz, 0.02 * 50 = 1, so 800k samples in 20 milliseconds will capture one complete cycle
n_samples = 800000

# Sample duration is 20 miliseconds
sample_duration = 0.02

# Sample rate is the number of samples in one second
# Sample rate will be 40mhz
sample_rate = n_samples * (1 / sample_duration)

In [4]:
def maddest(d, axis=None):
    """
    Mean Absolute Deviation
    """
    return np.mean(np.absolute(d - np.mean(d, axis)), axis)

In [5]:
def high_pass_filter(x, low_cutoff=1000, sample_rate=sample_rate):
    """
    From @randxie https://github.com/randxie/Kaggle-VSB-Baseline/blob/master/src/utils/util_signal.py
    Modified to work with scipy version 1.1.0 which does not have the fs parameter
    """
    
    # nyquist frequency is half the sample rate https://en.wikipedia.org/wiki/Nyquist_frequency
    nyquist = 0.5 * sample_rate
    norm_low_cutoff = low_cutoff / nyquist
    
    # Fault pattern usually exists in high frequency band. According to literature, the pattern is visible above 10^4 Hz.
    # scipy version 1.2.0
    #sos = butter(10, low_freq, btype='hp', fs=sample_fs, output='sos')
    
    # scipy version 1.1.0
    sos = butter(10, Wn=[norm_low_cutoff], btype='highpass', output='sos')
    filtered_sig = signal.sosfilt(sos, x)

    return filtered_sig

In [6]:
def denoise_signal( x, wavelet='db4', level=1):
    """
    1. Adapted from waveletSmooth function found here:
    http://connor-johnson.com/2016/01/24/using-pywavelets-to-remove-high-frequency-noise/
    2. Threshold equation and using hard mode in threshold as mentioned
    in section '3.2 denoising based on optimized singular values' from paper by Tomas Vantuch:
    http://dspace.vsb.cz/bitstream/handle/10084/133114/VAN431_FEI_P1807_1801V001_2018.pdf
    """
    
    # Decompose to get the wavelet coefficients
    coeff = pywt.wavedec( x, wavelet, mode="per", level=level)
    
    # Calculate sigma for threshold as defined in http://dspace.vsb.cz/bitstream/handle/10084/133114/VAN431_FEI_P1807_1801V001_2018.pdf
    # As noted by @harshit92 MAD referred to in the paper is Mean Absolute Deviation not Median Absolute Deviation
    sigma = (1/0.6745) * maddest( coeff[-level] )

    # Calculte the univeral threshold
    uthresh = sigma * np.sqrt( 2*np.log( len( x ) ) )
    coeff[1:] = ( pywt.threshold( i, value=uthresh, mode='hard' ) for i in coeff[1:] )
    
    # Reconstruct the signal using the thresholded coefficients
    return pywt.waverec( coeff[1:], wavelet, mode='per' )

In [7]:
def denoise_signal_2( x, wavelet='db4', level=1):
    """
    1. Adapted from waveletSmooth function found here:
    http://connor-johnson.com/2016/01/24/using-pywavelets-to-remove-high-frequency-noise/
    2. Threshold equation and using hard mode in threshold as mentioned
    in section '3.2 denoising based on optimized singular values' from paper by Tomas Vantuch:
    http://dspace.vsb.cz/bitstream/handle/10084/133114/VAN431_FEI_P1807_1801V001_2018.pdf
    """
    
    # Decompose to get the wavelet coefficients
    coeff = pywt.wavedec( x, wavelet, mode="per", level=level)
    
    # Calculate sigma for threshold as defined in http://dspace.vsb.cz/bitstream/handle/10084/133114/VAN431_FEI_P1807_1801V001_2018.pdf
    # As noted by @harshit92 MAD referred to in the paper is Mean Absolute Deviation not Median Absolute Deviation
    sigma = (1/0.6745) * maddest( coeff[-level] )

    # Calculte the univeral threshold
    uthresh = sigma * np.sqrt( 2*np.log( len( x ) ) )
    coeff[1:] = ( pywt.threshold( i, value=uthresh, mode='hard' ) for i in coeff[1:] )
    
    # Reconstruct the signal using the thresholded coefficients
    return pywt.waverec( coeff[0:], wavelet, mode='per' )

In [8]:
def remove_corona(x_dn):
    index = pd.Series(x_dn).loc[np.abs(x_dn)>0].index
    corona_idx = []
    for idx in index:
        for i in range(1,maxDistance+1):
            if idx+i < pd.Series(x_dn).shape[0]:
                if x_dn[idx+i]/(x_dn[idx]+1e-04)<-maxHeightRatio:
                    x_dn[idx:idx+maxTicksRemoval] = 0
                    corona_idx.append(idx)
    return x_dn, corona_idx

In [10]:
def main(x_df):
    for col in tqdm_notebook(range(0, x_df.shape[1], 3)):
        X_HP = []
        X_DN = []
        
        pool = Pool()

        X_HP.append(pool.map(functools.partial(high_pass_filter, low_cutoff=10000, sample_rate=sample_rate), x_df.iloc[:,col]))
        X_HP.append(pool.map(functools.partial(high_pass_filter, low_cutoff=10000, sample_rate=sample_rate), x_df.iloc[:,col+1]))
        X_HP.append(pool.map(functools.partial(high_pass_filter, low_cutoff=10000, sample_rate=sample_rate), x_df.iloc[:,col+2]))

        
        X_DN.append(pool.map(functools.partial(denoise_signal, wavelet='haar', level=1), X_HP[0]))
        X_DN.append(pool.map(functools.partial(denoise_signal, wavelet='haar', level=1), X_HP[1]))
        X_DN.append(pool.map(functools.partial(denoise_signal, wavelet='haar', level=1), X_HP[2]))


        X_DN[0], corona_idx = remove_corona(X_DN[0])
        X_DN[1], corona_idx = remove_corona(X_DN[1])
        X_DN[2], corona_idx = remove_corona(X_DN[2])

        total = X_DN[0] + X_DN[1] + X_DN[2]
        total = pd.Series(total)
        
        meanamppos_id[col:col+3] = np.mean(total[total>0])
        meanampneg_id[col:col+3] = np.mean(total[total<0])
        
        stdamp_id[col:col+3] = np.std(total)
        stdamppos_id[col:col+3] = np.std(total[total>0])
        stdampneg_id[col:col+3] = np.std(total[total<0])

In [11]:
maxDistance = 10
maxHeightRatio = 0.25
maxTicksRemoval =500

In [12]:
meanamppos_id_test = []
meanampneg_id_test = []

stdamp_id_test = []
stdamppos_id_test = []
stdampneg_id_test = []

In [13]:
for i in tqdm_notebook(range(7)):
    subset_test = pq.read_pandas('../input/test.parquet', columns=[str(i*3000 + j + 8712) for j in range(3000)]).to_pandas()
    
    meanamppos_id = np.zeros(subset_test.shape[1])
    meanampneg_id = np.zeros(subset_test.shape[1])
    stdamp_id = np.zeros(subset_test.shape[1])
    stdamppos_id = np.zeros(subset_test.shape[1])
    stdampneg_id = np.zeros(subset_test.shape[1])
    
    main(subset_test)
    
    meanamppos_id_test = np.hstack((meanamppos_id_test, meanamppos_id))
    meanampneg_id_test = np.hstack((meanampneg_id_test, meanampneg_id))
    stdamp_id_test = np.hstack((stdamp_id_test, stdamp_id))
    stdamppos_id_test = np.hstack((stdamppos_id_test, stdamppos_id))
    stdampneg_id_test = np.hstack((stdampneg_id_test, stdampneg_id))
    
    gc.collect()

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




ValueError: selected axis is out of range

In [13]:
np.save('../features/meanamppos_id_test.npy', meanamppos_id_test)
np.save('../features/meanampneg_id_test.npy', meanampneg_id_test)

np.save('../features/stdamp_id_test.npy', stdamp_id_test)
np.save('../features/stdamppos_id_test.npy', stdamppos_id_test)
np.save('../features/stdampneg_id_test.npy', stdampneg_id_test)

In [14]:
pd.Series(stdamp_id_test)

0        0.216964
1        0.216964
2        0.216964
3        0.000000
4        0.000000
5        0.000000
6        0.030055
7        0.030055
8        0.030055
9        0.135807
10       0.135807
11       0.135807
12       0.023521
13       0.023521
14       0.023521
15       0.039193
16       0.039193
17       0.039193
18       0.404793
19       0.404793
20       0.404793
21       0.000000
22       0.000000
23       0.000000
24       0.114569
25       0.114569
26       0.114569
27       0.078003
28       0.078003
29       0.078003
           ...   
20307    0.177893
20308    0.177893
20309    0.177893
20310    0.110347
20311    0.110347
20312    0.110347
20313    0.000000
20314    0.000000
20315    0.000000
20316    0.325294
20317    0.325294
20318    0.325294
20319    0.374014
20320    0.374014
20321    0.374014
20322    0.347570
20323    0.347570
20324    0.347570
20325    0.122885
20326    0.122885
20327    0.122885
20328    0.050337
20329    0.050337
20330    0.050337
20331    0