In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pyarrow.parquet as pq
import gc
import pywt
from statsmodels.robust import mad
import scipy
from scipy import signal
from scipy.signal import butter
from tqdm import tqdm_notebook
import os
from multiprocessing import Pool

import warnings

# Suppress pandas future warnings, I am using different library versions locally
# that do not raise warnings.
warnings.simplefilter(action='ignore', category=FutureWarning)

data_dir = '../input'

In [3]:
numcpu = os.cpu_count()

In [4]:
print(scipy.__version__)

1.1.0


In [5]:
metadata_train = pd.read_csv(data_dir + '/metadata_train.csv')
metadata_train.head()

Unnamed: 0,signal_id,id_measurement,phase,target
0,0,0,0,0
1,1,0,1,0
2,2,0,2,0
3,3,1,0,1
4,4,1,1,1


In [6]:
subset_train = pq.read_pandas(data_dir + '/train.parquet').to_pandas()

In [7]:
subset_train.memory_usage(index=True).sum()

6969600080

In [8]:
# 800,000 data points taken over 20 ms
# Grid operates at 50hz, 0.02 * 50 = 1, so 800k samples in 20 milliseconds will capture one complete cycle
n_samples = 800000

# Sample duration is 20 miliseconds
sample_duration = 0.02

# Sample rate is the number of samples in one second
# Sample rate will be 40mhz
sample_rate = n_samples * (1 / sample_duration)

In [9]:
def maddest(d, axis=None):
    """
    Mean Absolute Deviation
    """
    return np.mean(np.absolute(d - np.mean(d, axis)), axis)

In [10]:
def high_pass_filter(x, low_cutoff=1000, sample_rate=sample_rate):
    """
    From @randxie https://github.com/randxie/Kaggle-VSB-Baseline/blob/master/src/utils/util_signal.py
    Modified to work with scipy version 1.1.0 which does not have the fs parameter
    """
    
    # nyquist frequency is half the sample rate https://en.wikipedia.org/wiki/Nyquist_frequency
    nyquist = 0.5 * sample_rate
    norm_low_cutoff = low_cutoff / nyquist
    
    # Fault pattern usually exists in high frequency band. According to literature, the pattern is visible above 10^4 Hz.
    # scipy version 1.2.0
    #sos = butter(10, low_freq, btype='hp', fs=sample_fs, output='sos')
    
    # scipy version 1.1.0
    sos = butter(10, Wn=[norm_low_cutoff], btype='highpass', output='sos')
    filtered_sig = signal.sosfilt(sos, x)

    return filtered_sig

In [11]:
def denoise_signal( x, wavelet='db4', level=1):
    """
    1. Adapted from waveletSmooth function found here:
    http://connor-johnson.com/2016/01/24/using-pywavelets-to-remove-high-frequency-noise/
    2. Threshold equation and using hard mode in threshold as mentioned
    in section '3.2 denoising based on optimized singular values' from paper by Tomas Vantuch:
    http://dspace.vsb.cz/bitstream/handle/10084/133114/VAN431_FEI_P1807_1801V001_2018.pdf
    """
    
    # Decompose to get the wavelet coefficients
    coeff = pywt.wavedec( x, wavelet, mode="per", level=level)
    
    # Calculate sigma for threshold as defined in http://dspace.vsb.cz/bitstream/handle/10084/133114/VAN431_FEI_P1807_1801V001_2018.pdf
    # As noted by @harshit92 MAD referred to in the paper is Mean Absolute Deviation not Median Absolute Deviation
    sigma = (1/0.6745) * maddest( coeff[-level] )

    # Calculte the univeral threshold
    uthresh = sigma * np.sqrt( 2*np.log( len( x ) ) )
    coeff[1:] = ( pywt.threshold( i, value=uthresh, mode='hard' ) for i in coeff[1:] )
    
    # Reconstruct the signal using the thresholded coefficients
    return pywt.waverec( coeff[1:], wavelet, mode='per' )

In [17]:
maxDistance = 10
maxHeightRatio = 0.25
maxTicksRemoval =500

numpeaks = []
numpospeaks = []
numnegpeaks = []

maxpeakwidth = []
minpeakwidth = []
meanpeakwidth = []
maxamp = []
minamp = []
meanamp = []

stdampall = []
stdposall = []
stdwidthall = []

stdampq2 = []
stdposq2 = []
stdwidthq2 = []
stdampq3 = []
stdposq3 = []
stdwidthq3 = []
stdampq4 = []
stdposq4 = []
stdwidthq4 = []


for col in tqdm_notebook(range(subset_train.shape[1])):
    x_hp = high_pass_filter(subset_train.iloc[:,col], low_cutoff=10000, sample_rate=sample_rate)
    x_dn = denoise_signal(x_hp, wavelet='haar', level=1)

    index = pd.Series(x_dn).loc[np.abs(x_dn)>0].index
    for idx in index:
        for i in range(1,maxDistance+1):
            if idx+i < pd.Series(x_dn).shape[0]:
                if x_dn[idx+i]/(x_dn[idx]+1e-04)<-maxHeightRatio:
                    x_dn[idx:idx+maxTicksRemoval] = 0
    
    numpeaks.append(pd.Series(x_dn).loc[np.abs(x_dn)>0].count())
    numpospeaks.append(pd.Series(x_dn).loc[x_dn>0].count())
    numnegpeaks.append(pd.Series(x_dn).loc[x_dn<0].count())
    
    peakwidth = []
    for idx in range(numpeaks[col]-1):
        peakwidth.append(pd.Series(x_dn).loc[np.abs(x_dn)>0].index[idx+1] - pd.Series(x_dn).loc[np.abs(x_dn)>0].index[idx])
    
    if len(peakwidth)==0:
        maxpeakwidth.append(0)
        minpeakwidth.append(0)
        meanpeakwidth.append(0)
    else:
        maxpeakwidth.append(np.max(peakwidth))
        minpeakwidth.append(np.min(peakwidth))
        meanpeakwidth.append(np.mean(peakwidth))
    
    maxamp.append(x_dn.max())
    minamp.append(x_dn.min())
    meanamp.append(np.mean(x_dn))
    
    stdampall.append(np.std(pd.Series(x_dn).loc[np.abs(x_dn)>0]))
    stdposall.append(np.std(pd.Series(x_dn).loc[np.abs(x_dn)>0].index))
    stdwidthall.append(np.std(peakwidth))
    
    stdampq2.append(np.std(pd.Series(x_dn[100001:200000]).loc[np.abs(x_dn[100001:200000])>0]))
    stdposq2.append(np.std(pd.Series(x_dn[100001:200000]).loc[np.abs(x_dn[100001:200000])>0].index))

    numpeaks_q2 = pd.Series(x_dn[100001:200000]).loc[np.abs(x_dn[100001:200000])>0].count()
    peakwidth_q2 = []
    for idx in range(numpeaks_q2-1):
        peakwidth_q2.append(pd.Series(x_dn[100001:200000]).loc[np.abs(x_dn[100001:200000])>0].index[idx+1] 
                            - pd.Series(x_dn[100001:200000]).loc[np.abs(x_dn[100001:200000])>0].index[idx])
    stdwidthq2.append(np.std(peakwidth_q2))
    
    stdampq3.append(np.std(pd.Series(x_dn[200001:300000]).loc[np.abs(x_dn[200001:300000])>0]))
    stdposq3.append(np.std(pd.Series(x_dn[200001:300000]).loc[np.abs(x_dn[200001:300000])>0].index))

    numpeaks_q3 = pd.Series(x_dn[200001:300000]).loc[np.abs(x_dn[200001:300000])>0].count()
    peakwidth_q3 = []
    for idx in range(numpeaks_q3-1):
        peakwidth_q3.append(pd.Series(x_dn[200001:300000]).loc[np.abs(x_dn[200001:300000])>0].index[idx+1] 
                            - pd.Series(x_dn[200001:300000]).loc[np.abs(x_dn[200001:300000])>0].index[idx])
    stdwidthq3.append(np.std(peakwidth_q3))
    
    stdampq4.append(np.std(pd.Series(x_dn[300001:400000]).loc[np.abs(x_dn[300001:400000])>0]))
    stdposq4.append(np.std(pd.Series(x_dn[300001:400000]).loc[np.abs(x_dn[300001:400000])>0].index))

    numpeaks_q4 = pd.Series(x_dn[300001:400000]).loc[np.abs(x_dn[300001:400000])>0].count()
    peakwidth_q4 = []
    for idx in range(numpeaks_q4-1):
        peakwidth_q4.append(pd.Series(x_dn[300001:400000]).loc[np.abs(x_dn[300001:400000])>0].index[idx+1] 
                            - pd.Series(x_dn[300001:400000]).loc[np.abs(x_dn[300001:400000])>0].index[idx])
    stdwidthq4.append(np.std(peakwidth_q4)) 

  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


In [23]:
numpeaks = np.array(numpeaks)
numpospeaks = np.array(numpospeaks)
numnegpeaks = np.array(numnegpeaks)

maxpeakwidth = np.array(maxpeakwidth)
minpeakwidth = np.array(minpeakwidth)
meanpeakwidth = np.array(meanpeakwidth)
maxamp = np.array(maxamp)
minamp = np.array(minamp)
meanamp = np.array(meanamp)

stdampall = np.array(stdampall)
stdposall = np.array(stdposall)
stdwidthall = np.array(stdwidthall)

stdampq2 = np.array(stdampq2)
stdposq2 = np.array(stdposq2)
stdwidthq2 = np.array(stdwidthq2)
stdampq3 = np.array(stdampq3)
stdposq3 = np.array(stdposq3)
stdwidthq3 = np.array(stdwidthq3)
stdampq4 = np.array(stdampq4)
stdposq4 = np.array(stdposq4)
stdwidthq4 = np.array(stdwidthq4)

In [19]:
len(peakwidth)

81

In [26]:
np.save('../features/numpeaks.npy', numpeaks)
np.save('../features/numpospeaks.npy', numpospeaks)
np.save('../features/numnegpeaks.npy', numnegpeaks)
np.save('../features/maxpeakwidth.npy', maxpeakwidth)
np.save('../features/minpeakwidth.npy', minpeakwidth)
np.save('../features/meanpeakwidth.npy', meanpeakwidth)
np.save('../features/maxamp.npy', maxamp)
np.save('../features/minamp.npy', minamp)
np.save('../features/meanamp.npy', meanamp)
np.save('../features/stdampall.npy', stdampall)
np.save('../features/stdposall.npy', stdposall)
np.save('../features/stdwidthall.npy', stdwidthall)
np.save('../features/stdampq2.npy', stdampq2)
np.save('../features/stdposq2.npy', stdposq2)

np.save('../features/stdwidthq2.npy', stdwidthq2)
np.save('../features/stdampq3.npy', stdampq3)
np.save('../features/stdposq3.npy', stdposq3)
np.save('../features/stdwidthq3.npy', stdwidthq3)
np.save('../features/stdampq4.npy', stdampq4)
np.save('../features/stdposq4.npy', stdposq4)
np.save('../features/stdwidthq4.npy', stdwidthq4)


In [28]:
pd.DataFrame({'NumPeaks':numpeaks})

Unnamed: 0,NumPeaks
0,105
1,23
2,121
3,217
4,141
5,269
6,8
7,10
8,14
9,26
