## Import

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-whitegrid')
plt.rcParams.update({'font.size': 32})
plt.rcParams["figure.figsize"] = (12,8)
import torch
import numpy as np
import scipy.signal
import scipy.io
import pandas as pd
import itertools
from itertools import product

from tqdm.notebook import tqdm, trange

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import random

In [6]:
import h5py

## Data import

In [15]:
fs = 50e3

In [16]:
dataset = h5py.File('dataset_.hdf5', 'r')

In [18]:
class H5Dataset:
    
    def __init__(self, h5dataset, segment_size = 1):
        self.dataset  = h5dataset
        self.ds_parts = list(self.dataset.keys())
        self.seg_size = int(segment_size)

    def visit(self):
        self.dataset.visit(lambda name: print(name))
            
    def inspect(self):
        for key in self.ds_parts:
            part = self.dataset[key]
            info = f'Name {key},\tType {type(part)}'
            info +=f' Shape {part.shape}, DType {part.dtype}'
            if isinstance(self.seg_size,int):
                info +=f' N segments {part.shape[1]//self.seg_size}'    
            print(info)
    
    def __len__(self):
        return self.length
    
    def get_source(self, part, source, label = None):
        if self.seg_size is None: raise ValueError
        
        source_data = self.dataset[part][source]
        
        segment = source_data.reshape(-1,self.seg_size)
        
        if label is None:
            label = source
        target  = label * np.ones(segment.shape[0])
        return segment, target

    def part(self, part_name):
        return self.dataset[part_name]
    
#     def get_batches_idxs 
    
    def get_range(self, part, sources = None, segments = None, label = None):
        if self.seg_size is None: raise ValueError
        
        data = self.dataset[part]
        if sources is None:  sources  = (0,data.shape[0])
        if segments is None: segments = (0,data.shape[1]//self.seg_size) 
        sources  = (max(0,int(sources[0])),min(data.shape[0],int(sources[1])))    
        segments = (segments[0]*self.seg_size,segments[1]*self.seg_size)  
        segments = (max(0,int(segments[0])),min(data.shape[1],int(segments[1])))

        return data[sources[0]:sources[1],segments[0]:segments[1] ]
        
    

In [19]:
ds = H5Dataset(dataset, segment_size=int(10e3))
ds.inspect()

Name x_test_1,	Type <class 'h5py._hl.dataset.Dataset'> Shape (20, 12500000), DType float32 N segments 1250
Name x_test_2,	Type <class 'h5py._hl.dataset.Dataset'> Shape (20, 12500000), DType float32 N segments 1250
Name x_train_1,	Type <class 'h5py._hl.dataset.Dataset'> Shape (20, 12500000), DType float32 N segments 1250
Name x_train_2,	Type <class 'h5py._hl.dataset.Dataset'> Shape (20, 12500000), DType float32 N segments 1250


In [20]:
part = ds.part('x_test_1')

In [21]:
range_ = ds.get_range('x_test_1')
print(range_.shape)

(20, 12500000)


In [22]:
range_ = ds.get_range('x_test_1',(0,2))
print(range_.shape)

(2, 12500000)


In [30]:
import tsfel

cfg_file = tsfel.get_features_by_domain()

CLASSES_PER_PART = 20

In [31]:
def get_features(ds, parts, fs, cfg_file):
    parts = sum([],parts)
    df = pd.DataFrame([])    
    for  (cntpart,part),i in tqdm( product(enumerate(parts), range(CLASSES_PER_PART)  ) ):

        x_,y_ = ds.get_source(part,i)
        
        y_ = (cntpart*CLASSES_PER_PART+i)*np.ones(x_.shape[0])
        
        df_ = tsfel.time_series_features_extractor(cfg_file, x_, fs=fs)
        
        df = pd.concat([df,df_ ],axis=0, ignore_index=True)
    
    return df


In [40]:
df_train = get_features(ds, ['x_train_1','x_train_2'], fs = fs, cfg_file = cfg_file)
# df_test  = get_features(ds, ['x_test_1','x_test_2'], fs = fs, cfg_file = cfg_file)

0it [00:00, ?it/s]

*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***


Process SpawnPoolWorker-392:
Process SpawnPoolWorker-385:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.9/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/opt/anaconda3/lib/python3.9/site-packages/tsfel/feature_extraction/calc_features.py", line 185, in calc_features
    feat_val = calc_window_features(dict_features, wind_sig, fs, features_path=features_path, header_names=names)
  File "/opt/anaconda3/lib/python3.9/site-packages/tsfel/feature_extraction/calc_features.py", line 462, in calc_window_features
    eval_result = eval(execf, locals())
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/lib/python3.9/site-packages/tsfel/feature_extraction/features.py", line 450, in ne

KeyboardInterrupt: 

elf._kwargs)
  File "/opt/anaconda3/lib/python3.9/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/opt/anaconda3/lib/python3.9/site-packages/tsfel/feature_extraction/calc_features.py", line 185, in calc_features
    feat_val = calc_window_features(dict_features, wind_sig, fs, features_path=features_path, header_names=names)
  File "/opt/anaconda3/lib/python3.9/site-packages/tsfel/feature_extraction/calc_features.py", line 462, in calc_window_features
    eval_result = eval(execf, locals())
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/lib/python3.9/site-packages/tsfel/feature_extraction/features.py", line 1603, in wavelet_entropy
    cwt = wavelet(signal, function, widths)
  File "/opt/anaconda3/lib/python3.9/site-packages/tsfel/feature_extraction/features_utils.py", line 345, in wavelet
    cwt = scipy.signal.cwt(signal, function, widths)
  File "/opt/anaconda3/lib/python3.9/site-packages/scipy/signal/wavelets.py", line 48

In [67]:
df_train.to_csv('tsfel_train.csv')
df_train = pd.read_csv('tsfel_train.csv')

In [32]:
df_test  = get_features(ds, ['x_test_1','x_test_2'], fs = fs, cfg_file = cfg_file)

0it [00:00, ?it/s]

*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***
*** Feature extraction started ***



*** Feature extraction finished ***


In [None]:
df_test.to_csv('tsfel_train.csv')
df_test = pd.read_csv('tsfel_train.csv')

In [42]:
df_train.sample(10)

Unnamed: 0.1,Unnamed: 0,0_Absolute energy,0_Area under the curve,0_Autocorrelation,0_Centroid,0_ECDF Percentile Count_0,0_ECDF Percentile Count_1,0_ECDF Percentile_0,0_ECDF Percentile_1,0_ECDF_0,...,0_Wavelet variance_0,0_Wavelet variance_1,0_Wavelet variance_2,0_Wavelet variance_3,0_Wavelet variance_4,0_Wavelet variance_5,0_Wavelet variance_6,0_Wavelet variance_7,0_Wavelet variance_8,0_Zero crossing rate
45873,45873,109.719883,0.011695,109.719883,0.100577,2000.0,8000.0,-0.03053,0.030122,0.0001,...,6.2e-05,0.000239,0.000565,0.001077,0.001823,0.002913,0.004494,0.006706,0.009665,306.0
45287,45287,112.342696,0.011708,112.342696,0.099211,2000.0,8000.0,-0.03021,0.029893,0.0001,...,7.7e-05,0.000292,0.000685,0.001282,0.002142,0.003373,0.005116,0.007502,0.010645,334.0
13704,13704,174.12366,0.013007,174.12366,0.099319,2000.0,8000.0,-0.01983,0.019276,0.0001,...,8e-06,6.3e-05,0.000261,0.000707,0.001526,0.002877,0.00495,0.00793,0.011997,126.0
27653,27653,168.448411,0.013201,168.448411,0.097987,2000.0,8000.0,-0.029555,0.029097,0.0001,...,5.3e-05,0.00044,0.001459,0.003104,0.005356,0.00831,0.012171,0.017162,0.023435,278.0
21644,21644,156.093809,0.012224,156.093809,0.098297,2000.0,8000.0,-0.021126,0.020524,0.0001,...,4e-05,0.000304,0.000905,0.001813,0.003073,0.004877,0.007429,0.010895,0.015406,118.0
10960,10960,155.466106,0.01286,155.466106,0.098806,2000.0,8000.0,-0.020717,0.020163,0.0001,...,1.6e-05,0.000109,0.000362,0.000835,0.001578,0.002695,0.004331,0.006626,0.009695,100.0
43676,43676,110.349087,0.011242,110.349087,0.101873,2000.0,8000.0,-0.026964,0.026542,0.0001,...,4.8e-05,0.000207,0.000528,0.001067,0.001886,0.003104,0.004881,0.007356,0.010629,310.0
1352,1352,133.003346,0.011631,133.003346,0.097911,2000.0,8000.0,-0.02023,0.019874,0.0001,...,6e-06,4.1e-05,0.000171,0.000481,0.001075,0.002074,0.003617,0.005859,0.008957,106.0
35774,35774,103.6873,0.010146,103.6873,0.100609,2000.0,8000.0,-0.024351,0.024056,0.0001,...,5.3e-05,0.000359,0.001146,0.002476,0.004344,0.006775,0.009875,0.013794,0.018659,296.0
18208,18208,71.534575,0.008619,71.534575,0.098781,2000.0,8000.0,-0.018189,0.017904,0.0001,...,3.9e-05,0.000188,0.000456,0.000825,0.001315,0.002033,0.003097,0.004603,0.006634,138.0


In [43]:
df_train = df_train.drop(columns=['Unnamed: 0'])

In [35]:
df_test.sample(10)

Unnamed: 0,0_Absolute energy,0_Area under the curve,0_Autocorrelation,0_Centroid,0_ECDF Percentile Count_0,0_ECDF Percentile Count_1,0_ECDF Percentile_0,0_ECDF Percentile_1,0_ECDF_0,0_ECDF_1,...,0_Wavelet variance_0,0_Wavelet variance_1,0_Wavelet variance_2,0_Wavelet variance_3,0_Wavelet variance_4,0_Wavelet variance_5,0_Wavelet variance_6,0_Wavelet variance_7,0_Wavelet variance_8,0_Zero crossing rate
27673,166.738368,0.013199,166.738368,0.103006,2000.0,8000.0,-0.029471,0.029538,0.0001,0.0002,...,7.6e-05,0.000492,0.001354,0.002625,0.004309,0.006543,0.009541,0.013521,0.01869,338.0
8169,98.385407,0.00983,98.385407,0.099479,2000.0,8000.0,-0.017489,0.017024,0.0001,0.0002,...,2.3e-05,0.000147,0.000449,0.00101,0.001902,0.003212,0.005029,0.007426,0.010459,108.0
42227,63.075534,0.008609,63.075534,0.10116,2000.0,8000.0,-0.023214,0.022878,0.0001,0.0002,...,4e-05,0.000174,0.000421,0.000821,0.001422,0.0023,0.003556,0.005283,0.007563,298.0
16285,138.604059,0.011517,138.604059,0.096771,2000.0,8000.0,-0.019798,0.019174,0.0001,0.0002,...,3e-05,0.000252,0.000756,0.001505,0.002545,0.004051,0.006204,0.00915,0.013004,108.0
38742,65.700769,0.008416,65.700769,0.096826,2000.0,8000.0,-0.02191,0.021885,0.0001,0.0002,...,5.7e-05,0.000289,0.000693,0.001281,0.002078,0.003158,0.004641,0.006642,0.009267,366.0
1098,124.999009,0.011381,124.999009,0.097236,2000.0,8000.0,-0.020293,0.020027,0.0001,0.0002,...,1e-05,7.2e-05,0.000265,0.000678,0.001402,0.002544,0.004226,0.006565,0.009668,106.0
2466,129.943564,0.011629,129.943564,0.102655,2000.0,8000.0,-0.020505,0.020017,0.0001,0.0002,...,1.1e-05,7.6e-05,0.000275,0.000694,0.001425,0.002581,0.004289,0.006667,0.009824,114.0
8026,102.974663,0.009847,102.974663,0.101928,2000.0,8000.0,-0.017505,0.017167,0.0001,0.0002,...,1.5e-05,0.000102,0.000346,0.000848,0.001712,0.003052,0.004978,0.007581,0.010934,108.0
15794,210.540157,0.014279,210.540157,0.102334,2000.0,8000.0,-0.019822,0.019221,0.0001,0.0002,...,2.4e-05,0.000198,0.000668,0.001522,0.002862,0.00488,0.007787,0.011751,0.016892,106.0
10528,154.660699,0.012864,154.660699,0.099681,2000.0,8000.0,-0.020924,0.020315,0.0001,0.0002,...,2.4e-05,0.000141,0.000409,0.000881,0.001611,0.002707,0.004312,0.006554,0.009546,106.0
