In [1]:
import pandas as pd
import biosppy.signals.ecg as ecg
import numpy as np
import pickle

train = pd.read_csv('train.csv')
train_features = train.iloc[:, 2:].to_numpy()
train_labels = train.iloc[:, 1].to_numpy()

In [64]:
# drop nan value
train_features_no_nan = [features[~np.isnan(features)] for features in train_features]

# truncate to get sequence with length 500
subsequences = []
subsequence_labels = []

for features, label in zip(train_features_no_nan, train_labels):
    num_subsequences = len(features) // 1500
    
    for i in range(num_subsequences):
        subsequence = features[i * 1500:(i + 1) * 1500]
        subsequences.append(subsequence)
        subsequence_labels.append(label)

In [65]:
subsequences = np.array(subsequences)
subsequence_labels = np.array(subsequence_labels)

print(f'Subsequences shape: {subsequences.shape}')
print(f'Subsequence labels shape: {subsequence_labels.shape}')

Subsequences shape: (28106, 1500)
Subsequence labels shape: (28106,)


In [109]:
def filtered_ecg_dataset(data, sampling_rate=300):
    filtered_signals = []
    beat_templates = []
    error_indices = []
    for i in range(len(data)):
        row_data = data[i]
        try:
            output = ecg.ecg(row_data, sampling_rate=sampling_rate, show=False)
            filtered = output['filtered']
            beat = output['templates']

            filtered_signals.append(filtered)
            beat_templates.append(beat)
        except ValueError as e:
            if str(e) == "Not enough beats to compute heart rate.":
                print(f"Skipping signal {i} due to insufficient beats.")
                error_indices.append(i)
                continue
            else:
                raise e

    return filtered_signals, beat_templates, error_indices

In [110]:
train_filtered_signal, train_beats, train_labeltodelete = filtered_ecg_dataset(subsequences)

Skipping signal 4596 due to insufficient beats.
Skipping signal 15102 due to insufficient beats.
Skipping signal 20964 due to insufficient beats.
Skipping signal 25117 due to insufficient beats.


In [112]:
subsequence_labels = np.delete(subsequence_labels, train_labeltodelete)

In [113]:
print(len(subsequence_labels))

28102


In [114]:
test = pd.read_csv('test.csv')
test_idx = test.iloc[:, 0].to_numpy()
test_features = test.iloc[:, 1:].to_numpy()

In [115]:
print(test_idx)

[   0    1    2 ... 3408 3409 3410]


In [118]:
# drop nan value
test_features_no_nan = [features[~np.isnan(features)] for features in test_features]

# truncate to get sequence with length 500
subsequences_test = []
subsequence_idx = []

for features, idx in zip(test_features_no_nan, test_idx):
    num_subsequences = len(features) // 1500
    
    for i in range(num_subsequences):
        subsequence = features[i * 1500:(i + 1) * 1500]
        subsequences_test.append(subsequence)
        subsequence_idx.append(idx)

In [122]:
filtered_signal_test, beats_test, test_idxtodelete = filtered_ecg_dataset(subsequences_test)

Skipping signal 6188 due to insufficient beats.
Skipping signal 6189 due to insufficient beats.
Skipping signal 6191 due to insufficient beats.
Skipping signal 7948 due to insufficient beats.
Skipping signal 10328 due to insufficient beats.
Skipping signal 14534 due to insufficient beats.
Skipping signal 14536 due to insufficient beats.


In [123]:
subsequence_idx = np.delete(subsequence_idx, test_idxtodelete)

In [126]:
with open('train_1500.pkl', 'wb') as f:
    pickle.dump({'X': train_filtered_signal, 'y': subsequence_labels}, f)
with open('test_1500.pkl', 'wb') as f:
    pickle.dump({'idx': subsequence_idx, 'X': filtered_signal_test}, f)