# Data Process

## Import Libraries

In [None]:
# import library
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import trange

# sklearn
import sklearn
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.feature_selection import RFECV, SelectKBest, r_regression, f_regression
from sklearn.gaussian_process.kernels import Matern, RBF, CompoundKernel, Product, Sum, ExpSineSquared, RationalQuadratic
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, IsolationForest, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, make_scorer, f1_score, confusion_matrix
from sklearn.decomposition import PCA

# boost algorithm
import xgboost as xgb
import catboost as cat
import lightgbm as lgb

# torch
import torch
from torch import nn
from torch.nn import Module, Linear, Dropout
from torch.nn.functional import tanh, softmax, mse_loss, relu, sigmoid, binary_cross_entropy, nll_loss
from torch.optim import Adam, SGD

# bio library
import biosppy
from biosppy import storage
from biosppy.signals import ecg

DATA_DIR = "Data"
RESULT_DIR = "Result"

## Load Data

In [None]:
# Load Data
X_train_df = pd.read_csv(os.path.join(DATA_DIR, "X_train.csv"), header=0, index_col=0)
X_test_df = pd.read_csv(os.path.join(DATA_DIR, "X_test.csv"), header=0, index_col=0)
y_train_df = pd.read_csv(os.path.join(DATA_DIR, "y_train.csv"), header=0, index_col=0)

X_train = X_train_df.values
X_test = X_test_df.values
y_train = y_train_df.values.ravel()

In [None]:
# 获取有效长度
X_train_len = []
for row in X_train:
    tail_id = np.where(np.isnan(row))[0]
    if tail_id.shape[0] > 0:
        X_train_len.append(tail_id[0])
    else:
        X_train_len.append(X_train.shape[1])

X_test_len = []
for row in X_test:
    tail_id = np.where(np.isnan(row))[0]
    if tail_id.shape[0] > 0:
        X_test_len.append(tail_id[0])
    else:
        X_test_len.append(X_test.shape[1])

X_train_len, X_test_len = np.array(X_train_len), np.array(X_test_len)

## Extract Valid Features

In [None]:
# get ecg info (比较松)
def check_template_result(templates: np.ndarray) -> list:
    # 检查有没有一个template里有多个心跳的情况
    check_result = True
    error_num = 0
    error_ids = []
    for template_i, template in enumerate(templates):
        peak_threshold = np.max(template) * 0.7
        peak_region = np.array(np.where(template > peak_threshold))
        if np.max(peak_region) - np.min(peak_region) > 0.5 * template.shape[0]:
            error_num += 1
            error_ids.append(template_i)
    return error_ids

def get_ecg_info(X, X_len):
    ts_lst = []
    filtered_lst = []
    rpeaks_lst = []
    templates_ts_lst = []
    templates_lst = []
    heart_rate_ts_lst = []
    heart_rate_lst = []

    error_ids = []
    part_error_lst = []
    for i, (signal, sig_len) in enumerate(zip(X, X_len)):
        ts, filtered, rpeaks, templates_ts, templates, heart_rate_ts, heart_rate = ecg.ecg(signal[:sig_len], sampling_rate=300., show=False)
        # check template
        # if check_ecg_result(templates) == False:
        #     error_ids.append(i)
        #     error_templates.append(templates)
        #     continue

        # template_error_ids = check_template_result(templates) # 以较轻松的方式处理ecg处理异常
        template_error_ids = [] # 以较轻松的方式处理ecg处理异常

        # delete error data
        rpeaks = np.delete(rpeaks, template_error_ids, axis=0)
        templates_ts = np.delete(templates_ts, template_error_ids, axis=0)
        templates = np.delete(templates, template_error_ids, axis=0)
        if len(templates) < 1:
            error_ids.append(i)
            continue

        if len(template_error_ids) > 0:
            part_error_lst.append(len(ts_lst))

        ts_lst.append(ts) # Signal time axis reference (seconds)
        filtered_lst.append(filtered) # Filtered ECG signal
        rpeaks_lst.append(rpeaks) # R-peak location indices
        templates_ts_lst.append(templates_ts) # Templates time axis reference
        templates_lst.append(templates) # Extracted heartbeat templates
        heart_rate_ts_lst.append(heart_rate_ts) # Heart rate time axis reference (seconds)
        heart_rate_lst.append(heart_rate) # Instantaneous heart rate (bpm)
    return ts_lst, filtered_lst, rpeaks_lst, templates_ts_lst, templates_lst, heart_rate_ts_lst, heart_rate_lst

ts_lst, filtered_lst, rpeaks_lst, templates_ts_lst, templates_lst, heart_rate_ts_lst, heart_rate_lst = get_ecg_info(X_train, X_train_len)
ts_lst_test, filtered_lst_test, rpeaks_lst_test, templates_ts_lst_test, templates_lst_test, heart_rate_ts_lst_test, heart_rate_lst_test = get_ecg_info(X_test, X_test_len)

In [None]:
# get average template
max_height = None
for templates in templates_lst:
    for template in templates:
        if max_height is None or np.max(template) > max_height:
            max_height = np.max(template)

# scaler现在只是简单的缩放，不确定绝对高度有没有用
def scaler(template: np.array):
    result = template / max_height
    return result

# 对所有的templates进行缩放
templates_lst = [scaler(templates) for templates in templates_lst]

def get_average_templates(templates):
    templates = scaler(templates)
    avg_templates = templates.sum(axis=0) / templates.shape[0]
    return avg_templates

avg_templates_lst = [get_average_templates(templates) for templates in templates_lst]

In [None]:
def get_PQRST_from_template(template: np.array):
    error_state = False

    # get R
    R_id = np.where(template == np.max(template))[0][0]
    R = template[R_id]

    # get Q
    if R_id == 0:
        Q_id = R_id
        error_state = True
    else:
        Q_id = np.where(template[:R_id] == np.min(template[:R_id]))[0][0]
    Q = template[Q_id]

    # get P
    if Q_id == 0:
        P_id = 0
        error_state = True
    else:
        P_id = np.where(template[:Q_id] == np.max(template[:Q_id]))[0][0]
    P = template[P_id]

    # get S
    if R_id == 179:
        S_id = R_id
        error_state = True
    else:
        S_id = np.where(template[R_id + 1:] == np.min(template[R_id + 1:]))[0][0] + R_id + 1
    S = template[S_id]

    # get T
    if S_id == 179:
        T_id = 179
        error_state = True
    else:
        T_id = np.where(template[S_id + 1:] == np.max(template[S_id + 1:]))[0][0] + S_id + 1
    T = template[T_id]

    # assert (P_id < Q_id and Q_id < R_id and R_id < S_id and S_id < T_id)

    # cal interval
    QRS = S_id - Q_id
    PR = R_id - P_id
    PQ = R_id - Q_id
    ST = T_id - S_id
    QT = T_id - Q_id
    return (P, Q, R, S, T), (P_id, Q_id, R_id, S_id, T_id), (QRS, PR, PQ, ST, QT), error_state

# get P Q R S T
def get_PQRST(templates_lst):
    PQRST = []
    for templates_i, templates in enumerate(templates_lst):
        template_PQRST = {
            "P": [], 
            "Q": [], 
            "R": [], 
            "S": [], 
            "T": [], 
            "P_id": [], 
            "Q_id": [], 
            "R_id": [], 
            "S_id": [], 
            "T_id": [], 
            "QRS": [], 
            "PR": [], 
            "PQ": [], 
            "ST": [], 
            "QT": [], 
            "error_state": []
        }
        for template_i, template in enumerate(templates):
            (P, Q, R, S, T), (P_id, Q_id, R_id, S_id, T_id), (QRS, PR, PQ, ST, QT), error_state = get_PQRST_from_template(template)
            template_PQRST["P"].append(P)
            template_PQRST["Q"].append(Q)
            template_PQRST["R"].append(R)
            template_PQRST["S"].append(S)
            template_PQRST["T"].append(T)
            template_PQRST["P_id"].append(P_id)
            template_PQRST["Q_id"].append(Q_id)
            template_PQRST["R_id"].append(R_id)
            template_PQRST["S_id"].append(S_id)
            template_PQRST["T_id"].append(T_id)
            template_PQRST["QRS"].append(QRS)
            template_PQRST["PR"].append(PR)
            template_PQRST["PQ"].append(PQ)
            template_PQRST["ST"].append(ST)
            template_PQRST["QT"].append(QT)
            template_PQRST["error_state"].append(error_state)
        PQRST.append(template_PQRST)
    return PQRST
PQRST = get_PQRST(templates_lst)
PQRST_test = get_PQRST(templates_lst_test)

In [None]:
# 处理rpeak
rpeaks_new = []
for rpeaks in rpeaks_lst:
    rpeaks_iterval = []
    for i in range(1, rpeaks.shape[0]):
        rpeaks_iterval.append(rpeaks[i] - rpeaks[i - 1])
    rpeaks_iterval = np.array(rpeaks_iterval)
    rpeaks_new.append(rpeaks_iterval)

rpeaks_new_test = []
for rpeaks in rpeaks_lst_test:
    rpeaks_iterval = []
    for i in range(1, rpeaks.shape[0]):
        rpeaks_iterval.append(rpeaks[i] - rpeaks[i - 1])
    rpeaks_iterval = np.array(rpeaks_iterval)
    rpeaks_new_test.append(rpeaks_iterval)

# 处理heart_rate
heart_rate_new = []
for heart_rate in heart_rate_lst:
    if heart_rate.shape[0] == 0:
        heart_rate = np.array(-100)
    heart_rate_new.append(heart_rate)

heart_rate_new_test = []
for heart_rate in heart_rate_lst_test:
    if heart_rate.shape[0] == 0:
        heart_rate = np.array(-100)
    heart_rate_new_test.append(heart_rate)

In [None]:
def get_PQRST_scope_stats(templates_lst: [np.ndarray], start_ids: [np.ndarray], end_ids: [np.ndarray]) -> np.ndarray:
    scope_mean = []
    scope_std = []
    scope_max = []
    scope_min = []
    for templates, start_id, end_id in zip(templates_lst, start_ids, end_ids):
        templates = templates / np.max(templates) # 归一化
        templates_mean = []
        templates_std = []
        templates_max = []
        templates_min = []
        for template, s_i, e_i in zip(templates, start_id, end_id):
            template = template[s_i: e_i + 1]
            templates_mean.append(np.mean(template))
            templates_std.append(np.std(template))
            templates_max.append(np.max(template))
            templates_min.append(np.min(template))
        scope_mean.append(np.mean(templates_mean))
        scope_std.append(np.mean(templates_std))
        scope_max.append(np.mean(templates_max))
        scope_min.append(np.mean(templates_min))
    scope_mean = np.array(scope_mean)
    scope_std = np.array(scope_std)
    scope_max = np.array(scope_max)
    scope_min = np.array(scope_min)
    return scope_mean, scope_std, scope_max, scope_min

def get_PQRST_range_stats(start_ids: [np.ndarray], end_ids: [np.ndarray]) -> np.ndarray:
    range_mean = []
    range_std = []
    range_max = []
    range_min = []
    for start_id, end_id in zip(start_ids, end_ids):
        diff_id = np.array(end_id) - np.array(start_id)
        templates_mean = np.mean(diff_id)
        templates_std = np.std(diff_id)
        templates_max = np.max(diff_id)
        templates_min = np.min(diff_id)
        range_mean.append(np.mean(templates_mean))
        range_std.append(np.mean(templates_std))
        range_max.append(np.mean(templates_max))
        range_min.append(np.mean(templates_min))
    range_mean = np.array(range_mean)
    range_std = np.array(range_std)
    range_max = np.array(range_max)
    range_min = np.array(range_min)
    return range_mean, range_std, range_max, range_min

def get_PQRST_slope_stats(templates_lst: [np.ndarray], start_ids: [np.ndarray], end_ids: [np.ndarray]) -> np.ndarray:
    slope_mean = []
    slope_std = []
    slope_max = []
    slope_min = []
    for templates, start_id, end_id in zip(templates_lst, start_ids, end_ids):
        templates = templates / np.max(templates) # 归一化
        slopes = []
        for template, s_i, e_i in zip(templates, start_id, end_id):
            if s_i == e_i:
                slope = 0
            else:
                slope = (template[e_i] - template[s_i]) / (e_i - s_i)
            slopes.append(slope)
        slope_mean.append(np.mean(slopes))
        slope_std.append(np.std(slopes))
        slope_max.append(np.max(slopes))
        slope_min.append(np.min(slopes))
    slope_mean = np.array(slope_mean)
    slope_std = np.array(slope_std)
    slope_max = np.array(slope_max)
    slope_min = np.array(slope_min)
    return slope_mean, slope_std, slope_max, slope_min

In [None]:
# 统计各种指标
def get_valid_features(rpeaks, heart_rate, PQRST, rpeaks_ids, filtered_ecg, templates_lst):
    # rpeak
    rpeak_mean = np.array([np.average(x) for x in rpeaks])
    rpeak_median = np.array([np.median(x) for x in rpeaks])
    rpeak_std = np.array([np.std(x) for x in rpeaks])
    rpeak_min = np.array([np.min(x) for x in rpeaks])
    rpeak_max = np.array([np.max(x) for x in rpeaks])

    # heart rate
    hr_mean = np.array([np.average(x) for x in heart_rate])
    hr_median = np.array([np.median(x) for x in heart_rate])
    hr_std = np.array([np.std(x) for x in heart_rate])
    hr_min = np.array([np.min(x) for x in heart_rate])
    hr_max = np.array([np.max(x) for x in heart_rate])

    # PQRST
    # P
    P_mean = np.array([np.mean(x["P"]) for x in PQRST])
    P_median = np.array([np.median(x["P"]) for x in PQRST])
    P_std = np.array([np.std(x["P"]) for x in PQRST])
    P_min = np.array([np.min(x["P"]) for x in PQRST])
    P_max = np.array([np.max(x["P"]) for x in PQRST])

    # Q
    Q_mean = np.array([np.mean(x["Q"]) for x in PQRST])
    Q_median = np.array([np.median(x["Q"]) for x in PQRST])
    Q_std = np.array([np.std(x["Q"]) for x in PQRST])
    Q_min = np.array([np.min(x["Q"]) for x in PQRST])
    Q_max = np.array([np.max(x["Q"]) for x in PQRST])

    # R
    R_mean = np.array([np.mean(x["R"]) for x in PQRST])
    R_median = np.array([np.median(x["R"]) for x in PQRST])
    R_std = np.array([np.std(x["R"]) for x in PQRST])
    R_min = np.array([np.min(x["R"]) for x in PQRST])
    R_max = np.array([np.max(x["R"]) for x in PQRST])

    # S
    S_mean = np.array([np.mean(x["S"]) for x in PQRST])
    S_median = np.array([np.median(x["S"]) for x in PQRST])
    S_std = np.array([np.std(x["S"]) for x in PQRST])
    S_min = np.array([np.min(x["S"]) for x in PQRST])
    S_max = np.array([np.max(x["S"]) for x in PQRST])

    # T
    T_mean = np.array([np.mean(x["T"]) for x in PQRST])
    T_median = np.array([np.median(x["T"]) for x in PQRST])
    T_std = np.array([np.std(x["T"]) for x in PQRST])
    T_min = np.array([np.min(x["T"]) for x in PQRST])
    T_max = np.array([np.max(x["T"]) for x in PQRST])

    # P_i
    P_id_mean = np.array([np.mean(x["P_id"]) for x in PQRST])
    P_id_median = np.array([np.median(x["P_id"]) for x in PQRST])
    P_id_std = np.array([np.std(x["P_id"]) for x in PQRST])
    P_id_min = np.array([np.min(x["P_id"]) for x in PQRST])
    P_id_max = np.array([np.max(x["P_id"]) for x in PQRST])

    # Q_i
    Q_id_mean = np.array([np.mean(x["Q_id"]) for x in PQRST])
    Q_id_median = np.array([np.median(x["Q_id"]) for x in PQRST])
    Q_id_std = np.array([np.std(x["Q_id"]) for x in PQRST])
    Q_id_min = np.array([np.min(x["Q_id"]) for x in PQRST])
    Q_id_max = np.array([np.max(x["Q_id"]) for x in PQRST])

    # R_i
    R_id_mean = np.array([np.mean(x["R_id"]) for x in PQRST])
    R_id_median = np.array([np.median(x["R_id"]) for x in PQRST])
    R_id_std = np.array([np.std(x["R_id"]) for x in PQRST])
    R_id_min = np.array([np.min(x["R_id"]) for x in PQRST])
    R_id_max = np.array([np.max(x["R_id"]) for x in PQRST])

    # S_i
    S_id_mean = np.array([np.mean(x["S_id"]) for x in PQRST])
    S_id_median = np.array([np.median(x["S_id"]) for x in PQRST])
    S_id_std = np.array([np.std(x["S_id"]) for x in PQRST])
    S_id_min = np.array([np.min(x["S_id"]) for x in PQRST])
    S_id_max = np.array([np.max(x["S_id"]) for x in PQRST])

    # T_i
    T_id_mean = np.array([np.mean(x["T_id"]) for x in PQRST])
    T_id_median = np.array([np.median(x["T_id"]) for x in PQRST])
    T_id_std = np.array([np.std(x["T_id"]) for x in PQRST])
    T_id_min = np.array([np.min(x["T_id"]) for x in PQRST])
    T_id_max = np.array([np.max(x["T_id"]) for x in PQRST])

    # QRS
    QRS_mean = np.array([np.mean(x["QRS"]) for x in PQRST])
    QRS_median = np.array([np.median(x["QRS"]) for x in PQRST])
    QRS_std = np.array([np.std(x["QRS"]) for x in PQRST])
    QRS_min = np.array([np.min(x["QRS"]) for x in PQRST])
    QRS_max = np.array([np.max(x["QRS"]) for x in PQRST])

    # PR
    PR_mean = np.array([np.mean(x["PR"]) for x in PQRST])
    PR_median = np.array([np.median(x["PR"]) for x in PQRST])
    PR_std = np.array([np.std(x["PR"]) for x in PQRST])
    PR_min = np.array([np.min(x["PR"]) for x in PQRST])
    PR_max = np.array([np.max(x["PR"]) for x in PQRST])

    # PQ
    PQ_mean = np.array([np.mean(x["PQ"]) for x in PQRST])
    PQ_median = np.array([np.median(x["PQ"]) for x in PQRST])
    PQ_std = np.array([np.std(x["PQ"]) for x in PQRST])
    PQ_min = np.array([np.min(x["PQ"]) for x in PQRST])
    PQ_max = np.array([np.max(x["PQ"]) for x in PQRST])

    # ST
    ST_mean = np.array([np.mean(x["ST"]) for x in PQRST])
    ST_median = np.array([np.median(x["ST"]) for x in PQRST])
    ST_std = np.array([np.std(x["ST"]) for x in PQRST])
    ST_min = np.array([np.min(x["ST"]) for x in PQRST])
    ST_max = np.array([np.max(x["ST"]) for x in PQRST])

    # QT
    QT_mean = np.array([np.mean(x["QT"]) for x in PQRST])
    QT_median = np.array([np.median(x["QT"]) for x in PQRST])
    QT_std = np.array([np.std(x["QT"]) for x in PQRST])
    QT_min = np.array([np.min(x["QT"]) for x in PQRST])
    QT_max = np.array([np.max(x["QT"]) for x in PQRST])

    # RS斜率
    RS_slope_mean, RS_slope_std, RS_slope_max, RS_slope_min = get_PQRST_slope_stats(templates_lst, [sample["R_id"] for sample in PQRST], [sample["S_id"] for sample in PQRST])
    ST_slope_mean, ST_slope_std, ST_slope_max, ST_slope_min = get_PQRST_slope_stats(templates_lst, [sample["S_id"] for sample in PQRST], [sample["T_id"] for sample in PQRST])

    # QRS幅度
    QRS_scope_mean, QRS_scope_std, QRS_scope_max, QRS_scope_min = get_PQRST_scope_stats(templates_lst, [sample["Q_id"] for sample in PQRST], [sample["S_id"] for sample in PQRST])

    # PR幅度
    PR_scope_mean, PR_scope_std, PR_scope_max, PR_scope_min = get_PQRST_scope_stats(templates_lst, [sample["P_id"] for sample in PQRST], [sample["R_id"] for sample in PQRST])

    # QT幅度
    QT_scope_mean, QT_scope_std, QT_scope_max, QT_scope_min = get_PQRST_scope_stats(templates_lst, [sample["Q_id"] for sample in PQRST], [sample["T_id"] for sample in PQRST])

    # QR幅度
    QR_scope_mean, QR_scope_std, QR_scope_max, QR_scope_min = get_PQRST_scope_stats(templates_lst, [sample["Q_id"] for sample in PQRST], [sample["R_id"] for sample in PQRST])
    QR_range_mean, QR_range_std, QR_range_max, QR_range_min = get_PQRST_range_stats([sample["Q_id"] for sample in PQRST], [sample["R_id"] for sample in PQRST])

    # RS幅度
    RS_scope_mean, RS_scope_std, RS_scope_max, RS_scope_min = get_PQRST_scope_stats(templates_lst, [sample["R_id"] for sample in PQRST], [sample["S_id"] for sample in PQRST])
    RS_range_mean, RS_range_std, RS_range_max, RS_range_min = get_PQRST_range_stats([sample["R_id"] for sample in PQRST], [sample["S_id"] for sample in PQRST])

    # ST幅度
    ST_scope_mean, ST_scope_std, ST_scope_max, ST_scope_min = get_PQRST_scope_stats(templates_lst, [sample["S_id"] for sample in PQRST], [sample["T_id"] for sample in PQRST])

    # PQ幅度
    PQ_scope_mean, PQ_scope_std, PQ_scope_max, PQ_scope_min = get_PQRST_scope_stats(templates_lst, [sample["P_id"] for sample in PQRST], [sample["Q_id"] for sample in PQRST])

    # QS幅度
    QS_scope_mean, QS_scope_std, QS_scope_max, QS_scope_min = get_PQRST_scope_stats(templates_lst, [sample["Q_id"] for sample in PQRST], [sample["S_id"] for sample in PQRST])
    QS_range_mean, QS_range_std, QS_range_max, QS_range_min = get_PQRST_range_stats([sample["Q_id"] for sample in PQRST], [sample["S_id"] for sample in PQRST])

    # RT幅度
    RT_scope_mean, RT_scope_std, RT_scope_max, RT_scope_min = get_PQRST_scope_stats(templates_lst, [sample["R_id"] for sample in PQRST], [sample["T_id"] for sample in PQRST])
    RT_range_mean, RT_range_std, RT_range_max, RT_range_min = get_PQRST_range_stats([sample["R_id"] for sample in PQRST], [sample["T_id"] for sample in PQRST])

    # ST幅度
    ST_scope_mean, ST_scope_std, ST_scope_max, ST_scope_min = get_PQRST_scope_stats(templates_lst, [sample["S_id"] for sample in PQRST], [sample["T_id"] for sample in PQRST])
    ST_range_mean, ST_range_std, ST_range_max, ST_range_min = get_PQRST_range_stats([sample["S_id"] for sample in PQRST], [sample["T_id"] for sample in PQRST])

    # error_state
    error_count = np.array([np.sum(x["error_state"]) for x in PQRST])
    error_mean = np.array([np.mean(x["error_state"]) for x in PQRST])

    # 计算前2000的peak数量
    peak_counts = np.array([(rpeak <= 2000).sum() for rpeak in rpeaks_ids])

    # 计算peak的mean和std(相对，除以该ecg的最大值)
    ecg_peak_mean = np.array([np.mean(filtered_array[rpeaks_id] / np.max(filtered_array)) for rpeaks_id, filtered_array in zip(rpeaks_ids, filtered_ecg)])
    ecg_peak_std = np.array([np.std(filtered_array[rpeaks_id] / np.max(filtered_array)) for rpeaks_id, filtered_array in zip(rpeaks_ids, filtered_ecg)])
    
    # 计算ecg的绝对最大值
    ecg_max = np.array([np.max(filtered_array) for filtered_array in filtered_ecg])
    ecg_min = np.array([np.min(filtered_array) for filtered_array in filtered_ecg])
    ecg_mean = np.array([np.mean(filtered_array) for filtered_array in filtered_ecg])
    ecg_std = np.array([np.std(filtered_array) for filtered_array in filtered_ecg])
    ecg_median = np.array([np.median(filtered_array) for filtered_array in filtered_ecg])

    # 计算ecg的绝对最大最小值之比
    ecg_max_min_ratio = np.abs(ecg_max / ecg_min)

    # 计算锯齿数量
    saw_counts = []
    for filtered_array in filtered_ecg:
        saw_count = 0
        for y_1, y_2, y_3 in zip(filtered_array[:-2], filtered_array[1: -1], filtered_array[2:]):
            if (y_1 >= y_2) != (y_2 >= y_3):
                saw_count += 1
        saw_counts.append(saw_count)
    saw_counts = np.array(saw_counts)

    # 截取前2000的数据中穿过0.5 Max直线的点的个数
    upper_quant_counts = []
    half_counts = []
    lower_quant_counts = []
    zero_quant_counts = []
    neg_lower_quant_counts = []
    neg_half_counts = []
    neg_higher_quant_counts = []
    percent_1_counts = []
    percent_10_counts = []
    percent_90_counts = []
    percent_95_counts = []
    percent_99_counts = []
    neg_percent_1_counts = []
    neg_percent_10_counts = []
    neg_percent_90_counts = []
    neg_percent_95_counts = []
    neg_percent_99_counts = []
    percent_1_99_ranges = []
    percent_5_95_ranges = []
    percent_10_90_ranges = []
    percent_25_75_ranges = []
    neg_percent_1_99_ranges = []
    neg_percent_5_95_ranges = []
    neg_percent_10_90_ranges = []
    neg_percent_25_75_ranges = []
    for filtered_array in filtered_ecg:
        ecg_array_max = np.max(filtered_array)
        ecg_array_min = np.min(filtered_array)
        filtered_array = filtered_array[:2000]
        upper_quant_count = 0
        half_count = 0
        lower_quant_count = 0
        zero_quant_count = 0
        neg_lower_quant_count = 0
        neg_half_count = 0
        neg_higher_quant_count = 0
        percent_1_count = 0
        percent_10_count = 0
        percent_90_count = 0
        percent_95_count = 0
        percent_99_count = 0
        neg_percent_1_count = 0
        neg_percent_10_count = 0
        neg_percent_90_count = 0
        neg_percent_95_count = 0
        neg_percent_99_count = 0
        for y_1, y_2 in zip(filtered_array[:-1], filtered_array[1:]):
            if (y_1 >= 0.5 * ecg_array_max) != (y_2 >= 0.5 * ecg_array_max):
                half_count += 1
            if (y_1 >= 0.75 * ecg_array_max) != (y_2 >= 0.75 * ecg_array_max):
                upper_quant_count += 1
            if (y_1 >= 0.25 * ecg_array_max) != (y_2 >= 0.25 * ecg_array_max):
                lower_quant_count += 1
            if (y_1 >= 0) + (y_2 >= 0) == 1:
                zero_quant_count += 1
            if (y_1 >= 0.25 * ecg_array_min) != (y_2 >= 0.25 * ecg_array_min):
                neg_lower_quant_count += 1
            if (y_1 >= 0.5 * ecg_array_min) != (y_2 >= 0.5 * ecg_array_min):
                neg_half_count += 1
            if (y_1 >= 0.75 * ecg_array_min) != (y_2 >= 0.75 * ecg_array_min):
                neg_higher_quant_count += 1
            if (y_1 >= 0.01 * ecg_array_max) != (y_2 >= 0.01 * ecg_array_max):
                percent_1_count += 1
            if (y_1 >= 0.10 * ecg_array_max) != (y_2 >= 0.10 * ecg_array_max):
                percent_10_count += 1
            if (y_1 >= 0.90 * ecg_array_max) != (y_2 >= 0.90 * ecg_array_max):
                percent_90_count += 1
            if (y_1 >= 0.95 * ecg_array_max) != (y_2 >= 0.95 * ecg_array_max):
                percent_95_count += 1
            if (y_1 >= 0.99 * ecg_array_max) != (y_2 >= 0.99 * ecg_array_max):
                percent_99_count += 1
            if (y_1 >= 0.01 * ecg_array_min) != (y_2 >= 0.01 * ecg_array_min):
                neg_percent_1_count += 1
            if (y_1 >= 0.10 * ecg_array_min) != (y_2 >= 0.10 * ecg_array_min):
                neg_percent_10_count += 1
            if (y_1 >= 0.90 * ecg_array_min) != (y_2 >= 0.90 * ecg_array_min):
                neg_percent_90_count += 1
            if (y_1 >= 0.95 * ecg_array_min) != (y_2 >= 0.95 * ecg_array_min):
                neg_percent_95_count += 1
            if (y_1 >= 0.99 * ecg_array_min) != (y_2 >= 0.99 * ecg_array_min):
                neg_percent_99_count += 1
        # 计算区间
        percent_1_99_range = ((filtered_array >= 0.01 * ecg_array_max) & (filtered_array <= 0.99 * ecg_array_max)).sum()
        percent_5_95_range = ((filtered_array >= 0.05 * ecg_array_max) & (filtered_array <= 0.95 * ecg_array_max)).sum()
        percent_10_90_range = ((filtered_array >= 0.1 * ecg_array_max) & (filtered_array <= 0.9 * ecg_array_max)).sum()
        percent_25_75_range = ((filtered_array >= 0.25 * ecg_array_max) & (filtered_array <= 0.75 * ecg_array_max)).sum()
        neg_percent_1_99_range = ((filtered_array <= 0.01 * ecg_array_min) & (filtered_array >= 0.99 * ecg_array_min)).sum()
        neg_percent_5_95_range = ((filtered_array <= 0.05 * ecg_array_min) & (filtered_array >= 0.95 * ecg_array_min)).sum()
        neg_percent_10_90_range = ((filtered_array <= 0.1 * ecg_array_min) & (filtered_array >= 0.9 * ecg_array_min)).sum()
        neg_percent_25_75_range = ((filtered_array <= 0.25 * ecg_array_min) & (filtered_array >= 0.75 * ecg_array_min)).sum()

        upper_quant_counts.append(upper_quant_count)
        half_counts.append(half_count)
        lower_quant_counts.append(lower_quant_count)
        zero_quant_counts.append(zero_quant_count)
        neg_lower_quant_counts.append(neg_lower_quant_count)
        neg_half_counts.append(neg_half_count)
        neg_higher_quant_counts.append(neg_higher_quant_count)
        percent_1_counts.append(percent_1_count)
        percent_10_counts.append(percent_10_count)
        percent_90_counts.append(percent_90_count)
        percent_95_counts.append(percent_95_count)
        percent_99_counts.append(percent_99_count)
        neg_percent_1_counts.append(neg_percent_1_count)
        neg_percent_10_counts.append(neg_percent_10_count)
        neg_percent_90_counts.append(neg_percent_90_count)
        neg_percent_95_counts.append(neg_percent_95_count)
        neg_percent_99_counts.append(neg_percent_99_count)
        percent_1_99_ranges.append(percent_1_99_range)
        percent_5_95_ranges.append(percent_5_95_range)
        percent_10_90_ranges.append(percent_10_90_range)
        percent_25_75_ranges.append(percent_25_75_range)
        neg_percent_1_99_ranges.append(neg_percent_1_99_range)
        neg_percent_5_95_ranges.append(neg_percent_5_95_range)
        neg_percent_10_90_ranges.append(neg_percent_10_90_range)
        neg_percent_25_75_ranges.append(neg_percent_25_75_range)
    upper_quant_counts = np.array(upper_quant_counts)
    half_counts = np.array(half_counts)
    lower_quant_counts = np.array(lower_quant_counts)
    zero_quant_counts = np.array(zero_quant_counts)
    neg_lower_quant_counts = np.array(neg_lower_quant_counts)
    neg_half_counts = np.array(neg_half_counts)
    neg_higher_quant_counts = np.array(neg_higher_quant_counts)
    percent_1_counts = np.array(percent_1_counts)
    percent_10_counts = np.array(percent_10_counts)
    percent_90_counts = np.array(percent_90_counts)
    percent_95_counts = np.array(percent_95_counts)
    percent_99_counts = np.array(percent_99_counts)
    neg_percent_1_counts = np.array(neg_percent_1_counts)
    neg_percent_10_counts = np.array(neg_percent_10_counts)
    neg_percent_90_counts = np.array(neg_percent_90_counts)
    neg_percent_95_counts = np.array(neg_percent_95_counts)
    neg_percent_99_counts = np.array(neg_percent_99_counts)
    percent_1_99_ranges = np.array(percent_1_99_ranges)
    percent_5_95_ranges = np.array(percent_5_95_ranges)
    percent_10_90_ranges = np.array(percent_10_90_ranges)
    percent_25_75_ranges = np.array(percent_25_75_ranges)
    neg_percent_1_99_ranges = np.array(neg_percent_1_99_ranges)
    neg_percent_5_95_ranges = np.array(neg_percent_5_95_ranges)
    neg_percent_10_90_ranges = np.array(neg_percent_10_90_ranges)
    neg_percent_25_75_ranges = np.array(neg_percent_25_75_ranges)

    # 计算前2000的数据std
    x_std = np.array([np.std(filtered_x[:2000]) for filtered_x in filtered_ecg])

    # 计算前2000的数据的绝对差距
    x_diff = []
    for filtered_x in filtered_ecg:
        filtered_x = filtered_x[:2000] / np.max(filtered_x[:2000])
        filtered_x_right = filtered_x[1:]
        filtered_x_left = filtered_x[:-1]
        filtered_x_diff = np.sum(np.abs(filtered_x_right - filtered_x_left))
        x_diff.append(filtered_x_diff)
    x_diff = np.array(x_diff)

    # valid features
    valid_features = [
        # rpeak
        rpeak_mean, 
        rpeak_median, 
        rpeak_std, 
        rpeak_min, 
        rpeak_max, 
        # heart rate
        hr_mean, 
        hr_median, 
        hr_std, 
        hr_min, 
        hr_max, 
        # P
        P_mean, 
        P_median, 
        P_std, 
        P_min, 
        P_max, 
        # Q
        Q_mean, 
        Q_median, 
        Q_std, 
        Q_min, 
        Q_max, 
        # R
        R_mean, 
        R_median, 
        R_std, 
        R_min, 
        R_max, 
        # S
        S_mean, 
        S_median, 
        S_std, 
        S_min, 
        S_max, 
        # T
        T_mean, 
        T_median, 
        T_std, 
        T_min, 
        T_max, 
        # P_id
        P_id_mean, 
        P_id_median, 
        P_id_std, 
        P_id_min, 
        P_id_max, 
        # Q_id
        Q_id_mean, 
        Q_id_median, 
        Q_id_std, 
        Q_id_min, 
        Q_id_max, 
        # R_id
        R_id_mean, 
        R_id_median, 
        R_id_std, 
        R_id_min, 
        R_id_max, 
        # S_id
        S_id_mean, 
        S_id_median, 
        S_id_std, 
        S_id_min, 
        S_id_max, 
        # T_id
        T_id_mean, 
        T_id_median, 
        T_id_std, 
        T_id_min, 
        T_id_max, 
        # QRS
        QRS_mean, 
        QRS_median, 
        QRS_std, 
        QRS_min, 
        QRS_max, 
        # PR
        PR_mean, 
        PR_median, 
        PR_std, 
        PR_min, 
        PR_max, 
        # PQ
        PQ_mean, 
        PQ_median, 
        PQ_std, 
        PQ_min, 
        PQ_max, 
        # ST
        ST_mean, 
        ST_median, 
        ST_std, 
        ST_min, 
        ST_max, 
        # QT
        QT_mean, 
        QT_median, 
        QT_std, 
        QT_min, 
        QT_max, 
        # PQRST范围和幅度
        QRS_scope_mean, QRS_scope_std, QRS_scope_max, QRS_scope_min, 
        PR_scope_mean, PR_scope_std, PR_scope_max, PR_scope_min, 
        QT_scope_mean, QT_scope_std, QT_scope_max, QT_scope_min,
        QR_scope_mean, QR_scope_std, QR_scope_max, QR_scope_min,
        QR_range_mean, QR_range_std, QR_range_max, QR_range_min,
        RS_scope_mean, RS_scope_std, RS_scope_max, RS_scope_min,
        RS_range_mean, RS_range_std, RS_range_max, RS_range_min,
        ST_scope_mean, ST_scope_std, ST_scope_max, ST_scope_min,
        PQ_scope_mean, PQ_scope_std, PQ_scope_max, PQ_scope_min,
        QS_scope_mean, QS_scope_std, QS_scope_max, QS_scope_min,
        QS_range_mean, QS_range_std, QS_range_max, QS_range_min,
        RT_scope_mean, RT_scope_std, RT_scope_max, RT_scope_min,
        RT_range_mean, RT_range_std, RT_range_max, RT_range_min,
        ST_scope_mean, ST_scope_std, ST_scope_max, ST_scope_min,
        ST_range_mean, ST_range_std, ST_range_max, ST_range_min,
        # RS和ST斜率
        RS_slope_mean, RS_slope_std, RS_slope_max, RS_slope_min, 
        ST_slope_mean, ST_slope_std, ST_slope_max, ST_slope_min,
        # error state
        error_count, 
        error_mean, 
        # 前2000的peak数量
        peak_counts, 
        # 前2000的数据std
        x_std, 
        # 计算前2000的数据的绝对差距
        x_diff, 

        # 计算peak的mean和std(相对，除以该ecg的最大值)
        ecg_peak_mean, 
        ecg_peak_std, 
        ecg_max, 
        ecg_min, 
        ecg_mean, 
        ecg_std, 
        ecg_median, 
        ecg_max_min_ratio, 

        # 锯齿数量
        saw_counts, 

        # 截取前2000的数据中穿过某个阈值的直线的点的个数
        upper_quant_counts, 
        half_counts,
        lower_quant_counts, 
        zero_quant_counts,
        neg_lower_quant_counts,
        neg_half_counts,
        neg_higher_quant_counts, 
        percent_1_counts, 
        percent_10_counts, 
        percent_90_counts, 
        percent_95_counts, 
        percent_99_counts, 
        neg_percent_1_counts, 
        neg_percent_10_counts, 
        neg_percent_90_counts,
        neg_percent_95_counts,
        neg_percent_99_counts, 
        percent_1_99_ranges, 
        percent_5_95_ranges, 
        percent_1_99_ranges, 
        percent_5_95_ranges, 
        percent_10_90_ranges, 
        percent_25_75_ranges, 
        neg_percent_1_99_ranges, 
        neg_percent_5_95_ranges, 
        neg_percent_10_90_ranges, 
        neg_percent_25_75_ranges, 
    ]
    return valid_features

valid_features = get_valid_features(rpeaks_new, heart_rate_new, PQRST, rpeaks_lst, filtered_lst, templates_lst)
valid_features_test = get_valid_features(rpeaks_new_test, heart_rate_new_test, PQRST_test, rpeaks_lst_test, filtered_lst_test, templates_lst_test)

In [None]:
# 生成新的训练数据 X_train_features
X_train_features = []
for feature in valid_features:
    feature = feature.reshape((X_train.shape[0], -1))
    X_train_features.append(feature)
X_train_features = np.concatenate(X_train_features, axis=1)

X_test_features = []
for feature in valid_features_test:
    feature = feature.reshape((X_test.shape[0], -1))
    X_test_features.append(feature)
X_test_features = np.concatenate(X_test_features, axis=1)

feature_scaler = StandardScaler()
X_train_features = feature_scaler.fit_transform(X_train_features)
X_test_features = feature_scaler.transform(X_test_features)

In [None]:
np.save("Data/X_train_features.npy", X_train_features)
np.save("Data/X_test_features.npy", X_test_features)
np.save("Data/y_train.npy", y_train)