# Task 2

## Import Libraries

In [1]:
# import library
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn
import sklearn
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.feature_selection import RFECV, SelectKBest, r_regression, f_regression
from sklearn.gaussian_process.kernels import Matern, RBF, CompoundKernel, Product, Sum, ExpSineSquared, RationalQuadratic
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, IsolationForest, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, make_scorer
from sklearn.decomposition import PCA

# boost algorithm
import xgboost as xgb
import catboost as cat
import lightgbm as lgb

# torch
import torch
from torch import nn
from torch.nn import Module, Linear, Dropout
from torch.nn.functional import tanh, softmax, mse_loss, relu, sigmoid
from torch.optim import Adam, SGD

# bio library
import biosppy
from biosppy import storage
from biosppy.signals import ecg

DATA_DIR = "Data"

## Data Preprocessing

### Load Data

In [2]:
# Load Data
X_train_df = pd.read_csv(os.path.join(DATA_DIR, "X_train.csv"), header=0, index_col=0)
X_test_df = pd.read_csv(os.path.join(DATA_DIR, "X_test.csv"), header=0, index_col=0)
y_train_df = pd.read_csv(os.path.join(DATA_DIR, "y_train.csv"), header=0, index_col=0)

X_train = X_train_df.values
X_test = X_test_df.values
y_train = y_train_df.values.ravel()

In [None]:
# 获取有效长度
X_train_len = []
for row in X_train:
    tail_id = np.where(np.isnan(row))[0]
    if tail_id.shape[0] > 0:
        X_train_len.append(tail_id[0])
    else:
        X_train_len.append(X_train.shape[1])

X_test_len = []
for row in X_test:
    tail_id = np.where(np.isnan(row))[0]
    if tail_id.shape[0] > 0:
        X_test_len.append(tail_id[0])
    else:
        X_test_len.append(X_test.shape[1])

X_train_len, X_test_len = np.array(X_train_len), np.array(X_test_len)

In [None]:
# get ecg features
ts_lst = []
filtered_lst = []
rpeaks_lst = []
templates_ts_lst = []
templates_lst = []
heart_rate_ts_lst = []
heart_rate_lst = []
for signal, sig_len in zip(X_train, X_train_len):
    ts, filtered, rpeaks, templates_ts, templates, heart_rate_ts, heart_rate = ecg.ecg(signal[:sig_len], sampling_rate=300., show=False)
    ts_lst.append(ts) # Signal time axis reference (seconds)
    filtered_lst.append(filtered) # Filtered ECG signal
    rpeaks_lst.append(rpeaks) # R-peak location indices
    templates_ts_lst.append(templates_ts) # Templates time axis reference
    templates_lst.append(templates) # Extracted heartbeat templates
    heart_rate_ts_lst.append(heart_rate_ts) # Heart rate time axis reference (seconds)
    heart_rate_lst.append(heart_rate) # Instantaneous heart rate (bpm)

In [None]:
# get average template
max_height = None
for templates in templates_lst:
    for template in templates:
        if max_height is None or np.max(template) > max_height:
            max_height = np.max(template)

# scaler现在只是简单的缩放，不确定绝对高度有没有用
def scaler(template: np.array):
    result = template / max_height
    return result

def get_average_templates(templates):
    templates = scaler(templates)
    avg_templates = templates.sum(axis=0) / templates.shape[0]
    return avg_templates

avg_templates_lst = [get_average_templates(templates) for templates in templates_lst]

In [None]:
# 测试获取RQPST
templates = avg_templates_lst[10]

def get_PQRST(template: np.array):
    # get R
    R_id = np.where(templates == np.max(templates))[0][0]
    R = templates[R_id]

    # get Q
    Q_id = np.where(templates[:R_id] == np.min(templates[:R_id]))[0][0]
    Q = templates[Q_id]

    # get P
    P_id = np.where(templates[:Q_id] == np.max(templates[:Q_id]))[0][0]
    P = templates[P_id]

    # get S
    S_id = np.where(templates[R_id + 1:] == np.min(templates[R_id + 1:]))[0][0] + R_id + 1
    S = templates[S_id]

    # get T
    T_id = np.where(templates[S_id + 1:] == np.max(templates[S_id + 1:]))[0][0] + S_id + 1
    T = templates[T_id]

    assert (P_id < Q_id and Q_id < R_id and R_id < S_id and S_id < T_id)

    # cal interval
    QRS = S_id - Q_id
    PR = R_id - P_id
    PQ = R_id - Q_id
    ST = T_id - S_id
    QT = T_id - Q_id
    return (R, Q, P, S, T), (R_id, Q_id, P_id, S_id, T_id), (QRS, PR, PQ, ST, QT)

(R, Q, P, S, T), (R_id, Q_id, P_id, S_id, T_id), (QRS, PR, PQ, ST, QT) = get_PQRST(templates)

plt.plot(np.arange(0, templates.shape[0], 1), templates)
plt.scatter([R_id, Q_id, P_id, S_id, T_id], [R, Q, P, S, T], c="r")

NameError: name 'avg_templates_lst' is not defined

In [None]:
# get P Q R S T
for templates in avg_templates_lst:
    R_id = np.where(templates == np.max(templates))