In [None]:
import os
from functools import reduce

import pandas as pd
import numpy as np

import neurokit2 as nk
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import f1_score, confusion_matrix

import xgboost as xgb

from sklearn.impute import SimpleImputer
from biosppy.signals import ecg
import scipy
from sklearn import preprocessing

In [None]:
root_path = './'
X_train_path = 'X_train.csv'
X_test_path = 'X_test.csv'
y_train_path = 'y_train.csv'

sampling_rate = 300
threshhold = 0
shrinkage = 0.1

## val_ratio = 0.1
random_state = 29

## 1. HRV feature extraction

In [None]:
def data_loading(root_path, data_path):
    return pd.read_csv(os.path.join(root_path, data_path)).values[:,1:]

X_train_raw = data_loading(root_path, X_train_path)
y_train_raw = data_loading(root_path, y_train_path).ravel()
X_test_raw = data_loading(root_path, X_test_path)

In [None]:
def generate_list(matrix):
    sequences = list()
    for row in matrix:
        row = row[~np.isnan(row)]
        sequences.append(row)
    return sequences

def generate_features(X):
    c1 = -1
    nan = []
    H = []
    for row in X:
        c1 += 1
        out = ecg.ecg(signal=row,sampling_rate=sampling_rate, show=False)
        
        # heartbeat features 
        heart_rates = out['heart_rate'][~np.isnan(out['heart_rate'])] #heart rate
        hr_mean = np.mean(heart_rates)#heart rate mean
        if np.isnan(hr_mean):
            nan.append(c1)
            out_new = np.ones(out['heart_rate'].shape[0]+1)*90 # set a random heartbeat of 90, should not occur that often
            hr_mean =  np.mean(out_new) # heart rate mean
            hr_median = np.median(out_new) #heart rate median
            hr_std = np.std(out_new) #heart rate standart deviation
            hr_var = np.var(out_new) #heart rate variance
            hr_mad = scipy.stats.median_absolute_deviation(out_new) #heart rate median absloute deviation
        else:
            hr_median = np.median(heart_rates) #heart rate median
            hr_std = np.std(heart_rates) #heart rate standart deviation
            hr_var = np.var(heart_rates) #heart beat variance
            hr_mad = scipy.stats.median_absolute_deviation(heart_rates) #heart rate median absloute deviation
        hr_features = np.array([hr_mean,hr_median,hr_std,hr_var, hr_mad]) #first few features

        r_intensities = row[out['rpeaks']]
        r_int_mean = np.mean(r_intensities) #mean of the intrnsities
        r_int_median = np.median(r_intensities) # median of the intensities
        r_int_std = np.std(r_intensities) #stadnard deviation  of the intensities
        r_int_var = np.var(r_intensities) #variance of the intensities
        r_int_mad = scipy.stats.median_absolute_deviation(r_intensities) #median absolute deviation of the intensities
        r_int_features = np.array([r_int_mean,r_int_median,r_int_std,r_int_var, r_int_mad]) #more features
        
        #RR interval features 
        r_peaks = out['ts'][out['rpeaks']] 
        rri = np.diff(r_peaks)*1000 # rr interval in milli seconds
        rri_mean = np.mean(rri) #mean of the rr intervals
        rri_median = np.median(rri) #median of the rr intervals
        rri_std = np.std(rri) #standard deviation of the rr intervals
        rri_var = np.var(rri) # variance of the rr intervals
        rri_diff = np.absolute(np.diff(rri)) #difference of peaks: out[i] = a[i+1] - a[i]
        nn10 = rri_diff[rri_diff>10].shape[0] #amount of peak diff higher than 10
        nn20 = rri_diff[rri_diff>20].shape[0] #amount of peak diff higher than 20
        nn50 = rri_diff[rri_diff>50].shape[0] #amount of peak diff higher than 50
        nn100 = rri_diff[rri_diff>100].shape[0] #amount of peak diff higher than 100
        nn200 = rri_diff[rri_diff>200].shape[0] #amount of peak diff higher than 200
        nn500 = rri_diff[rri_diff>500].shape[0] #amount of peak diff higher than 500
        rri_diff_length = rri_diff.shape[0]
        pnn10 = nn10 / rri_diff_length #amount of peak diff higher than 10 compared to the number all differences
        pnn20 = nn20 / rri_diff_length #amount of peak diff higher than 20 compared to the number all differences
        pnn50 = nn50 / rri_diff_length #amount of peak diff higher than 50 compared to the number all differences
        pnn100 = nn100 / rri_diff_length #amount of peak diff higher than 100 compared to the number all differences
        pnn200 = nn200 / rri_diff_length #amount of peak diff higher than 200 compared to the number all differences
        pnn500 = nn500 / rri_diff_length #amount of peak diff higher than 500 compared to the number all differences
        rmssd = np.sqrt(np.mean(rri_diff**2)) #root mean squared
        cvsd = rmssd / rri_mean #coefficient of variance
        sdsd = np.std(rri_diff) #standard deviation
        madnn = scipy.stats.median_absolute_deviation(rri) #median absolute deviation
        mcvnn = madnn / rri_median #median coefficient variance
           
        r_features = np.array([r_int_mean,r_int_median,r_int_std,r_int_var, r_int_mad, rri_mean, rri_median, rri_std, rri_var, nn10, nn20, nn50, nn100, nn200, nn500, pnn10, pnn20, pnn50, pnn100, pnn200, pnn500, rmssd, cvsd, sdsd, madnn, mcvnn]) #even more features
        
        if np.any(np.isnan(r_features)): #should never be the case
            nan.append(c1)
            r_features = np.zeros(r_features.shape[0])
        
        features = np.concatenate((hr_features, r_int_features,r_features), axis=None)
        H.append(features)
        G = np.asarray(H)
    return G

In [None]:
X_train = generate_list(X_train_raw)
X_test = generate_list(X_test_raw)

X_train = generate_features(X_train)
X_test = generate_features(X_test)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)   
X_test = scaler.transform(X_test)

In [None]:
X_train, y_train = RandomOverSampler(sampling_strategy='minority', shrinkage=shrinkage, random_state=random_state).fit_resample(X_train, y_train_raw)
print(y_train.shape)

In [None]:
num_KFold = 5
num_features = X_train.shape[1]

xgb_param_grid = {
    'n_estimators': [160],
    'max_depth': [5,15],
    'subsample': [0.9],
    'min_child_weight': [5],
    'colsample_bytree': [0.6],
    'colsample_bylevel': [0.8],
    'colsample_bynode': [0.5],
    'learning_rate': np.arange(0.15, 0.25, step=0.05),
    'reg_alpha': [1e-2,1e-1,1e0]}
#  / reg_lambda

In [None]:
fit_params={
    'verbose': False}

xgboost = xgb.XGBClassifier(verbosity = 0, objective='multi:softmax', tree_method='gpu_hist', 
                            random_state=random_state, gpu_id=0, predictor='gpu_predictor')

%time clsf = GridSearchCV(xgboost, xgb_param_grid, scoring='f1_micro', n_jobs=16, cv=num_KFold).fit(X_train, y_train, **fit_params)

In [None]:
print("Best Estmator: ", clsf.best_estimator_)
print("Best Score: ", clsf.best_score_)
print("Feature Importances: ", clsf.best_estimator_.feature_importances_)##

## 3. Test Results

In [None]:
y_test_path = 'y_test_yutong_v26.csv'
y_test = clsf.predict(X_test)
df_result = pd.DataFrame(data=y_test.astype(int), columns=['y'])
df_result.to_csv(path_or_buf=os.path.join(root_path, y_test_path), index_label='id')