In [None]:
# %cd C:\Users\piotr\Desktop\PSG data
# %pip install -r requirements.txt
import wfdb
import mne
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, hamming_loss,ConfusionMatrixDisplay
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.preprocessing import FunctionTransformer,StandardScaler
from mne.datasets import sample
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from mne_features.feature_extraction import extract_features, FeatureExtractor
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest,f_classif
import neurokit2 as nk
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

In [None]:
def data_creation(record_path):
    annotation = wfdb.rdann(record_path, extension='apn')
    
    # Get the sampling frequency from the record
    sampling_rate = wfdb.rdrecord(record_path).fs  
    
    # Convert annotation sample indices to times in seconds
    annotation_times = np.array(annotation.sample) / sampling_rate  
    
    # Extract annotation labels and relable 0 for no apnea and 1 for apnea
    annotation_labels = np.where(np.array(annotation.symbol)=="N",0, 1).astype(np.int32)
    # get the signal
    signal,dic = wfdb.rdsamp(record_path)
    signal=signal.flatten()
    annotation_times = annotation_times[1:-1]  
    annotation_labels = annotation_labels[1:-1]
    df=pd.DataFrame()
    bad_times=[]
    for i in range(len(annotation_times)):
        try:
            # Clean the ECG signal
            time=int(annotation_times[i])*sampling_rate
            clean = nk.ecg_clean(signal[time-1500:time+1500], sampling_rate=sampling_rate)
            
            # Compute ECG quality and filter bad segments
            quality = np.mean(nk.ecg_quality(clean, sampling_rate=sampling_rate))
            
            if quality < 0.5:
                bad_times.append(i)  # Mark as bad quality
                print(f"Warning: ECG quality check failed at index {i}. Marking as bad data. Error: {e}")
            else:
                # Extract features from cleaned ECG
                peaks = nk.ecg_peaks(clean, sampling_rate=sampling_rate)
                features = nk.hrv(peaks[0], sampling_rate=sampling_rate)  
                df = pd.concat([df, features])
    
        except Exception as e:
            # If any error occurs, consider this segment as bad
            bad_times.append(i)
            print(f"Warning: ECG quality check failed at index {i}. Marking as bad data. Error: {e}")
    features= df.to_numpy()
    labels=np.delete(annotation_labels,bad_times)
    return features,labels

In [None]:
list_of_features_train=[]
list_of_labels_train=[]
list_of_file_names=open(r"C:\Users\piotr\Desktop\PSG data\Apnea ECG\list_train").read().split("\n")
for i in list_of_file_names:
    record_path=fr"C:\Users\piotr\Desktop\PSG data\Apnea ECG\{i}"
    features,labels = data_creation(record_path)
    list_of_features_train.append(features)
    list_of_labels_train.append(labels)

In [None]:
extracted_features=np.concatenate(list_of_features_train)
extracted_labels=np.concatenate(list_of_labels_train)
extracted_features[np.isinf(extracted_features)]=np.nan
nan_cols = np.all(np.isnan(extracted_features), axis=0)
clean_features = extracted_features[:, ~nan_cols]
nan_rows = np.isnan(clean_features).any(axis=1)
nan_indexes = np.where(nan_rows)[0]
features_cleaned = clean_features[~nan_rows]
labels_cleaned = np.delete(extracted_labels, nan_indexes)

In [None]:
np.save("features_cleaned_30s",features_cleaned)
np.save("labels_cleaned_30s",labels_cleaned)

In [None]:
list_of_features_test=[]
list_of_labels_test=[]
list_of_file_names=open(r"C:\Users\piotr\Desktop\PSG data\Apnea ECG\list_test").read().split("\n")
for i in list_of_file_names:
    record_path=fr"C:\Users\piotr\Desktop\PSG data\Apnea ECG\{i}"
    features,labels = data_creation(record_path)
    list_of_features_test.append(features)
    list_of_labels_test.append(labels)

In [None]:
extracted_features_test=np.concatenate(list_of_features_test)
extracted_labels_test=np.concatenate(list_of_labels_test)
extracted_features_test[np.isinf(extracted_features_test)]=np.nan
nan_cols = np.all(np.isnan(extracted_features_test), axis=0)
clean_features_test = extracted_features_test[:, ~nan_cols]
nan_rows = np.isnan(clean_features_test).any(axis=1)
nan_indexes = np.where(nan_rows)[0]
features_cleaned_test = clean_features_test[~nan_rows]
labels_cleaned_test = np.delete(extracted_labels_test, nan_indexes)

In [None]:
np.save("features_cleaned_30s_test",features_cleaned_test)
np.save("labels_cleaned_30s_test",labels_cleaned_test)

In [None]:
nan_rows

In [None]:
np.unique(np.isnan(features_cleaned).any(axis=1))