In [None]:
!pip install -q wfdb neurokit2

In [None]:
import os
import wfdb
import math
import numpy as np
import pandas as pd
from scipy import interpolate
from scipy import signal
import neurokit2 as nk
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [None]:
measure_df = pd.read_csv("/kaggle/input/mimic-iv-full-3-5gb/mimic - iv 3.5gb/machine_measurements.csv", index_col = None)
measure_df

In [None]:
has_duplicates = measure_df['study_id'].duplicated().any()
print('has duplicates ', has_duplicates)

In [None]:
measure_df = measure_df[measure_df['subject_id'] <= 10370032].reset_index(drop=True)
measure_df

In [None]:
def find_class(df, row_no):

    report_list = []
    for pos in range(18):
        rep = df.iloc[row_no][f'report_{pos}'] 
        if not pd.isna(rep):
            report_list.append(rep.lower())
        
    for rep in report_list:
        if "bradycardia" in rep:
            return "SB"
                
    for rep in report_list:
        if "tachycardia" in rep:
            return "ST"
                
    for rep in report_list:
        if "atrial fibrillation" in rep:
            return "AF"
                
    for rep in report_list:
        if "abnormal" in rep:
            if "possible" in rep:
                continue
            else:
                return None
            
    return "NORM"

ret = find_class(measure_df, 1050)
ret

In [None]:
recoder_df = pd.read_csv("/kaggle/input/mimic-iv-full-3-5gb/mimic - iv 3.5gb/record_list.csv")
recoder_df

In [None]:
has_duplicates = recoder_df['file_name'].duplicated().any()
print('has duplicates ', has_duplicates)

In [None]:
from tqdm import tqdm

# Initialize tqdm progress bar
tqdm.pandas()
measure_df["class name"] = measure_df.progress_apply(lambda row: find_class(measure_df, row.name), axis=1)

# Display the updated dataframe
measure_df = measure_df[["subject_id","study_id",'ecg_time','class name']]
measure_df

In [None]:
# Merge the dataframes on subject_id and study_id
final_df = recoder_df.merge(measure_df[["subject_id", "study_id", "class name"]], 
                              on=["subject_id", "study_id"], how="left")

final_df

In [None]:
final_df = final_df.dropna(subset=['class name'])  # This will remove rows where class name is NaN
final_df = final_df[final_df['class name'] != None]  # This will remove rows where class name is None

# Now 'df' will have no rows with NaN or None in 'class name'
final_df

In [None]:
# Define the base directory
base_dir = "/kaggle/input/mimic-iv-full-3-5gb/mimic - iv 3.5gb"

# Function to check if the .hea file exists
def check_hea_exists(path):
    full_path = os.path.join(base_dir, path) 
    output = os.path.exists(full_path + ".hea")
    return output

# Use tqdm for progress bar and filter rows where the .hea file exists
tqdm.pandas()
final_df = final_df[final_df["path"].progress_apply(check_hea_exists)].reset_index(drop=True)
final_df

In [None]:
# resampling to 250Hz
def resampling(array, freq, kind='linear'):
    t = np.linspace(1, len(array), len(array))
    f = interpolate.interp1d(t, array, kind=kind)
    t_new = np.linspace(1, len(array), int(len(array)/freq * 250))
    new_array = f(t_new)
    return new_array

# standard normalization 
def normalize(data):
    scaler = StandardScaler()
    data_norm = scaler.fit_transform(data)
    return data_norm

In [None]:
base_dir = "/kaggle/input/mimic-iv-full-3-5gb/mimic - iv 3.5gb"

# main
feature_path = './Feature'
if not os.path.exists(feature_path):
    os.mkdir(feature_path)

for i, row in tqdm(final_df.iterrows(), total=final_df.shape[0], desc="Processing ECG Data"):
    sub = []
    pid = row['file_name']
    tri_path = os.path.join(base_dir, row['path'])
    # ecg_data, field = wfdb.rdsamp(tri_path)
    ecg_data = np.fromfile(tri_path+'.dat', dtype='<i2').reshape(-1, 12)
    trial = []
    for ch in range(ecg_data.shape[1]):
        data = resampling(ecg_data[:,ch], freq=500, kind='linear')
        trial.append(data)
    trial = np.array(trial).T
    trial_norm = normalize(trial)
    sub.append(trial_norm)
    sub = np.array(sub)
    sub = sub.reshape(-1, 250, sub.shape[-1])  # split 10s trial into 1s sample
    # sub = sub.reshape(-1, 1250, sub.shape[-1])  # split 10s trial into 5s sample
    # print(sub.shape)
    # np.save(feature_path + f'/feature_{.05d}.npy'.format(i+1), sub)
    np.save(feature_path + f'/feature_{i+1:05d}.npy', sub)



In [None]:
loaded = np.load('./Feature/feature_00003.npy')
loaded

In [None]:
sub = []
trial = []
tri_path = os.path.join(base_dir, 'files/p1001/p10011668/s44623549/44623549')
ecg_data = np.fromfile(tri_path+'.dat', dtype='<i2').reshape(-1, 12)
ecg_data
for ch in range(ecg_data.shape[1]):
    data = resampling(ecg_data[:,ch], freq=500, kind='linear')
    trial.append(data)
trial = np.array(trial).T
trial_norm = normalize(trial)
trial_norm
sub.append(trial_norm)
sub = np.array(sub)
sub = sub.reshape(-1, 250, sub.shape[-1])  # split 10s trial into 1s sample
sub

In [None]:
# label.npy

label_path = './Label'
if not os.path.exists(label_path):
    os.mkdir(label_path)
    
label = []
for i, row in tqdm(final_df.iterrows(), total=final_df.shape[0], desc="Processing ECG Data Labels"):
    sub = []
    pid = row['file_name']
    class_name = row['class name']
    if 'SB' in class_name:
        diag = 0
    elif  'ST' in class_name:
        diag = 0
    elif 'AF' in class_name:
        diag = 0
    else:
        diag = 1
    label.append([int(diag), int(i+1)])
label = np.array(label)
print(label)
np.save(label_path + '/label.npy', label)

In [None]:
len(label)