In [1]:
import sys
sys.path.append('..')

import numpy as np
import h5py
import os
from collections import Counter

from lib.CustomDataset import TimeSeriesHDF5Dataset
from lib.Utilities import *


In [2]:
filepath = '/storage/ms5267@drexel.edu/precicecap_downloads/90_Patient_2023-03-21_12:19.h5'
t  = TimeSeriesHDF5Dataset(filepath, 'Waveforms/ABP_na','Waveforms/ABP_na_Timestamps',10,0.5)

18:41:09 :	  Reading /storage/ms5267@drexel.edu/precicecap_downloads/90_Patient_2023-03-21_12:19.h5 

18:41:09 :	  Sampling frequency for this file is: 125 

18:41:09 :	  There are a total of : 4586 segments of 10 seconds with overlap of 50.0% 



In [3]:
filepath = '/storage/ms5267@drexel.edu/precicecap_downloads/90_Patient_2023-03-21_12:19.h5'
t  = TimeSeriesHDF5Dataset(filepath, 'Waveforms/ECG_II','Waveforms/ECG_II_Timestamps',10,0.5)

18:41:10 :	  Reading /storage/ms5267@drexel.edu/precicecap_downloads/90_Patient_2023-03-21_12:19.h5 

18:41:10 :	  Sampling frequency for this file is: 500 

18:41:10 :	  Frequency will be resampled to 125Hz. 

18:41:10 :	  There are a total of : 4586 segments of 10 seconds with overlap of 50.0% 



In [2]:
import yaml


config_path = '../config.yaml'
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

annotation_dir = config['annotation_dir']

def load_annotation_file(ann_file_path):
    import pandas as pd

    def convert_bytes(b):
        # Convert byte string literals found in the CSV to string removing b' and '
        if isinstance(b, bytes):
            return b.decode('utf-8').strip("b'").strip("'")
        return b.strip("b'").strip("'")


    # Load the CSV file, applying conversion to all columns
    df = pd.read_csv(ann_file_path, converters={i: convert_bytes for i in range(8)}, header=None)

    # Rename columns if needed (assuming you know what each column represents)
    df.columns = ['ID1', 'ID2', 'Session', 'Data_Type', 'Start_Time', 'End_Time', 'Signal_Type', 'Lead_Type']

    return df

def is_artifact_overlap(file_path, mode, candidate_interval):
    """Finds if the given indices contain artifact or not

    Args:
        file_path (str): Path of the datafile, this is to get the name of file
        mode (str): Either ABP or ECG
        start_idx (int): Start index
        end_idx (int): End index
    """
    import os
    file_name = os.path.basename(file_path)
    annotation_file_name = annotation_dir + file_name + '-annotations.csv'
    annotation_df = load_annotation_file(annotation_file_name)

    if mode == 'ABP':
        filter = ['ABP', 'ART', 'ART1', 'ART2']
    else:
        filter = 'ECG'
    
    # Filter the DataFrame
    filtered_df = annotation_df[annotation_df.iloc[:, -2].isin(filter)]
    # Extract the first two columns and convert to NumPy array
    artifact_arr = filtered_df.iloc[:, :2].astype(int).to_numpy()

    # Print the resulting NumPy array
    print(artifact_arr)	
    return has_artifact(candidate_interval, artifact_arr)


def has_artifact(candidate_interval, artifacts):
    for artifact in artifacts:
        # Calculate the maximum start time and minimum end time between candidate_interval and artifact
        start_max = max(candidate_interval[0], artifact[0])
        end_min = min(candidate_interval[1], artifact[1])
        
        # Check for overlap
        if start_max < end_min:
            # If there is an overlap, return True
            return True
    
    # If no overlap is found with any artifact, return False
    return False


datafile = '/storage/ms5267@drexel.edu/precicecap_downloads/90_Patient_2023-03-21_12:19.h5'
is_artifact_overlap(datafile, 'ABP', [523176,622176])

[[  39767   40392]
 [ 332635  333385]
 [ 256794  256919]
 [ 256919  257544]
 [ 255170  255795]
 [ 522176  524175]
 [ 335384  347754]
 [     35      35]
 [ 194197  194697]
 [  34646   34771]
 [ 227932  228432]
 [  99111   99486]
 [ 307147  307522]
 [ 308396  308771]
 [ 805175  805799]
 [1032074 1032448]
 [1125157 1125282]
 [1287960 1369173]
 [1857456 1857706]
 [2270522 2272895]
 [2821519 2822518]
 [ 976099 1003836]
 [1047817 1048066]
 [1084675 1085175]
 [1939919 1955787]
 [2294886 2295136]
 [1852708 1852958]
 [1021078 1029450]
 [1102667 1111413]
 [1269718 1283462]
 [2618236 2618611]
 [2816521 2817770]
 [ 308521  308771]
 [ 100860  101110]
 [ 227932  228557]
 [ 335384  347754]
 [     35      35]
 [  34646   34771]
 [     35      35]
 [ 194322  194697]
 [  37019   37144]
 [ 332510  333760]
 [2867623 2867623]
 [ 255295  257544]
 [2867623 2867623]
 [  39517   40517]
 [ 307147  307522]
 [  97986   99611]
 [ 521426  524300]
 [1007210 1007460]
 [1489120 1489745]
 [2089352 2091601]
 [2613238 26

True

In [11]:
import torch
from torch.utils.data import DataLoader
from lib.CustomDataset import TimeSeriesHDF5Dataset
from tqdm.notebook import tqdm

############### SCRIPT VARIABLES #######################
config_path = '../config.yaml'
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

segment_length_sec = config['segment_length_sec']
sampling_rate = config['sampling_rate']
overlap = config['overlap']

latent_dim = 20
lr = 1e-4
epochs = 5
batch_size=64
percentile_threshold = 99.9
n=0

device = 'cpu'
best_model_path = 'models/deep_clean_abp_best.pt'
directory_path = '/storage/ms5267@drexel.edu/precicecap_downloads/'
mode = 'ABP'
#########################################################

def compute_mean_std(train_files):
    # Initialize sum and sum of squares
    sum_data = torch.zeros((1, int(segment_length_sec * sampling_rate)), device='cuda')
    sum_sq_data = torch.zeros((1,int(segment_length_sec * sampling_rate)), device='cuda')
    n = 0
    
    with torch.no_grad():
        for filename in tqdm(train_files):
            log_info(f"Processing {filename}")
            datafile = os.path.join(directory_path, filename)

            # Load the dataset
            dataset = TimeSeriesHDF5Dataset(datafile, mode, segment_length_sec, overlap)
            dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=4, shuffle=False, pin_memory=True)

            # Loop through batches in the DataLoader
            for _, data, _ in dataloader:
                filter = filter_batch(data)
                data = data.unsqueeze(1).float().to(device)[filter]
                
                data = data.to('cuda')  # Ensure data is on GPU
                sum_data += torch.sum(data, dim=0)
                sum_sq_data += torch.sum(data ** 2, dim=0)
                # Update sample count
                n += len(data)
    # Compute mean and standard deviation
    mean = sum_data / n
    std_dev = torch.sqrt(sum_sq_data / n - mean ** 2)

    return mean, std_dev

train_files = ['59_Patient_2022-01-31_23:19.h5'
, '74_Patient_2023-08-05_06:00.h5'
, '110_Patient_2023_Sep_28__23_52_07_705708.h5'
, '90_Patient_2023-03-21_19:57.h5', '4_Patient_2022-02-05_08:59.h5', '73_Patient_2017_Dec_18__11_19_55_297272.h5', '34_Patient_2023-04-04_22:31.h5', '53_Patient_2023-06-25_21:39.h5', '101_Patient_2023_Nov_9__22_24_41_155873.h5', '90_Patient_2023-03-21_12:19.h5', '50_Patient_2023-06-12_21:10.h5', '35_Patient_2023-04-03_19:51.h5', '55_Patient_2023-06-13_00:47.h5', '139_Patient_2024_Mar_4__7_32_51_662674.h5', '34_Patient_2023-04-05_12:23.h5']
mean, std= compute_mean_std(train_files)


  0%|          | 0/15 [00:00<?, ?it/s]

17:32:43 :	  Processing 59_Patient_2022-01-31_23:19.h5 

17:32:44 :	  Processing 74_Patient_2023-08-05_06:00.h5 

17:33:41 :	  Processing 110_Patient_2023_Sep_28__23_52_07_705708.h5 

17:36:13 :	  Processing 90_Patient_2023-03-21_19:57.h5 

17:37:54 :	  Processing 4_Patient_2022-02-05_08:59.h5 

17:37:55 :	  Processing 73_Patient_2017_Dec_18__11_19_55_297272.h5 

17:37:58 :	  Processing 34_Patient_2023-04-04_22:31.h5 

17:38:00 :	  Processing 53_Patient_2023-06-25_21:39.h5 

17:38:01 :	  Processing 101_Patient_2023_Nov_9__22_24_41_155873.h5 

17:38:43 :	  Processing 90_Patient_2023-03-21_12:19.h5 

17:38:46 :	  Processing 50_Patient_2023-06-12_21:10.h5 

17:39:39 :	  Processing 35_Patient_2023-04-03_19:51.h5 

17:39:40 :	  Processing 55_Patient_2023-06-13_00:47.h5 

17:41:07 :	  Processing 139_Patient_2024_Mar_4__7_32_51_662674.h5 

17:41:13 :	  Processing 34_Patient_2023-04-05_12:23.h5 



In [7]:
mean.shape, std.shape

(torch.Size([1, 1250]), torch.Size([1, 1250]))

In [12]:
torch.save(mean, '../models/mean_abp_10sec')
torch.save(std, '../models/std_abp_10sec')

In [13]:
mean

tensor([[79.6564, 79.1526, 78.6991,  ..., 77.3311, 77.6051, 77.9201]],
       device='cuda:0')

In [14]:
torch.sum(mean)

tensor(101781.5703, device='cuda:0')

In [15]:
std

tensor([[28.4017, 28.0697, 27.7477,  ..., 24.9447, 24.8768, 24.7370]],
       device='cuda:0')

In [6]:
import torch
mean = torch.load('../models/mean_ecg_10sec')
std = torch.load('../models/std_ecg_10sec')


In [7]:
mean, std

(tensor([[ 3.6175e-02,  3.4293e-02,  2.1703e-02,  ..., -3.8359e-05,
          -2.2871e-02,  1.6050e-02]], device='cuda:2'),
 tensor([[0.4039, 0.7181, 0.6091,  ..., 0.1972, 0.1942, 0.1896]],
        device='cuda:2'))

In [10]:
torch.sum(mean)

tensor(222.5967, device='cuda:2')

In [3]:
filepath = '/storage/ms5267@drexel.edu/precicecap_downloads/4_Patient_2022-02-05_08:59.h5'
dataset  =TimeSeriesHDF5Dataset(filepath, 'ART', 3, 0.5)    

len(dataset)

2665