In [1]:
import os, sys

sys.path.insert(0, '..')

import h5py
import pandas as pd
import numpy as np
from Utilities import *
from tqdm import tqdm
from CustomDataset import CustomHD5Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split


def read_hd5(hd5file_path, data_group_name, timestamp_group_name):
	"""
	This method reads the hd5 file and returns the data and timestamp group of the file
	"""
	with h5py.File(hd5file_path, 'r') as file:
		dataset = file[data_group_name]
		timestamp = file[timestamp_group_name]

		return dataset[:], timestamp[:]

In [2]:
####################################################################################################
hdf5_file_path = '/home/ms5267@drexel.edu/moberg-precicecap/data/Patient_2021-12-21_04_16.h5'
annotation_path = '/home/ms5267@drexel.edu/moberg-precicecap/data/20240207-annotations-export-workspace=precicecap-patient=7-annotation_group=90.csv'
annotation_metadata = {
	'modality':'ART'
	,'location':'na'
	,'scale_wrt_hd5':1e3
}

base_file = '../data/ABP_test_samples_5sec.csv'
segment_length_sec = 5
sampling_frequency = 125
data_group_name='Waveforms/ART_na'
timestamp_group_name = 'Waveforms/ART_na_Timestamps'

row_count = sum(1 for _ in open(base_file)) - 1
new_entries_num = (row_count/2) * 8

####################################################################################################
df_annotation = pd.read_csv(annotation_path)
df_annotation_filtered = df_annotation[(df_annotation['modality']==annotation_metadata['modality']) & (df_annotation['location']==annotation_metadata['location'])]
artifacts = df_annotation_filtered[["start_time","end_time"]].to_numpy() * int(annotation_metadata['scale_wrt_hd5'])

data, timestamp = read_hd5(hdf5_file_path, data_group_name, timestamp_group_name)



In [3]:
segment_length = segment_length_sec * sampling_frequency

# Randomly get a segment that is of length given as segment_length_sec*sampling_frequency
# If has artifact, then append to artifact list else append to non-artifact list

# reduced_range = int(len(self.timestamp)/segment_length)

# Generate num_positive_samples*2 unique random values from 0 to 58360000 without replacement
random_values = np.random.choice(len(timestamp), int(new_entries_num*1.5), replace=False)

count_negative, i = 0, 0

non_artifact_raw=[]
while count_negative<new_entries_num:
	start_idx = random_values[i]
	temp_ts = timestamp[start_idx : start_idx+segment_length]
	if not has_artifact(temp_ts, artifacts):
		temp_data = data[start_idx: start_idx+segment_length]
		if len(temp_data)==segment_length:
			non_artifact_raw.append(data[start_idx: start_idx+segment_length])
			count_negative+=1
	i+=1



In [4]:
data = np.loadtxt(base_file, delimiter=',') 
non_artifact_raw_np = np.array(non_artifact_raw)

non_artifact_labels = np.zeros((non_artifact_raw_np.shape[0], 1))  
non_artifact_labeled = np.hstack((non_artifact_raw_np, non_artifact_labels)) 

combined_data = np.vstack((data, non_artifact_labeled))


dir_name, file_name = os.path.split(base_file)
base, ext = os.path.splitext(file_name)
new_file_name = f"{base}_imbalanced{ext}"

# Combine it back to form the full path
imbalanced_file = os.path.join(dir_name, new_file_name)

np.savetxt(imbalanced_file, combined_data, delimiter=',')  # You can specify fmt='%f' if you need formatting


In [5]:
data.shape, non_artifact_raw_np.shape, combined_data.shape

((7364, 626), (29452, 625), (36816, 626))