In [1]:
import pandas as pd
import os
import glob
from collections import defaultdict

input_dir = '../data/ecg'
ecg_files = glob.glob(os.path.join(input_dir, '*.csv'))

sums = defaultdict(float)
sumsq = defaultdict(float)
counts = defaultdict(int)

for file_path in ecg_files:
    df = pd.read_csv(file_path)
    for col in df.columns:
        if col == 'time':
            continue
        sums[col] += df[col].sum()
        sumsq[col] += (df[col] ** 2).sum()
        counts[col] += df[col].count()

# Compute mean and std for each lead
global_stats = {}
for col in sums:
    mean = sums[col] / counts[col]
    var = (sumsq[col] / counts[col]) - mean ** 2
    std = var ** 0.5
    global_stats[col] = {'mean': mean, 'std': std}

print(global_stats)  # Optionally view lead-wise stats

{'I': {'mean': np.float64(0.0017377080217930512), 'std': np.float64(0.14356736611851956)}, 'II': {'mean': np.float64(0.0038203141832033796), 'std': np.float64(0.17755781191435402)}, 'III': {'mean': np.float64(0.002078985369418842), 'std': np.float64(0.15620894004410205)}, 'aVR': {'mean': np.float64(-0.001593612601878088), 'std': np.float64(0.14130977828751345)}, 'aVL': {'mean': np.float64(-0.0013561875531537973), 'std': np.float64(0.12093651107055514)}, 'aVF': {'mean': np.float64(0.0017667012491141157), 'std': np.float64(0.15103439842117525)}, 'V1': {'mean': np.float64(-0.0004684570266756133), 'std': np.float64(0.22579097827980024)}, 'V2': {'mean': np.float64(0.0029829813693335624), 'std': np.float64(0.3648487670717997)}, 'V3': {'mean': np.float64(0.0034779498549144957), 'std': np.float64(0.3746732053707821)}, 'V4': {'mean': np.float64(0.00407681692861873), 'std': np.float64(0.38782930525871673)}, 'V5': {'mean': np.float64(0.0032441011508382465), 'std': np.float64(0.3897460908800855)},

In [2]:
n_std = 5  # Number of standard deviations for clipping

clipping_bounds = {}

for lead, stats in global_stats.items():
    mean = float(stats['mean'])
    std = float(stats['std'])
    lower = mean - n_std * std
    upper = mean + n_std * std
    clipping_bounds[lead] = {'lower': lower, 'upper': upper}

# Optionally print or inspect
import pprint
pprint.pprint(clipping_bounds)


{'I': {'lower': -0.7160991225708047, 'upper': 0.7195745386143908},
 'II': {'lower': -0.8839687453885667, 'upper': 0.8916093737549735},
 'III': {'lower': -0.7789657148510913, 'upper': 0.7831236855899291},
 'V1': {'lower': -1.1294233484256768, 'upper': 1.1284864343723258},
 'V2': {'lower': -1.8212608539896649, 'upper': 1.827226816728332},
 'V3': {'lower': -1.869888076998996, 'upper': 1.876843976708825},
 'V4': {'lower': -1.935069709364965, 'upper': 1.9432233432222024},
 'V5': {'lower': -1.9454863532495892, 'upper': 1.9519745555512655},
 'V6': {'lower': -2.0788057066822465, 'upper': 2.077879843311759},
 'aVF': {'lower': -0.7534052908567621, 'upper': 0.7569386933549904},
 'aVL': {'lower': -0.6060387429059295, 'upper': 0.6033263677996219},
 'aVR': {'lower': -0.7081425040394453, 'upper': 0.7049552788356892}}


In [3]:
import json

# Step 1: Make sure everything is converted to standard float (not np.float64)
def clean_for_json(d):
    return {k: {sk: float(sv) for sk, sv in v.items()} for k, v in d.items()}

# Clean versions
global_stats_clean = clean_for_json(global_stats)
clipping_bounds_clean = clean_for_json(clipping_bounds)

# Step 2: Combine into one metadata dictionary
metadata = {
    'global_stats': global_stats_clean,
    'clipping_bounds': clipping_bounds_clean
}

# Step 3: Save to JSON file
with open('../metadata/ecg_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=4)

print("✅ Metadata saved to ecg_metadata.json")


✅ Metadata saved to ecg_metadata.json


In [12]:
import pandas as pd
import os
import glob
import json

# Load clipping bounds from metadata JSON
with open('../metadata/ecg_metadata.json', 'r') as f:
    metadata = json.load(f)

clipping_bounds = metadata['clipping_bounds']

# Setup paths
input_dir = '../data/ecg'
output_dir = '../data/ecg_clipped'
os.makedirs(output_dir, exist_ok=True)

# Get list of all .csv files
ecg_files = glob.glob(os.path.join(input_dir, '*.csv'))

# Clip each file
for file_path in ecg_files:
    df = pd.read_csv(file_path)
    clipped_df = df.copy()

    for col in df.columns:
        if col == 'time':
            continue
        lower = clipping_bounds[col]['lower']
        upper = clipping_bounds[col]['upper']
        clipped_df[col] = df[col].clip(lower=lower, upper=upper)

    # Save to output folder
    filename = os.path.basename(file_path)
    clipped_df.to_csv(os.path.join(output_dir, filename), index=False)

print("✅ All ECG files clipped and saved to:", output_dir)


✅ All ECG files clipped and saved to: ../data/ecg_clipped


In [15]:

input_dir = '../data/ecg_clipped'
ecg_files = glob.glob(os.path.join(input_dir, '*.csv'))

sums = defaultdict(float)
sumsq = defaultdict(float)
counts = defaultdict(int)

for file_path in ecg_files:
    df = pd.read_csv(file_path)
    for col in df.columns:
        if col == 'time':
            continue
        sums[col] += df[col].sum()
        sumsq[col] += (df[col] ** 2).sum()
        counts[col] += df[col].count()

# Compute mean and std for each lead
global_stats_clipped = {}
for col in sums:
    mean = sums[col] / counts[col]
    var = (sumsq[col] / counts[col]) - mean ** 2
    std = var ** 0.5
    global_stats_clipped[col] = {'mean': mean, 'std': std}

print(global_stats_clipped)  # Optionally view lead-wise stats

{'I': {'mean': np.float64(0.0008244467063146014), 'std': np.float64(0.12618619423799107)}, 'II': {'mean': np.float64(0.002639598971731266), 'std': np.float64(0.15602616848001877)}, 'III': {'mean': np.float64(0.00233063847090593), 'std': np.float64(0.13065246740675088)}, 'aVR': {'mean': np.float64(-0.0006678672134255577), 'std': np.float64(0.12592566953809425)}, 'aVL': {'mean': np.float64(-0.002165100527038807), 'std': np.float64(0.10321755991291601)}, 'aVF': {'mean': np.float64(0.0012038095120968282), 'std': np.float64(0.1287620572062578)}, 'V1': {'mean': np.float64(0.0010650213937836385), 'std': np.float64(0.1916399300532119)}, 'V2': {'mean': np.float64(0.005617837505461991), 'std': np.float64(0.3202138962626205)}, 'V3': {'mean': np.float64(0.005117576033032021), 'std': np.float64(0.3285690375964132)}, 'V4': {'mean': np.float64(0.002886342475415718), 'std': np.float64(0.32946913388098775)}, 'V5': {'mean': np.float64(0.0009154152963690726), 'std': np.float64(0.30538764668023605)}, 'V6'

In [16]:
print(global_stats_clipped)

{'I': {'mean': np.float64(0.0008244467063146014), 'std': np.float64(0.12618619423799107)}, 'II': {'mean': np.float64(0.002639598971731266), 'std': np.float64(0.15602616848001877)}, 'III': {'mean': np.float64(0.00233063847090593), 'std': np.float64(0.13065246740675088)}, 'aVR': {'mean': np.float64(-0.0006678672134255577), 'std': np.float64(0.12592566953809425)}, 'aVL': {'mean': np.float64(-0.002165100527038807), 'std': np.float64(0.10321755991291601)}, 'aVF': {'mean': np.float64(0.0012038095120968282), 'std': np.float64(0.1287620572062578)}, 'V1': {'mean': np.float64(0.0010650213937836385), 'std': np.float64(0.1916399300532119)}, 'V2': {'mean': np.float64(0.005617837505461991), 'std': np.float64(0.3202138962626205)}, 'V3': {'mean': np.float64(0.005117576033032021), 'std': np.float64(0.3285690375964132)}, 'V4': {'mean': np.float64(0.002886342475415718), 'std': np.float64(0.32946913388098775)}, 'V5': {'mean': np.float64(0.0009154152963690726), 'std': np.float64(0.30538764668023605)}, 'V6'