<a href="https://colab.research.google.com/github/JahnviAghera/DAA/blob/main/ECG_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import math
import os

!pip install wfdb # Install missing library
import wfdb

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Dropout, LSTM, Dense, TimeDistributed, Flatten
from tensorflow.keras import optimizers, callbacks

DATA_PATH = '/content/drive/MyDrive/mit-bih-arrhythmia-database'
MODEL_SAVE_PATH = "saved_models/CNN_LSTM_Oversampled_Aggressive.h5"

SEQUENCE_LENGTH = 3
BEAT_LENGTH = 180

# Class definition based on AAMI recommendation and previous mapping
CLASSES = ['N', 'LBBB', 'RBBB', 'APC', 'AESC', 'ABERR', 'NPC', 'NESC']
CLASS_MAP = {
    'N': 0, 'L': 1, 'R': 2, 'A': 3,
    'e': 4, 'a': 5, 'J': 6, 'j': 7
}

Collecting wfdb
  Downloading wfdb-4.3.0-py3-none-any.whl.metadata (3.8 kB)
Collecting pandas>=2.2.3 (from wfdb)
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Downloading wfdb-4.3.0-py3-none-any.whl (163 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/163.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m89.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pandas, wfdb
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.2
    Uninstalling pandas-2.2.2:
      Successfully uninstalled pandas-2.2.2
[31mERROR: pip's dependency resolver does not currently take into accou

In [4]:
def build_cnn_lstm_model(sequence_length, beat_length, num_classes):
    input_shape = (sequence_length, beat_length, 1)

    # Input layer
    inputs = Input(shape=input_shape)

    # TimeDistributed CNN part (Applies CNN to each beat in the sequence)
    td_cnn = TimeDistributed(Conv1D(32, 5, activation='relu', padding='same'))(inputs)
    td_cnn = TimeDistributed(Conv1D(32, 5, activation='relu', padding='same'))(td_cnn)
    td_cnn = TimeDistributed(MaxPooling1D(2))(td_cnn)
    td_cnn = TimeDistributed(Dropout(0.25))(td_cnn)

    td_cnn = TimeDistributed(Conv1D(64, 5, activation='relu', padding='same'))(td_cnn)
    td_cnn = TimeDistributed(Conv1D(64, 5, activation='relu', padding='same'))(td_cnn)
    td_cnn = TimeDistributed(MaxPooling1D(2))(td_cnn)
    td_cnn = TimeDistributed(Dropout(0.25))(td_cnn)

    # Flatten the output of the CNN for the LSTM input
    td_cnn_flatten = TimeDistributed(Flatten())(td_cnn)

    # LSTM part (Processes the sequence of extracted features)
    lstm_out = LSTM(128, activation='tanh', return_sequences=False)(td_cnn_flatten)
    lstm_out = Dropout(0.5)(lstm_out)

    # Output layer
    outputs = Dense(num_classes, activation='softmax')(lstm_out)

    model = Model(inputs=inputs, outputs=outputs)
    return model

In [5]:
def main():
    # 1. Load and Process Data
    processor = ECGDataProcessor(SEQUENCE_LENGTH, BEAT_LENGTH)
    try:
        X, y = processor.load_real_data(DATA_PATH)
    except Exception as e:
        print(f"Error loading data: {e}")
        return

    # 2. Train / Test Split
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

    print(f"Initial Train Shape: {X_train_raw.shape}")
    print(f"Test Shape: {X_test_raw.shape}")
    print(f"Initial Class Distribution (Train): {Counter(y_train)}")

    # 3. Aggressive Oversampling
    MIN_SAMPLES_TARGET = 500
    X_resampled, y_resampled = [], []

    for cls in np.unique(y_train):
        indices = np.where(y_train == cls)[0]
        X_cls = X_train_raw[indices]
        y_cls = y_train[indices]
        count = len(y_cls)

        if count < MIN_SAMPLES_TARGET:
            oversample_factor = math.ceil(MIN_SAMPLES_TARGET / count)

            X_aug = np.tile(X_cls, (oversample_factor, 1, 1))[:MIN_SAMPLES_TARGET]
            y_aug = np.tile(y_cls, oversample_factor)[:MIN_SAMPLES_TARGET]

            X_resampled.append(X_aug)
            y_resampled.append(y_aug)

            print(f"Oversampling Class {cls} ({CLASSES[cls]}): {count} → {len(y_aug)}")
        else:
            X_resampled.append(X_cls)
            y_resampled.append(y_cls)

    X_train_resampled = np.concatenate(X_resampled, axis=0)
    y_train_resampled = np.concatenate(y_resampled, axis=0)

    shuffle_idx = np.random.permutation(len(X_train_resampled))
    X_train_final = X_train_resampled[shuffle_idx]
    y_train_final = y_train_resampled[shuffle_idx]

    # 4. Reshape for CNN input
    X_train = X_train_final.reshape(-1, SEQUENCE_LENGTH, BEAT_LENGTH, 1)
    X_test = X_test_raw.reshape(-1, SEQUENCE_LENGTH, BEAT_LENGTH, 1)

    print(f"\nFinal Train Shape: {X_train.shape}")
    print(f"Final Class Distribution: {Counter(y_train_final)}")

    # 5. Log-Smoothed Class Weights
    unique, counts = np.unique(y_train_final, return_counts=True)
    max_count = counts.max()

    class_weights_dict = {}
    print("\nFinal Smoothed Class Weights:")
    for cls, count in zip(unique, counts):
        weight = np.log(max_count / (count + 1e-6)) + 1.0
        class_weights_dict[cls] = min(weight, 7.0)
        print(f"Class {cls} ({CLASSES[cls]}): Count={count}, Weight={class_weights_dict[cls]:.4f}")

    # 6. Build & Compile Model
    model = build_cnn_lstm_model(
        SEQUENCE_LENGTH,
        BEAT_LENGTH,
        len(CLASSES)
    )

    model.compile(
        optimizer=optimizers.Adam(learning_rate=1e-4),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    # 7. Callbacks (Best Model Saving)
    os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)

    early_stop = callbacks.EarlyStopping(
        monitor='val_loss',
        patience=15,
        restore_best_weights=True
    )

    reduce_lr = callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        min_lr=1e-7,
        verbose=1
    )

    best_model_ckpt = callbacks.ModelCheckpoint(
        filepath=MODEL_SAVE_PATH,
        monitor='val_loss',
        save_best_only=True,
        save_weights_only=False,
        verbose=1
    )

    # 8. Train
    print("\nStarting training with aggressive oversampling and class weighting...")
    history = model.fit(
        X_train,
        y_train_final,
        epochs=50,
        batch_size=32,
        validation_split=0.1,
        class_weight=class_weights_dict,
        callbacks=[early_stop, reduce_lr, best_model_ckpt],
        verbose=1
    )

    print(f"\nBest model saved to: {MODEL_SAVE_PATH}")

    # 9. Load Best Model for Evaluation
    model = tf.keras.models.load_model(MODEL_SAVE_PATH)
    print("Loaded best validation model for evaluation.")

    # 10. Evaluate on Test Set
    y_pred_probs = model.predict(X_test)
    y_pred = np.argmax(y_pred_probs, axis=1)

    print("\nClassification Report (Test Data):")
    print(
        classification_report(
            y_test,
            y_pred,
            target_names=CLASSES,
            zero_division=0,
            digits=4
        )
    )


In [6]:
class ECGDataProcessor:
    def __init__(self, sequence_length=3, beat_length=180):
        self.seq_len = sequence_length
        self.beat_len = beat_length
        self.scaler = StandardScaler()

    def load_real_data(self, record_path):
        if not os.path.exists(record_path):
            raise FileNotFoundError(f"The directory '{record_path}' does not exist.")

        records = [f.split('.')[0] for f in os.listdir(record_path) if f.endswith('.dat')]
        records = sorted(list(set(records)))

        X_all, y_all = [], []

        # NOTE: Only printing count, loading happens below
        print(f"Found {len(records)} records. Loading...")

        for record_name in records:
            try:
                record = wfdb.rdrecord(os.path.join(record_path, record_name))
                annotation = wfdb.rdann(os.path.join(record_path, record_name), 'atr')

                signal = record.p_signal[:, 0]
                signal = self.denoise_signal(signal)

                peaks = annotation.sample
                labels = annotation.symbol

                X_rec, y_rec = self.segment_beats(signal, peaks, labels)
                X_all.extend(X_rec)
                y_all.extend(y_rec)
            except Exception as e:
                # print(f"Skipping {record_name}: {e}")
                continue

        if not X_all:
            raise ValueError("No valid beats were extracted.")

        X_all = np.array(X_all)
        y_all = np.array(y_all)

        # Scale the data before splitting/reshaping
        N, S, B = X_all.shape
        X_reshaped = X_all.reshape(-1, B)
        # Fit on all data, transform on all data
        X_scaled = self.scaler.fit_transform(X_reshaped)
        X_final = X_scaled.reshape(N, S, B)

        return X_final, y_all

    def denoise_signal(self, signal):
        # Simple moving average filter
        return np.convolve(signal, np.ones(5)/5, mode='same')

    def segment_beats(self, signal, peaks, labels):
        beats = []
        beat_labels = []
        half_len = self.beat_len // 2

        for i, peak in enumerate(peaks):
            if peak < half_len or peak > len(signal) - half_len:
                continue

            symbol = labels[i]
            if symbol in CLASS_MAP:
                beat_segment = signal[peak - half_len : peak + half_len]
                beats.append(beat_segment)
                beat_labels.append(CLASS_MAP[symbol])

        # Create sequential input (sequence of beats)
        X_seq, y_target = [], []
        for i in range(len(beats) - self.seq_len):
            sequence = np.array(beats[i : i + self.seq_len])
            target = beat_labels[i + self.seq_len] # Label is the beat immediately following the sequence
            X_seq.append(sequence)
            y_target.append(target)

        return X_seq, y_target


In [None]:
import tensorflow as tf

if __name__ == "__main__":
    # Ensure TensorFlow is not printing excessive warnings
    tf.get_logger().setLevel('ERROR')
    main()

Found 48 records. Loading...
Initial Train Shape: (74596, 3, 180)
Test Shape: (18650, 3, 180)
Initial Class Distribution (Train): Counter({np.int64(0): 59935, np.int64(1): 6451, np.int64(2): 5793, np.int64(3): 2035, np.int64(7): 183, np.int64(5): 120, np.int64(6): 66, np.int64(4): 13})
Oversampling Class 4 (AESC): 13 → 500
Oversampling Class 5 (ABERR): 120 → 500
Oversampling Class 6 (NPC): 66 → 500
Oversampling Class 7 (NESC): 183 → 500

Final Train Shape: (76214, 3, 180, 1)
Final Class Distribution: Counter({np.int64(0): 59935, np.int64(1): 6451, np.int64(2): 5793, np.int64(3): 2035, np.int64(6): 500, np.int64(4): 500, np.int64(5): 500, np.int64(7): 500})

Final Smoothed Class Weights:
Class 0 (N): Count=59935, Weight=1.0000
Class 1 (LBBB): Count=6451, Weight=3.2290
Class 2 (RBBB): Count=5793, Weight=3.3366
Class 3 (APC): Count=2035, Weight=4.3828
Class 4 (AESC): Count=500, Weight=5.7864
Class 5 (ABERR): Count=500, Weight=5.7864
Class 6 (NPC): Count=500, Weight=5.7864
Class 7 (NESC): 



[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m511s[0m 236ms/step - accuracy: 0.8633 - loss: 1.2744 - val_accuracy: 0.9539 - val_loss: 0.1607 - learning_rate: 1.0000e-04
Epoch 2/50
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 229ms/step - accuracy: 0.9399 - loss: 0.5271
Epoch 2: val_loss improved from 0.16072 to 0.15866, saving model to saved_models/CNN_LSTM_Oversampled_Aggressive.h5




[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m502s[0m 234ms/step - accuracy: 0.9399 - loss: 0.5270 - val_accuracy: 0.9511 - val_loss: 0.1587 - learning_rate: 1.0000e-04
Epoch 3/50
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 227ms/step - accuracy: 0.9474 - loss: 0.4347
Epoch 3: val_loss improved from 0.15866 to 0.11715, saving model to saved_models/CNN_LSTM_Oversampled_Aggressive.h5




[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m496s[0m 232ms/step - accuracy: 0.9474 - loss: 0.4347 - val_accuracy: 0.9677 - val_loss: 0.1172 - learning_rate: 1.0000e-04
Epoch 4/50
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 227ms/step - accuracy: 0.9554 - loss: 0.3735
Epoch 4: val_loss did not improve from 0.11715
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m497s[0m 232ms/step - accuracy: 0.9554 - loss: 0.3735 - val_accuracy: 0.9642 - val_loss: 0.1175 - learning_rate: 1.0000e-04
Epoch 5/50
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 226ms/step - accuracy: 0.9615 - loss: 0.3338
Epoch 5: val_loss did not improve from 0.11715
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m499s[0m 230ms/step - accuracy: 0.9615 - loss: 0.3338 - val_accuracy: 0.9656 - val_loss: 0.1194 - learning_rate: 1.0000e-04
Epoch 6/50
[1m2144/2144



[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m502s[0m 231ms/step - accuracy: 0.9618 - loss: 0.3199 - val_accuracy: 0.9676 - val_loss: 0.1093 - learning_rate: 1.0000e-04
Epoch 7/50
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 227ms/step - accuracy: 0.9680 - loss: 0.2835
Epoch 7: val_loss did not improve from 0.10928
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m504s[0m 231ms/step - accuracy: 0.9680 - loss: 0.2835 - val_accuracy: 0.9686 - val_loss: 0.1119 - learning_rate: 1.0000e-04
Epoch 8/50
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 227ms/step - accuracy: 0.9689 - loss: 0.2727
Epoch 8: val_loss improved from 0.10928 to 0.09745, saving model to saved_models/CNN_LSTM_Oversampled_Aggressive.h5




[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m497s[0m 232ms/step - accuracy: 0.9689 - loss: 0.2727 - val_accuracy: 0.9727 - val_loss: 0.0974 - learning_rate: 1.0000e-04
Epoch 9/50
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 228ms/step - accuracy: 0.9711 - loss: 0.2580
Epoch 9: val_loss improved from 0.09745 to 0.09741, saving model to saved_models/CNN_LSTM_Oversampled_Aggressive.h5




[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m496s[0m 231ms/step - accuracy: 0.9711 - loss: 0.2580 - val_accuracy: 0.9726 - val_loss: 0.0974 - learning_rate: 1.0000e-04
Epoch 10/50
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 228ms/step - accuracy: 0.9720 - loss: 0.2447
Epoch 10: val_loss did not improve from 0.09741
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m499s[0m 233ms/step - accuracy: 0.9720 - loss: 0.2447 - val_accuracy: 0.9714 - val_loss: 0.1006 - learning_rate: 1.0000e-04
Epoch 11/50
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 228ms/step - accuracy: 0.9719 - loss: 0.2425
Epoch 11: val_loss did not improve from 0.09741
[1m2144/2144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m500s[0m 232ms/step - accuracy: 0.9719 - loss: 0.2425 - val_accuracy: 0.9710 - val_loss: 0.0976 - learning_rate: 1.0000e-04
Epoch 12/50
[1m1155