# ECG Classification Without Augmentation: 
This notebook explores ECG heartbeat classification using deep learning, without applying any data augmentation techniques.


## Dataset Exploration

In [1]:
# Import necessary libraries

import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical computations
from matplotlib import pyplot as plt  # For data visualization
from sklearn.model_selection import train_test_split  # For splitting the dataset into train and test sets


In [2]:
# Load the ECG training dataset from a CSV file
# The dataset does not have predefined headers, so we set `header=None`
df_train = pd.read_csv("../Data/mitbih_train.csv", header=None)

# Load the ECG test dataset from a CSV file
# This dataset will be used for evaluating the model's performance
df_test = pd.read_csv("../Data/mitbih_test.csv", header=None)

In [3]:
# Check the Dimension of the Set
print(f"Training dataset shape: {df_train.shape} (rows, columns)")

print(f"Test dataset shape: {df_test.shape} (rows, columns)")

Training dataset shape: (87554, 188) (rows, columns)
Test dataset shape: (21892, 188) (rows, columns)


In [4]:
print("🔹 Label distribution in Training Set:")
print(df_train.iloc[:, -1].value_counts())

print("\n🔹 Label distribution in Test Set:")
print(df_test.iloc[:, -1].value_counts())

🔹 Label distribution in Training Set:
187
0.0    72471
4.0     6431
2.0     5788
1.0     2223
3.0      641
Name: count, dtype: int64

🔹 Label distribution in Test Set:
187
0.0    18118
4.0     1608
2.0     1448
1.0      556
3.0      162
Name: count, dtype: int64


# Data Preprocessing

In [5]:
import sys

# Add the 'src' directory to the Python path so we can import custom utility functions
sys.path.append("../src")  

# Import the stratified sampling function from utils.py
from utils import stratified_sample  


df_validation , df_train = stratified_sample(df_train, 3)

X_train = df_train.iloc[:, :-1]
y_train = df_train.iloc[:, -1]

X_validation = df_validation.iloc[:, :-1]
y_validation = df_validation.iloc[:,-1]


# X_holdout = X_holdout.reset_index(drop=True)
# y_holdout = y_holdout.reset_index(drop=True)
# Identify the distribution of classes in the training set
class_distribution = y_train.value_counts().sort_values()

# Select the three least represented classes (minority classes)
minority_classes = class_distribution.index[:3]

# Print the identified minority classes
print(f"Minority classes: {minority_classes.tolist()}")

# Display the class distribution in the training set
# This helps assess dataset imbalance and decide on augmentation strategies
print("Class distribution in the training set:")
print(class_distribution)


Minority classes: [3.0, 1.0, 2.0]
Class distribution in the training set:
187
3.0      641
1.0     2223
2.0     5788
4.0     6431
0.0    69846
Name: count, dtype: int64


In [6]:
# Print the class distribution in the validation set
# This helps verify that stratified sampling preserved the class balance
print(y_validation.value_counts())

# Print the class distribution in the training set
# Ensures we have a clear understanding of how many samples exist for each class
print(y_train.value_counts())

187
0.0    525
3.0    525
2.0    525
4.0    525
1.0    525
Name: count, dtype: int64
187
0.0    69846
4.0     6431
2.0     5788
1.0     2223
3.0      641
Name: count, dtype: int64


In [7]:
# Identify the minority classes
class_distribution = y_train.value_counts().sort_values()
minority_classes = class_distribution.index[:3]
print(f"Minority classes: {minority_classes.tolist()}")
print("Class distribution in the training set:")
print(class_distribution)

Minority classes: [3.0, 1.0, 2.0]
Class distribution in the training set:
187
3.0      641
1.0     2223
2.0     5788
4.0     6431
0.0    69846
Name: count, dtype: int64


In [8]:
#%% Prepare Data

import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from cnn_lstm_classifier import CNNLSTMClassifier  # Assuming the model class is in the src directory

X_data = np.expand_dims(X_train.values, axis=-1)  # Add channel dimension for CNN
y_data = y_train.values

print(f"Input shape for model: {X_data.shape}")
print(f"Labels shape: {y_data.shape}")


Input shape for model: (84929, 187, 1)
Labels shape: (84929,)


## Model Selection & Implementation

In [9]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow import keras
from data_augmentation import DataAugmentation  # Assuming the class is in the src folder
from cnn_lstm_classifier import CNNLSTMClassifier  # Assuming the model class is in the src directory


# To store evaluation metrics
accuracy_scores = []
classification_reports = []
confusion_matrices = []


## Initialize the model and training : 

In [10]:
# Import necessary libraries for model evaluation and visualization
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow import keras
import numpy as np
import seaborn as sns

# Initialize the CNN + LSTM model with specified parameters
model = CNNLSTMClassifier(
    input_shape=(187, 1),  # Input shape matches the ECG signal length and single-channel format
    num_classes=len(np.unique(y_data)),  # Automatically determine the number of classes
    learning_rate=1e-3,  # Set an appropriate learning rate for training
    batch_size=32,  # Define batch size for training
    epochs=50  # Set the number of epochs for training
)

# Train the model using the training dataset
# This step involves feeding the training data to the model and optimizing weights
print("\nStarting model training...")
model.fit(X_data, y_data, X_validation, y_validation)


NameError: name 'y_test' is not defined

## Model Evaluation

In [None]:
# Plot the accuracy and loss curves to visualize the training process
# Helps in identifying underfitting or overfitting
model.plot_history()

# Evaluate the model using the validation dataset
# This provides insights into how well the model generalizes to unseen data
print("\nEvaluating on Validation Set...")
model.evaluate(X_validation, y_validation)

In [None]:
# Load the best model to make predictions
best_model = keras.models.load_model('no_augmentation_saved_model.h5')

# Predict on the validation set using the best model
y_val_pred = best_model.predict(X_validation)
y_val_pred_classes = np.argmax(y_val_pred, axis=1)

# Calculate and display accuracy
accuracy = accuracy_score(y_validation, y_val_pred_classes)
print(f"\nValidation Accuracy: {accuracy:.4f}")

# Generate and display classification report and confusion matrix
print("\nClassification Report:")
print(classification_report(y_validation, y_val_pred_classes, digits=4))

print("\nConfusion Matrix:")
print(confusion_matrix(y_validation, y_val_pred_classes))


## test set

In [None]:
#%% evlauate the model with the test set

# Ensure the test data has the correct shape
X_test = np.expand_dims(X_test, axis=-1)  # Add channel dimension if not already added
print(f"Test set shape: {X_test.shape}")
print(f"Test labels shape: {y_test.shape}")


from tensorflow import keras

# Load the best model weights
#best_model = keras.models.load_model('best_model.h5')

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


# Predict on the test set
y_test_pred = model.predict(X_test)
y_test_pred_classes = np.argmax(y_test_pred, axis=1)

# Generate classification report
print("Classification Report on Test Set:")
print(classification_report(y_test, y_test_pred_classes, digits=4))

# Calculate accuracy
test_accuracy = accuracy_score(y_test, y_test_pred_classes)
print(f"Accuracy on Test Set: {test_accuracy:.4f}")

# Plot confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred_classes)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='d')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix on Test Set')
plt.savefig("Confusion Matrix on Test Set" , bbox_inches='tight', dpi=300)
plt.show()

In [None]:
import numpy as np
from data_augmentation import DataAugmentation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow import keras

# Initialize the Data Augmentation class for noise injection
augmenter = DataAugmentation(
    shift_max=0,       # No time shift
    noise_level=0.1,   # Introduce significant noise
    scale_range=(1.0, 1.0), # No scaling
    expected_length=187
)

# 🆕 Prepare holdout data with the channel dimension for augmentation
X_holdout_prepared = np.expand_dims(X_holdout, axis=-1)  # Shape should be (samples, 187, 1)
y_holdout = y_holdout

#%%
print(f"Prepared holdout set shape for augmentation: {X_holdout_prepared.shape}")

# Apply noise to the holdout set with an augmentation factor of 1 (100% of samples)
X_holdout_noisy = augmenter.augment_batch(X_holdout_prepared, augmentation_factor=1.0)

# 🆕 Set the labels for the augmented noisy holdout set
num_original_samples = len(y_holdout)
num_augmented_samples = X_holdout_noisy.shape[0]

# Generate the correct number of labels for the augmented samples
y_holdout_noisy = np.repeat(y_holdout, np.ceil(num_augmented_samples / num_original_samples).astype(int))[:num_augmented_samples]

print(f"Noisy holdout set shape: {X_holdout_noisy.shape}")
print(f"Noisy holdout labels shape: {y_holdout_noisy.shape}")

# Load the best model
#model = keras.models.load_model('best_model.h5')

# Evaluate on the noisy holdout set
print("\nEvaluating on the Noisy Holdout Set...")
y_holdout_pred = model.predict(X_holdout_noisy)
y_holdout_pred_classes = np.argmax(y_holdout_pred, axis=1)

# Calculate accuracy for the noisy holdout set
holdout_accuracy = accuracy_score(y_holdout_noisy, y_holdout_pred_classes)
print(f"Noisy Holdout Set Accuracy: {holdout_accuracy:.4f}")

# Generate classification report for the noisy holdout set
print("\nClassification Report for Noisy Holdout Set:")
print(classification_report(y_holdout_noisy, y_holdout_pred_classes, digits=4))

# Generate confusion matrix for the noisy holdout set
conf_matrix_holdout = confusion_matrix(y_holdout_noisy, y_holdout_pred_classes)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_holdout, annot=True, cmap='Blues', fmt='d')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Noisy Holdout Set')
plt.savefig("Confusion Matrix for Noisy Holdout Set" , bbox_inches='tight', dpi=300)
plt.show()

In [None]:
from tensorflow.keras.utils import plot_model
sys.path.append('../src')
from cnn_lstm_classifier import CNNLSTMClassifier  # Assuming the model class is in the src directory

# Create model instance
model = CNNLSTMClassifier(input_shape=(187, 1), num_classes=5)

# Build the model explicitly before plotting
model.model.build(input_shape=(None, 187, 1))

# Plot the model architecture
plot_model(model.model, to_file="model_plot.png", show_shapes=True, show_layer_names=True)
