In [1]:
import numpy as np
import wfdb
from scipy.signal import butter, lfilter
from scipy.stats import skew, kurtosis
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Define the normal and abnormal classes
normal_classes = ['N', 'L', 'R', 'e', 'j']
abnormal_classes = ['A', 'a', 'J', 'S']
beat_length = 50  # determined by inspection of data
mapping = {'N': 0, 'L': 0, 'R': 0, 'e': 0, 'j': 0, 'A': 1, 'a': 1, 'J': 1, 'S': 1}

samples = []          # list of beats (each beat is equal to 50 ECG readings)
sample_labels = []    # list of labels of each beat 

In [3]:
for j in range(100,235):
    path = "../MIT/{}".format(j)
    try:
        signals, fields = wfdb.rdsamp(path)

        annotation = wfdb.rdann(path, 'atr')

        signal = signals[:, 0]  
        labels = annotation.symbol
        #loop to separate and store beats with their labels
        for i in range(len(labels)):
            if labels[i] in normal_classes or labels[i] in abnormal_classes:
                
                beat_start = annotation.sample[i]
                beat_end = annotation.sample[i+1] if i+1 < len(annotation.sample) else len(signal)
                beat = signal[beat_start:beat_end]

                
                if len(beat) < beat_length:
                    padded_beat = np.pad(beat, (0, beat_length - len(beat)), mode='constant')
                    samples.append(padded_beat)
                else:
                    truncated_beat = beat[:beat_length]
                    samples.append(truncated_beat)
                sample_labels.append(mapping[labels[i]])

        # for i in range(len(labels)):
        #     if labels[i] in normal_classes or labels[i] in abnormal_classes:
            
        #         beat_start = annotation.sample[i]-int(beat_length/2)
        #         beat_end = annotation.sample[i]+int(beat_length/2)
        #         beat = signal[beat_start:beat_end]
        #         samples.append(beat)
        #         sample_labels.append(mapping[labels[i]])    
        
    
    except:
        continue


X = np.array(samples)
y = np.array(sample_labels)

In [4]:
samples[0]

array([ 0.84 ,  0.765,  0.52 ,  0.17 , -0.165, -0.365, -0.435, -0.425,
       -0.37 , -0.33 , -0.325, -0.335, -0.345, -0.33 , -0.325, -0.315,
       -0.31 , -0.32 , -0.335, -0.34 , -0.325, -0.345, -0.335, -0.33 ,
       -0.335, -0.33 , -0.325, -0.33 , -0.33 , -0.345, -0.355, -0.335,
       -0.325, -0.305, -0.32 , -0.32 , -0.33 , -0.34 , -0.335, -0.34 ,
       -0.345, -0.355, -0.355, -0.34 , -0.33 , -0.33 , -0.33 , -0.34 ,
       -0.35 , -0.325])

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
samples[0]


array([ 0.84 ,  0.765,  0.52 ,  0.17 , -0.165, -0.365, -0.435, -0.425,
       -0.37 , -0.33 , -0.325, -0.335, -0.345, -0.33 , -0.325, -0.315,
       -0.31 , -0.32 , -0.335, -0.34 , -0.325, -0.345, -0.335, -0.33 ,
       -0.335, -0.33 , -0.325, -0.33 , -0.33 , -0.345, -0.355, -0.335,
       -0.325, -0.305, -0.32 , -0.32 , -0.33 , -0.34 , -0.335, -0.34 ,
       -0.345, -0.355, -0.355, -0.34 , -0.33 , -0.33 , -0.33 , -0.34 ,
       -0.35 , -0.325])

In [7]:
len(sample_labels)

93412

# Random Forest 

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [16]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)

clf.fit(X_train_pca, y_train)

y_pred = clf.predict(X_test_pca)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     18150
           1       0.86      0.53      0.65       533

    accuracy                           0.98     18683
   macro avg       0.93      0.76      0.82     18683
weighted avg       0.98      0.98      0.98     18683



# XgBoost

In [8]:
import xgboost as xgb

In [9]:
import xgboost as xgb
from sklearn.metrics import classification_report

# Initialize the XGBClassifier
xgb_clf = xgb.XGBClassifier(n_estimators=100, random_state=42)

# Fit the model on the training data
xgb_clf.fit(X_train, y_train)

# Predict on the test data
y_pred = xgb_clf.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99     18150
           1       0.85      0.56      0.68       533

    accuracy                           0.98     18683
   macro avg       0.92      0.78      0.84     18683
weighted avg       0.98      0.98      0.98     18683



# LGBM

In [10]:
import lightgbm as lgb

In [11]:
# Train and evaluate the classifier using LightGBM
dtrain = lgb.Dataset(X_train, label=y_train)
dtest = lgb.Dataset(X_test, label=y_test, reference=dtrain)

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'max_depth': 3,
    'learning_rate': 0.1,
    'seed': 42
}

callbacks = [lgb.early_stopping(stopping_rounds=10)]

bst = lgb.train(params, dtrain, num_boost_round=100, valid_sets=[dtest], callbacks=callbacks)
y_pred_prob = bst.predict(X_test, num_iteration=bst.best_iteration)
y_pred = (y_pred_prob > 0.5).astype(int)

# Generate the classification report
report = classification_report(y_test, y_pred, target_names=['Normal', 'Abnormal'])
print(report)

[LightGBM] [Info] Number of positive: 2248, number of negative: 72481
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010088 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12750
[LightGBM] [Info] Number of data points in the train set: 74729, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.030082 -> initscore=-3.473284
[LightGBM] [Info] Start training from score -3.473284
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.0694284
              precision    recall  f1-score   support

      Normal       0.98      1.00      0.99     18150
    Abnormal       0.87      0.43      0.57       533

    accuracy                           0.98     18683
   macro avg       0.93      0.71      0.78     18683
weighted avg       0.98      0.98      0.98     18683



# CNN

In [12]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [13]:
# Reshape data for CNN input
X = X.reshape(X.shape[0], X.shape[1], 1)

# Convert labels to categorical format
y = to_categorical(y, num_classes=2)

In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the CNN model
model = Sequential([
    Conv1D(32, kernel_size=3, activation='relu', input_shape=(beat_length, 1)),
    MaxPooling1D(pool_size=2),
    Conv1D(64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)
y_test_labels = np.argmax(y_test, axis=1)

# Generate the classification report
report = classification_report(y_test_labels, y_pred, target_names=['Normal', 'Abnormal'])
print(report)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m1869/1869[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - accuracy: 0.9640 - loss: 0.1381 - val_accuracy: 0.9742 - val_loss: 0.0945
Epoch 2/10
[1m1869/1869[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9762 - loss: 0.0854 - val_accuracy: 0.9789 - val_loss: 0.0827
Epoch 3/10
[1m1869/1869[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.9787 - loss: 0.0818 - val_accuracy: 0.9803 - val_loss: 0.0797
Epoch 4/10
[1m1869/1869[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9798 - loss: 0.0783 - val_accuracy: 0.9777 - val_loss: 0.0803
Epoch 5/10
[1m1869/1869[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.9793 - loss: 0.0771 - val_accuracy: 0.9806 - val_loss: 0.0765
Epoch 6/10
[1m1869/1869[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 6ms/step - accuracy: 0.9803 - loss: 0.0745 - val_accuracy: 0.9813 - val_loss: 0.0732
Epoch 7/10
[

In [17]:
from sklearn.metrics import classification_report


In [18]:
import joblib

# Assuming xgb_clf is your trained XGBClassifier
joblib.dump(xgb_clf, 'xgb_model.pkl')


['xgb_model.pkl']