In [23]:
# Clone the repository

# Import necessary libraries
import zipfile
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn

# Scikit-learn modules
from sklearn import datasets as ds, metrics, model_selection, feature_selection, preprocessing, neighbors, svm
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

def colorplot(clf, ax, x, y, h=100, precomputer=None):
    '''
    Overlay the decision areas as colors in an axes.

    Input:
        clf: trained classifier
        ax: axis to overlay color mesh on
        x: feature on x-axis
        y: feature on y-axis
        h(optional): steps in the mesh
    '''
    # Create a meshgrid the size of the axis
    xstep = (x.max() - x.min()) / 20.0
    ystep = (y.max() - y.min()) / 20.0
    x_min, x_max = x.min() - xstep, x.max() + xstep
    y_min, y_max = y.min() - ystep, y.max() + ystep
    h = max((x_max - x_min, y_max - y_min)) / h
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    features = np.c_[xx.ravel(), yy.ravel()]
    if precomputer is not None:
        if type(precomputer) is RBFSampler:
            features = precomputer.transform(features)
        elif precomputer is rbf_kernel:
            features = rbf_kernel(features, X)
    
    # Compute decision boundary
    if hasattr(clf, "decision_function"):
        Z = clf.decision_function(features)
    else:
        Z = clf.predict_proba(features)
    if len(Z.shape) > 1:
        Z = Z[:, 1]
    
    # Put the result into a color plot
    cm = plt.cm.RdBu_r
    Z = Z.reshape(xx.shape)
    ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
    
    del xx, yy, x_min, x_max, y_min, y_max, Z, cm

# Extract ECG data
zip_path = 'ecg_data.zip'
extract_path = '.'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Load dataset
data_path = os.path.join(extract_path, 'ecg_data.csv')
data = pd.read_csv(data_path, index_col=0)

# Print dataset info
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')


The number of samples: 827
The number of columns: 9001


In [24]:
# Extracting features and labels
x = data.iloc[:, 0:-1].values  # Selecting all columns except last
y = data.iloc[:, -1].values  # Selecting last column as labels

In [81]:
# Build a forest and compute the feature importances
forest = RandomForestClassifier(n_estimators=100)

forest.fit(x, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

# for f in range(x.shape[1]):
#     print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Select only the top 300 features
valuable_x = x[:, indices[:100]]
# valuable_x = pd.DataFrame(valuable_x, columns=[f'feature_{i}' for i in indices[:300]])



Feature ranking:


In [89]:
from tqdm import tqdm
from sklearn.model_selection import StratifiedShuffleSplit, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.feature_selection import SelectKBest, f_classif

# Repeat the experiment 20 times
sss = StratifiedShuffleSplit(n_splits=20, test_size=0.5, random_state=0)
train_scores = []
test_scores = []

# Adjusted hyperparameter grid
param_dist = {
    'n_estimators': [50, 100, 200],  # Reduced n_estimators
    'max_depth': [5, 10, 15],  # Reduced max_depth range
    'min_samples_split': [2, 5, 10, 20],  # Keep min_samples_split moderate
    'min_samples_leaf': [5, 10, 20],  # Increased min_samples_leaf
    'max_features': ['sqrt', 'log2']  # Removed None to focus on these options
}

for train_index, test_index in tqdm(sss.split(valuable_x, y), total=20, desc="Training and Testing", unit="split"):
    split_X_train, split_y_train = valuable_x[train_index], y[train_index]
    split_X_test, split_y_test = valuable_x[test_index], y[test_index]

    # Perform RandomizedSearchCV to tune hyperparameters
    base_clf = RandomForestClassifier()
    clf = RandomizedSearchCV(base_clf, param_dist, n_iter=30, cv=3, n_jobs=-1, scoring='balanced_accuracy')
    clf.fit(split_X_train, split_y_train)
    best_clf = clf.best_estimator_

    # Loop through all the possible number of features (1 to max features)
    for num_features in range(1, split_X_train.shape[1] + 1):  # Looping over all feature counts
        # Select the top k features dynamically
        feature_selector = SelectKBest(f_classif, k=num_features)
        split_X_train_selected = feature_selector.fit_transform(split_X_train, split_y_train)
        split_X_test_selected = feature_selector.transform(split_X_test)

        # Train the classifier on the selected features
        best_clf.fit(split_X_train_selected, split_y_train)

        # Predictions
        y_pred_train = best_clf.predict(split_X_train_selected)
        y_pred_test = best_clf.predict(split_X_test_selected)

        # Compute scores for training set
        y_score_train = best_clf.predict_proba(split_X_train_selected)[:, 1] if hasattr(best_clf, 'predict_proba') else y_pred_train
        auc_train = metrics.roc_auc_score(split_y_train, y_score_train)
        accuracy_train = metrics.accuracy_score(split_y_train, y_pred_train)
        F1_train = metrics.f1_score(split_y_train, y_pred_train)
        precision_train = metrics.precision_score(split_y_train, y_pred_train)
        recall_train = metrics.recall_score(split_y_train, y_pred_train)
        train_scores.append((accuracy_train, auc_train, F1_train, precision_train, recall_train))

        # Compute scores for test set
        y_score_test = best_clf.predict_proba(split_X_test_selected)[:, 1] if hasattr(best_clf, 'predict_proba') else y_pred_test
        auc_test = metrics.roc_auc_score(split_y_test, y_score_test)
        accuracy_test = metrics.accuracy_score(split_y_test, y_pred_test)
        F1_test = metrics.f1_score(split_y_test, y_pred_test)
        precision_test = metrics.precision_score(split_y_test, y_pred_test)
        recall_test = metrics.recall_score(split_y_test, y_pred_test)
        test_scores.append((accuracy_test, auc_test, F1_test, precision_test, recall_test))

# Print evaluation metrics for the last split
print(f'Best Parameters: {clf.best_params_}')
print(f'Train Accuracy: {accuracy_train}')
print(f'Train AUC: {auc_train}')
print(f'Train F1: {F1_train}')
print(f'Train Precision: {precision_train}')
print(f'Train Recall: {recall_train}')
print(f'Test Accuracy: {accuracy_test}')
print(f'Test AUC: {auc_test}')
print(f'Test F1: {F1_test}')
print(f'Test Precision: {precision_test}')
print(f'Test Recall: {recall_test}')


Training and Testing: 100%|██████████| 20/20 [03:22<00:00, 10.14s/split]

Best Parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 10}
Train Accuracy: 0.9152542372881356
Train AUC: 0.9973408541498792
Train F1: 0.6846846846846847
Train Precision: 1.0
Train Recall: 0.5205479452054794
Test Accuracy: 0.8695652173913043
Test AUC: 0.8731370264733057
Test F1: 0.4489795918367347
Test Precision: 0.88
Test Recall: 0.3013698630136986





In [83]:
from sklearn.preprocessing import RobustScaler
import pandas as pd

x = data.iloc[:, 0:-1].values  # Selecting all columns except the first and last
y = data.iloc[:, -1].values  # Selecting last column as labels

scaler = RobustScaler(quantile_range=(25,75))
scaled_valuable_x = scaler.fit_transform(valuable_x)

# Manually specify column names (assuming you know the original column names)
# scaled_valuable_x = pd.DataFrame(scaled_data, columns=data.columns[0:-1])  # Use the same column names from data

print(scaled_valuable_x.shape)

(827, 100)


In [88]:
from tqdm import tqdm
from sklearn.model_selection import StratifiedShuffleSplit, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn import metrics

# Repeat the experiment 20 times
sss = StratifiedShuffleSplit(n_splits=20, test_size=0.5, random_state=0)
train_scores = []
test_scores = []

# Adjusted hyperparameter grid for SVM
param_dist = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'poly', 'rbf'],  # Different kernels for SVM
    'degree': [2, 3, 4],  # Degree of the polynomial kernel (if using 'poly')
    'gamma': ['scale', 'auto'],  # Kernel coefficient for 'rbf' and 'poly'
}

for train_index, test_index in tqdm(sss.split(scaled_valuable_x, y), total=20, desc="Training and Testing", unit="split"):
    split_X_train, split_y_train = scaled_valuable_x[train_index], y[train_index]
    split_X_test, split_y_test = scaled_valuable_x[test_index], y[test_index]

    # Hyperparameter tuning with RandomizedSearchCV for SVM
    base_clf = SVC()
    clf = RandomizedSearchCV(base_clf, param_dist, n_iter=30, cv=3, n_jobs=-1, scoring='balanced_accuracy')  # Changed to balanced_accuracy
    clf.fit(split_X_train, split_y_train)
    best_clf = clf.best_estimator_

    y_pred_train = best_clf.predict(split_X_train)
    y_pred_test = best_clf.predict(split_X_test)

    # Compute scores for training set
    y_score_train = best_clf.decision_function(split_X_train) if hasattr(best_clf, 'decision_function') else y_pred_train
    auc_train = metrics.roc_auc_score(split_y_train, y_score_train)
    accuracy_train = metrics.accuracy_score(split_y_train, y_pred_train)
    F1_train = metrics.f1_score(split_y_train, y_pred_train)
    precision_train = metrics.precision_score(split_y_train, y_pred_train)
    recall_train = metrics.recall_score(split_y_train, y_pred_train)
    train_scores.append((accuracy_train, auc_train, F1_train, precision_train, recall_train))

    # Compute scores for test set
    y_score_test = best_clf.decision_function(split_X_test) if hasattr(best_clf, 'decision_function') else y_pred_test
    auc_test = metrics.roc_auc_score(split_y_test, y_score_test)
    accuracy_test = metrics.accuracy_score(split_y_test, y_pred_test)
    F1_test = metrics.f1_score(split_y_test, y_pred_test)
    precision_test = metrics.precision_score(split_y_test, y_pred_test)
    recall_test = metrics.recall_score(split_y_test, y_pred_test)
    test_scores.append((accuracy_test, auc_test, F1_test, precision_test, recall_test))

# Print evaluation metrics for the last split
print(type(best_clf))
print('Best Parameters:', clf.best_params_)
print('Train Acc:', accuracy_train)
print('Train AUC:', auc_train)
print('Train F1:', F1_train)
print('Train Precision:', precision_train)
print('Train Recall:', recall_train)
print('Test Acc:', accuracy_test)
print('Test AUC:', auc_test)
print('Test F1:', F1_test)
print('Test Precision:', precision_test)
print('Test Recall:', recall_test)


Training and Testing: 100%|██████████| 20/20 [00:05<00:00,  3.43split/s]

<class 'sklearn.svm._classes.SVC'>
Best Parameters: {'kernel': 'rbf', 'gamma': 'scale', 'degree': 2, 'C': 10}
Train Acc: 0.9539951573849879
Train AUC: 0.9649073327961322
Train F1: 0.8503937007874016
Train Precision: 1.0
Train Recall: 0.7397260273972602
Test Acc: 0.8647342995169082
Test AUC: 0.8412806813160327
Test F1: 0.5254237288135594
Test Precision: 0.6888888888888889
Test Recall: 0.4246575342465753





In [None]:
## XGB Classifier
from tqdm import tqdm
from sklearn.model_selection import StratifiedShuffleSplit, RandomizedSearchCV
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np

# Repeat the experiment 20 times
sss = StratifiedShuffleSplit(n_splits=20, test_size=0.5, random_state=0)
train_scores = []
test_scores = []

# Adjusted hyperparameter grid with stronger regularization
param_dist = {
    'n_estimators': [100, 200, 300],  # Lowered n_estimators
    'max_depth': [2, 3, 4],  # Keeping depth shallow
    'learning_rate': [0.05, 0.1, 0.2],  # Slightly varying learning rate
    'subsample': [0.7, 0.8, 0.9],  # Reduce subsampling slightly
    'colsample_bytree': [0.7, 0.8],  # Feature subsampling
    'gamma': [0.1, 0.5, 1.0],  # Higher regularization
    'lambda': [0.1, 0.5, 1.0],  # L2 regularization
    'alpha': [0.1, 0.5, 1.0]  # L1 regularization
}

for train_index, test_index in tqdm(sss.split(valuable_x, y), total=20, desc="Training and Testing", unit="split"):
    split_X_train, split_y_train = valuable_x[train_index], y[train_index]
    split_X_test, split_y_test = valuable_x[test_index], y[test_index]

    # Perform RandomizedSearchCV to tune hyperparameters
    base_clf = XGBClassifier(use_label_encoder=False, verbosity=0)
    clf = RandomizedSearchCV(base_clf, param_dist, n_iter=30, cv=3, n_jobs=-1, scoring='balanced_accuracy')
    clf.fit(split_X_train, split_y_train)
    best_clf = clf.best_estimator_

    # Instead of looping over all features, test only meaningful fractions
    feature_fractions = [0.25, 0.5, 0.75]  # Use only 25%, 50%, 75% of features
    for frac in feature_fractions:
        num_features = int(split_X_train.shape[1] * frac)
        num_features = max(1, num_features)  # Ensure at least 1 feature

        feature_selector = SelectKBest(f_classif, k=num_features)
        split_X_train_selected = feature_selector.fit_transform(split_X_train, split_y_train)
        split_X_test_selected = feature_selector.transform(split_X_test)

        # Train the classifier on the selected features
        best_clf.fit(split_X_train_selected, split_y_train)

        # Predictions
        y_pred_train = best_clf.predict(split_X_train_selected)
        y_pred_test = best_clf.predict(split_X_test_selected)

        # Compute scores for training set
        y_score_train = best_clf.predict_proba(split_X_train_selected)[:, 1] if hasattr(best_clf, 'predict_proba') else y_pred_train
        auc_train = metrics.roc_auc_score(split_y_train, y_score_train)
        accuracy_train = metrics.accuracy_score(split_y_train, y_pred_train)
        F1_train = metrics.f1_score(split_y_train, y_pred_train)
        precision_train = metrics.precision_score(split_y_train, y_pred_train)
        recall_train = metrics.recall_score(split_y_train, y_pred_train)
        train_scores.append((accuracy_train, auc_train, F1_train, precision_train, recall_train))

        # Compute scores for test set
        y_score_test = best_clf.predict_proba(split_X_test_selected)[:, 1] if hasattr(best_clf, 'predict_proba') else y_pred_test
        auc_test = metrics.roc_auc_score(split_y_test, y_score_test)
        accuracy_test = metrics.accuracy_score(split_y_test, y_pred_test)
        F1_test = metrics.f1_score(split_y_test, y_pred_test)
        precision_test = metrics.precision_score(split_y_test, y_pred_test)
        recall_test = metrics.recall_score(split_y_test, y_pred_test)
        test_scores.append((accuracy_test, auc_test, F1_test, precision_test, recall_test))

# Print evaluation metrics for the last split
print(f'Best Parameters: {clf.best_params_}')
print(f'Train Accuracy: {accuracy_train}')
print(f'Train AUC: {auc_train}')
print(f'Train F1: {F1_train}')
print(f'Train Precision: {precision_train}')
print(f'Train Recall: {recall_train}')
print(f'Test Accuracy: {accuracy_test}')
print(f'Test AUC: {auc_test}')
print(f'Test F1: {F1_test}')
print(f'Test Precision: {precision_test}')
print(f'Test Recall: {recall_test}')


Training and Testing: 100%|██████████| 20/20 [00:35<00:00,  1.80s/split]

Best Parameters: {'subsample': 0.9, 'n_estimators': 200, 'max_depth': 2, 'learning_rate': 0.1, 'lambda': 1.0, 'gamma': 0.5, 'colsample_bytree': 0.7, 'alpha': 1.0}
Train Accuracy: 0.9927360774818402
Train AUC: 1.0
Train F1: 0.9790209790209791
Train Precision: 1.0
Train Recall: 0.958904109589041
Test Accuracy: 0.8695652173913043
Test AUC: 0.8332061222030289
Test F1: 0.5263157894736842
Test Precision: 0.7317073170731707
Test Recall: 0.410958904109589





In [87]:
## XGBClassifier
from tqdm import tqdm
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np

# Fixed best parameters
best_params = {
    'subsample': 0.9,
    'n_estimators': 200,
    'max_depth': 2,
    'learning_rate': 0.1,
    'lambda': 1.0,
    'gamma': 0.5,
    'colsample_bytree': 0.7,
    'alpha': 1.0
}

# Repeat the experiment 20 times
sss = StratifiedShuffleSplit(n_splits=20, test_size=0.5, random_state=0)
train_scores = []
test_scores = []

for train_index, test_index in tqdm(sss.split(valuable_x, y), total=20, desc="Training and Testing", unit="split"):
    split_X_train, split_y_train = valuable_x[train_index], y[train_index]
    split_X_test, split_y_test = valuable_x[test_index], y[test_index]

    # Initialize classifier with fixed best parameters
    best_clf = XGBClassifier(**best_params, use_label_encoder=False, verbosity=0)
    
    # Feature selection (using 50% of features as an example)
    num_features = int(split_X_train.shape[1] * 0.5)
    num_features = max(1, num_features)  # Ensure at least 1 feature

    feature_selector = SelectKBest(f_classif, k=num_features)
    split_X_train_selected = feature_selector.fit_transform(split_X_train, split_y_train)
    split_X_test_selected = feature_selector.transform(split_X_test)

    # Train classifier on selected features
    best_clf.fit(split_X_train_selected, split_y_train)

    # Predictions
    y_pred_train = best_clf.predict(split_X_train_selected)
    y_pred_test = best_clf.predict(split_X_test_selected)

    # Compute scores for training set
    y_score_train = best_clf.predict_proba(split_X_train_selected)[:, 1]
    auc_train = metrics.roc_auc_score(split_y_train, y_score_train)
    accuracy_train = metrics.accuracy_score(split_y_train, y_pred_train)
    F1_train = metrics.f1_score(split_y_train, y_pred_train)
    precision_train = metrics.precision_score(split_y_train, y_pred_train)
    recall_train = metrics.recall_score(split_y_train, y_pred_train)
    train_scores.append((accuracy_train, auc_train, F1_train, precision_train, recall_train))

    # Compute scores for test set
    y_score_test = best_clf.predict_proba(split_X_test_selected)[:, 1]
    auc_test = metrics.roc_auc_score(split_y_test, y_score_test)
    accuracy_test = metrics.accuracy_score(split_y_test, y_pred_test)
    F1_test = metrics.f1_score(split_y_test, y_pred_test)
    precision_test = metrics.precision_score(split_y_test, y_pred_test)
    recall_test = metrics.recall_score(split_y_test, y_pred_test)
    test_scores.append((accuracy_test, auc_test, F1_test, precision_test, recall_test))

# Print evaluation metrics for the last split
print(f'Train Accuracy: {accuracy_train}')
print(f'Train AUC: {auc_train}')
print(f'Train F1: {F1_train}')
print(f'Train Precision: {precision_train}')
print(f'Train Recall: {recall_train}')
print(f'Test Accuracy: {accuracy_test}')
print(f'Test AUC: {auc_test}')
print(f'Test F1: {F1_test}')
print(f'Test Precision: {precision_test}')
print(f'Test Recall: {recall_test}')


Training and Testing: 100%|██████████| 20/20 [00:01<00:00, 10.42split/s]

Train Accuracy: 0.9903147699757869
Train AUC: 0.9997179693795326
Train F1: 0.971830985915493
Train Precision: 1.0
Train Recall: 0.9452054794520548
Test Accuracy: 0.857487922705314
Test AUC: 0.8309163218575503
Test F1: 0.48695652173913045
Test Precision: 0.6666666666666666
Test Recall: 0.3835616438356164





(827, 100)
