# TM10007 Assignment template -- ECG data

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [None]:
# Run this to use from Colab environment
!git clone https://github.com/jveenland/tm10007_ml.git

import os
import time
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, f1_score
from scipy.stats import uniform, loguniform

# Extract dataset
with zipfile.ZipFile('/content/tm10007_ml/ecg/ecg_data.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/tm10007_ml/ecg')


# Load dataset
data_path = os.path.join(os.getcwd(), 'ecg_data.csv')
data = pd.read_csv(data_path, index_col=0)

print(f'The number of samples: {len(data.index)}')
print(f'The number of features: {len(data.columns) - 1}')  # Excluding label column

# Extract features and labels
X = data.iloc[:, :-1].values  # All columns except the last one
y = data.iloc[:, -1].values   # Last column as labels

# Scale the features
scaler = RobustScaler(quantile_range=(30, 70))
X_scaled = scaler.fit_transform(X)

# Define outer and inner loop random seeds
outer = range(0, 5)  # Outer loop (train-test variations)
inner = range(0, 5)  # Inner loop (hyperparameter tuning)

best_params_list = {}  # Stores best params per (outer, inner) iteration
hyperparam_accuracies = {}  # Stores accuracies per hyperparameter set

# Generate a fixed set of random hyperparameter sets (same for all outer loops)
fixed_hyperparam_sets = []
for _ in range(10):  # Generate 10 random hyperparameter sets
    param_set = {
        'C': loguniform(0.01, 1000).rvs(),
        'gamma': loguniform(0.0001, 100).rvs(),
        'class_weight': random.choice(['balanced', None]),
        'shrinking': random.choice([True, False]),
        'tol': uniform(1e-5, 1e-2).rvs()
    }
    fixed_hyperparam_sets.append(param_set)
    hyperparam_accuracies[str(param_set)] = []  # Initialize list for accuracy tracking

# Outer loop for train-test split variations
for outer_rand in tqdm(outer, desc='Outer Loop'):
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=outer_rand, stratify=y  # Maintain class proportions
    )

    # Apply PCA to reduce dimensionality (fit only on training data)
    pca = PCA(n_components=0.99)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Print PCA variance explained
    print(f"Explained Variance by PCA Components: {pca.explained_variance_ratio_.sum()}")

    # Inner loop for hyperparameter tuning
    for inner_rand, param_set in tqdm(enumerate(fixed_hyperparam_sets), desc='Inner Loop', leave=True):
        X_train_train, X_train_val, y_train_train, y_train_val = train_test_split(
            X_train_pca, y_train, test_size=0.15, random_state=inner_rand, stratify=y_train  # Maintain class proportions
        )

        # Initialize SVM model with RBF kernel
        svm = SVC(kernel='rbf', **param_set, probability=True)  # Enable probability for AUC calculation

        # Train and evaluate model
        start_time = time.time()
        svm.fit(X_train_train, y_train_train)
        elapsed_time = time.time() - start_time  # Calculate elapsed time

        # Best model evaluation
        y_pred = svm.predict(X_test_pca)
        accuracy = accuracy_score(y_test, y_pred)
        f1=f1_score(y_test,y_pred)

        # Compute confusion matrix for sensitivity and specificity
        cm = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = cm.ravel()  # True negatives, False positives, False negatives, True positives
        sensitivity = tp / (tp + fn)
        specificity = tn / (tn + fp)

        # Compute AUC
        auc = roc_auc_score(y_test, svm.predict_proba(X_test_pca)[:, 1])

        # Store results
        param_key = str(param_set)  # Convert to string for dict key
        hyperparam_accuracies[param_key].append(accuracy)  # Store accuracy

        best_params_list[f"Outer {outer_rand} - Inner {inner_rand}"] = param_set.copy()
        best_params_list[f"Outer {outer_rand} - Inner {inner_rand}"]['accuracy'] = accuracy
        best_params_list[f"Outer {outer_rand} - Inner {inner_rand}"]['sensitivity'] = sensitivity
        best_params_list[f"Outer {outer_rand} - Inner {inner_rand}"]['specificity'] = specificity
        best_params_list[f"Outer {outer_rand} - Inner {inner_rand}"]['auc'] = auc
        best_params_list[f"Outer {outer_rand} - Inner {inner_rand}"]['f1-score'] = f1

        print(f"\nOuter {outer_rand}, Inner {inner_rand} -> Hyperparameters: {param_set}")
        print(f"Test Set Accuracy: {accuracy:.4f}")
        print(f"Sensitivity: {sensitivity:.4f}")
        print(f"Specificity: {specificity:.4f}")
        print(f"AUC: {auc:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"Time taken: {elapsed_time:.2f} seconds")

# Select the best classifier based on the F1 score
best_f1_score = -1
best_classifier = None
best_metrics = {}

for key, val in best_params_list.items():
    f1 = val['f1-score']

    # Update if current F1 score is higher
    if f1 > best_f1_score:
        best_f1_score = f1
        best_classifier = key
        best_metrics = val  # Store the metrics of the best classifier

# Print the best classifier based on F1 score
print(f"\n🏆 Best Classifier Based on F1 Score: {best_classifier}")
print(f"F1 Score: {best_f1_score:.4f}")

# Print the detailed metrics of the best classifier
print(f"\nDetailed Metrics of the Best Classifier:")
print(f"Accuracy: {best_metrics['accuracy']:.4f}")
print(f"Sensitivity: {best_metrics['sensitivity']:.4f}")
print(f"Specificity: {best_metrics['specificity']:.4f}")
print(f"AUC: {best_metrics['auc']:.4f}")
print(f"F1 Score: {best_metrics['f1-score']:.4f}")


fatal: destination path 'tm10007_ml' already exists and is not an empty directory.
The number of samples: 827
The number of features: 9000
