# TM10007 Assignment template -- ECG data

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [None]:
# Run this to use from Colab environment
!git clone https://github.com/jveenland/tm10007_ml.git

import time
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from tqdm import tqdm  # Progress bar
from scipy.stats import uniform, loguniform

# Extract dataset
with zipfile.ZipFile('/content/tm10007_ml/ecg/ecg_data.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/tm10007_ml/ecg')


# Load dataset
data = pd.read_csv('ecg_data.csv', index_col=0)

print(f'The number of samples: {len(data.index)}')
print(f'The number of features: {len(data.columns) - 1}')  # Excluding label column

# Extract features and labels
X = data.iloc[:, :-1].values  # All columns except the last one
y = data.iloc[:, -1].values   # Last column as labels

# Scale the features
scaler = RobustScaler(quantile_range=(30, 70))
X_scaled = scaler.fit_transform(X)

# Apply PCA to reduce dimensionality while preserving 99% of variance
pca = PCA(n_components=0.99)
X_pca = pca.fit_transform(X_scaled)

print(f'Reduced number of features after PCA: {X_pca.shape[1]}')

# Define outer and inner loop random seeds
outer = range(0, 5)  # Outer loop (train-test variations)
inner = range(0, 5)  # Inner loop (hyperparameter tuning)

best_params_list = {}

# Outer loop for train-test split variations
for outer_rand in tqdm(outer, desc='Outer Loop'):
    X_train, X_test, y_train, y_test = train_test_split(
        X_pca, y, test_size=0.2, random_state=outer_rand
    )

    # Inner loop for hyperparameter tuning
    for inner_rand in tqdm(inner, desc='Inner Loop', leave=True):
        X_train_train, X_train_val, y_train_train, y_train_val = train_test_split(
        X_train, y_train, test_size=0.15, random_state=inner_rand)

        # Define hyperparameter search space for RBF kernel
        param_distributions = {
            'C': loguniform(0.01, 500),  # Log-uniform for better coverage
            'gamma': loguniform(0.0001, 10),
            'class_weight': ['balanced', None],
            'shrinking': [True, False],
            'tol': uniform(1e-5, 1e-2),
        }

        # Initialize SVM model with RBF kernel
        svm = SVC(kernel='rbf')

        # Perform Randomized Search
        random_search = RandomizedSearchCV(
            estimator=svm,
            param_distributions=param_distributions,
            n_iter=10,  # Sample 10 hyperparameter combinations
            scoring='accuracy',
            cv=5,
            n_jobs=-1,
            random_state=inner_rand
        )

        # Timing the process
        start_time = time.time()
        random_search.fit(X_train_train, y_train_train)
        elapsed_time = time.time() - start_time  # Calculate elapsed time

        # Best hyperparameters
        best_params = random_search.best_params_
        best_params_list[f"Outer {outer_rand} - Inner {inner_rand}"] = best_params

        # Best model evaluation
        best_model = random_search.best_estimator_
        y_pred = best_model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        best_params_list[f"Outer {outer_rand} - Inner {inner_rand}"]['accuracy'] = accuracy

        print(f"\nOuter {outer_rand}, Inner {inner_rand} -> Best Hyperparameters: {best_params}")
        print(f"Test Set Accuracy: {accuracy:.4f}")
        print(f"Time taken: {elapsed_time:.2f} seconds")

# Print all best parameters
print("\nBest Parameters Summary:")
for key, val in best_params_list.items():
    print(f"{key}: {val}")

print(best_params_list)


fatal: destination path 'tm10007_ml' already exists and is not an empty directory.
The number of samples: 827
The number of features: 9000
