# TM10007 Assignment template -- ECG data

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [None]:
# Run this to use from Colab environment
!git clone https://github.com/jveenland/tm10007_ml.git

import zipfile
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.preprocessing import StandardScaler

# Extract dataset
with zipfile.ZipFile('/content/tm10007_ml/ecg/ecg_data.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/tm10007_ml/ecg')

# Extract dataset
data = pd.read_csv('ecg_data.csv', index_col=0)

print(f'The number of samples: {len(data.index)}')
print(f'The number of features: {len(data.columns) - 1}')  # Excluding label column

# Extract features and labels
X = data.iloc[:, :-1].values  # All columns except the last one
y = data.iloc[:, -1].values   # Last column as labels

# Scale the features
scaler = RobustScaler(quantile_range=(30,70))
X_scaled = scaler.fit_transform(X)

# Apply PCA to reduce dimensionality while preserving 95% of variance
pca = PCA(n_components=0.99)  # Retain 95% of the variance
X_pca = pca.fit_transform(X_scaled)
# scaler_pca = StandardScaler()
# X_pca = scaler_pca.fit_transform(X_pca)  # Normalize after PCA

print(f'Reduced number of features after PCA: {X_pca.shape[1]}')

print("Before PCA: Mean & Variance")
print(np.mean(X_scaled, axis=0)[:10])  # Print first 10 features
print(np.var(X_scaled, axis=0)[:10])

print("\nAfter PCA: Mean & Variance")
print(np.mean(X_pca, axis=0)[:10])
print(np.var(X_pca, axis=0)[:10])


# Split into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Define the parameter grid for Randomized Search
param_distributions = {
    'C': uniform(0.1, 10),  # Regularization parameter
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],  # Kernel coefficient for ‘rbf’
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']  # Different kernel types
}

# Initialize SVM model
svm = SVC()

# Perform Randomized Search
random_search = RandomizedSearchCV(
    estimator=svm,
    param_distributions=param_distributions,
    n_iter=20,  # Number of different hyperparameter combinations to try
    scoring='accuracy',
    cv=5,  # 5-fold cross-validation on training data
    n_jobs=-1,  # Use all available cores
    random_state=42
)

# Fit Randomized Search on training data (80% of X_train used for training, 20% for validation)
random_search.fit(X_train, y_train)

# Get best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation accuracy: ", random_search.best_score_)

# Evaluate on test set
best_model = random_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Test set accuracy: ", test_accuracy)


fatal: destination path 'tm10007_ml' already exists and is not an empty directory.
The number of samples: 827
The number of features: 9000
