# TM10007 Assignment template -- ECG data

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [5]:
# Run this to use from Colab environment
!git clone https://github.com/jveenland/tm10007_ml.git

import zipfile
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Extract dataset
with zipfile.ZipFile('/content/tm10007_ml/ecg/ecg_data.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/tm10007_ml/ecg')

data = pd.read_csv('/content/tm10007_ml/ecg/ecg_data.csv', index_col=0)

print(f'The number of samples: {len(data.index)}')
print(f'The number of features: {len(data.columns) - 1}')  # Excluding label column

# Extract features and labels
X = data.iloc[:, :-1].values  # All columns except the last one
y = data.iloc[:, -1].values   # Last column as labels

# Scale the features
scaler = RobustScaler(quantile_range=(30,70))
X_scaled = scaler.fit_transform(X)

# Split into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize SVM classifiers with different kernels
svm_linear = SVC(kernel='linear', gamma='scale')
svm_rbf = SVC(kernel='rbf', gamma='scale')
svm_poly = SVC(kernel='poly', degree=3, gamma='scale')

classifiers = {
    "Linear SVM": svm_linear,
    "RBF SVM": svm_rbf,
    "Polynomial SVM": svm_poly
}

# Construct classifiers
degrees = [5, 10, 15]
coef0s = [0.01, 0.5, 1]
slacks = [0.01, 0.5, 1]

# Train and evaluate each SVM model
for degree in degrees:
    for coef0 in coef0s:
        for slack in slacks:
            clf = SVC(kernel='poly', degree=degree, coef0=coef0, C=slack, gamma='scale')
            clf.fit(X_train, y_train)
            y_train_pred = clf.predict(X_train)
            y_test_pred = clf.predict(X_test)

            train_acc = accuracy_score(y_train, y_train_pred)
            test_acc = accuracy_score(y_test, y_test_pred)

            print(f"{name}:")
            print(f"  Training Accuracy: {train_acc:.4f}")
            print(f"  Testing Accuracy: {test_acc:.4f}")
            print("-" * 40)

fatal: destination path 'tm10007_ml' already exists and is not an empty directory.
The number of samples: 827
The number of features: 9000
Polynomial SVM:
  Training Accuracy: 0.8321
  Testing Accuracy: 0.8253
----------------------------------------
Polynomial SVM:
  Training Accuracy: 0.8472
  Testing Accuracy: 0.8253
----------------------------------------
Polynomial SVM:
  Training Accuracy: 0.8502
  Testing Accuracy: 0.8193
----------------------------------------
Polynomial SVM:
  Training Accuracy: 0.8366
  Testing Accuracy: 0.8313
----------------------------------------
Polynomial SVM:
  Training Accuracy: 0.8654
  Testing Accuracy: 0.8133
----------------------------------------
Polynomial SVM:
  Training Accuracy: 0.8729
  Testing Accuracy: 0.8193
----------------------------------------
Polynomial SVM:
  Training Accuracy: 0.8396
  Testing Accuracy: 0.8253
----------------------------------------
Polynomial SVM:
  Training Accuracy: 0.9032
  Testing Accuracy: 0.8193
------