In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedGroupKFold, cross_validate


In [None]:
data = pd.read_csv("data/processed/windows_21_10_balanced_avg_energy.csv")
data

In [None]:
X = data.drop(columns=["label", "patient_id"])
y = data["label"].map({'bckg': 0, 'seiz': 1})
groups = data["patient_id"]

In [None]:
y_frame = y.to_frame()
y_frame[y_frame["label"] == 0].count(), y_frame[y_frame["label"] == 1].count()

In [None]:
def remove_highly_correlated_features(X, threshold=0.95):
    # Calculate the correlation matrix
    corr_matrix = X.corr().abs()
    
    # Select the upper triangle of the correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Find features with correlation greater than the threshold
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    
    # Drop highly correlated features
    X_reduced = X.drop(columns=to_drop)
    
    return X_reduced, to_drop

X, _ = remove_highly_correlated_features(X, 0.95)
X

In [None]:
cv = StratifiedGroupKFold(n_splits=5)

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted'),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted'),
    'roc_auc': make_scorer(roc_auc_score, average='weighted', multi_class='ovr')
}

def cross_validate_pipeline(pipeline):
    results = cross_validate(pipeline, X, y, groups=groups, cv=cv, scoring=scoring, return_train_score=False)
    avg_results = {metric: np.mean(values) for metric, values in results.items()}

    return results, avg_results

In [None]:
rf_pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('rf', RandomForestClassifier(n_estimators=400, random_state=42))
])

rf_results, rf_avg_results = cross_validate_pipeline(rf_pipeline)
rf_results, rf_avg_results

In [None]:
svm_pipeline = Pipeline([
     ('scale', StandardScaler()),
     ('svm', SVC(kernel='rbf', C=1, gamma=0.001, probability=True, random_state=42))
])

svm_results, svm_avg_results = cross_validate_pipeline(svm_pipeline) 
svm_results, svm_avg_results

In [None]:
knn_pipeline = Pipeline([
     ('scale', StandardScaler()),
     ('knn', KNeighborsClassifier(n_neighbors=10))
])

knn_results, knn_avg_results = cross_validate_pipeline(knn_pipeline)
knn_results, knn_avg_results