In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, matthews_corrcoef
from sklearn.metrics import confusion_matrix

from sklearn.neighbors import KNeighborsClassifier


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import optuna

In [2]:
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp) if (tn + fp) > 0 else 0

def evaluate_model_on_folds(model, folds=5, prefix="data/fold_", scaler="MinMaxScaler",val=0,disp=0):
    metrics = {
        'acc': [],
        'f1': [],
        'precision': [],
        'recall': [],
        'specificity': [],
        'mcc': [],
        'nMCC': [],
        'fc': []
    }

    for i in range(folds):
        train = pd.read_csv(f"{prefix}{i+1}_train.csv")
        test = pd.read_csv(f"{prefix}{i+1}_test.csv")

        # Binarization
        train['arrytmia'] = train['arrytmia'].apply(lambda x: 1 if x > 1 else x)
        test['arrytmia'] = test['arrytmia'].apply(lambda x: 1 if x > 1 else x)

        if scaler == "MinMaxScaler":
            scaler_obj = MinMaxScaler()
        else:
            test['arrytmia'] = test['arrytmia'].apply(lambda x: -1 if x == 0 else x)
            train['arrytmia'] = train['arrytmia'].apply(lambda x: -1 if x == 0 else x)
            scaler_obj = StandardScaler()


        X_train = train.iloc[:, 1:]
        y_train = train['arrytmia'].values
        if val==0:
            X_test = test.iloc[:, 1:]
            y_test = test['arrytmia'].values
        if val==1:
            X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=0.2, random_state=42)
            X_test=X_val
            y_test=y_val

        X_train = pd.DataFrame(scaler_obj.fit_transform(X_train), columns=X_train.columns)
        X_test = pd.DataFrame(scaler_obj.transform(X_test), columns=X_test.columns)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        cm = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = cm.ravel()

        acc = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        f1 = 2 * tp / (2 * tp + fp + fn) if (2 * tp + fp + fn) > 0 else 0

        mcc = matthews_corrcoef(y_test, y_pred)
        nMCC = (mcc + 1) / 2

        fc = 0.5 * f1 + 0.3 * specificity + 0.2 * nMCC

        metrics['acc'].append(acc)
        metrics['f1'].append(f1)
        metrics['precision'].append(precision)
        metrics['recall'].append(recall)
        metrics['specificity'].append(specificity)
        metrics['mcc'].append(mcc)
        metrics['nMCC'].append(nMCC)
        metrics['fc'].append(fc)

        if disp:
            def mean_std_str(metric_list):
                mean = np.mean(metric_list)
                std = np.std(metric_list)
                return f"{mean:.2f} $\pm$ {std:.2f}"
            print(f"Accuracy:     {mean_std_str(metrics['acc'])}")
            print(f"Specificity:  {mean_std_str(metrics['specificity'])}")
            print(f"F1-score:     {mean_std_str(metrics['f1'])}")
            # print(f"Precision:    {mean_std_str(metrics['precision'])}")
            # print(f"Sensitivity:  {mean_std_str(metrics['recall'])}")
            # print(f"MCC:          {mean_std_str(metrics['mcc'])}")
            print(f"nMCC:         {mean_std_str(metrics['nMCC'])}")
            print(f"Funkcja celu: {mean_std_str(metrics['fc'])}")

        return fc


    

## KNN

In [4]:
n_trials=30

In [None]:
def objective(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 30, 300) # Liczba sąsiadów
    p = trial.suggest_int('p', 1, 5)           # Liczba sąsiadów
    
    model = KNeighborsClassifier(
        n_neighbors=n_neighbors,
        p=p,
    )

    score=evaluate_model_on_folds(model,val=1)
    return round(score,4) 

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_trials)

best_params = study.best_params
print("Najlepsze parametry:", best_params)

[I 2025-08-06 18:47:32,808] A new study created in memory with name: no-name-bb69b782-9bf1-47be-9606-2363862332b1
[I 2025-08-06 18:47:33,845] Trial 0 finished with value: 0.864 and parameters: {'n_neighbors': 33, 'p': 2}. Best is trial 0 with value: 0.864.
[I 2025-08-06 18:47:59,412] Trial 1 finished with value: 0.7838 and parameters: {'n_neighbors': 298, 'p': 5}. Best is trial 0 with value: 0.864.
[I 2025-08-06 18:48:00,892] Trial 2 finished with value: 0.79 and parameters: {'n_neighbors': 226, 'p': 1}. Best is trial 0 with value: 0.864.
[I 2025-08-06 18:48:26,465] Trial 3 finished with value: 0.8042 and parameters: {'n_neighbors': 195, 'p': 5}. Best is trial 0 with value: 0.864.
[I 2025-08-06 18:48:51,912] Trial 4 finished with value: 0.8198 and parameters: {'n_neighbors': 137, 'p': 5}. Best is trial 0 with value: 0.864.
[I 2025-08-06 18:49:17,848] Trial 5 finished with value: 0.7905 and parameters: {'n_neighbors': 251, 'p': 4}. Best is trial 0 with value: 0.864.
[I 2025-08-06 18:49:

Najlepsze parametry: {'n_neighbors': 22, 'p': 4}


In [6]:
best_model = KNeighborsClassifier(**best_params)
score=evaluate_model_on_folds(best_model,disp=1)

Accuracy:     0.83 $\pm$ 0.00
Specificity:  0.73 $\pm$ 0.00
F1-score:     0.86 $\pm$ 0.00
nMCC:         0.82 $\pm$ 0.00
Funkcja celu: 0.82 $\pm$ 0.00


## DecisionTree

In [None]:
def objective(trial):
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy']) 
    max_depth = trial.suggest_int('max_depth', 3, 20) 
    min_samples_split = trial.suggest_int('min_samples_split', 1, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

    model = DecisionTreeClassifier(
        criterion=criterion,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )


    score=evaluate_model_on_folds(model)
    return round(score,4)  

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_trials)

best_params = study.best_params
print("Najlepsze parametry:", best_params)

[I 2025-04-13 22:43:16,247] A new study created in memory with name: no-name-65ba793c-fa6f-4d3f-967a-272cc1abee2e
[I 2025-04-13 22:43:17,580] Trial 0 finished with value: 0.939 and parameters: {'criterion': 'entropy', 'splitter': 'best', 'max_depth': 4, 'min_samples_split': 13, 'min_samples_leaf': 14, 'max_features': 'log2'}. Best is trial 0 with value: 0.939.
[I 2025-04-13 22:43:18,617] Trial 1 finished with value: 0.943 and parameters: {'criterion': 'gini', 'splitter': 'random', 'max_depth': 5, 'min_samples_split': 12, 'min_samples_leaf': 19, 'max_features': None}. Best is trial 1 with value: 0.943.
[I 2025-04-13 22:43:25,109] Trial 2 finished with value: 0.9666 and parameters: {'criterion': 'gini', 'splitter': 'best', 'max_depth': 12, 'min_samples_split': 17, 'min_samples_leaf': 7, 'max_features': None}. Best is trial 2 with value: 0.9666.
[I 2025-04-13 22:43:26,046] Trial 3 finished with value: 0.8862 and parameters: {'criterion': 'log_loss', 'splitter': 'random', 'max_depth': 6, '

Najlepsze parametry: {'criterion': 'gini', 'splitter': 'best', 'max_depth': 20, 'min_samples_split': 18, 'min_samples_leaf': 10, 'max_features': None}


In [None]:
best_model = DecisionTreeClassifier(**best_params, random_state=42)
score=evaluate_model_on_folds(best_model,disp=1)

F1:           0.9674
Accuracy:     0.9513
Precision:    0.9674
Sensitivity:  0.9674
Specificity:  0.9034
TP:           46810
FP:           1578
TN:           14762
FN:           1577
         Pred 1    Pred 0
True 1    46810     1577
True 0    1578     14762


## RandomForestClassifier

In [None]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 1, 200) # liczba drzew w lesie
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    max_depth = trial.suggest_int('max_depth', 5, 50) # Maksymalna głebokość drzewa
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20) # Minimalna liczba próbek w węźle wymagana do podzielenia
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20) # minimalna liczba próbek w liściu
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None]) # Liczba cech rozważanych podczas rodzielania węzła
    
    model = RandomForestClassifier(
        criterion=criterion,
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )

    
    score=evaluate_model_on_folds(model)
    return round(score,4)   

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_trials)

best_params = study.best_params
print("Najlepsze parametry:", best_params)


[I 2025-04-13 22:44:34,014] A new study created in memory with name: no-name-2241ecb4-c471-4a91-9b52-d6bc72d24f3b
[I 2025-04-13 22:45:10,184] Trial 0 finished with value: 0.9772 and parameters: {'n_estimators': 59, 'criterion': 'entropy', 'max_depth': 33, 'min_samples_split': 12, 'min_samples_leaf': 9, 'max_features': 'log2'}. Best is trial 0 with value: 0.9772.
[I 2025-04-13 22:46:11,677] Trial 1 finished with value: 0.9787 and parameters: {'n_estimators': 80, 'criterion': 'entropy', 'max_depth': 48, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.9787.
[I 2025-04-13 22:48:29,540] Trial 2 finished with value: 0.9777 and parameters: {'n_estimators': 188, 'criterion': 'gini', 'max_depth': 24, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.9787.
[I 2025-04-13 22:50:52,282] Trial 3 finished with value: 0.9771 and parameters: {'n_estimators': 194, 'criterion': 'log_loss', 'max_dept

Najlepsze parametry: {'n_estimators': 80, 'criterion': 'entropy', 'max_depth': 48, 'min_samples_split': 9, 'min_samples_leaf': 1, 'max_features': 'sqrt'}


In [None]:
best_model = RandomForestClassifier(**best_params, random_state=42)
score=evaluate_model_on_folds(best_model,disp=1)

F1:           0.9787
Accuracy:     0.9679
Precision:    0.9717
Sensitivity:  0.9858
Specificity:  0.9150
TP:           47700
FP:           1389
TN:           14951
FN:           687
         Pred 1    Pred 0
True 1    47700      687
True 0    1389     14951


## SVM

In [None]:
def objective(trial):
    C = trial.suggest_float('C', 1e-5, 1e5,log=True)
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    
    
    if kernel == "linear":
        model = SVC(C=C, kernel=kernel, random_state=42)
    
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
    
    if kernel == 'poly':
        degree = trial.suggest_int('degree', 2, 5)
        coef0 = trial.suggest_float ('coef0', -1, 1)
        model = SVC(C=C, kernel=kernel, gamma=gamma, degree=degree, coef0=coef0, random_state=42)
    if kernel == 'sigmoid':
        coef0 = trial.suggest_float ('coef0', -1, 1)
        model = SVC(C=C, kernel=kernel, gamma=gamma, coef0=coef0, random_state=42)
    else:
        model = SVC(C=C, kernel=kernel, gamma=gamma, random_state=42)
    
    
    score=evaluate_model_on_folds(model)
    return round(score,4)  

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=n_trials)

best_params = study.best_params
print("Najlepsze parametry:", best_params)


[I 2025-04-13 23:14:53,212] A new study created in memory with name: no-name-8cfcef7d-c57d-468b-95d0-8e244fb9eccb
[I 2025-04-13 23:18:29,592] Trial 0 finished with value: 0.8555 and parameters: {'C': 1.2770386725976239e-05, 'kernel': 'sigmoid', 'gamma': 'scale'}. Best is trial 0 with value: 0.8555.
[I 2025-04-13 23:20:11,291] Trial 1 finished with value: 0.9675 and parameters: {'C': 135.884689711956, 'kernel': 'linear', 'gamma': 'scale'}. Best is trial 1 with value: 0.9675.
[I 2025-04-13 23:24:06,309] Trial 2 finished with value: 0.9244 and parameters: {'C': 0.011970328441774924, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 1 with value: 0.9675.
[I 2025-04-13 23:30:55,127] Trial 3 finished with value: 0.7108 and parameters: {'C': 0.04141499356569666, 'kernel': 'sigmoid', 'gamma': 'scale'}. Best is trial 1 with value: 0.9675.
[I 2025-04-13 23:31:49,844] Trial 4 finished with value: 0.9628 and parameters: {'C': 0.947261534801931, 'kernel': 'linear', 'gamma': 'scale'}. Best is trial 1

Najlepsze parametry: {'C': 17431.85057616861, 'kernel': 'poly', 'gamma': 'auto', 'degree': 3, 'coef0': 0.3860570320218555}


In [None]:
best_model = SVC(**best_params, random_state=42)
score=evaluate_model_on_folds(best_model,disp=1)

F1:           0.9822
Accuracy:     0.9734
Precision:    0.9796
Sensitivity:  0.9849
Specificity:  0.9392
TP:           47656
FP:           993
TN:           15347
FN:           731
         Pred 1    Pred 0
True 1    47656      731
True 0     993     15347
