In [26]:
%pip install -q pandas scikit-learn numpy

In [27]:
import os
import zipfile
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score

In [28]:
path_data = "dataset"
path_dataset_download = "dataset/gallstone-1.zip"
path_dataset_file = "dataset/Gallstone_csv.csv"

os.makedirs(path_data, exist_ok=True)
if not os.path.exists(path_dataset_download):
    !curl -L "https://www.kaggle.com/api/v1/datasets/download/yasserhessein/gallstone" -o "{path_dataset_download}"
zipfile.ZipFile(path_dataset_download, 'r').extractall(path_data)

In [29]:
df = pd.read_csv(path_dataset_file)
df.head()

Unnamed: 0,Gallstone Status,Age,Gender,Comorbidity,Coronary Artery Disease (CAD),Hypothyroidism,Hyperlipidemia,Diabetes Mellitus (DM),Height,Weight,...,High Density Lipoprotein (HDL),Triglyceride,Aspartat Aminotransferaz (AST),Alanin Aminotransferaz (ALT),Alkaline Phosphatase (ALP),Creatinine,Glomerular Filtration Rate (GFR),C-Reactive Protein (CRP),Hemoglobin (HGB),Vitamin D
0,0,50,0,0,0,0,0,0,185,92.8,...,40.0,134.0,20.0,22.0,87.0,0.82,112.47,0.0,16.0,33.0
1,0,47,0,1,0,0,0,0,176,94.5,...,43.0,103.0,14.0,13.0,46.0,0.87,107.1,0.0,14.4,25.0
2,0,61,0,0,0,0,0,0,171,91.1,...,43.0,69.0,18.0,14.0,66.0,1.25,65.51,0.0,16.2,30.2
3,0,41,0,0,0,0,0,0,168,67.7,...,59.0,53.0,20.0,12.0,34.0,1.02,94.1,0.0,15.4,35.4
4,0,42,0,0,0,0,0,0,178,89.6,...,30.0,326.0,27.0,54.0,71.0,0.82,112.47,0.0,16.8,40.6


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319 entries, 0 to 318
Data columns (total 39 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Gallstone Status                                319 non-null    int64  
 1   Age                                             319 non-null    int64  
 2   Gender                                          319 non-null    int64  
 3   Comorbidity                                     319 non-null    int64  
 4   Coronary Artery Disease (CAD)                   319 non-null    int64  
 5   Hypothyroidism                                  319 non-null    int64  
 6   Hyperlipidemia                                  319 non-null    int64  
 7   Diabetes Mellitus (DM)                          319 non-null    int64  
 8   Height                                          319 non-null    int64  
 9   Weight                                     

## Separando rotulo e atributos

In [31]:
X,y = df.drop(columns=["Gallstone Status"]), df["Gallstone Status"]

# Stratified K-Fold Cross Validation


In [32]:
skf = StratifiedKFold(n_splits=5, shuffle=True)

knn = KNeighborsClassifier(n_neighbors=5)

In [67]:
NUM_TEST = 10
metrics = {
    "accuracy": [],
    "precision": [],
    "recall": [],
    "f1_score": [],
    "kappa": []
}

for i in range(NUM_TEST):
    print(f"Test Split {i + 1}")

    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=0.1, shuffle=True
    )

    accuracy_cv = cross_val_score(knn, X_train_val, y_train_val, cv=skf, scoring="accuracy")
    precision_cv = cross_val_score(knn, X_train_val, y_train_val, cv=skf, scoring="precision")
    recall_cv = cross_val_score(knn, X_train_val, y_train_val, cv=skf, scoring="recall")

    y_pred = knn.fit(X_train_val, y_train_val).predict(X_test)

    metrics["accuracy"].append(accuracy_cv.mean())
    metrics["precision"].append(precision_cv.mean())
    metrics["recall"].append(recall_cv.mean())

    metrics["f1_score"].append(f1_score(y_test, y_pred))
    metrics["kappa"].append(cohen_kappa_score(y_test, y_pred))

Test Split 1
Test Split 2
Test Split 3
Test Split 4
Test Split 5
Test Split 6
Test Split 7
Test Split 8
Test Split 9
Test Split 10


In [68]:
for metric_name, scores in metrics.items():
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    print(f"{metric_name.capitalize()}: Mean = {mean_score:.4f}, Std = {std_score:.4f}")

Accuracy: Mean = 0.5272, Std = 0.0231
Precision: Mean = 0.5146, Std = 0.0178
Recall: Mean = 0.4669, Std = 0.0342
F1_score: Mean = 0.5220, Std = 0.0735
Kappa: Mean = 0.0696, Std = 0.1545
