# Setup

In [1]:
%pip install -q pandas scikit-learn numpy

Note: you may need to restart the kernel to use updated packages.


In [10]:
import os
import zipfile
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,LeaveOneOut
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score
from sklearn.model_selection import KFold


In [11]:
path_data = "dataset"
path_dataset_download = "dataset/gallstone-1.zip"
path_dataset_file = "dataset/Gallstone_csv.csv"

os.makedirs(path_data, exist_ok=True)
if not os.path.exists(path_dataset_download):
    !curl -L "https://www.kaggle.com/api/v1/datasets/download/yasserhessein/gallstone" -o "{path_dataset_download}"
zipfile.ZipFile(path_dataset_download, 'r').extractall(path_data)

In [12]:
df = pd.read_csv(path_dataset_file)
df.head()

Unnamed: 0,Gallstone Status,Age,Gender,Comorbidity,Coronary Artery Disease (CAD),Hypothyroidism,Hyperlipidemia,Diabetes Mellitus (DM),Height,Weight,...,High Density Lipoprotein (HDL),Triglyceride,Aspartat Aminotransferaz (AST),Alanin Aminotransferaz (ALT),Alkaline Phosphatase (ALP),Creatinine,Glomerular Filtration Rate (GFR),C-Reactive Protein (CRP),Hemoglobin (HGB),Vitamin D
0,0,50,0,0,0,0,0,0,185,92.8,...,40.0,134.0,20.0,22.0,87.0,0.82,112.47,0.0,16.0,33.0
1,0,47,0,1,0,0,0,0,176,94.5,...,43.0,103.0,14.0,13.0,46.0,0.87,107.1,0.0,14.4,25.0
2,0,61,0,0,0,0,0,0,171,91.1,...,43.0,69.0,18.0,14.0,66.0,1.25,65.51,0.0,16.2,30.2
3,0,41,0,0,0,0,0,0,168,67.7,...,59.0,53.0,20.0,12.0,34.0,1.02,94.1,0.0,15.4,35.4
4,0,42,0,0,0,0,0,0,178,89.6,...,30.0,326.0,27.0,54.0,71.0,0.82,112.47,0.0,16.8,40.6


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319 entries, 0 to 318
Data columns (total 39 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Gallstone Status                                319 non-null    int64  
 1   Age                                             319 non-null    int64  
 2   Gender                                          319 non-null    int64  
 3   Comorbidity                                     319 non-null    int64  
 4   Coronary Artery Disease (CAD)                   319 non-null    int64  
 5   Hypothyroidism                                  319 non-null    int64  
 6   Hyperlipidemia                                  319 non-null    int64  
 7   Diabetes Mellitus (DM)                          319 non-null    int64  
 8   Height                                          319 non-null    int64  
 9   Weight                                     

In [7]:
df.describe()

Unnamed: 0,Gallstone Status,Age,Gender,Comorbidity,Coronary Artery Disease (CAD),Hypothyroidism,Hyperlipidemia,Diabetes Mellitus (DM),Height,Weight,...,High Density Lipoprotein (HDL),Triglyceride,Aspartat Aminotransferaz (AST),Alanin Aminotransferaz (ALT),Alkaline Phosphatase (ALP),Creatinine,Glomerular Filtration Rate (GFR),C-Reactive Protein (CRP),Hemoglobin (HGB),Vitamin D
count,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,...,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0,319.0
mean,0.495298,48.068966,0.492163,0.335423,0.037618,0.028213,0.025078,0.134796,167.15674,80.56489,...,49.475549,144.502163,21.684953,26.855799,73.112539,0.800611,100.818903,1.853856,14.418182,21.401411
std,0.500763,12.114558,0.500724,0.51734,0.190568,0.165841,0.156609,0.342042,10.05303,15.709069,...,17.718701,97.904493,16.697605,27.884413,24.181069,0.176433,16.971396,4.989591,1.775815,9.981659
min,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,145.0,42.9,...,25.0,1.39,8.0,3.0,7.0,0.46,10.6,0.0,8.5,3.5
25%,0.0,38.5,0.0,0.0,0.0,0.0,0.0,0.0,159.5,69.6,...,40.0,83.0,15.0,14.25,58.0,0.65,94.17,0.0,13.3,13.25
50%,0.0,49.0,0.0,0.0,0.0,0.0,0.0,0.0,168.0,78.8,...,46.5,119.0,18.0,19.0,71.0,0.79,104.0,0.215,14.4,22.0
75%,1.0,56.0,1.0,1.0,0.0,0.0,0.0,0.0,175.0,91.25,...,56.0,172.0,23.0,30.0,86.0,0.92,110.745,1.615,15.7,28.06
max,1.0,96.0,1.0,3.0,1.0,1.0,1.0,1.0,191.0,143.5,...,273.0,838.0,195.0,372.0,197.0,1.46,132.0,43.4,18.8,53.1


In [51]:
correlation_matrix = df.corr()
correlation_matrix["Gallstone Status"].sort_values(ascending=False)[:5]

Gallstone Status                   1.000000
C-Reactive Protein (CRP)           0.281995
Total Body Fat Ratio (TBFR) (%)    0.225470
Total Fat Content (TFC)            0.170158
Hyperlipidemia                     0.161901
Name: Gallstone Status, dtype: float64

## Separando rotulo e atributos

In [13]:
X,y = df.drop(columns=["Gallstone Status"]), df["Gallstone Status"]

# Stratified K-Fold Cross Validation

In [14]:
scaler = StandardScaler()
knn = KNeighborsClassifier(n_neighbors=5)
pipe = make_pipeline(scaler, knn)

In [None]:
NUM_TEST = 10
metrics = { "accuracy": [], "precision": [], "recall": [], "f1_score": [], "kappa": [] }
scoring = {'accuracy': 'accuracy', 'precision': 'precision', 'recall': 'recall','f1':'f1'}
skf = StratifiedKFold(n_splits=5, shuffle=True)

for i in range(NUM_TEST):
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, shuffle=True)
    scores_cv = cross_validate(pipe, X_train_val, y_train_val, cv=skf, scoring=scoring)
    pipe.fit(X_train_val, y_train_val)
    y_pred = pipe.predict(X_test)

    metrics["accuracy"].append(scores_cv["test_accuracy"].mean())
    metrics["precision"].append(scores_cv["test_precision"].mean())
    metrics["recall"].append(scores_cv["test_recall"].mean())
    metrics["f1_score"].append(scores_cv["test_f1"].mean())
    metrics["kappa"].append(cohen_kappa_score(y_test, y_pred))

df_metrics = pd.DataFrame(metrics)
df_metrics.describe()

Unnamed: 0,accuracy,precision,recall,f1_score,kappa
count,10.0,10.0,10.0,10.0,10.0
mean,0.51778,0.514392,0.474315,0.490962,0.105685
std,0.015458,0.024278,0.030182,0.026071,0.186423
min,0.49141,0.468986,0.428042,0.446333,-0.142857
25%,0.512129,0.508488,0.459667,0.48127,-0.060441
50%,0.51582,0.512716,0.466626,0.487896,0.149657
75%,0.525045,0.526519,0.490025,0.499277,0.272
max,0.543436,0.55579,0.533333,0.543346,0.323077


# Hold-out

In [55]:
NUM_TEST = 10
metrics = { "accuracy": [], "precision": [], "recall": [], "f1_score": [], "kappa": [] }
scoring = {'accuracy': 'accuracy', 'precision': 'precision', 'recall': 'recall','f1':'f1'}

for i in range(NUM_TEST):
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
    pipe.fit(X_train_val, y_train_val)
    y_pred = pipe.predict(X_test)

    metrics["accuracy"].append(accuracy_score(y_test, y_pred))
    metrics["precision"].append(precision_score(y_test, y_pred, average='weighted'))
    metrics["recall"].append(recall_score(y_test, y_pred, average='weighted'))
    metrics["f1_score"].append(f1_score(y_test, y_pred, average='weighted'))
    metrics["kappa"].append(cohen_kappa_score(y_test, y_pred))

df_metrics = pd.DataFrame(metrics)
df_metrics.describe()

Unnamed: 0,accuracy,precision,recall,f1_score,kappa
count,10.0,10.0,10.0,10.0,10.0
mean,0.534375,0.537772,0.534375,0.532107,0.069714
std,0.066618,0.069327,0.066618,0.069534,0.132389
min,0.421875,0.425338,0.421875,0.410875,-0.136276
25%,0.492188,0.494019,0.492188,0.490397,-0.013748
50%,0.539062,0.538597,0.539062,0.538228,0.068908
75%,0.5625,0.571721,0.5625,0.563248,0.129444
max,0.65625,0.661397,0.65625,0.65625,0.315175


# Leave-One-Out Cross Validation

In [17]:
scoring = {'accuracy': 'accuracy', 'precision': 'precision', 'recall': 'recall', 'f1': 'f1'}
metrics = {"accuracy": [], "precision": [], "recall": [], "f1_score": [], "kappa": []}
loo = LeaveOneOut()

y_predicts = []
y_trues = []

for train_index, test_index in loo.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    y_predicts.append(y_pred[0])
    y_trues.append(y_test.values[0])
    
metrics["accuracy"].append(accuracy_score(y_trues, y_predicts))
metrics["precision"].append(precision_score(y_trues, y_predicts, average='weighted'))
metrics["recall"].append(recall_score(y_trues, y_predicts, average='weighted'))
metrics["f1_score"].append(f1_score(y_trues, y_predicts, average='weighted'))
metrics["kappa"].append(cohen_kappa_score(y_trues, y_predicts))

df_metrics = pd.DataFrame(metrics)
df_metrics

Unnamed: 0,accuracy,precision,recall,f1_score,kappa
0,0.664577,0.680763,0.664577,0.655949,0.32715


# K-Fold Cross Validation

In [None]:
# o meu só funciona assim divos 
import os
os.environ['LOKY_MAX_CPU_COUNT'] = '4' 

In [19]:

NUM_TEST = 10
metrics = { "accuracy": [], "precision": [], "recall": [], "f1_score": [], "kappa": [] }
scoring = {'accuracy': 'accuracy', 'precision': 'precision', 'recall': 'recall','f1':'f1'}
kf = KFold(n_splits=5, shuffle=True)

for i in range(NUM_TEST):
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, shuffle=True)
    scores_cv = cross_validate(pipe, X_train_val, y_train_val, cv=kf, scoring=scoring)
    pipe.fit(X_train_val, y_train_val)
    y_pred = pipe.predict(X_test)

    metrics["accuracy"].append(scores_cv["test_accuracy"].mean())
    metrics["precision"].append(scores_cv["test_precision"].mean())
    metrics["recall"].append(scores_cv["test_recall"].mean())
    metrics["f1_score"].append(scores_cv["test_f1"].mean())
    metrics["kappa"].append(cohen_kappa_score(y_test, y_pred))

df_metrics = pd.DataFrame(metrics)
df_metrics.describe()

Unnamed: 0,accuracy,precision,recall,f1_score,kappa
count,10.0,10.0,10.0,10.0,10.0
mean,0.638645,0.694292,0.483423,0.563765,0.359169
std,0.030543,0.047632,0.034848,0.034636,0.105664
min,0.592196,0.594673,0.426779,0.518006,0.173432
25%,0.611419,0.673659,0.465756,0.528076,0.330208
50%,0.648064,0.697528,0.491189,0.575894,0.357869
75%,0.655883,0.728599,0.511455,0.593682,0.417582
max,0.683061,0.75617,0.522942,0.601201,0.517241
