In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [2]:
import kagglehub

import os

path = kagglehub.dataset_download("rabieelkharoua/alzheimers-disease-dataset")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
files = os.listdir(path)
print("Content of", files)

csv_file = files[0]
csv_path = os.path.join(path, csv_file)

# Load DataFrame
df = pd.read_csv(csv_path)
df = df.drop(columns=["DoctorInCharge"])  # Drop useless column


# Display the content of DataFrame
df.head().T

Content of ['alzheimers_disease_data.csv']


Unnamed: 0,0,1,2,3,4
PatientID,4751.0,4752.0,4753.0,4754.0,4755.0
Age,73.0,89.0,73.0,74.0,89.0
Gender,0.0,0.0,0.0,1.0,0.0
Ethnicity,0.0,0.0,3.0,0.0,0.0
EducationLevel,2.0,0.0,1.0,1.0,0.0
BMI,22.927749,26.827681,17.795882,33.800817,20.716974
Smoking,0.0,0.0,0.0,1.0,0.0
AlcoholConsumption,13.297218,4.542524,19.555085,12.209266,18.454356
PhysicalActivity,6.327112,7.619885,7.844988,8.428001,6.310461
DietQuality,1.347214,0.518767,1.826335,7.435604,0.795498


<!-- @format -->

# First evaluation


In [4]:
from sklearn.model_selection import train_test_split
from functions.data_prep import data_preprocessing

X = np.array(df.drop(columns=["Diagnosis"]))
y = np.array(df["Diagnosis"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train_scaled, X_test_scaled = data_preprocessing(X_train, X_test)

<!-- @format -->

## Logisitc Regression


In [10]:
from models.logistic_regression import Logistic_Regression_

clf = Logistic_Regression_()
print(clf.bayesian_opt(X_train_scaled, y_train))

[I 2025-01-28 01:10:47,434] A new study created in memory with name: no-name-11b46623-ce55-4ed6-84f9-917f759daaa7
[I 2025-01-28 01:10:47,459] Trial 0 finished with value: 0.8359329446064139 and parameters: {'solver': 'newton-cg', 'penalty': 'l1', 'C': 1}. Best is trial 0 with value: 0.8359329446064139.
[I 2025-01-28 01:10:47,486] Trial 1 finished with value: 0.8359329446064139 and parameters: {'solver': 'liblinear', 'penalty': None, 'C': 100}. Best is trial 0 with value: 0.8359329446064139.
[I 2025-01-28 01:10:47,510] Trial 2 finished with value: 0.8359329446064139 and parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'C': 1}. Best is trial 0 with value: 0.8359329446064139.
[I 2025-01-28 01:10:47,536] Trial 3 finished with value: 0.8359329446064139 and parameters: {'solver': 'newton-cg', 'penalty': 'l1', 'C': 1}. Best is trial 0 with value: 0.8359329446064139.
[I 2025-01-28 01:10:47,604] Trial 4 finished with value: 0.8341870635297308 and parameters: {'solver': 'sag', 'penalty': Non

0.8423333785341379


<!-- @format -->

Do zrobienia

- Dodanie opcji sprawdzenia wszystkich wyników


<!-- @format -->


<!-- @format -->

## Suppor Vector Machines


In [11]:
from models.svm import SVM

svm_model = SVM()
print(svm_model.bayesian_opt(X_train_scaled, y_train))

[I 2025-01-27 20:30:03,737] A new study created in memory with name: no-name-c2dfdaa6-6dd8-401d-ade9-25d962c83d20
[I 2025-01-27 20:30:03,847] Trial 0 finished with value: 0.7870855148342059 and parameters: {'C': 0.054954153336145506, 'gamma': 'auto', 'kernel': 'poly', 'degree': 4, 'coef0': 0.5754418268326502}. Best is trial 0 with value: 0.7870855148342059.
[I 2025-01-27 20:30:04,108] Trial 1 finished with value: 0.8417684700407214 and parameters: {'C': 1.6090077237438831, 'gamma': 'scale', 'kernel': 'linear', 'degree': 3, 'coef0': 0.2983939196218889}. Best is trial 1 with value: 0.8417684700407214.
[I 2025-01-27 20:30:06,924] Trial 2 finished with value: 0.8406050029086679 and parameters: {'C': 24.377422429997598, 'gamma': 'auto', 'kernel': 'linear', 'degree': 2, 'coef0': 0.06822811213005275}. Best is trial 1 with value: 0.8417684700407214.
[I 2025-01-27 20:30:06,994] Trial 3 finished with value: 0.837696335078534 and parameters: {'C': 0.017855842857729856, 'gamma': 'scale', 'kernel':

0.8452588714368819


<!-- @format -->

## XGBoost


In [6]:
import optuna
import xgboost as xgb
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)  # Walidacja 3-fold


def objective(trial):

    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        "lambda": trial.suggest_float("lambda", 1e-8, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
    }

    accuracies = []

    # 🔹 Ręczna walidacja K-Fold
    for train_idx, val_idx in cv.split(X_train_scaled, y_train):
        X_train_fold, X_val_fold = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

        # Tworzenie modelu i trening
        model = xgb.XGBClassifier(eval_metric="auc", **params)
        model.fit(X_train_fold, y_train_fold)

        # Predykcja i ocena
        y_pred = model.predict(X_val_fold)
        acc = accuracy_score(y_val_fold, y_pred)
        accuracies.append(acc)

    return np.mean(accuracies)  # Zwracamy średnią dokładność z walidacji krzyżowej


# 🔹 Optymalizacja
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, n_jobs=-1)

# 🔹 Najlepsze parametry
best_params = study.best_params
print("Najlepsze hiperparametry:", best_params)

# 🔹 Trening modelu na całym zbiorze treningowym
best_model = xgb.XGBClassifier(eval_metric="auc", **best_params)
best_model.fit(X_train_scaled, y_train)

# 🔹 Predykcja i raport klasyfikacji
y_pred = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

[I 2025-01-28 04:12:20,477] A new study created in memory with name: no-name-5e1bb230-57a6-4ddf-9b7f-c9710902bb26
[I 2025-01-28 04:12:30,607] Trial 2 finished with value: 0.9383362420011635 and parameters: {'max_depth': 10, 'learning_rate': 0.16279238388074493, 'n_estimators': 365, 'subsample': 0.6548286734691469, 'colsample_bytree': 0.7006588161700709, 'gamma': 0.7423821713783588, 'lambda': 0.20178279280588515, 'alpha': 2.4729507525354387e-06, 'min_child_weight': 3}. Best is trial 2 with value: 0.9383362420011635.
[I 2025-01-28 04:12:32,422] Trial 10 finished with value: 0.9360093077370565 and parameters: {'max_depth': 4, 'learning_rate': 0.04587477821429255, 'n_estimators': 314, 'subsample': 0.5201497930418895, 'colsample_bytree': 0.5202532563917619, 'gamma': 1.173750067273844e-08, 'lambda': 2.050765467960528e-06, 'alpha': 0.007696555395693562, 'min_child_weight': 10}. Best is trial 2 with value: 0.9383362420011635.
[I 2025-01-28 04:12:32,539] Trial 5 finished with value: 0.937754508

KeyboardInterrupt: 

In [24]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, recall_score

In [23]:
def objective(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        "lambda": trial.suggest_float("lambda", 1e-8, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
    }

    model = xgb.XGBClassifier(eval_metric="auc", **params)
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy  # Maksymalizujemy dokładność


# Przeprowadzenie optymalizacji
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, n_jobs=-1)

# Najlepsze parametry
best_params = study.best_params
print("Najlepsze hiperparametry:", best_params)

# Trening modelu z najlepszymi parametrami
best_model = xgb.XGBClassifier(eval_metric="auc", **best_params)
best_model.fit(X_train_scaled, y_train)

# Predykcja i raport klasyfikacji
y_pred = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

[I 2025-01-28 01:23:16,284] A new study created in memory with name: no-name-64e48396-af9c-46df-b0f1-2b5f8abbc060
[I 2025-01-28 01:23:16,744] Trial 0 finished with value: 0.9488372093023256 and parameters: {'max_depth': 3, 'learning_rate': 0.012569794538313708, 'n_estimators': 581, 'subsample': 0.7480536912214921, 'colsample_bytree': 0.9963691504499166, 'gamma': 0.0011726706403192928, 'lambda': 0.09591123114985056, 'alpha': 0.0005903896817027807, 'min_child_weight': 5}. Best is trial 0 with value: 0.9488372093023256.
[I 2025-01-28 01:23:21,779] Trial 1 finished with value: 0.9534883720930233 and parameters: {'max_depth': 10, 'learning_rate': 0.09267977595207047, 'n_estimators': 454, 'subsample': 0.5212669652601266, 'colsample_bytree': 0.5946871406605096, 'gamma': 2.6828112489807745e-06, 'lambda': 1.2207536280771798e-05, 'alpha': 0.005687693093541859, 'min_child_weight': 2}. Best is trial 1 with value: 0.9534883720930233.
[I 2025-01-28 01:23:33,271] Trial 2 finished with value: 0.958139

Najlepsze hiperparametry: {'max_depth': 10, 'learning_rate': 0.03631255132341192, 'n_estimators': 326, 'subsample': 0.9735590026070371, 'colsample_bytree': 0.7702392369999069, 'gamma': 0.0610040588424025, 'lambda': 9.530096299713464e-05, 'alpha': 0.002593056608028743, 'min_child_weight': 2}
              precision    recall  f1-score   support

           0       0.98      0.97      0.97       277
           1       0.95      0.96      0.95       153

    accuracy                           0.97       430
   macro avg       0.96      0.97      0.96       430
weighted avg       0.97      0.97      0.97       430



<!-- @format -->

## Random Forest


In [10]:
from models.random_forest import Random_Forest_

<!-- @format -->

## Decision trees
