# Opis
Celem jest stworzenie modelu, który na podstawie danych medycznych pacjentów, może stwierdzić czy jest ryzyko choroby serca. Do osiągnięcia tego celu wykorzystam dataset "Heart Failure Prediction Dataset", który można znaleźć na Kaggle (link: https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction)

W datasecie znajdują się następujące atrybuty:
1. Wiek
2. Płeć
3. Typ bólu w klatce piersiowej
4. Spoczynkowe ciśnienie krwi
5. Cholesterol
6. Poziom cukru we krwi na czczo
7. EKG spoczynkowe
8. Maksymalne tętno
9. Dławica wywołana wysiłkiem fizycznym
10. Oldpeak
11. Nachylenie szczytowego odcinka ST podczas ćwiczenia
12. Choroba Serca

In [99]:
import os
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline

## Wczytanie danych

In [100]:
df = pd.read_csv('./heart.csv')

In [101]:
df.head(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
5,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0
6,45,F,ATA,130,237,0,Normal,170,N,0.0,Up,0
7,54,M,ATA,110,208,0,Normal,142,N,0.0,Up,0
8,37,M,ASY,140,207,0,Normal,130,Y,1.5,Flat,1
9,48,F,ATA,120,284,0,Normal,120,N,0.0,Up,0


## Oczyszczanie danych

In [102]:
df.isnull().any() # Sprawdzam czy są puste komórki w tabeli

Age               False
Sex               False
ChestPainType     False
RestingBP         False
Cholesterol       False
FastingBS         False
RestingECG        False
MaxHR             False
ExerciseAngina    False
Oldpeak           False
ST_Slope          False
HeartDisease      False
dtype: bool

In [103]:
# jako że nie ma brakujących danych tutaj jest przykład w przypadku gdyby
# w kolumnie 'MaxHR' był mały procent brakujących komórek
df = df.dropna(subset=['MaxHR'])

In [104]:
# zamień kolumny kategoryczne na kolumny typu string
string_col = df.select_dtypes(include="object").columns
df[string_col] = df[string_col].astype("string")

In [105]:
string_col = df.select_dtypes("string").columns.to_list()
num_col = df.columns.to_list()
for col in string_col:
    num_col.remove(col)
num_col.remove("HeartDisease")

In [106]:
print(df.dtypes)

Age                        int64
Sex               string[python]
ChestPainType     string[python]
RestingBP                  int64
Cholesterol                int64
FastingBS                  int64
RestingECG        string[python]
MaxHR                      int64
ExerciseAngina    string[python]
Oldpeak                  float64
ST_Slope          string[python]
HeartDisease               int64
dtype: object


## Dostosowanie danych

In [107]:
df = pd.get_dummies(df, columns=string_col, drop_first=True)

In [108]:
scaler = StandardScaler()
df[num_col] = scaler.fit_transform(df[num_col])

In [109]:
df.head(10)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,-1.43314,0.410909,0.82507,-0.551341,1.382928,-0.832432,0,True,True,False,False,True,False,False,False,True
1,-0.478484,1.491752,-0.171961,-0.551341,0.754157,0.105664,1,False,False,True,False,True,False,False,True,False
2,-1.751359,-0.129513,0.770188,-0.551341,-1.525138,-0.832432,0,True,True,False,False,False,True,False,False,True
3,-0.584556,0.302825,0.13904,-0.551341,-1.132156,0.574711,1,False,False,False,False,True,False,True,True,False
4,0.051881,0.951331,-0.034755,-0.551341,-0.581981,-0.832432,0,True,False,True,False,True,False,False,False,True
5,-1.539213,-0.669935,1.282424,-0.551341,1.304332,-0.832432,0,True,False,True,False,True,False,False,False,True
6,-0.902775,-0.129513,0.349422,-0.551341,1.304332,-0.832432,0,False,True,False,False,True,False,False,False,True
7,0.051881,-1.210356,0.084157,-0.551341,0.203982,-0.832432,0,True,True,False,False,True,False,False,False,True
8,-1.751359,0.410909,0.07501,-0.551341,-0.267596,0.574711,1,True,False,False,False,True,False,True,True,False
9,-0.584556,-0.669935,0.779335,-0.551341,-0.660578,-0.832432,0,False,True,False,False,True,False,False,False,True


## Podział na dane do treningu i do testowania

In [110]:
X = df.drop(columns=["HeartDisease"])
y = df["HeartDisease"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

## Wybór i trening modelu



In [111]:
# Wybieramy kilka modeli dla porównania
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier()
}

## Wyszukiwanie najlepszych hiperparametrów

In [112]:
# testowanie kilku hiperparametrów dla danego modelu oraz dobranie najlepszej kombinacji
param_grids = {
    "Logistic Regression": {"C": [0.1, 1, 10, 100]},
    "Random Forest": {"n_estimators": [10, 50, 100, 200], "criterion": ["gini", "entropy"]},
    "XGBoost": {"learning_rate": [0.01, 0.1, 0.2, 0.3], "max_depth": [3, 5, 7, 9], "n_estimators": [50, 100, 200]}
}
# Stworzenie i dopasowanie potoku dla Logistic Regression
pipeline_lr = make_pipeline(StandardScaler(), LogisticRegression())
param_grid_lr = {
    'logisticregression__C': [0.1, 1, 10, 100]
}
grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=StratifiedKFold(n_splits=5), scoring='roc_auc', n_jobs=-1)
grid_search_lr.fit(X_train, y_train)
best_model_lr = grid_search_lr.best_estimator_
print(f"Najlepsze parametry dla: Logistic Regression: {grid_search_lr.best_params_}")

best_models = {}
for model_name, model in models.items():
    param_grid = param_grids.get(model_name, {})
    grid_search = GridSearchCV(model, param_grid, cv=StratifiedKFold(n_splits=5), scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_
    print(f"Najlepsze parametry dla: {model_name}: {grid_search.best_params_}")

Najlepsze parametry dla: Logistic Regression: {'logisticregression__C': 0.1}
Najlepsze parametry dla: Logistic Regression: {'C': 10}
Najlepsze parametry dla: Random Forest: {'criterion': 'entropy', 'n_estimators': 50}
Najlepsze parametry dla: XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}


## Sprawdzenie modeli

In [113]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")
    print(f"Balanced_accuracy: {balanced_accuracy}")
    print(f"Precision: {precision}")
    print(classification_report(y_test, y_pred))
    return accuracy, precision

results = {}
for model_name, model in best_models.items():
    if model_name == "Logistic Regression":
      continue
    print(f"\nEvaluating {model_name}")
    results[model_name] = evaluate_model(model, X_test, y_test)


Evaluating Random Forest
Accuracy: 0.8858695652173914
Balanced_accuracy: 0.8827116212338594
Precision: 0.8857142857142857
              precision    recall  f1-score   support

           0       0.89      0.85      0.87        82
           1       0.89      0.91      0.90       102

    accuracy                           0.89       184
   macro avg       0.89      0.88      0.88       184
weighted avg       0.89      0.89      0.89       184


Evaluating XGBoost
Accuracy: 0.8967391304347826
Balanced_accuracy: 0.8925155428024869
Precision: 0.8878504672897196
              precision    recall  f1-score   support

           0       0.91      0.85      0.88        82
           1       0.89      0.93      0.91       102

    accuracy                           0.90       184
   macro avg       0.90      0.89      0.89       184
weighted avg       0.90      0.90      0.90       184

