In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("cherngs/heart-disease-cleveland-uci")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/cherngs/heart-disease-cleveland-uci?dataset_version_number=1...


100%|██████████| 3.33k/3.33k [00:00<00:00, 2.83MB/s]

Extracting files...
Path to dataset files: C:\Users\OLUSOLADE EMMANUEL\.cache\kagglehub\datasets\cherngs\heart-disease-cleveland-uci\versions\1





In [18]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, accuracy_score, f1_score, recall_score, roc_auc_score

In [14]:
os.listdir(path)

['heart_cleveland_upload.csv']

In [15]:
heart_disease = pd.read_csv(os.path.join(path, 'heart_cleveland_upload.csv'))

In [16]:
heart_disease.head(15)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0
5,64,1,0,170,227,0,2,155,0,0.6,1,0,2,0
6,63,1,0,145,233,1,2,150,0,2.3,2,0,1,0
7,61,1,0,134,234,0,0,145,0,2.6,1,2,0,1
8,60,0,0,150,240,0,0,171,0,0.9,0,0,0,0
9,59,1,0,178,270,0,2,145,0,4.2,2,0,2,0


In [20]:
heart_disease['condition'].value_counts()

condition
0    160
1    137
Name: count, dtype: int64

In [17]:
heart_disease.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        297 non-null    int64  
 1   sex        297 non-null    int64  
 2   cp         297 non-null    int64  
 3   trestbps   297 non-null    int64  
 4   chol       297 non-null    int64  
 5   fbs        297 non-null    int64  
 6   restecg    297 non-null    int64  
 7   thalach    297 non-null    int64  
 8   exang      297 non-null    int64  
 9   oldpeak    297 non-null    float64
 10  slope      297 non-null    int64  
 11  ca         297 non-null    int64  
 12  thal       297 non-null    int64  
 13  condition  297 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 32.6 KB


In [19]:
X = heart_disease.drop('condition', axis=1)
y = heart_disease['condition']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [24]:
#create models
log_reg = Pipeline([
    ('scaler', StandardScaler()),
    ('log', LogisticRegression(max_iter=1000, random_state=42))
])

forest = RandomForestClassifier(n_estimators=100, random_state=42)

lgbm = LGBMClassifier(random_state=42)

In [27]:
#conbine models to loop through easily
models = {
    'LogisticRegression': log_reg,
    'RandomForest': forest,
    'LightGBM': lgbm
}

In [33]:
#define evaluation metrics
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'recall': make_scorer(recall_score),
    'roc_auc': make_scorer(roc_auc_score)
}

In [34]:
#perform cross_validation for each model
result = {}
for name, model in models.items():
    cv_res = cross_validate(model, X_train, y_train, cv=cv, scoring=scoring, return_train_score=False)

[LightGBM] [Info] Number of positive: 72, number of negative: 86
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000569 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 158, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.455696 -> initscore=-0.177681
[LightGBM] [Info] Start training from score -0.177681
[LightGBM] [Info] Number of positive: 73, number of negative: 85
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 186
[LightGBM] [Info] Number of data points in the train set: 158, number of used features: 13
[LightGBM] [Info] [binary:BoostFromS

In [36]:
#summarize result
summary = {metric: (cv_res[f'test_{metric}'].mean(), cv_res[f'test_{metric}'].std()) for metric in scoring}
result[name] = summary

In [None]:
#display tresult
for name, summary in results.items():
    print(f'\n=== {name} ===')
    for metric, (mean_, std_) in summary.items