In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("cherngs/heart-disease-cleveland-uci")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/cherngs/heart-disease-cleveland-uci?dataset_version_number=1...


100%|██████████| 3.33k/3.33k [00:00<00:00, 2.83MB/s]

Extracting files...
Path to dataset files: C:\Users\OLUSOLADE EMMANUEL\.cache\kagglehub\datasets\cherngs\heart-disease-cleveland-uci\versions\1





In [53]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, accuracy_score, f1_score, recall_score, roc_auc_score, classification_report, confusion_matrix

In [14]:
os.listdir(path)

['heart_cleveland_upload.csv']

In [15]:
heart_disease = pd.read_csv(os.path.join(path, 'heart_cleveland_upload.csv'))

In [16]:
heart_disease.head(15)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0
5,64,1,0,170,227,0,2,155,0,0.6,1,0,2,0
6,63,1,0,145,233,1,2,150,0,2.3,2,0,1,0
7,61,1,0,134,234,0,0,145,0,2.6,1,2,0,1
8,60,0,0,150,240,0,0,171,0,0.9,0,0,0,0
9,59,1,0,178,270,0,2,145,0,4.2,2,0,2,0


In [20]:
heart_disease['condition'].value_counts()

condition
0    160
1    137
Name: count, dtype: int64

In [17]:
heart_disease.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        297 non-null    int64  
 1   sex        297 non-null    int64  
 2   cp         297 non-null    int64  
 3   trestbps   297 non-null    int64  
 4   chol       297 non-null    int64  
 5   fbs        297 non-null    int64  
 6   restecg    297 non-null    int64  
 7   thalach    297 non-null    int64  
 8   exang      297 non-null    int64  
 9   oldpeak    297 non-null    float64
 10  slope      297 non-null    int64  
 11  ca         297 non-null    int64  
 12  thal       297 non-null    int64  
 13  condition  297 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 32.6 KB


In [19]:
X = heart_disease.drop('condition', axis=1)
y = heart_disease['condition']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [24]:
#create models
log_reg = Pipeline([
    ('scaler', StandardScaler()),
    ('log', LogisticRegression(max_iter=1000, random_state=42))
])

forest = RandomForestClassifier(n_estimators=100, random_state=42)

lgbm = LGBMClassifier(random_state=42)

In [27]:
#conbine models to loop through easily
models = {
    'LogisticRegression': log_reg,
    'RandomForest': forest,
    'LightGBM': lgbm
}

In [33]:
#define evaluation metrics
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'recall': make_scorer(recall_score),
    'roc_auc': make_scorer(roc_auc_score)
}

In [39]:
#perform cross_validation for each model
result = {}
for name, model in models.items():
    cv_res = cross_validate(model, X_train, y_train, cv=cv, scoring=scoring, return_train_score=False)
    #summarize result
    summary = {metric: (cv_res[f'test_{metric}'].mean(), cv_res[f'test_{metric}'].std()) for metric in scoring}
    result[name] = summary

[LightGBM] [Info] Number of positive: 72, number of negative: 86
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000117 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 158, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.455696 -> initscore=-0.177681
[LightGBM] [Info] Start training from score -0.177681
[LightGBM] [Info] Number of positive: 73, number of negative: 85
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 186
[LightGBM] [Info] Number of data points in the train set: 158, number of used features: 13
[LightGBM] [Info] [binary:BoostFromS

In [40]:
#display tresult
for name, summary in result.items():
    print(f'\n=== {name} ===')
    for metric, (mean_, std_) in summary.items():
        print(f'{metric:7s} : {mean_:.4f} ~ {std_:.4f}')


=== LogisticRegression ===
accuracy : 0.8608 ~ 0.0207
f1      : 0.8451 ~ 0.0230
recall  : 0.8261 ~ 0.0320
roc_auc : 0.8583 ~ 0.0211

=== RandomForest ===
accuracy : 0.8481 ~ 0.0310
f1      : 0.8290 ~ 0.0330
recall  : 0.7985 ~ 0.0317
roc_auc : 0.8445 ~ 0.0306

=== LightGBM ===
accuracy : 0.8017 ~ 0.0239
f1      : 0.7832 ~ 0.0275
recall  : 0.7795 ~ 0.0249
roc_auc : 0.8001 ~ 0.0240


In [55]:
log = LogisticRegression(max_iter=1000, class_weight='balanced')

In [56]:
scaler = StandardScaler()

In [57]:
X_train_scaled = scaler.fit_transform(X_train)

In [58]:
X_test_scaled = scaler.fit_transform(X_test)

In [59]:
log.fit(X_train_scaled, y_train)

In [60]:
pred = log.predict(X_test_scaled)

In [61]:
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

              precision    recall  f1-score   support

           0       0.77      0.72      0.74        32
           1       0.70      0.75      0.72        28

    accuracy                           0.73        60
   macro avg       0.73      0.73      0.73        60
weighted avg       0.74      0.73      0.73        60

[[23  9]
 [ 7 21]]


In [62]:
lgbm = LGBMClassifier()

In [65]:
lgbm.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 109, number of negative: 128
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000380 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 235
[LightGBM] [Info] Number of data points in the train set: 237, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.459916 -> initscore=-0.160682
[LightGBM] [Info] Start training from score -0.160682


In [66]:
pred2 = lgbm.predict(X_test)

In [67]:
print(classification_report(y_test, pred2))
print(confusion_matrix(y_test, pred2))

              precision    recall  f1-score   support

           0       0.71      0.69      0.70        32
           1       0.66      0.68      0.67        28

    accuracy                           0.68        60
   macro avg       0.68      0.68      0.68        60
weighted avg       0.68      0.68      0.68        60

[[22 10]
 [ 9 19]]


In [70]:
from sklearn.model_selection import GridSearchCV

In [75]:
params = [
    {
        'solver': ['liblinear'],
        'penalty': ['l1', 'l2'],
        'C': [0.01, 0.1, 1, 10]
    },
    {
        'solver': ['lbfgs', 'newton-cg', 'sag'],
        'penalty': ['l2'],
        'C': [0.01, 0.1, 1, 10]
    },
    {
        'solver': ['saga'],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'C': [0.01, 0.1, 1, 10],
        'l1_ratio': [0.1, 0.5, 0.9]  # only used for elasticnet
    }
]

In [76]:
grid = GridSearchCV(LogisticRegression(max_iter=1000), params, cv=3, scoring='accuracy')

In [77]:
grid.fit(X_train_scaled, y_train)



In [78]:
grid.best_params_

{'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}

In [79]:
grid_pred = grid.predict(X_test_scaled)

In [80]:
print(classification_report(y_test, grid_pred))
print(confusion_matrix(y_test, grid_pred))

              precision    recall  f1-score   support

           0       0.76      0.78      0.77        32
           1       0.74      0.71      0.73        28

    accuracy                           0.75        60
   macro avg       0.75      0.75      0.75        60
weighted avg       0.75      0.75      0.75        60

[[25  7]
 [ 8 20]]


In [81]:
grid_pred

array([1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1])

In [82]:
import joblib

In [94]:
joblib.dump(grid, 'grid.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [89]:
pred_table = pd.DataFrame({'Y_Test':y_test, 'Prediction': grid_pred})

In [90]:
pred_table

Unnamed: 0,Y_Test,Prediction
167,1,1
211,1,1
63,0,0
154,0,0
5,0,0
77,0,0
183,1,1
158,1,1
9,0,1
139,0,0


In [92]:
X.iloc[24]

age          71.0
sex           0.0
cp            1.0
trestbps    160.0
chol        302.0
fbs           0.0
restecg       0.0
thalach     162.0
exang         0.0
oldpeak       0.4
slope         0.0
ca            2.0
thal          0.0
Name: 24, dtype: float64

In [95]:
heart_disease.iloc[24]

age           71.0
sex            0.0
cp             1.0
trestbps     160.0
chol         302.0
fbs            0.0
restecg        0.0
thalach      162.0
exang          0.0
oldpeak        0.4
slope          0.0
ca             2.0
thal           0.0
condition      0.0
Name: 24, dtype: float64