In [2]:
import zipfile
import numpy as np
import pandas as pd
import os

from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import (
    accuracy_score, classification_report, recall_score, confusion_matrix,
    roc_auc_score, precision_score, f1_score, roc_curve, auc
)
from sklearn.preprocessing import OrdinalEncoder

from catboost import CatBoostClassifier, Pool

In [5]:
!kaggle datasets download -d mexwell/heart-disease-dataset

Dataset URL: https://www.kaggle.com/datasets/mexwell/heart-disease-dataset
License(s): Attribution 4.0 International (CC BY 4.0)
heart-disease-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [6]:
filepath = "heart-disease-dataset.zip"
unzip = zipfile.ZipFile(filepath)
unzip.extractall("data")

In [7]:
data = pd.read_csv("data/heart_statlog_cleveland_hungary_final.csv")
print(data.head())

   age  sex  chest pain type  resting bp s  cholesterol  fasting blood sugar  \
0   40    1                2           140          289                    0   
1   49    0                3           160          180                    0   
2   37    1                2           130          283                    0   
3   48    0                4           138          214                    0   
4   54    1                3           150          195                    0   

   resting ecg  max heart rate  exercise angina  oldpeak  ST slope  target  
0            0             172                0      0.0         1       0  
1            0             156                0      1.0         2       1  
2            1              98                0      0.0         1       0  
3            0             108                1      1.5         2       1  
4            0             122                0      0.0         1       0  


In [8]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  1190 non-null   int64  
 1   sex                  1190 non-null   int64  
 2   chest pain type      1190 non-null   int64  
 3   resting bp s         1190 non-null   int64  
 4   cholesterol          1190 non-null   int64  
 5   fasting blood sugar  1190 non-null   int64  
 6   resting ecg          1190 non-null   int64  
 7   max heart rate       1190 non-null   int64  
 8   exercise angina      1190 non-null   int64  
 9   oldpeak              1190 non-null   float64
 10  ST slope             1190 non-null   int64  
 11  target               1190 non-null   int64  
dtypes: float64(1), int64(11)
memory usage: 111.7 KB
None


In [9]:
print(data["target"].value_counts())

target
1    629
0    561
Name: count, dtype: int64


In [10]:
labels = data["target"]
data = data.drop("target",axis=1)
data.describe()

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope
count,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0
mean,53.720168,0.763866,3.232773,132.153782,210.363866,0.213445,0.698319,139.732773,0.387395,0.922773,1.62437
std,9.358203,0.424884,0.93548,18.368823,101.420489,0.409912,0.870359,25.517636,0.48736,1.086337,0.610459
min,28.0,0.0,1.0,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,0.0
25%,47.0,1.0,3.0,120.0,188.0,0.0,0.0,121.0,0.0,0.0,1.0
50%,54.0,1.0,4.0,130.0,229.0,0.0,0.0,140.5,0.0,0.6,2.0
75%,60.0,1.0,4.0,140.0,269.75,0.0,2.0,160.0,1.0,1.6,2.0
max,77.0,1.0,4.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,3.0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(data,labels,test_size=0.2,random_state=42)

In [12]:
input_shape=X_train.shape[-1]
print(f"X TRAIN SHAPE: {X_train.shape}")
print(f"X TEST SHAPE: {X_test.shape}")
print(f"Y TRAIN SHAPE: {y_train.shape}")
print(f"Y TEST SHAPE: {y_test.shape}")
print(f"INPUT SHAPE: {input_shape}")


X TRAIN SHAPE: (952, 11)
X TEST SHAPE: (238, 11)
Y TRAIN SHAPE: (952,)
Y TEST SHAPE: (238,)
INPUT SHAPE: 11


In [13]:
categorical_columns = data.select_dtypes(include=['object']).columns.tolist()

In [14]:
cat_model = CatBoostClassifier(verbose=False, random_state=0, scale_pos_weight=3)
cat_model.fit(X_train, y_train, cat_features=categorical_columns, eval_set=(X_test, y_test))

# Predict on test set
y_pred = cat_model.predict(X_test)

# Calculate evaluation metrics
accuracy, recall, roc_auc, precision = [round(metric(y_test, y_pred), 4) for metric in [accuracy_score, recall_score, roc_auc_score, precision_score]]


In [15]:
y_pred = cat_model.predict(X_test)

In [16]:
model_names = ['CatBoost_Model']
result = pd.DataFrame({'Accuracy': accuracy, 'Recall': recall, 'Roc_Auc': roc_auc, 'Precision': precision}, index=model_names)

# Print results
print(result)

                Accuracy  Recall  Roc_Auc  Precision
CatBoost_Model    0.9454  0.9847    0.941     0.9214


In [17]:
model_dir = "model"
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

cat_model.save_model("model/cat_heart_model.cbm")

In [18]:
X_train, X_test, y_train, y_test = train_test_split(data,labels,test_size=0.2,random_state=42)
X_train.to_pickle("data/X_train.pkl")
X_test.to_pickle("data/X_test.pkl")
y_train.to_pickle("data/y_train.pkl")
y_test.to_pickle("data/y_test.pkl")

In [19]:
type(X_train)

pandas.core.frame.DataFrame

In [22]:
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import shap
import pandas as pd
import numpy as np
import streamlit as st
from matplotlib import pyplot as plt
from pyarrow import parquet as pq
from catboost import CatBoostClassifier, Pool
import joblib


def load_x_y(file_path):
    data = joblib.load(file_path)
    data.reset_index(drop=True, inplace=True)
    return data

def load_data():
    data = pd.read_csv(DATA_PATH)
    return data

def load_model():
    model = CatBoostClassifier()
    model.load_model(MODEL_PATH)
    return model

MODEL_PATH = "model/cat_heart_model.cbm"
DATA_PATH = "data/heart_statlog_cleveland_hungary_final.csv"

In [23]:
model = load_model()
data = load_data()

X_train = load_x_y("data/X_train.pkl")
X_test = load_x_y("data/X_test.pkl")
y_train = load_x_y("data/y_train.pkl")
y_test = load_x_y("data/y_test.pkl")


In [24]:
explainer = shap.TreeExplainer(model)
shap_values_cat_train = explainer.shap_values(X_train)

In [25]:
shap_values_cat_train[0]

array([ 0.37484048,  0.32963571,  0.75435997,  0.02100143,  0.50106484,
        0.54923338,  0.90873415, -0.09080419,  0.69458317,  0.89778111,
       -1.46553645])