In [None]:
# =========================
# 1. IMPORTS
# =========================
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_auc_score



In [None]:
# =========================
# 2. LOAD DATA
# =========================
data = pd.read_excel("heart_disease_data.xlsx")

data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restceg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,1,1,110,264,0,0,132,0,1.2,2,0,7,1
299,68,1,4,144,193,1,0,141,0,3.4,2,2,7,2
300,57,1,4,130,131,0,0,115,1,1.2,2,1,7,3
301,57,0,2,130,236,0,2,174,0,0.0,2,1,3,1


In [None]:
data.info() #something is there to be handled not all the dataset is in numeric

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restceg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    object 
 12  thal      303 non-null    object 
 13  num       303 non-null    int64  
dtypes: float64(1), int64(11), object(2)
memory usage: 33.3+ KB


In [None]:
data['ca'].unique()

array([0, 3, 2, 1, '?'], dtype=object)

In [None]:
data['thal'].unique()

array([6, 3, 7, '?'], dtype=object)

In [1]:
# =========================
# 3. FIX DIRTY VALUES (CRITICAL)
# =========================
data = data.replace('?', np.nan).infer_objects(copy=False)


for col in ['ca', 'thal']:
    data[col] = pd.to_numeric(data[col], errors='coerce')


NameError: name 'data' is not defined

In [None]:
data['disease']=(data['num']>0).astype('int')

In [None]:
# =========================
# 3. STRATIFIED SPLIT
# =========================
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_idx, test_idx in split.split(data, data['disease']):
    train = data.loc[train_idx]
    test  = data.loc[test_idx]

X_train = train.drop(columns=['num', 'disease'])
y_train = train['disease']

X_test = test.drop(columns=['num', 'disease'])
y_test = test['disease']


In [None]:
# =========================
# 4. FEATURE GROUPS
# =========================
numeric_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categorical_features = ['sex','cp','fbs','restceg','exang','slope','ca','thal']


In [None]:
# =========================
# 5. PREPROCESSOR
# =========================
numeric_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_features),
    ('cat', categorical_pipeline, categorical_features)
])


In [None]:
# =========================
# 9. FINAL MODEL PIPELINE
# =========================
model = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, solver='liblinear'))
])


# =========================
# 10. TRAIN
# =========================
model.fit(X_train, y_train)
 

In [None]:
# =========================
# 11. EVALUATE
# =========================
y_pred = model.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


Confusion Matrix:
 [[29  4]
 [ 3 25]]
Accuracy: 0.8852459016393442
Precision: 0.8620689655172413
Recall: 0.8928571428571429
F1 Score: 0.8771929824561403


In [None]:
y_proba = model.predict_proba(X_test)[:, 1]
print("F1:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

print(classification_report(y_test, y_pred))

F1: 0.8771929824561403
ROC-AUC: 0.9653679653679654
              precision    recall  f1-score   support

           0       0.91      0.88      0.89        33
           1       0.86      0.89      0.88        28

    accuracy                           0.89        61
   macro avg       0.88      0.89      0.88        61
weighted avg       0.89      0.89      0.89        61



In [None]:
log_reg = model.named_steps['classifier']


In [None]:
log_reg.coef_

array([[ 1.01423117e-03,  2.50832087e-01,  1.51064576e-01,
        -4.27082000e-01,  3.54453707e-01,  9.15037377e-01,
        -4.35619413e-02, -6.37444809e-01,  9.12135917e-01,
        -5.62677640e-02, -1.19237226e-01,  2.50039121e-01,
         4.15839202e-01,  5.75670996e-01, -2.42658436e-01,
         1.36977239e+00,  1.27732406e+00,  1.06513218e+00,
         9.02007729e-02,  1.20337298e+00]])

In [None]:
feature_names = model.named_steps['preprocess'].get_feature_names_out()
feature_names

array(['num__age', 'num__trestbps', 'num__chol', 'num__thalach',
       'num__oldpeak', 'cat__sex_1.0', 'cat__cp_2.0', 'cat__cp_3.0',
       'cat__cp_4.0', 'cat__fbs_1.0', 'cat__restceg_1.0',
       'cat__restceg_2.0', 'cat__exang_1.0', 'cat__slope_2.0',
       'cat__slope_3.0', 'cat__ca_1.0', 'cat__ca_2.0', 'cat__ca_3.0',
       'cat__thal_6.0', 'cat__thal_7.0'], dtype=object)

In [None]:
coef_df = pd.DataFrame({
    "feature": feature_names,
    "coefficient": log_reg.coef_[0]
}).sort_values(by="coefficient", ascending=False)

coef_df

Unnamed: 0,feature,coefficient
15,cat__ca_1.0,1.369772
16,cat__ca_2.0,1.277324
19,cat__thal_7.0,1.203373
17,cat__ca_3.0,1.065132
5,cat__sex_1.0,0.915037
8,cat__cp_4.0,0.912136
13,cat__slope_2.0,0.575671
12,cat__exang_1.0,0.415839
4,num__oldpeak,0.354454
1,num__trestbps,0.250832


In [None]:
y_pr = model.predict_proba(X_test)[:, 1]
y_pr

array([0.24012507, 0.66533124, 0.04770177, 0.07898365, 0.35147895,
       0.05606242, 0.10644573, 0.42949879, 0.6932451 , 0.36629023,
       0.93971459, 0.10560643, 0.04482219, 0.76882256, 0.95240225,
       0.94712404, 0.67715935, 0.73646877, 0.63218709, 0.96411003,
       0.68622748, 0.06153482, 0.88094195, 0.15325139, 0.8342186 ,
       0.05348459, 0.11652641, 0.69861727, 0.08640981, 0.95781972,
       0.27125927, 0.89331342, 0.47832312, 0.48327177, 0.17225506,
       0.80748465, 0.18114494, 0.29206787, 0.05495796, 0.81985112,
       0.71354861, 0.96969542, 0.9603898 , 0.89880429, 0.18826508,
       0.06577064, 0.91756897, 0.05238362, 0.9494345 , 0.08564933,
       0.03546227, 0.97914741, 0.45290488, 0.07061313, 0.46520779,
       0.95744319, 0.79699894, 0.71020589, 0.02861673, 0.07769904,
       0.93451848])

In [None]:

thresholds = [0.5, 0.4, 0.3,0.2]

for t in thresholds:
    y_pred_t = (y_pr >= t).astype(int)
    print(f"\nThreshold: {t}")
    print(confusion_matrix(y_test, y_pred_t))
    print("Recall:", recall_score(y_test, y_pred_t))
    print("acc:", accuracy_score(y_test, y_pred_t))
    print("Pre:", precision_score(y_test, y_pred_t))
    print("f1:", f1_score(y_test, y_pred_t))
    


Threshold: 0.5
[[29  4]
 [ 3 25]]
Recall: 0.8928571428571429
acc: 0.8852459016393442
Pre: 0.8620689655172413
f1: 0.8771929824561403

Threshold: 0.4
[[26  7]
 [ 1 27]]
Recall: 0.9642857142857143
acc: 0.8688524590163934
Pre: 0.7941176470588235
f1: 0.8709677419354839

Threshold: 0.3
[[24  9]
 [ 1 27]]
Recall: 0.9642857142857143
acc: 0.8360655737704918
Pre: 0.75
f1: 0.84375

Threshold: 0.2
[[22 11]
 [ 0 28]]
Recall: 1.0
acc: 0.819672131147541
Pre: 0.717948717948718
f1: 0.835820895522388


In [None]:
import joblib

joblib.dump(model, "heart_disease_pipeline.pkl")


['heart_disease_pipeline.pkl']

In [None]:
loaded_model = joblib.load("heart_disease_pipeline.pkl")
print("Model loaded successfully")


Model loaded successfully
