In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import joblib


In [26]:
df = pd.read_csv("../data/heart_disease.csv")

df["ca"].fillna(df["ca"].mean(), inplace=True)
df["thal"].fillna(df["thal"].mean(), inplace=True)
X = df.drop("num", axis=1)
y = df["num"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

print(type(X_train))  
print(X_train.head())


<class 'pandas.core.frame.DataFrame'>
     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
97    60    0   4       150   258    0        2      157      0      2.6   
82    39    1   3       140   321    0        2      182      0      0.0   
167   54    0   2       132   288    1        2      159      1      0.0   
288   56    1   2       130   221    0        2      163      0      0.0   
71    67    1   4       125   254    1        0      163      0      0.2   

     slope   ca  thal  
97       2  2.0   7.0  
82       1  0.0   3.0  
167      1  1.0   3.0  
288      1  0.0   7.0  
71       2  2.0   7.0  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["ca"].fillna(df["ca"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["thal"].fillna(df["thal"].mean(), inplace=True)


In [27]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [28]:
models = {
    "LogisticRegression": (
        LogisticRegression(max_iter=5000),
        {"model__C": [0.01, 0.1, 1, 10], "model__solver": ["lbfgs", "liblinear"]}
    ),
    "SVM": (
        SVC(probability=True),
        {"model__C": [0.1, 1, 10], "model__kernel": ["linear", "rbf"]}
    ),
    "RandomForest": (
        RandomForestClassifier(),
        {"model__n_estimators": [100, 200], "model__max_depth": [None, 5, 10]}
    ),
    "XGBoost": (
        XGBClassifier(eval_metric="mlogloss", use_label_encoder=False),
        {"model__n_estimators": [100, 200], "model__max_depth": [3, 5, 7]}
    )
}


In [29]:
best_models = {}

for name, (model, params) in models.items():
    pipe = Pipeline(steps=[("preprocessor", preprocessor),
                           ("model", model)])
    grid = GridSearchCV(pipe, param_grid=params, cv=3, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train, y_train)
    
    y_pred = grid.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    
    print(f"\n{name} Best Params: {grid.best_params_}")
    print(f"{name} Accuracy: {acc}")
    print(classification_report(y_test, y_pred))
    
    best_models[name] = (grid.best_estimator_, acc)





LogisticRegression Best Params: {'model__C': 1, 'model__solver': 'liblinear'}
LogisticRegression Accuracy: 0.6229508196721312
              precision    recall  f1-score   support

           0       0.79      0.94      0.86        33
           1       0.38      0.27      0.32        11
           2       0.17      0.14      0.15         7
           3       0.43      0.43      0.43         7
           4       0.00      0.00      0.00         3

    accuracy                           0.62        61
   macro avg       0.35      0.36      0.35        61
weighted avg       0.57      0.62      0.59        61


SVM Best Params: {'model__C': 1, 'model__kernel': 'rbf'}
SVM Accuracy: 0.5409836065573771
              precision    recall  f1-score   support

           0       0.76      0.94      0.84        33
           1       0.17      0.18      0.17        11
           2       0.00      0.00      0.00         7
           3       0.00      0.00      0.00         7
           4       0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



RandomForest Best Params: {'model__max_depth': None, 'model__n_estimators': 100}
RandomForest Accuracy: 0.5245901639344263
              precision    recall  f1-score   support

           0       0.79      0.91      0.85        33
           1       0.00      0.00      0.00        11
           2       0.00      0.00      0.00         7
           3       0.22      0.29      0.25         7
           4       0.00      0.00      0.00         3

    accuracy                           0.52        61
   macro avg       0.20      0.24      0.22        61
weighted avg       0.45      0.52      0.49        61



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost Best Params: {'model__max_depth': 3, 'model__n_estimators': 200}
XGBoost Accuracy: 0.5081967213114754
              precision    recall  f1-score   support

           0       0.82      0.82      0.82        33
           1       0.13      0.18      0.15        11
           2       0.17      0.14      0.15         7
           3       0.17      0.14      0.15         7
           4       0.00      0.00      0.00         3

    accuracy                           0.51        61
   macro avg       0.26      0.26      0.26        61
weighted avg       0.50      0.51      0.51        61



In [35]:
best_model_name = max(best_models, key=lambda x: best_models[x][1])
best_model, best_acc = best_models[best_model_name]

print(f"\nBest Model: {best_model_name} with Accuracy {best_acc}")

# Save the pipeline (preprocessing + model)
joblib.dump(best_model, "../models/final_model.pkl")



Best Model: LogisticRegression with Accuracy 0.6229508196721312


['../models/final_model.pkl']

In [37]:
import os
from sklearn.metrics import classification_report, accuracy_score
# Evaluate model
y_pred = grid.best_estimator_.predict(X_test)

acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Save metrics to file
with open("../results/evaluation_metrics.txt", "w") as f:
    f.write("Best Model Parameters:\n")
    f.write(str(grid.best_params_))
    f.write("\n\n")
    f.write(f"Accuracy: {acc:.4f}\n\n")
    f.write("Classification Report:\n")
    f.write(report)

print("Metrics saved in results/evaluation_metrics.txt")


Metrics saved in results/evaluation_metrics.txt
