<a href="https://colab.research.google.com/github/Malkhedchetan/heart-check-ai/blob/main/heart_disease.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [8]:
df=pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,32,0,0,131,266,0,0,191,0,0.8,1,0,2,0
1,39,0,1,133,213,0,0,203,0,1.3,1,1,2,0
2,46,0,2,130,232,0,1,199,1,0.7,0,3,2,1
3,63,1,2,137,293,0,0,185,0,0.2,1,0,2,1
4,42,0,3,152,147,0,1,191,1,0.9,0,1,3,1


In [None]:


# Separate features & target
X = df.drop(columns=["target"])
y = df["target"]

# Categorical columns for one-hot
cat_cols = ["cp","restecg","slope","thal"]

# Convert categorical to dummies
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# Scale numeric columns
from sklearn.preprocessing import StandardScaler

num_cols = ["age","trestbps","chol","thalach","oldpeak"]

scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

print("Final X shape =", X.shape)


Final X shape = (56748, 18)


In [None]:
df["target"].isna().sum()
df = df.dropna()


In [None]:
df = df.dropna(subset=["target"])


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(X_train.shape, X_test.shape)


(45398, 18) (11350, 18)


In [None]:
!pip install xgboost

from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss"
)

model.fit(X_train, y_train)




In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9845814977973568

Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      0.98      0.98      5661
         1.0       0.98      0.99      0.98      5689

    accuracy                           0.98     11350
   macro avg       0.98      0.98      0.98     11350
weighted avg       0.98      0.98      0.98     11350



In [None]:
print("\n====== Training Multiple Models ======\n")

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

models = {
    "XGBoost": XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss"
    ),
    "LightGBM": LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=300,
        max_depth=15
    ),
    "Logistic Regression": LogisticRegression(max_iter=500)
}






In [None]:
results = {}

for name, mdl in models.items():
    print(f"\nTraining {name}...")
    mdl.fit(X_train, y_train)

    preds = mdl.predict(X_test)
    acc = accuracy_score(y_test, preds)
    results[name] = acc

    print(f"{name} Accuracy: {acc:.4f}")



Training XGBoost...
XGBoost Accuracy: 0.9846

Training LightGBM...
[LightGBM] [Info] Number of positive: 22757, number of negative: 22641
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004817 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 569
[LightGBM] [Info] Number of data points in the train set: 45398, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501278 -> initscore=0.005110
[LightGBM] [Info] Start training from score 0.005110
LightGBM Accuracy: 0.9874

Training Random Forest...
Random Forest Accuracy: 0.9711

Training Logistic Regression...
Logistic Regression Accuracy: 0.9686


In [None]:
print("\n====== Final Model Accuracies ======\n")
for model_name, acc in results.items():
    print(f"{model_name}: {acc:.4f}")

best_model_name = max(results, key=results.get)
best_accuracy = results[best_model_name]

print(f"\nðŸ”¥ BEST MODEL = {best_model_name} (Accuracy: {best_accuracy:.4f})")




XGBoost: 0.9846
LightGBM: 0.9874
Random Forest: 0.9711
Logistic Regression: 0.9686

ðŸ”¥ BEST MODEL = LightGBM (Accuracy: 0.9874)


In [None]:
# Mount Google Drive (run once)
from google.colab import drive
drive.mount('/content/drive')

import pickle

# Use the correct best model name from your accuracy block
best_model1 = models[best_model_name]   # <-- FIXED

# Save path inside Google Drive
save_path = "/content/drive/MyDrive/best_heart_model1.pkl"

# Save model
with open(save_path, "wb") as f:
    pickle.dump(best_model1, f)

print("âœ… Model saved permanently at:", save_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
âœ… Model saved permanently at: /content/drive/MyDrive/best_heart_model1.pkl


In [None]:
artifact = {
    "model": best_model,          # your best model
    "scaler": scaler,             # your fitted scaler
    "columns": X.columns.tolist() # final feature order
}

import pickle
import os

save_path = "/content/drive/MyDrive/heart_artifact.pkl"

with open(save_path, "wb") as f:
    pickle.dump(artifact, f)

print("Saved artifact:", save_path)


Saved artifact: /content/drive/MyDrive/heart_artifact.pkl


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pickle

model_path = "/content/drive/MyDrive/best_heart_model1.pkl"  # <-- change if needed

with open(model_path, "rb") as f:
    model = pickle.load(f)

print("Model loaded!")

# Print the exact feature names used during training
try:
    print(model.feature_name_)
except:
    print("Model expects:", model.n_features_in_, "features")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model loaded!
['age', 'sex', 'trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak', 'ca', 'cp_1', 'cp_2', 'cp_3', 'restecg_1', 'restecg_2', 'slope_1', 'slope_2', 'thal_2', 'thal_3']


In [None]:
import pickle
import numpy as np
import pandas as pd

feature_names = [
    'age', 'sex', 'trestbps', 'chol', 'fbs', 'thalach', 'exang',
    'oldpeak', 'ca', 'cp_1', 'cp_2', 'cp_3', 'restecg_1', 'restecg_2',
    'slope_1', 'slope_2', 'thal_2', 'thal_3'
]

model = pickle.load(open("/content/drive/MyDrive/best_heart_model1.pkl", "rb"))

def prepare_input_df(age, sex, trestbps, chol, fbs, thalach, exang, oldpeak, ca,
                     cp, restecg, slope, thal):

    cp_1 = 1 if cp == 1 else 0
    cp_2 = 1 if cp == 2 else 0
    cp_3 = 1 if cp == 3 else 0

    restecg_1 = 1 if restecg == 1 else 0
    restecg_2 = 1 if restecg == 2 else 0

    slope_1 = 1 if slope == 1 else 0
    slope_2 = 1 if slope == 2 else 0

    thal_2 = 1 if thal == 2 else 0
    thal_3 = 1 if thal == 3 else 0

    values = [[
        age, sex, trestbps, chol, fbs, thalach, exang,
        oldpeak, ca, cp_1, cp_2, cp_3,
        restecg_1, restecg_2, slope_1, slope_2,
        thal_2, thal_3
    ]]

    return pd.DataFrame(values, columns=feature_names)

X = prepare_input_df(35, 1, 120, 180, 0, 170, 0, 1.0, 0, 0, 0, 0, 1)

pred = model.predict(X)[0]

if pred == 1:
    print("ðŸ©º Person likely HAS heart disease")
else:
    print("ðŸ’š Person is HEALTHY")


ðŸ’š Person is HEALTHY


In [None]:
import pickle
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load your existing model
with open("/content/drive/MyDrive/best_heart_model1.pkl", "rb") as f:
    model = pickle.load(f)

# Extract model feature names (18 features)
columns = model.feature_name_

# Create a dummy scaler that does nothing (identity transform)
scaler = StandardScaler()
# Fit scaler on zeros (so it won't change inputs)
scaler.fit(np.zeros((1, len(columns))))

# Create artifact dictionary
artifact = {
    "model": model,
    "scaler": scaler,
    "columns": columns
}

# Save new artifact file
with open("/content/drive/MyDrive/heart_artifact.pkl", "wb") as f:
    pickle.dump(artifact, f)

print("heart_artifact.pkl created successfully!")


heart_artifact.pkl created successfully!


In [None]:
import pickle
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load your existing model
with open("/content/drive/MyDrive/best_heart_model1.pkl", "rb") as f:
    model = pickle.load(f)

# Extract model feature names (the 18 one-hot encoded columns)
columns = model.feature_name_

# These are the numeric columns YOUR streamlit code scales
num_cols = ["age","trestbps","chol","thalach","oldpeak"]

# Create a scaler that handles only these 5 columns
scaler = StandardScaler()
scaler.fit(np.zeros((1, len(num_cols))))   # identity scaler

# Build FINAL artifact
artifact = {
    "model": model,
    "scaler": scaler,
    "columns": columns,
    "num_cols": num_cols   # <--- VERY IMPORTANT
}

with open("/content/drive/MyDrive/heart_artifact1.pkl", "wb") as f:
    pickle.dump(artifact, f)

print("ðŸŽ‰ heart_artifact.pkl created successfully with num_cols included!")


ðŸŽ‰ heart_artifact.pkl created successfully with num_cols included!


In [None]:
plt.figure(figsize=(10,6))

plt.scatter(df["age"], df["thalach"], alpha=0.35, s=25)

plt.title("Age vs Maximum Heart Rate", fontsize=18, fontweight='bold')
plt.xlabel("Age", fontsize=14)
plt.ylabel("Thalach (Max Heart Rate)", fontsize=14)
plt.grid(alpha=0.25)
plt.show()


In [None]:
plt.figure(figsize=(10,6))

plt.scatter(df["chol"], df["oldpeak"], alpha=0.35, s=25)

plt.title("Cholesterol vs ST Depression (Oldpeak)", fontsize=18, fontweight='bold')
plt.xlabel("Cholesterol", fontsize=14)
plt.ylabel("Oldpeak", fontsize=14)
plt.grid(alpha=0.25)
plt.show()


In [None]:
plt.figure(figsize=(10,6))

plt.scatter(df["age"], df["trestbps"], alpha=0.35, s=25)

plt.title("Age vs Resting Blood Pressure", fontsize=18, fontweight='bold')
plt.xlabel("Age", fontsize=14)
plt.ylabel("Resting Blood Pressure (trestbps)", fontsize=14)
plt.grid(alpha=0.25)
plt.show()


In [None]:
plt.figure(figsize=(10,6))

plt.scatter(df["age"], df["target"], alpha=0.35, s=25)

plt.title("Age vs Heart Disease (0 = No, 1 = Yes)", fontsize=18, fontweight='bold')
plt.xlabel("Age", fontsize=14)
plt.ylabel("Heart Disease", fontsize=14)
plt.grid(alpha=0.25)
plt.show()
