In [4]:
import pandas as pd

# Load heart disease dataset
df_heart = pd.read_csv("../datasets/Heart_disease_cleveland_new.csv")

# Preview data
df_heart.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,0,145,233,1,2,150,0,2.3,2,0,2,0
1,67,1,3,160,286,0,2,108,1,1.5,1,3,1,1
2,67,1,3,120,229,0,2,129,1,2.6,1,2,3,1
3,37,1,2,130,250,0,0,187,0,3.5,2,0,1,0
4,41,0,1,130,204,0,2,172,0,1.4,0,0,1,0


In [5]:
df_heart.shape


(303, 14)

In [6]:
df_heart.columns


Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [7]:
df_heart.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [8]:
# Check target distribution
df_heart['target'].value_counts()


target
0    164
1    139
Name: count, dtype: int64

In [9]:
# Check missing values
df_heart.isnull().sum()


age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [10]:
# Check zero values in important columns
zero_cols = ['trestbps', 'chol', 'thalach']

(df_heart[zero_cols] == 0).sum()


trestbps    0
chol        0
thalach     0
dtype: int64

In [11]:
# Identify categorical columns
df_heart.dtypes


age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [12]:
# One-hot encode categorical columns
df_heart_encoded = pd.get_dummies(df_heart, drop_first=True)


In [13]:
# Separate features and target
X = df_heart_encoded.drop('target', axis=1)
y = df_heart_encoded['target']


In [14]:
X.shape, y.shape


((303, 13), (303,))

In [15]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [16]:
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler_heart = StandardScaler()

# Fit on training data only
X_train_scaled = scaler_heart.fit_transform(X_train)
X_test_scaled = scaler_heart.transform(X_test)


In [17]:
X_train_scaled.shape, X_test_scaled.shape


((242, 13), (61, 13))

In [18]:
from sklearn.linear_model import LogisticRegression


In [19]:
# Initialize Logistic Regression model
log_heart = LogisticRegression(max_iter=1000)

# Train the model
log_heart.fit(X_train_scaled, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [20]:
# Predict on test data
y_pred_log_heart = log_heart.predict(X_test_scaled)


In [21]:
from sklearn.metrics import accuracy_score, classification_report

# Accuracy
log_heart_acc = accuracy_score(y_test, y_pred_log_heart)
print("Heart Disease Logistic Regression Accuracy:", log_heart_acc)

# Detailed report
print(classification_report(y_test, y_pred_log_heart))


Heart Disease Logistic Regression Accuracy: 0.8852459016393442
              precision    recall  f1-score   support

           0       0.89      0.86      0.88        29
           1       0.88      0.91      0.89        32

    accuracy                           0.89        61
   macro avg       0.89      0.88      0.88        61
weighted avg       0.89      0.89      0.89        61



In [22]:
from sklearn.ensemble import RandomForestClassifier


In [23]:
# Initialize Random Forest
rf_heart = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

# Train the model
rf_heart.fit(X_train_scaled, y_train)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [24]:
# Predict on test data
y_pred_rf_heart = rf_heart.predict(X_test_scaled)


In [25]:
from sklearn.metrics import accuracy_score, classification_report

# Accuracy
rf_heart_acc = accuracy_score(y_test, y_pred_rf_heart)
print("Heart Disease Random Forest Accuracy:", rf_heart_acc)

# Detailed report
print(classification_report(y_test, y_pred_rf_heart))


Heart Disease Random Forest Accuracy: 0.8688524590163934
              precision    recall  f1-score   support

           0       0.84      0.90      0.87        29
           1       0.90      0.84      0.87        32

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61



In [26]:
import joblib


In [27]:
# Save Logistic Regression heart model
joblib.dump(log_heart, "../models/heart_disease_model.pkl")


['../models/heart_disease_model.pkl']

In [28]:
# Save scaler for heart disease
joblib.dump(scaler_heart, "../models/heart_disease_scaler.pkl")


['../models/heart_disease_scaler.pkl']

In [29]:
# Save feature names for backend use
heart_feature_names = X.columns.tolist()
heart_feature_names


['age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal']

In [30]:
import json

with open("../models/heart_features.json", "w") as f:
    json.dump(heart_feature_names, f)
