In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, r2_score
from xgboost import XGBClassifier


In [2]:
#Load Dataset as dataFrame
df = pd.read_csv('heart_disease_health_indicators.csv')
df.shape

(253680, 22)

In [3]:
#Checking for duplicate values
df.duplicated().sum()

23899

## Insights:
- Dataset have 23899 duplicated values

In [4]:
df=df.drop_duplicates()

In [5]:
df.duplicated().sum()

0

## Removed Duplicated values

In [7]:
df.shape

(229781, 22)

In [8]:
df.head()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [9]:
X = df.drop('HeartDiseaseorAttack', axis=1) # Feature Variable
y = df['HeartDiseaseorAttack']  # Target Variable

In [10]:
print(X.shape)
print(y.shape)

(229781, 21)
(229781,)


In [11]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Initialize a DataFrame to capture metrics
metrics_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'R2 Score'])
print(metrics_df)

Empty DataFrame
Columns: [Model, Accuracy, Precision, Recall, F1 Score, ROC AUC, R2 Score]
Index: []


In [13]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'XGBoost': XGBClassifier(eval_metric='logloss')
}

In [14]:
# Train and evaluate each model
for model_name, model in models.items():
    # Create and fit the model pipeline
    model_pipeline = Pipeline([
        ('scaler', StandardScaler()),  
        ('model', model)
    ])
    
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)
    y_prob = model_pipeline.predict_proba(X_test)[:, 1]
    
    # Compute metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)
    r2 = r2_score(y_test, y_prob)
    
    # Create a DataFrame for the current model's metrics
    current_metrics = pd.DataFrame([{
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc,
        'R2 Score': r2
    }])
    
    # Concatenate the current metrics with the overall metrics DataFrame
    metrics_df = pd.concat([metrics_df, current_metrics], ignore_index=True)
    #printing metrics_dataframe for each iteration
    print(metrics_df)


                 Model  Accuracy  Precision    Recall  F1 Score   ROC AUC  \
0  Logistic Regression   0.90032   0.551591  0.122014  0.199825  0.836531   

   R2 Score  
0  0.172277  
                 Model  Accuracy  Precision    Recall  F1 Score   ROC AUC  \
0  Logistic Regression  0.900320   0.551591  0.122014  0.199825  0.836531   
1        Decision Tree  0.835194   0.231884  0.266212  0.247865  0.584003   

   R2 Score  
0  0.172277  
1 -0.806921  
                 Model  Accuracy  Precision    Recall  F1 Score   ROC AUC  \
0  Logistic Regression  0.900320   0.551591  0.122014  0.199825  0.836531   
1        Decision Tree  0.835194   0.231884  0.266212  0.247865  0.584003   
2        Random Forest  0.894249   0.428808  0.110495  0.175712  0.803937   

   R2 Score  
0  0.172277  
1 -0.806921  
2  0.110648  
                 Model  Accuracy  Precision    Recall  F1 Score   ROC AUC  \
0  Logistic Regression  0.900320   0.551591  0.122014  0.199825  0.836531   
1        Decision Tree  

In [15]:
# Print the metrics DataFrame
print(metrics_df)

                 Model  Accuracy  Precision    Recall  F1 Score   ROC AUC  \
0  Logistic Regression  0.900320   0.551591  0.122014  0.199825  0.836531   
1        Decision Tree  0.835194   0.231884  0.266212  0.247865  0.584003   
2        Random Forest  0.894249   0.428808  0.110495  0.175712  0.803937   
3    Gradient Boosting  0.901038   0.580460  0.107722  0.181720  0.839132   
4  K-Nearest Neighbors  0.888222   0.378845  0.149744  0.214646  0.713725   
5              XGBoost  0.900385   0.554563  0.119241  0.196278  0.834515   

   R2 Score  
0  0.172277  
1 -0.806921  
2  0.110648  
3  0.177835  
4  0.005925  
5  0.168077  


## Conclusion
- Gradient Boosting performs the best
- Accuracy: 0.901038
- Precision: 0.580460
- Recall: 0.107722
- F1 Score: 0.181720
- ROC AUC: 0.839132
- R2 Score: 0.177835