In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE

In [4]:
# Step 1: Data Acquisition
# Load the Heart Disease UCI dataset
data = pd.read_csv('heart.csv')
print("Data Sample:\n", data.head())

Data Sample:
    id  age     sex    dataset               cp  trestbps   chol    fbs  \
0   1   63    Male  Cleveland   typical angina     145.0  233.0   True   
1   2   67    Male  Cleveland     asymptomatic     160.0  286.0  False   
2   3   67    Male  Cleveland     asymptomatic     120.0  229.0  False   
3   4   37    Male  Cleveland      non-anginal     130.0  250.0  False   
4   5   41  Female  Cleveland  atypical angina     130.0  204.0  False   

          restecg  thalch  exang  oldpeak        slope   ca  \
0  lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1  lv hypertrophy   108.0   True      1.5         flat  3.0   
2  lv hypertrophy   129.0   True      2.6         flat  2.0   
3          normal   187.0  False      3.5  downsloping  0.0   
4  lv hypertrophy   172.0  False      1.4    upsloping  0.0   

                thal  num  
0       fixed defect    0  
1             normal    2  
2  reversable defect    1  
3             normal    0  
4             normal  

In [None]:
# Step 2: Define Methodology and Objectives
# Objective: Predict heart disease risk based on health metrics to support SDG 3.
# Methodology: Explore data, preprocess, use classification models, and evaluate results.

In [5]:
# Step 3: Data Preprocessing
# 3.1 Check for missing values
print("\nMissing Values:\n", data.isnull().sum())

# 3.3 Encode categorical variables if present
data = pd.get_dummies(data, drop_first=True)

# 3.2 Fill missing values with column medians
data.fillna(data.median(), inplace=True)

# 3.4 Separate features and target variable
X = data.drop('num', axis=1)  # Features
y = data['num']  # Target variable

# Convert 'num' to binary classification: 0 (No disease), 1 (Disease)
y = y.apply(lambda x: 1 if x > 0 else 0)

# 3.5 Handle class imbalance using SMOTE
smote = SMOTE()
X_res, y_res = smote.fit_resample(X, y)

# 3.6 Standardize the features for better model performance
scaler = StandardScaler()
X_res = scaler.fit_transform(X_res)

print("\nMissing Values:\n", data.isnull().sum())


Missing Values:
 id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

Missing Values:
 id                          0
age                         0
trestbps                    0
chol                        0
thalch                      0
oldpeak                     0
ca                          0
num                         0
sex_Male                    0
dataset_Hungary             0
dataset_Switzerland         0
dataset_VA Long Beach       0
cp_atypical angina          0
cp_non-anginal              0
cp_typical angina           0
fbs_True                    0
restecg_normal              0
restecg_st-t abnormality    0
exang_True                  0
slope_flat                  0
slope_upsloping             0
thal_normal                 0
thal_reversable defect      0
dtype

In [6]:
# Step 4: Model Selection and Validation
# Define models to evaluate
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

# Perform K-Fold Cross Validation to evaluate each model
print("\nModel Validation Results (Accuracy):")
for model_name, model in models.items():
    scores = cross_val_score(model, X_res, y_res, cv=10, scoring='accuracy')
    print(f"{model_name} - Accuracy: {scores.mean():.2f}")



Model Validation Results (Accuracy):
Logistic Regression - Accuracy: 0.80
Decision Tree - Accuracy: 0.72
Random Forest - Accuracy: 0.80


In [None]:
# Step 5: Comparing Results with Multiple Metrics
# Split the data into train and test sets for evaluation
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Train and evaluate each model on the test set
print("\nDetailed Model Evaluation:")
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Display results
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(f"Precision: {precision_score(y_test, y_pred):.2f}")
    print(f"Recall: {recall_score(y_test, y_pred):.2f}")
    print(f"F1-Score: {f1_score(y_test, y_pred):.2f}")
    
    # AUC Score - Only for models that support it
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
        print(f"AUC Score: {roc_auc_score(y_test, y_proba):.2f}")


Detailed Model Evaluation:

Logistic Regression Results:
Accuracy: 0.85
Precision: 0.88
Recall: 0.83
F1-Score: 0.86
AUC Score: 0.94

Decision Tree Results:
Accuracy: 0.81
Precision: 0.84
Recall: 0.80
F1-Score: 0.82
AUC Score: 0.81

Random Forest Results:
Accuracy: 0.89
Precision: 0.91
Recall: 0.87
F1-Score: 0.89
AUC Score: 0.95
