In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE

In [4]:
# Step 1: Data Acquisition
# Load the Heart Disease UCI dataset
data = pd.read_csv('heart.csv')
print("heart.csv", data.head())

heart.csv    age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0  


In [None]:
# Step 2: Define Methodology and Objectives
# Objective: Predict heart disease risk based on health metrics to support SDG 3.
# Methodology: Explore data, preprocess, use classification models, and evaluate results.

In [6]:
# Step 3: Data Preprocessing
# 3.1 Check for missing values
print("\nMissing Values:\n", data.isnull().sum())

# 3.3 Encode categorical variables if present
data = pd.get_dummies(data, drop_first=True)

# 3.2 Fill missing values with column medians
data.fillna(data.median(), inplace=True)

# 3.4 Separate features and target variable
X = data.drop('target', axis=1)  # Features
y = data['target']  # Target variable

# Convert 'num' to binary classification: 0 (No disease), 1 (Disease)
y = y.apply(lambda x: 1 if x > 0 else 0)

# 3.5 Handle class imbalance using SMOTE
smote = SMOTE()
X_res, y_res = smote.fit_resample(X, y)

# 3.6 Standardize the features for better model performance
scaler = StandardScaler()
X_res = scaler.fit_transform(X_res)

print("\nMissing Values:\n", data.isnull().sum())


Missing Values:
 age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

Missing Values:
 age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [7]:
# Step 4: Model Selection and Validation
# Define models to evaluate
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

# Perform K-Fold Cross Validation to evaluate each model
print("\nModel Validation Results (Accuracy):")
for model_name, model in models.items():
    scores = cross_val_score(model, X_res, y_res, cv=10, scoring='accuracy')
    print(f"{model_name} - Accuracy: {scores.mean():.2f}")



Model Validation Results (Accuracy):
Logistic Regression - Accuracy: 0.85
Decision Tree - Accuracy: 0.99
Random Forest - Accuracy: 1.00


In [8]:
# Step 5: Comparing Results with Multiple Metrics
# Split the data into train and test sets for evaluation
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Train and evaluate each model on the test set
print("\nDetailed Model Evaluation:")
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Display results
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(f"Precision: {precision_score(y_test, y_pred):.2f}")
    print(f"Recall: {recall_score(y_test, y_pred):.2f}")
    print(f"F1-Score: {f1_score(y_test, y_pred):.2f}")
    
    # AUC Score - Only for models that support it
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
        print(f"AUC Score: {roc_auc_score(y_test, y_proba):.2f}")


Detailed Model Evaluation:

Logistic Regression Results:
Accuracy: 0.84
Precision: 0.83
Recall: 0.89
F1-Score: 0.86
AUC Score: 0.93

Decision Tree Results:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-Score: 1.00
AUC Score: 1.00

Random Forest Results:
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-Score: 1.00
AUC Score: 1.00


In [1]:
import joblib
# Assuming your model variable is `model`
joblib.dump(model, 'model.pkl')

NameError: name 'model' is not defined