<a href="https://colab.research.google.com/github/MohammadAsad0/Medical-Diagnosis-Risk-Scoring-using-Bayesian-Networks/blob/main/PM_ML_Baseline_Heart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas numpy scikit-learn ucimlrepo



In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from xgboost import XGBClassifier

import matplotlib.pyplot as plt
import seaborn as sns

# Import for dataset
from ucimlrepo import fetch_ucirepo

In [None]:
# Load Dataset
heart_disease = fetch_ucirepo(id=45)

X = heart_disease.data.features
y = heart_disease.data.targets

print("X shape: ", X.shape)
print("y shape: ", y.shape)

print(X.describe())
y.value_counts()

X shape:  (303, 13)
y shape:  (303, 1)
              age         sex          cp    trestbps        chol         fbs  \
count  303.000000  303.000000  303.000000  303.000000  303.000000  303.000000   
mean    54.438944    0.679868    3.158416  131.689769  246.693069    0.148515   
std      9.038662    0.467299    0.960126   17.599748   51.776918    0.356198   
min     29.000000    0.000000    1.000000   94.000000  126.000000    0.000000   
25%     48.000000    0.000000    3.000000  120.000000  211.000000    0.000000   
50%     56.000000    1.000000    3.000000  130.000000  241.000000    0.000000   
75%     61.000000    1.000000    4.000000  140.000000  275.000000    0.000000   
max     77.000000    1.000000    4.000000  200.000000  564.000000    1.000000   

          restecg     thalach       exang     oldpeak       slope          ca  \
count  303.000000  303.000000  303.000000  303.000000  303.000000  299.000000   
mean     0.990099  149.607261    0.326733    1.039604    1.600660    

Unnamed: 0_level_0,count
num,Unnamed: 1_level_1
0,164
1,55
2,36
3,35
4,13


# Preprocessing

In [None]:
# Concatenate X and y
df = pd.concat([X, y], axis=1)

# Drop samples with null values
df = df.dropna().reset_index(drop=True)

df['num'] = (df['num'] > 0).astype(int)

# Features & Split
X = df.drop('num', axis=1)
y = df['num']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale Features (for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Binary Class

## Logistic Regression

In [None]:
# Train Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_lr = lr_model.predict(X_test_scaled)
y_pred_proba_lr = lr_model.predict_proba(X_test_scaled)[:, 1]

# Evaluation
lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_auc = roc_auc_score(y_test, y_pred_proba_lr)

print(f"\nTest Accuracy: {lr_accuracy:.4f}")
print(f"AUC-ROC Score: {lr_auc:.4f}")

# Cross-validation
cv_scores_lr = cross_val_score(lr_model, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"\n5-Fold CV Accuracy: {cv_scores_lr.mean():.4f} (+/- {cv_scores_lr.std():.4f})")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr, target_names=['No Disease', 'Disease']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))

# Feature importance for Logistic Regression
feature_importance_lr = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': np.abs(lr_model.coef_[0])
}).sort_values('Coefficient', ascending=False)

print("\nTop 5 Important Features (Logistic Regression):")
print(feature_importance_lr.head())


Test Accuracy: 0.8333
AUC-ROC Score: 0.9498

5-Fold CV Accuracy: 0.8221 (+/- 0.0764)

Classification Report:
              precision    recall  f1-score   support

  No Disease       0.82      0.88      0.85        32
     Disease       0.85      0.79      0.81        28

    accuracy                           0.83        60
   macro avg       0.83      0.83      0.83        60
weighted avg       0.83      0.83      0.83        60


Confusion Matrix:
[[28  4]
 [ 6 22]]

Top 5 Important Features (Logistic Regression):
    Feature  Coefficient
11       ca     0.958461
12     thal     0.738613
2        cp     0.507960
1       sex     0.484582
9   oldpeak     0.444296


## XGBoost

In [None]:
# Train XGBoost
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42,
    eval_metric='logloss'
)

xgb_model.fit(X_train, y_train)

# Predictions
y_pred_xgb = xgb_model.predict(X_test)
y_pred_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

# Evaluation
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_auc = roc_auc_score(y_test, y_pred_proba_xgb)

print(f"\nTest Accuracy: {xgb_accuracy:.4f}")
print(f"AUC-ROC Score: {xgb_auc:.4f}")

# Cross-validation
cv_scores_xgb = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"\n5-Fold CV Accuracy: {cv_scores_xgb.mean():.4f} (+/- {cv_scores_xgb.std():.4f})")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=['No Disease', 'Disease']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))

# Feature importance for XGBoost
feature_importance_xgb = pd.DataFrame({
    'Feature': X.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 5 Important Features (XGBoost):")
print(feature_importance_xgb.head())


Test Accuracy: 0.8333
AUC-ROC Score: 0.9141

5-Fold CV Accuracy: 0.8010 (+/- 0.0843)

Classification Report:
              precision    recall  f1-score   support

  No Disease       0.82      0.88      0.85        32
     Disease       0.85      0.79      0.81        28

    accuracy                           0.83        60
   macro avg       0.83      0.83      0.83        60
weighted avg       0.83      0.83      0.83        60


Confusion Matrix:
[[28  4]
 [ 6 22]]

Top 5 Important Features (XGBoost):
   Feature  Importance
12    thal    0.268721
2       cp    0.189275
11      ca    0.098394
10   slope    0.083740
8    exang    0.063999


## Comparison

In [None]:
comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'XGBoost'],
    'Test Accuracy': [lr_accuracy, xgb_accuracy],
    'AUC-ROC': [lr_auc, xgb_auc],
    'CV Accuracy': [cv_scores_lr.mean(), cv_scores_xgb.mean()]
})

print("\n", comparison.to_string(index=False))


               Model  Test Accuracy  AUC-ROC  CV Accuracy
Logistic Regression       0.833333 0.949777     0.822074
            XGBoost       0.833333 0.914062     0.800975


# Multi Class

# Preprocessing

In [None]:
X = heart_disease.data.features
y = heart_disease.data.targets

# Concatenate X and y
df = pd.concat([X, y], axis=1)

# Drop samples with null values
df = df.dropna().reset_index(drop=True)

# Features & Split
X = df.drop('num', axis=1)
y = df['num']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale Features (for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Logistic Regression

In [None]:
# Train Logistic Regression with multiclass support
lr_model = LogisticRegression(
    max_iter=1000,
    random_state=42,
    solver='lbfgs'
)
lr_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_lr = lr_model.predict(X_test_scaled)
y_pred_proba_lr = lr_model.predict_proba(X_test_scaled)

# Evaluation
lr_accuracy = accuracy_score(y_test, y_pred_lr)

print(f"\nTest Accuracy: {lr_accuracy:.4f}")

# For multiclass, calculate AUC using ovr (one-vs-rest)
try:
    lr_auc = roc_auc_score(y_test, y_pred_proba_lr, multi_class='ovr', average='weighted')
    print(f"Weighted AUC-ROC Score (OvR): {lr_auc:.4f}")
except:
    print("AUC-ROC: Not calculated (some classes may not be present in test set)")

# Cross-validation
cv_scores_lr = cross_val_score(lr_model, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"\n5-Fold CV Accuracy: {cv_scores_lr.mean():.4f} (+/- {cv_scores_lr.std():.4f})")

print("\nClassification Report:")
class_names = [f'Class {i}' for i in sorted(y.unique())]
print(classification_report(y_test, y_pred_lr, target_names=class_names, zero_division=0))

print("\nConfusion Matrix:")
cm_lr = confusion_matrix(y_test, y_pred_lr)
print(cm_lr)
print("\nConfusion Matrix (rows=actual, columns=predicted)")
cm_df_lr = pd.DataFrame(cm_lr,
                         index=[f'True {i}' for i in sorted(y.unique())],
                         columns=[f'Pred {i}' for i in sorted(y.unique())])
print(cm_df_lr)

# Feature importance for Logistic Regression (average across classes)
feature_importance_lr = pd.DataFrame({
    'Feature': X.columns,
    'Avg_Abs_Coefficient': np.abs(lr_model.coef_).mean(axis=0)
}).sort_values('Avg_Abs_Coefficient', ascending=False)

print("\nTop 5 Important Features (Logistic Regression):")
print(feature_importance_lr.head())



Test Accuracy: 0.6000
Weighted AUC-ROC Score (OvR): 0.8632

5-Fold CV Accuracy: 0.5908 (+/- 0.0357)

Classification Report:
              precision    recall  f1-score   support

     Class 0       0.79      0.94      0.86        32
     Class 1       0.50      0.27      0.35        11
     Class 2       0.00      0.00      0.00         7
     Class 3       0.33      0.43      0.38         7
     Class 4       0.00      0.00      0.00         3

    accuracy                           0.60        60
   macro avg       0.32      0.33      0.32        60
weighted avg       0.55      0.60      0.57        60


Confusion Matrix:
[[30  1  0  0  1]
 [ 6  3  2  0  0]
 [ 2  1  0  4  0]
 [ 0  0  3  3  1]
 [ 0  1  0  2  0]]

Confusion Matrix (rows=actual, columns=predicted)
        Pred 0  Pred 1  Pred 2  Pred 3  Pred 4
True 0      30       1       0       0       1
True 1       6       3       2       0       0
True 2       2       1       0       4       0
True 3       0       0       3       

## XGBoost Classifier

In [None]:

# Train XGBoost with multiclass
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42,
    objective='multi:softprob',  # Multiclass classification
    eval_metric='mlogloss'
)

xgb_model.fit(X_train, y_train)

# Predictions
y_pred_xgb = xgb_model.predict(X_test)
y_pred_proba_xgb = xgb_model.predict_proba(X_test)

# Evaluation
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)

print(f"\nTest Accuracy: {xgb_accuracy:.4f}")

# For multiclass, calculate AUC using ovr (one-vs-rest)
try:
    xgb_auc = roc_auc_score(y_test, y_pred_proba_xgb, multi_class='ovr', average='weighted')
    print(f"Weighted AUC-ROC Score (OvR): {xgb_auc:.4f}")
except:
    print("AUC-ROC: Not calculated (some classes may not be present in test set)")

# Cross-validation
cv_scores_xgb = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"\n5-Fold CV Accuracy: {cv_scores_xgb.mean():.4f} (+/- {cv_scores_xgb.std():.4f})")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=class_names, zero_division=0))

print("\nConfusion Matrix:")
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
print(cm_xgb)
print("\nConfusion Matrix (rows=actual, columns=predicted)")
cm_df_xgb = pd.DataFrame(cm_xgb,
                          index=[f'True {i}' for i in sorted(y.unique())],
                          columns=[f'Pred {i}' for i in sorted(y.unique())])
print(cm_df_xgb)

# Feature importance for XGBoost
feature_importance_xgb = pd.DataFrame({
    'Feature': X.columns,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 5 Important Features (XGBoost):")
print(feature_importance_xgb.head())


Test Accuracy: 0.5667
Weighted AUC-ROC Score (OvR): 0.8303

5-Fold CV Accuracy: 0.5652 (+/- 0.0430)

Classification Report:
              precision    recall  f1-score   support

     Class 0       0.85      0.91      0.88        32
     Class 1       0.31      0.36      0.33        11
     Class 2       0.17      0.14      0.15         7
     Class 3       0.00      0.00      0.00         7
     Class 4       0.00      0.00      0.00         3

    accuracy                           0.57        60
   macro avg       0.27      0.28      0.27        60
weighted avg       0.53      0.57      0.55        60


Confusion Matrix:
[[29  1  2  0  0]
 [ 4  4  1  2  0]
 [ 1  3  1  1  1]
 [ 0  4  2  0  1]
 [ 0  1  0  2  0]]

Confusion Matrix (rows=actual, columns=predicted)
        Pred 0  Pred 1  Pred 2  Pred 3  Pred 4
True 0      29       1       2       0       0
True 1       4       4       1       2       0
True 2       1       3       1       1       1
True 3       0       4       2       

## Comparison

In [None]:
comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'XGBoost'],
    'Test Accuracy': [lr_accuracy, xgb_accuracy],
    'CV Accuracy': [cv_scores_lr.mean(), cv_scores_xgb.mean()],
    'CV Std Dev': [cv_scores_lr.std(), cv_scores_xgb.std()]
})

print("\n", comparison.to_string(index=False))

print("\n" + "="*60)
print("ADDITIONAL INSIGHTS")
print("="*60)

print("\nClass distribution in predictions:")
print("\nLogistic Regression predictions:")
print(pd.Series(y_pred_lr).value_counts().sort_index())
print("\nXGBoost predictions:")
print(pd.Series(y_pred_xgb).value_counts().sort_index())
print("\nActual test set distribution:")
print(y_test.value_counts().sort_index())


               Model  Test Accuracy  CV Accuracy  CV Std Dev
Logistic Regression       0.600000     0.590780    0.035702
            XGBoost       0.566667     0.565248    0.042985

ADDITIONAL INSIGHTS

Class distribution in predictions:

Logistic Regression predictions:
0    38
1     6
2     5
3     9
4     2
Name: count, dtype: int64

XGBoost predictions:
0    34
1    13
2     6
3     5
4     2
Name: count, dtype: int64

Actual test set distribution:
num
0    32
1    11
2     7
3     7
4     3
Name: count, dtype: int64
