<a href="https://colab.research.google.com/github/MohammadAsad0/Medical-Diagnosis-Risk-Scoring-using-Bayesian-Networks/blob/main/PM_ML_Baseline_Diabetes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
!pip install pandas numpy scikit-learn



In [30]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from xgboost import XGBClassifier

import matplotlib.pyplot as plt
import seaborn as sns

In [31]:
# Load Dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv"

cols = [
    'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
    'Insulin', 'BMI', 'DiabetesPedigree', 'Age', 'Outcome'
]

df = pd.read_csv(url, names=cols)

print("df shape: ", df.shape)

print(df.describe())
df['Outcome'].value_counts()

df shape:  (768, 9)
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigree         Age     Outcome  
count  768.000000        768.000000  768.000000  768.000000  
mean    31.992578          0.471876   33.240885    0.348958  
std      7.884160          0.331329   11.760232    0.476951  
min      0.000000          0.078000   21.000000

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,268


# Preprocessing

In [32]:
# Features & Split
X = df.drop('Outcome', axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale Features (for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Binary Class

## Logistic Regression

In [33]:
# Train Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_lr = lr_model.predict(X_test_scaled)
y_pred_proba_lr = lr_model.predict_proba(X_test_scaled)[:, 1]

# Evaluation
lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_auc = roc_auc_score(y_test, y_pred_proba_lr)

print(f"\nTest Accuracy: {lr_accuracy:.4f}")
print(f"AUC-ROC Score: {lr_auc:.4f}")

# Cross-validation
cv_scores_lr = cross_val_score(lr_model, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"\n5-Fold CV Accuracy: {cv_scores_lr.mean():.4f} (+/- {cv_scores_lr.std():.4f})")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_lr, target_names=['No Disease', 'Disease']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))

# Feature importance for Logistic Regression
feature_importance_lr = pd.DataFrame({
  'Feature': X.columns,
  'Coefficient': np.abs(lr_model.coef_[0])
}).sort_values('Coefficient', ascending=False)

print("\nTop 5 Important Features (Logistic Regression):")
print(feature_importance_lr.head())


Test Accuracy: 0.7143
AUC-ROC Score: 0.8230

5-Fold CV Accuracy: 0.7785 (+/- 0.0126)

Classification Report:
              precision    recall  f1-score   support

  No Disease       0.76      0.82      0.79       100
     Disease       0.61      0.52      0.56        54

    accuracy                           0.71       154
   macro avg       0.68      0.67      0.67       154
weighted avg       0.71      0.71      0.71       154


Confusion Matrix:
[[82 18]
 [26 28]]

Top 5 Important Features (Logistic Regression):
            Feature  Coefficient
1           Glucose     1.144151
5               BMI     0.713893
0       Pregnancies     0.373178
6  DiabetesPedigree     0.255527
2     BloodPressure     0.197637


## XGBoost

In [34]:
# Train XGBoost
xgb_model = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42,
    eval_metric='logloss'
)

xgb_model.fit(X_train, y_train)

# Predictions
y_pred_xgb = xgb_model.predict(X_test)
y_pred_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

# Evaluation
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_auc = roc_auc_score(y_test, y_pred_proba_xgb)

print(f"\nTest Accuracy: {xgb_accuracy:.4f}")
print(f"AUC-ROC Score: {xgb_auc:.4f}")

# Cross-validation
cv_scores_xgb = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"\n5-Fold CV Accuracy: {cv_scores_xgb.mean():.4f} (+/- {cv_scores_xgb.std():.4f})")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=['No Disease', 'Disease']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))

# Feature importance for XGBoost
feature_importance_xgb = pd.DataFrame({
  'Feature': X.columns,
  'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 5 Important Features (XGBoost):")
print(feature_importance_xgb.head())


Test Accuracy: 0.7792
AUC-ROC Score: 0.8244

5-Fold CV Accuracy: 0.7476 (+/- 0.0437)

Classification Report:
              precision    recall  f1-score   support

  No Disease       0.82      0.85      0.83       100
     Disease       0.70      0.65      0.67        54

    accuracy                           0.78       154
   macro avg       0.76      0.75      0.75       154
weighted avg       0.78      0.78      0.78       154


Confusion Matrix:
[[85 15]
 [19 35]]

Top 5 Important Features (XGBoost):
            Feature  Importance
1           Glucose    0.298835
5               BMI    0.142809
7               Age    0.129001
0       Pregnancies    0.098524
6  DiabetesPedigree    0.089480


## Comparison

In [35]:
comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'XGBoost'],
    'Test Accuracy': [lr_accuracy, xgb_accuracy],
    'AUC-ROC': [lr_auc, xgb_auc],
    'CV Accuracy': [cv_scores_lr.mean(), cv_scores_xgb.mean()]
})

print("\n", comparison.to_string(index=False))


               Model  Test Accuracy  AUC-ROC  CV Accuracy
Logistic Regression       0.714286 0.822963     0.778529
            XGBoost       0.779221 0.824444     0.747608
