In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, roc_auc_score,
    classification_report
)


In [5]:
CSV_PATH = "DiaBD_A Diabetes Dataset for Enhanced Risk Analysis and Research in Bangladesh.csv"
df = pd.read_csv(CSV_PATH)

print("Dataset Shape:", df.shape)
df.head()



Dataset Shape: (5288, 15)


Unnamed: 0,age,gender,pulse_rate,systolic_bp,diastolic_bp,glucose,height,weight,bmi,family_diabetes,hypertensive,family_hypertension,cardiovascular_disease,stroke,diabetic
0,42,Female,66,110,73,5.88,1.65,70.2,25.75,0,0,0,0,0,No
1,35,Female,60,125,68,5.71,1.47,42.5,19.58,0,0,0,0,0,No
2,62,Female,57,127,74,6.85,1.52,47.0,20.24,0,0,0,0,0,No
3,73,Male,55,193,112,6.28,1.63,57.4,21.72,0,0,0,0,0,No
4,68,Female,71,150,81,5.71,1.42,36.0,17.79,0,0,0,0,0,No


In [7]:
X = df.drop(columns=["diabetic"])
y = df["diabetic"].map({"No": 0, "Yes": 1})

y.value_counts()


diabetic
0    4946
1     342
Name: count, dtype: int64

In [9]:
categorical_cols = ["gender"]
numeric_cols = [c for c in X.columns if c not in categorical_cols]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print("Training Size:", X_train.shape)
print("Testing Size:", X_test.shape)


Training Size: (4230, 14)
Testing Size: (1058, 14)


In [13]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, class_weight="balanced"),
    "Random Forest": RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        class_weight="balanced"
    ),
    "Gaussian Naive Bayes": GaussianNB()
}

def evaluate_model(name, estimator):
    pipe = Pipeline(steps=[
        ("preprocess", preprocess),
        ("model", estimator)
    ])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    auc = None
    if hasattr(pipe.named_steps["model"], "predict_proba"):
        y_proba = pipe.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)

    print("\nModel:", name)
    print(classification_report(y_test, y_pred, zero_division=0))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    return pipe


In [15]:
trained_models = {}

for name, model in models.items():
    trained_models[name] = evaluate_model(name, model)



Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.97      0.86      0.91       990
           1       0.24      0.65      0.35        68

    accuracy                           0.84      1058
   macro avg       0.60      0.75      0.63      1058
weighted avg       0.93      0.84      0.87      1058

Confusion Matrix:
 [[847 143]
 [ 24  44]]

Model: Random Forest
              precision    recall  f1-score   support

           0       0.94      0.99      0.97       990
           1       0.50      0.09      0.15        68

    accuracy                           0.94      1058
   macro avg       0.72      0.54      0.56      1058
weighted avg       0.91      0.94      0.91      1058

Confusion Matrix:
 [[984   6]
 [ 62   6]]

Model: Gaussian Naive Bayes
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       990
           1       0.27      0.43      0.33        68

    accuracy       

In [17]:
best_model = trained_models["Logistic Regression"]

sample = X.iloc[[0]]
prediction = best_model.predict(sample)[0]

result = "Yes (Diabetic)" if prediction == 1 else "No (Non-diabetic)"

print("Sample Input:")
print(sample)
print("Prediction:", result)


Sample Input:
   age  gender  pulse_rate  systolic_bp  diastolic_bp  glucose  height  \
0   42  Female          66          110            73     5.88    1.65   

   weight    bmi  family_diabetes  hypertensive  family_hypertension  \
0    70.2  25.75                0             0                    0   

   cardiovascular_disease  stroke  
0                       0       0  
Prediction: No (Non-diabetic)
