In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix



In [70]:
df = pd.read_csv("diabetes.csv")
print(df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [71]:
print(Counter(df["Outcome"]))

Counter({0: 500, 1: 268})


In [72]:
(df == 0).sum()  # Count zeros per column


Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64

In [73]:
columns_with_zeros = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
df[columns_with_zeros] = df[columns_with_zeros].replace(0, np.nan)
print(df.isnull().sum()) 


Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64


In [74]:
X = df.drop(columns=["Outcome"])
y = df["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [75]:
def cal_median(X_train, y_train, target_col): #Function to calculate median per varible
    temp_df = X_train.copy()
    temp_df["Outcome"] = y_train 
    medians = temp_df.groupby("Outcome")[target_col].median()
    return medians

def median_imputation(X, y, target_col, class_medians): #Function to apply median imputation
    for cls, median_value in class_medians.items():
        X.loc[(X[target_col].isnull()) & (y == cls), target_col] = median_value
    return X

#Compute medians based on the training set
medians_glucose = cal_median(X_train, y_train, "Glucose")
medians_bp = cal_median(X_train, y_train, "BloodPressure")
medians_skin = cal_median(X_train, y_train, "SkinThickness")
medians_insulin = cal_median(X_train, y_train, "Insulin")
medians_bmi = cal_median(X_train, y_train, "BMI")

#Impute the missing values for the training set and the test set based on the median from the training set
X_train = median_imputation(X_train, y_train, "Glucose", medians_glucose)
X_train = median_imputation(X_train, y_train, "BloodPressure", medians_bp)
X_train = median_imputation(X_train, y_train, "SkinThickness", medians_skin)
X_train = median_imputation(X_train, y_train, "Insulin", medians_insulin)
X_train = median_imputation(X_train, y_train, "BMI", medians_bmi)

X_test = median_imputation(X_test, y_test, "Glucose", medians_glucose)
X_test = median_imputation(X_test, y_test, "BloodPressure", medians_bp)
X_test = median_imputation(X_test, y_test, "SkinThickness", medians_skin)
X_test = median_imputation(X_test, y_test, "Insulin", medians_insulin)
X_test = median_imputation(X_test, y_test, "BMI", medians_bmi)


In [76]:
class_counts = Counter(y_train)
scale_pos_weight = class_counts[0] / class_counts[1] #Calculating the class imbalance

xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss", max_depth=5, min_child_weight=1, gamma=0, scale_pos_weight=scale_pos_weight, random_state=42)
xgb.fit(X_train, y_train)
y_pred_prob = xgb.predict_proba(X_test)[:, 1]

new_threshold = 0.4  #Adjusting the classification threshold to achieve higher recall without harming accuracy too much
y_pred_xgb = (y_pred_prob > new_threshold).astype(int)

print("XGBoost Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))


XGBoost Confusion Matrix:
 [[91  9]
 [ 8 46]]
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.91      0.91       100
           1       0.84      0.85      0.84        54

    accuracy                           0.89       154
   macro avg       0.88      0.88      0.88       154
weighted avg       0.89      0.89      0.89       154



Parameters: { "use_label_encoder" } are not used.

