# Diabetes Prediction

## Import library

In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from sklearn.metrics import ConfusionMatrixDisplay
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectFromModel
import xgboost as xgb

In [2]:
data = pd.read_csv('Data/diabetes.csv')

In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [5]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
cols_with_zero_as_missing = [
    "Glucose", "BloodPressure", "SkinThickness",
    "Insulin", "BMI"
]

df_pima_nan = data.copy()
for col in cols_with_zero_as_missing:
    df_pima_nan[col] = df_pima_nan[col].replace(0, np.nan)


## Split data

In [8]:
X = df_pima_nan.drop(columns=["Outcome"])
y = df_pima_nan["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [9]:
imputer = KNNImputer(n_neighbors=5)
X_train_imp = imputer.fit_transform(X_train)
X_test_imp  = imputer.transform(X_test)

In [10]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_imp)
X_test_scaled  = scaler.transform(X_test_imp)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled  = pd.DataFrame(X_test_scaled, columns=X_test.columns)




In [11]:
smote = SMOTETomek(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)


In [12]:
xgb_selector_model = xgb.XGBClassifier(
    random_state=42,
    eval_metric="logloss",
    n_estimators=200,
    max_depth=5
)

selector = SelectFromModel(xgb_selector_model, threshold="median")
selector.fit(X_train_res, y_train_res)

# transform train & test
X_train_sel = selector.transform(X_train_res)
X_test_sel  = selector.transform(X_test_scaled)

print("Jumlah fitur awal:", X.shape[1])
print("Jumlah fitur terpilih:", X_train_sel.shape[1])

Jumlah fitur awal: 8
Jumlah fitur terpilih: 4


In [None]:
param_lr = {
    "C": np.logspace(-3, 3, 20),
    "penalty": ["l1", "l2"],
    "solver": ["liblinear", "saga"],
    "max_iter": [200, 300, 500]
}

lr = LogisticRegression(random_state=42)

lr_tuning = RandomizedSearchCV(
    estimator=lr,
    param_distributions=param_lr,
    n_iter=30,
    cv=5,
    scoring="f1",
    random_state=42,
    n_jobs=-1,
    verbose=1
)

lr_tuning.fit(X_train_sel, y_train_res)
best_lr = lr_tuning.best_estimator_

print("\nBest LR Params:", lr_tuning.best_params_)


Fitting 5 folds for each of 30 candidates, totalling 150 fits

Best LR Params: {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 500, 'C': np.float64(0.3359818286283781)}


In [14]:
y_pred = best_lr.predict(X_test_sel)

print("\n==== EVALUASI TEST ====")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1-score :", f1_score(y_test, y_pred))



==== EVALUASI TEST ====
Accuracy : 0.7012987012987013
Precision: 0.5606060606060606
Recall   : 0.6851851851851852
F1-score : 0.6166666666666667
