# Diabetes Prediction

## Import library

In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from sklearn.metrics import ConfusionMatrixDisplay
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.impute import KNNImputer

In [None]:
data = pd.read_csv('Data/diabetes.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
cols_with_zero_as_missing = [
    "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"
]

df_pima_nan = data.copy()

for col in cols_with_zero_as_missing:
    df_pima_nan[col] = df_pima_nan[col].replace(0,np.nan)

## Split data

In [None]:
# Pisahkan fitur & target
X = df_pima_nan.drop(columns=["Outcome"])   
y = df_pima_nan["Outcome"]



In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
imputer = KNNImputer(n_neighbors=5)

X_train2 = X_train.copy()

X_train2[X_train.columns] = imputer.fit_transform(X_train)

print(X_train2.head())

In [None]:
scaler = MinMaxScaler()

X_scaled = scaler.fit_transform(X_train2)  # hasil numpy array â†’ perlu dikembalikan ke dataframe

X_scaled = pd.DataFrame(X_scaled, columns=X_train2.columns)


In [None]:
smote_tomek = SMOTETomek(random_state=42)
X_resampled = smote_tomek.fit_resample(X_scaled)

In [None]:
param_rf = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=42)

rf_tuning = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_rf,
    n_iter=20,
    cv=10,
    scoring='f1',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

rf_tuning.fit(X_train, y_train)
print("Best RF Params:", rf_tuning.best_params_)
best_rf = rf_tuning.best_estimator_

In [None]:
models = {
    "Random Forest": best_rf,
}

for name, model in models.items():
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    accs, pres, recs, f1s = [], [], [], []

    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accs.append(accuracy_score(y_test, y_pred))
        pres.append(precision_score(y_test, y_pred))
        recs.append(recall_score(y_test, y_pred))
        f1s.append(f1_score(y_test, y_pred))

    print(f"\n===== {name} =====")
    print("Accuracy :", np.mean(accs))
    print("Precision:", np.mean(pres))
    print("Recall   :", np.mean(recs))
    print("F1-score :", np.mean(f1s))

## Evaluation & Visualization

In [None]:
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)
plt.show()