In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import joblib

In [2]:
# Path Dataset & Folder Model
CLEAN_DATA_PATH = "../data/processed/diabetes_clean.csv"
MODELS_PATH = "../models/model"
os.makedirs(MODELS_PATH, exist_ok=True)

In [3]:
# Load clean dataset
def load_clean_data(path=CLEAN_DATA_PATH):
    """
    Membaca dataset hasil preprocessing dari folder processed.
    """
    df = pd.read_csv(path)
    print("Dataset bersih berhasil dimuat.")
    return df

df = load_clean_data()

Dataset bersih berhasil dimuat.


In [4]:
# Pisahkan Fitur & Target + Split
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [5]:
# Scaling Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

joblib.dump(scaler, os.path.join(MODELS_PATH, "scaler.joblib"))

['../models\\scaler.joblib']

In [6]:
# Fungsi Training Model
def train_decision_tree(X_train, y_train):
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)
    return model


def train_knn(X_train, y_train):
    model = KNeighborsClassifier()
    model.fit(X_train, y_train)
    return model


def train_xgboost(X_train, y_train):
    model = XGBClassifier(eval_metric="logloss", random_state=42)
    model.fit(X_train, y_train)
    return model

In [7]:
# Hyperparameter Tuning (GridSearch)
def tune_models(X_train, y_train):
    """
    Melakukan hyperparameter tuning untuk:
    - Decision Tree
    - KNN
    - XGBoost
    lalu memilih model terbaik berdasarkan skor F1.
    """
    models = {}
    print("\n=== Hyperparameter Tuning Running... ===")

    # Decision Tree
    dt_params = {"max_depth": [3, 5, 7, None]}
    dt = GridSearchCV(
        DecisionTreeClassifier(random_state=42),
        dt_params,
        scoring="f1",
        cv=5
    )
    dt.fit(X_train, y_train)
    models["Decision Tree"] = dt.best_estimator_

    # KNN
    knn_params = {"n_neighbors": [3, 5, 7, 9]}
    knn = GridSearchCV(
        KNeighborsClassifier(),
        knn_params,
        scoring="f1",
        cv=5
    )
    knn.fit(X_train, y_train)
    models["KNN"] = knn.best_estimator_

    # XGBoost
    xgb_params = {
        "n_estimators": [100, 200],
        "max_depth": [3, 5],
        "learning_rate": [0.05, 0.1]
    }
    xgb = GridSearchCV(
        XGBClassifier(eval_metric="logloss"),
        xgb_params,
        scoring="f1",
        cv=5
    )
    xgb.fit(X_train, y_train)
    models["XGBoost"] = xgb.best_estimator_

    print("\nBest Models Selected.")
    return models

In [8]:
# Simpan model ke folder models
models = tune_models(X_train, y_train)

for name, model in models.items():
    file_path = os.path.join(MODELS_PATH, f"{name.replace(' ', '_')}.joblib")
    joblib.dump(model, file_path)

print("\nSemua model berhasil disimpan di folder 'models'.")


=== Hyperparameter Tuning Running... ===

Best Models Selected.

Semua model berhasil disimpan di folder 'models'.
