In [21]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [22]:
# Dataset mentah diambil dari folder raw (relatif terhadap project)
RAW_DATA_PATH = "../data/raw/diabetes.csv"

# Folder output dataset hasil preprocessing
PROCESSED_PATH = "../data/processed"
os.makedirs(PROCESSED_PATH, exist_ok=True)

# Folder untuk menyimpan model & scaler/imputer
MODELS_PATH = "../models"  # sesuaikan path sesuai struktur project
os.makedirs(MODELS_PATH, exist_ok=True)

In [23]:
# Load Dataset
def load_data(path=RAW_DATA_PATH):
    """
    Membaca dataset diabetes dari folder raw.
    """
    df = pd.read_csv(path)
    print("Dataset berhasil dimuat dari folder raw.")
    return df

df_raw = load_data()

Dataset berhasil dimuat dari folder raw.


In [24]:
def preprocess_data(df):
    df_proc = df.copy()

    # Kolom yang tidak boleh 0
    zero_cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
    df_proc[zero_cols] = df_proc[zero_cols].replace(0, np.nan)

    # Pisahkan fitur & target
    X = df_proc.drop("Outcome", axis=1)
    y = df_proc["Outcome"]

    # Split dulu (Stratified)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        stratify=y,
        random_state=42
    )

    # Imputer fit di TRAIN saja
    imputer = SimpleImputer(strategy="median")
    X_train_imp = imputer.fit_transform(X_train)
    X_test_imp  = imputer.transform(X_test)

    # StandardScaler fit di TRAIN saja
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_imp)
    X_test_scaled  = scaler.transform(X_test_imp)

    # Convert kembali ke DF
    X_train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
    X_test_df  = pd.DataFrame(X_test_scaled,  columns=X.columns)

    # Full dataset scaled (untuk simpan)
    X_full_imp = imputer.transform(X)
    X_full_scaled = scaler.transform(X_full_imp)
    X_df = pd.DataFrame(X_full_scaled, columns=X.columns)

    # Simpan scaler & imputer
    joblib.dump(scaler, os.path.join(MODELS_PATH, "scaler.joblib"))
    joblib.dump(imputer, os.path.join(MODELS_PATH, "imputer.joblib"))

    print("Preprocessing selesai tanpa data leakage.")
    return X_train_df, X_test_df, y_train, y_test, X_df, scaler

# Jalankan preprocessing
X_train, X_test, y_train, y_test, X_df, scaler = preprocess_data(df_raw)

Preprocessing selesai tanpa data leakage.


In [25]:
# Simpan Dataset Hasil Cleaning
df_clean = X_df.copy()
df_clean["Outcome"] = df_raw["Outcome"].reset_index(drop=True)

output_file = os.path.join(PROCESSED_PATH, "diabetes_clean.csv")
df_clean.to_csv(output_file, index=False)

print("Dataset sudah dibersihkan dan disimpan di folder processed.")

Dataset sudah dibersihkan dan disimpan di folder processed.
