In [6]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [7]:
# Dataset mentah diambil dari folder raw (relatif terhadap project)
RAW_DATA_PATH = "../data/raw/diabetes.csv"

# Folder output dataset hasil preprocessing
PROCESSED_PATH = "../data/processed"
os.makedirs(PROCESSED_PATH, exist_ok=True)

In [8]:
# Load Dataset
def load_data(path=RAW_DATA_PATH):
    """
    Membaca dataset diabetes dari folder raw.
    """
    df = pd.read_csv(path)
    print("Dataset berhasil dimuat dari folder raw.")
    return df

df_raw = load_data()

Dataset berhasil dimuat dari folder raw.


In [9]:
def preprocess_data(df):
    """
    Melakukan preprocessing dataset:
    - Mengubah nilai 0 pada kolom medis menjadi NaN
    - Imputasi menggunakan median
    - Standarisasi fitur
    - Split data menjadi train & test (stratified)
    
    Mengembalikan:
    X_train, X_test, y_train, y_test, X_full_df, scaler
    """
    
    df_proc = df.copy()

    # Kolom medis yang tidak boleh bernilai 0 â†’ dianggap missing
    zero_cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
    df_proc[zero_cols] = df_proc[zero_cols].replace(0, np.nan)

    # Pisahkan fitur dan target
    X = df_proc.drop("Outcome", axis=1)
    y = df_proc["Outcome"]

    # Imputasi median
    imputer = SimpleImputer(strategy="median")
    X_imp = imputer.fit_transform(X)

    # Standard Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_imp)

    # Kembalikan ke DataFrame agar kompatibel dengan SHAP / analisis lain
    X_df = pd.DataFrame(X_scaled, columns=X.columns)

    # Stratified Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(
        X_df, y, test_size=0.2, stratify=y, random_state=42
    )

    return X_train, X_test, y_train, y_test, X_df, scaler

# Jalankan preprocessing
X_train, X_test, y_train, y_test, X_full_df, scaler = preprocess_data(df_raw)

In [10]:
# Simpan Dataset Hasil Cleaning
df_clean = X_full_df.copy()
df_clean["Outcome"] = df_raw["Outcome"].reset_index(drop=True)

output_file = os.path.join(PROCESSED_PATH, "diabetes_clean.csv")
df_clean.to_csv(output_file, index=False)

print("Dataset sudah dibersihkan dan disimpan di folder processed.")

Dataset sudah dibersihkan dan disimpan di folder processed.
