In [None]:
# ================================
# 1. IMPORT LIBRARY
# ================================
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


# ================================
# 2. LOAD DATASET
# ================================
df = pd.read_csv("heart.csv")

print("Kolom yang tersedia:")
print(df.columns)
print("\n")


# ================================
# 3. DETEKSI OTOMATIS KOLOM TARGET
# ================================
possible_targets = ['target', 'HeartDisease', 'output', 'heart_disease']

target_col = None
for col in df.columns:
    if col in possible_targets:
        target_col = col
        break

if target_col is None:
    raise ValueError("❌ Tidak ditemukan kolom target. Cek nama kolom dataset kamu!")

print("Kolom target terdeteksi:", target_col, "\n")


# ================================
# 4. PISAHKAN FITUR DAN TARGET
# ================================
X = df.drop(columns=[target_col])
y = df[target_col]


# ================================
# 5. TENTUKAN KATEGORIK & NUMERIK
# ================================
numeric_features = X.select_dtypes(include=['int64','float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

print("Fitur numerik:", list(numeric_features))
print("Fitur kategorik:", list(categorical_features), "\n")


# ================================
# 6. PIPELINE PREPROCESSING
# ================================
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


# ================================
# 7. SPLIT DATA TRAIN & TEST
# ================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ================================
# 8. FIT TRANSFORM DATA TRAIN + TRANSFORM DATA TEST
# ================================
X_train_clean = preprocessor.fit_transform(X_train)
X_test_clean = preprocessor.transform(X_test)

print("Preprocessing selesai!")
print("Shape X_train_clean:", X_train_clean.shape)
print("Shape X_test_clean:", X_test_clean.shape)


Kolom yang tersedia:
Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')


Kolom target terdeteksi: HeartDisease 

Fitur numerik: ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']
Fitur kategorik: ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'] 

Preprocessing selesai!
Shape X_train_clean: (734, 20)
Shape X_test_clean: (184, 20)


In [None]:
# ============================================
# 9. SIMPAN HASIL PREPROCESS KE FILE CSV
# ============================================
import numpy as np

# Ambil nama kolom baru setelah OneHotEncoder
encoded_cols = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)

# Gabung nama kolom numerik + kategorik hasil encoding
all_columns = list(numeric_features) + list(encoded_cols)

# Konversi X_train_clean menjadi DataFrame
X_train_df = pd.DataFrame(X_train_clean, columns=all_columns)
X_test_df = pd.DataFrame(X_test_clean, columns=all_columns)

# Simpan ke CSV
X_train_df.to_csv("X_train_clean.csv", index=False)
X_test_df.to_csv("X_test_clean.csv", index=False)

print("File CSV berhasil dibuat:")
print("• X_train_clean.csv")
print("• X_test_clean.csv")


File CSV berhasil dibuat:
• X_train_clean.csv
• X_test_clean.csv
