In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

print("--- Debugging XGBoost Early Stopping ---")

# Pastikan X dan y sudah siap dan bersih
if 'X' not in locals() or 'y' not in locals():
     print("Variabel X atau y belum didefinisikan. Jalankan ulang persiapan data.")
     X = train_final.copy()
     y = y_train.copy()
     non_numeric_cols = X.select_dtypes(exclude=np.number).columns
     if not non_numeric_cols.empty:
         X = X.drop(columns=non_numeric_cols)

# 1. Buat Split Train/Validasi Sederhana (bukan K-Fold)
X_train_dbg, X_val_dbg, y_train_dbg, y_val_dbg = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Ukuran Train Debug: {X_train_dbg.shape}, Test Debug: {X_val_dbg.shape}")

# 2. Definisikan Parameter XGBoost (ambil contoh dari Optuna)
xgb_params_dbg = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'eta': 0.05, # Contoh learning rate
    'max_depth': 6, # Contoh depth
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 10,
    'gamma': 0.1,
    'lambda': 1,
    'alpha': 0.1,
    'seed': 42,
    'nthread': -1,
    'tree_method': 'hist'
}
n_estimators_dbg = 1000 # Jumlah pohon awal
early_stopping_rounds_dbg = 50

# 3. Inisialisasi Model
model_dbg = xgb.XGBRegressor(**xgb_params_dbg, n_estimators=n_estimators_dbg)

# 4. Coba Fit dengan early_stopping_rounds sebagai argumen langsung
print("\nMencoba fit dengan early_stopping_rounds langsung...")
try:
    model_dbg.fit(X_train_dbg, y_train_dbg,
                  eval_set=[(X_val_dbg, y_val_dbg)],
                  early_stopping_rounds=early_stopping_rounds_dbg,
                  verbose=True) # Set verbose=True untuk melihat output XGBoost
    print("Fit dengan early_stopping_rounds langsung BERHASIL.")
except TypeError as e:
    print(f"Fit dengan early_stopping_rounds langsung GAGAL: {e}")
except Exception as e:
    print(f"Error lain saat fit (langsung): {e}")


# 5. Reset Model dan Coba Fit dengan Callbacks
print("\nMencoba fit dengan callbacks...")
model_dbg_cb = xgb.XGBRegressor(**xgb_params_dbg, n_estimators=n_estimators_dbg)
from xgboost.callback import EarlyStopping
es_callback_dbg = EarlyStopping(
    rounds=early_stopping_rounds_dbg,
    save_best=False # Set ke False untuk simplifikasi debug
)
try:
    model_dbg_cb.fit(X_train_dbg, y_train_dbg,
                     eval_set=[(X_val_dbg, y_val_dbg)],
                     callbacks=[es_callback_dbg],
                     verbose=True) # Set verbose=True
    print("Fit dengan callbacks BERHASIL.")
except TypeError as e:
    print(f"Fit dengan callbacks GAGAL: {e}")
except Exception as e:
    print(f"Error lain saat fit (callbacks): {e}")

# 6. Coba Fit Tanpa Early Stopping (untuk memastikan model dasar bekerja)
print("\nMencoba fit TANPA early stopping...")
model_dbg_no_es = xgb.XGBRegressor(**xgb_params_dbg, n_estimators=100) # Kurangi n_estimators
try:
    model_dbg_no_es.fit(X_train_dbg, y_train_dbg, verbose=False)
    print("Fit tanpa early stopping BERHASIL.")
    # Hitung RMSE manual jika berhasil
    preds_no_es = model_dbg_no_es.predict(X_val_dbg)
    rmse_no_es = np.sqrt(mean_squared_error(y_val_dbg, preds_no_es))
    print(f"RMSE tanpa early stopping: {rmse_no_es:.6f}")
except Exception as e:
    print(f"Fit tanpa early stopping GAGAL: {e}")

In [None]:
# Instal jika belum: pip install xgboost
import xgboost as xgb
import optuna
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import time
from xgboost.callback import EarlyStopping # Import callback XGBoost

print("\n--- Fase 4.3: Hyperparameter Tuning XGBoost (Pendekatan Callback) ---")

# --- Pastikan data X dan y sudah siap dan bersih ---
# (Kode untuk memuat/membersihkan X dan y ada di sini jika diperlukan)
if 'X' not in locals() or 'y' not in locals():
     print("Variabel X atau y belum didefinisikan. Jalankan ulang persiapan data.")
     X = train_final.copy()
     y = y_train.copy()
     non_numeric_cols = X.select_dtypes(exclude=np.number).columns
     if not non_numeric_cols.empty:
         X = X.drop(columns=non_numeric_cols)

# Fungsi Objective untuk XGBoost (dengan perbaikan callback)
def objective_xgb(trial):
    xgb_params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'eta': trial.suggest_float('eta', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'seed': RANDOM_SEED,
        'nthread': -1,
        'tree_method': 'hist'
    }
    n_estimators_xgb = trial.suggest_int('n_estimators', 500, 3000, step=100)
    early_stopping_rounds_xgb = 50

    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
    fold_scores = []

    for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val = y.iloc[train_index], y.iloc[val_index]

        model = xgb.XGBRegressor(**xgb_params, n_estimators=n_estimators_xgb)

        # --- PERBAIKAN: Coba sintaks evals dan early_stopping_rounds ---
        evals = [(X_train, y_train_fold), (X_val, y_val)] # List of tuples

        model.fit(X_train, y_train_fold,
                  eval_set=evals, # Ganti nama dari eval_set menjadi evals? (Tidak, sepertinya eval_set benar)
                  early_stopping_rounds=early_stopping_rounds_xgb, # Parameter langsung
                  verbose=False)
        # --- AKHIR PERBAIKAN ---

        val_preds = model.predict(X_val)
        fold_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
        fold_scores.append(fold_rmse)

        # Pruning
        trial.report(fold_rmse, fold)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return np.mean(fold_scores)

# Membuat Studi Optuna untuk XGBoost
N_TRIALS_XGB = 50 # Jumlah trial (sama dengan LGBM atau sesuaikan)
STUDY_NAME_XGB = 'xgb_hydrostatic_tuning'

try:
    study_xgb = optuna.load_study(study_name=STUDY_NAME_XGB, storage='sqlite:///optuna_studies.db')
    print(f"Memuat studi Optuna XGBoost yang sudah ada: '{STUDY_NAME_XGB}'")
except KeyError:
    print(f"Membuat studi Optuna XGBoost baru: '{STUDY_NAME_XGB}'")
    study_xgb = optuna.create_study(direction='minimize',
                                    study_name=STUDY_NAME_XGB,
                                    storage='sqlite:///optuna_studies.db',
                                    load_if_exists=True)

# Jalankan optimasi XGBoost
print(f"Memulai optimasi XGBoost dengan {N_TRIALS_XGB} trials...")
study_xgb.optimize(objective_xgb, n_trials=N_TRIALS_XGB, timeout=600) # Timeout 10 menit

# Tampilkan hasil terbaik XGBoost
print("\n--- Hasil Tuning Optuna XGBoost ---")
print(f"Jumlah trials selesai: {len(study_xgb.trials)}")
best_trial_xgb = study_xgb.best_trial
print(f"Trial XGBoost terbaik:")
print(f"  Value (Min RMSE): {best_trial_xgb.value:.6f}")
print(f"  Params: ")
for key, value in best_trial_xgb.params.items():
    print(f"    {key}: {value}")

# Simpan parameter terbaik XGBoost
# Ambil parameter dasar yang mungkin tidak di-tuning
best_xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'seed': RANDOM_SEED,
    'nthread': -1,
    'tree_method': 'hist'
}
best_xgb_params.update(best_trial_xgb.params) # Update dengan hasil tuning

print("\nParameter XGBoost terbaik ditemukan.")

# Bandingkan dengan LGBM
best_lgbm_rmse = best_trial.value # Dari studi LGBM sebelumnya (0.000471)
best_xgb_rmse = best_trial_xgb.value

print(f"\nPerbandingan Skor CV Terbaik:")
print(f"LGBM: {best_lgbm_rmse:.6f}")
print(f"XGBoost: {best_xgb_rmse:.6f}")

proceed_to_ensemble = False
if best_xgb_rmse < best_lgbm_rmse + 0.00001: # Jika XGB setidaknya sangat dekat dengan LGBM
    print("\nXGBoost kompetitif! Melanjutkan ke pembuatan prediksi untuk ensembling.")
    proceed_to_ensemble = True
else:
    print("\nXGBoost tidak sekompetitif LGBM pada CV. Ensembling mungkin tidak membantu.")