In [None]:
# -*- coding: utf-8 -*-
"""
Kode Lengkap - Fase 4.3 (Retry): Tuning XGBoost dengan Early Stopping Langsung

Asumsi:
- Library (xgboost 3.0.0+, optuna, pandas, numpy, scikit-learn) terinstal.
- Kernel sudah di-restart.
- Variabel X, y, X_test, N_SPLITS, RANDOM_SEED,
  submission_id_col, submission_target_col, test_ids sudah didefinisikan.
- Variabel best_trial (dari LGBM) ada untuk perbandingan (opsional).
"""

import xgboost as xgb
import optuna
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import time
import warnings

warnings.filterwarnings('ignore', category=UserWarning) # Mengabaikan beberapa user warning Optuna/XGBoost

print("\n--- Fase 4.3 (Retry): Hyperparameter Tuning XGBoost ---")
print(f"Menggunakan XGBoost versi: {xgb.__version__}") # Verifikasi lagi

# --- Memastikan data X dan y siap ---
if 'X' not in locals() or 'y' not in locals():
    print("ERROR: Variabel X atau y belum didefinisikan. Harap jalankan sel persiapan data sebelumnya.")
    # Muat ulang atau pastikan variabel ada
    try:
        X = train_final.copy()
        y = y_train.copy()
        X_test = test_final.copy()
        non_numeric_cols = X.select_dtypes(exclude=np.number).columns
        if not non_numeric_cols.empty:
            X = X.drop(columns=non_numeric_cols)
            cols_to_drop_in_test = [col for col in non_numeric_cols if col in X_test.columns]
            if cols_to_drop_in_test: X_test = X_test.drop(columns=cols_to_drop_in_test)
        print("Data X dan y siap.")
    except NameError:
        print("Gagal memuat ulang X/y. Hentikan.")
        exit()
elif X.select_dtypes(exclude=np.number).shape[1] > 0:
    print("ERROR: Masih ada kolom non-numerik di X. Membersihkan...")
    non_numeric_cols = X.select_dtypes(exclude=np.number).columns
    X = X.drop(columns=non_numeric_cols)
    cols_to_drop_in_test = [col for col in non_numeric_cols if col in X_test.columns]
    if cols_to_drop_in_test: X_test = X_test.drop(columns=cols_to_drop_in_test)
    print("Kolom non-numerik dihapus.")


# --- Fungsi Objective untuk Optuna (Menggunakan early_stopping_rounds langsung) ---
def objective_xgb_direct(trial):
    # Definisikan ruang pencarian hyperparameter
    xgb_params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'eta': trial.suggest_float('eta', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'seed': RANDOM_SEED,
        'nthread': -1,
        'tree_method': 'hist'
    }
    n_estimators_xgb = trial.suggest_int('n_estimators', 500, 3000, step=100)
    early_stopping_rounds_xgb = 50

    # Setup K-Fold
    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
    fold_scores = []

    # Loop Cross-Validation
    for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val = y.iloc[train_index], y.iloc[val_index]

        # Inisialisasi model XGBoost
        model = xgb.XGBRegressor(**xgb_params, n_estimators=n_estimators_xgb)

        # Latih model dengan early stopping langsung
        try:
            model.fit(X_train, y_train_fold,
                      eval_set=[(X_val, y_val)], # Data validasi
                      early_stopping_rounds=early_stopping_rounds_xgb, # Argumen langsung
                      verbose=False)             # Matikan log training
        except TypeError as e:
            # Ini seharusnya tidak terjadi dengan XGBoost v3
            print(f"*** TERJADI TypeError saat fit di fold {fold+1}: {e} ***")
            print("*** Ini sangat aneh. Periksa instalasi XGBoost Anda. ***")
            raise e # Hentikan trial jika error
        except Exception as e:
             print(f"Error tidak terduga saat fit di fold {fold+1}: {e}")
             raise e

        # Buat prediksi & hitung skor
        val_preds = model.predict(X_val)
        fold_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
        fold_scores.append(fold_rmse)

        # Pruning Optuna
        trial.report(fold_rmse, fold)
        if trial.should_prune():
            raise optuna.TrialPruned()

    # Kembalikan rata-rata RMSE
    return np.mean(fold_scores)

# --- Konfigurasi dan Eksekusi Studi Optuna ---
N_TRIALS_XGB = 50 # Jumlah trial
# Gunakan nama studi baru untuk memastikan tidak ada konflik state
STUDY_NAME_XGB = 'xgb_hydrostatic_tuning_direct_v4'
DB_FILENAME = 'optuna_studies.db'

# Membuat atau Memuat Studi
try:
    study_xgb = optuna.load_study(study_name=STUDY_NAME_XGB, storage=f'sqlite:///{DB_FILENAME}')
    print(f"Memuat studi Optuna XGBoost '{STUDY_NAME_XGB}' dari {DB_FILENAME}")
    remaining_trials = N_TRIALS_XGB - len(study_xgb.trials)
    if remaining_trials <= 0: N_TRIALS_TO_RUN = 0; print("Jumlah trials sudah tercapai.")
    else: N_TRIALS_TO_RUN = remaining_trials; print(f"Akan menjalankan {remaining_trials} trial tambahan.")
except KeyError:
    print(f"Membuat studi Optuna XGBoost baru '{STUDY_NAME_XGB}' di {DB_FILENAME}")
    study_xgb = optuna.create_study(direction='minimize',
                                    study_name=STUDY_NAME_XGB,
                                    storage=f'sqlite:///{DB_FILENAME}',
                                    load_if_exists=True)
    N_TRIALS_TO_RUN = N_TRIALS_XGB

# Menjalankan Optimasi
if N_TRIALS_TO_RUN > 0:
    print(f"\nMemulai optimasi XGBoost dengan {N_TRIALS_TO_RUN} trials...")
    start_opt_time = time.time()
    study_xgb.optimize(objective_xgb_direct, n_trials=N_TRIALS_TO_RUN, timeout=600) # Timeout opsional
    end_opt_time = time.time()
    print(f"Optimasi selesai dalam {end_opt_time - start_opt_time:.2f} detik.")
else:
    print("\nTidak ada trial baru yang dijalankan.")

# --- Menampilkan Hasil Terbaik ---
print("\n--- Hasil Tuning Optuna XGBoost ---")
if len(study_xgb.trials) > 0:
    completed_trials = [t for t in study_xgb.trials if t.state == optuna.trial.TrialState.COMPLETE]
    if completed_trials:
        print(f"Jumlah trials selesai total: {len(study_xgb.trials)}")
        print(f"Jumlah trials COMPLETE: {len(completed_trials)}")
        best_trial_xgb = study_xgb.best_trial
        print(f"Trial XGBoost terbaik:")
        print(f"  Value (Min RMSE): {best_trial_xgb.value:.6f}")
        print(f"  Params: ")
        for key, value in best_trial_xgb.params.items():
            print(f"    {key}: {value}")

        best_xgb_params = { # Simpan parameter terbaik
            'objective': 'reg:squarederror', 'eval_metric': 'rmse',
            'seed': RANDOM_SEED, 'nthread': -1, 'tree_method': 'hist'
        }
        best_xgb_params.update(best_trial_xgb.params)
        print("\nParameter XGBoost terbaik ditemukan ('best_xgb_params').")

        # --- Perbandingan dengan LGBM ---
        try:
            best_lgbm_rmse = best_trial.value
            best_xgb_rmse = best_trial_xgb.value
            print(f"\nPerbandingan Skor CV Terbaik:")
            print(f"LGBM: {best_lgbm_rmse:.6f}")
            print(f"XGBoost: {best_xgb_rmse:.6f}")
            proceed_to_ensemble = (best_xgb_rmse <= best_lgbm_rmse + 0.00001)
            if proceed_to_ensemble: print("\nXGBoost kompetitif atau lebih baik!")
            else: print("\nXGBoost tidak lebih baik dari LGBM.")
        except NameError:
            print("\nVariabel 'best_trial' (LGBM) tidak ditemukan untuk perbandingan.")
            proceed_to_ensemble = True # Asumsikan lanjut jika hanya tuning XGB
        except Exception as e:
            print(f"Error saat membandingkan skor: {e}")
            proceed_to_ensemble = True
    else:
        print("\nTidak ada trial Optuna yang berhasil diselesaikan (COMPLETE).")
        best_xgb_params = None
        proceed_to_ensemble = False
else:
    print("\nTidak ada trial Optuna yang pernah dijalankan untuk studi ini.")
    best_xgb_params = None
    proceed_to_ensemble = False

# --- Kode Selanjutnya (Ensemble/Submission) ---
# (Tambahkan kode ensemble atau submission di sini jika proceed_to_ensemble=True dan best_xgb_params ada)
# ...

In [None]:
if proceed_to_ensemble:
    print("\n--- Fase 4.4: Menghasilkan Prediksi OOF & Test untuk Ensemble ---")

    # Gunakan parameter terbaik untuk kedua model
    final_lgbm_params = best_lgbm_params.copy() # Dari tuning LGBM sebelumnya
    final_xgb_params = best_xgb_params.copy()   # Dari tuning XGB baru saja

    # Reset/Inisialisasi array penyimpanan
    oof_lgbm = np.zeros(X.shape[0])
    oof_xgb = np.zeros(X.shape[0])
    test_lgbm = np.zeros(X_test.shape[0]) if not X_test.empty else np.array([])
    test_xgb = np.zeros(X_test.shape[0]) if not X_test.empty else np.array([])
    fold_scores_lgbm_ens = []
    fold_scores_xgb_ens = []

    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
    early_stopping_rounds_lgbm = 50
    early_stopping_rounds_xgb = 50

    print(f"Menjalankan {N_SPLITS}-Fold CV untuk kedua model...")
    start_time_ens = time.time()

    for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
        print(f"--- Fold {fold+1}/{N_SPLITS} ---")
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val = y.iloc[train_index], y.iloc[val_index]

        # --- Train LGBM ---
        model_lgbm = lgb.LGBMRegressor(**final_lgbm_params)
        model_lgbm.fit(X_train, y_train_fold,
                       eval_set=[(X_val, y_val)],
                       eval_metric='rmse',
                       callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_rounds_lgbm, verbose=False)])
        oof_lgbm[val_index] = model_lgbm.predict(X_val)
        fold_scores_lgbm_ens.append(np.sqrt(mean_squared_error(y_val, oof_lgbm[val_index])))
        if not X_test.empty:
            test_lgbm += model_lgbm.predict(X_test) / N_SPLITS
        print(f"  LGBM Fold RMSE: {fold_scores_lgbm_ens[-1]:.6f}")

        # --- Train XGBoost ---
        model_xgb = xgb.XGBRegressor(**final_xgb_params) # n_estimators sudah ada di dalam dict
        model_xgb.fit(X_train, y_train_fold,
                      eval_set=[(X_val, y_val)],
                      early_stopping_rounds=early_stopping_rounds_xgb,
                      verbose=False)
        oof_xgb[val_index] = model_xgb.predict(X_val)
        fold_scores_xgb_ens.append(np.sqrt(mean_squared_error(y_val, oof_xgb[val_index])))
        if not X_test.empty:
            test_xgb += model_xgb.predict(X_test) / N_SPLITS
        print(f"  XGBoost Fold RMSE: {fold_scores_xgb_ens[-1]:.6f}")


    total_time_ens = time.time() - start_time_ens
    print(f"\nPelatihan ensemble selesai dalam {total_time_ens:.2f} detik.")

    # --- Evaluasi Ensemble (Simple Average) ---
    print("\n--- Evaluasi Ensemble (Simple Average) ---")
    oof_ensemble = (oof_lgbm + oof_xgb) / 2
    rmse_oof_lgbm = np.sqrt(mean_squared_error(y, oof_lgbm))
    rmse_oof_xgb = np.sqrt(mean_squared_error(y, oof_xgb))
    rmse_oof_ensemble = np.sqrt(mean_squared_error(y, oof_ensemble))

    print(f"RMSE OOF LGBM:     {rmse_oof_lgbm:.6f}")
    print(f"RMSE OOF XGBoost:  {rmse_oof_xgb:.6f}")
    print(f"RMSE OOF Ensemble: {rmse_oof_ensemble:.6f}")

    # Keputusan untuk Submission Ensemble
    if rmse_oof_ensemble < min(rmse_oof_lgbm, rmse_oof_xgb):
        print("\nEnsemble memberikan peningkatan pada skor OOF! Membuat submission ensemble.")
        # Buat prediksi test ensemble
        test_ensemble = (test_lgbm + test_xgb) / 2
        # Pastikan tidak ada prediksi negatif (jika relevan)
        # test_ensemble[test_ensemble < 0] = 0

        # Buat DataFrame submission ensemble
        submission_ensemble_df = pd.DataFrame({
            submission_id_col: test_ids,
            submission_target_col: test_ensemble
        })

        # Simpan ke file CSV
        submission_ensemble_filename = 'submission_ensemble_lgbm_xgb.csv'
        submission_ensemble_df.to_csv(submission_ensemble_filename, index=False)
        print(f"\nFile submission ensemble '{submission_ensemble_filename}' telah dibuat.")
        print(submission_ensemble_df.head())

    else:
        print("\nEnsemble tidak memberikan peningkatan signifikan pada skor OOF.")
        print("Sebaiknya gunakan model tunggal terbaik (LGBM atau XGBoost) untuk submission.")
        # Anda bisa membuat submission dari model terbaik tunggal jika mau
        # (misal, gunakan kode dari Opsi 1 sebelumnya dengan model & prediksi yang sesuai)