In [None]:
# -*- coding: utf-8 -*-
"""
Kode Lengkap: Prediksi Hidrostatik - Solusi XGBoost Tuned (GPU Enabled)

Menggabungkan semua fase:
1. Pemuatan Data & Koreksi Awal
2. Preprocessing & Feature Engineering
3. Tuning XGBoost dengan Optuna (Menggunakan GPU)
4. Pelatihan Model XGBoost Final (GPU)
5. Pembuatan File Submission
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis
from sklearn.model_selection import KFold, train_test_split # train_test_split hanya untuk debug jika perlu
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import optuna
import time
import os
import warnings

warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# --- 1. Konfigurasi Dasar ---
print("--- Konfigurasi ---")
TARGET_VARIABLE = 'hydrostatic_pressure'
ID_COLUMN = 'measurement_id'
TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'
SUBMISSION_FILE = 'sample_submission.csv'
DB_FILENAME = 'optuna_studies.db' # File database Optuna
FINAL_SUBMISSION_FILENAME_XGB = 'submission_xgb_gpu_tuned.csv'

N_SPLITS = 5  # Jumlah fold untuk validasi dalam tuning
RANDOM_SEED = 42 # Seed untuk reproduktifitas
N_TRIALS_XGB_GPU = 50 # Jumlah trial Optuna (sesuaikan)
OPTUNA_TIMEOUT = 600 # Batas waktu Optuna dalam detik (opsional, misal 10 menit)

print(f"Target: {TARGET_VARIABLE}")
print(f"ID Column: {ID_COLUMN}")
print(f"CV Folds: {N_SPLITS}")
print(f"Optuna Trials: {N_TRIALS_XGB_GPU}")
print(f"Random Seed: {RANDOM_SEED}")
print(f"XGBoost Version: {xgb.__version__}")

# --- 2. Pemuatan Data & Koreksi Awal ---
print("\n--- Memuat Data ---")
try:
    train_df_orig = pd.read_csv(TRAIN_FILE)
    test_df_orig = pd.read_csv(TEST_FILE)
    sample_submission_df = pd.read_csv(SUBMISSION_FILE)
    print(f"Train: {train_df_orig.shape}, Test: {test_df_orig.shape}, Sample Sub: {sample_submission_df.shape}")
except FileNotFoundError as e:
    print(f"Error: File tidak ditemukan - {e}")
    exit()

# Simpan ID test
test_ids = test_df_orig[ID_COLUMN].copy()

# Hapus kolom yang hanya ada di test
cols_only_in_test = set(test_df_orig.columns) - set(train_df_orig.columns)
if cols_only_in_test:
    print(f"Menghapus kolom hanya di test: {cols_only_in_test}")
    test_df_orig = test_df_orig.drop(columns=list(cols_only_in_test))

# --- 3. Preprocessing & Feature Engineering ---
print("\n--- Preprocessing & Feature Engineering ---")

# 3.1 Pisahkan Target & Gabungkan Fitur
y_train_full = train_df_orig[TARGET_VARIABLE].copy()
train_ids = train_df_orig[ID_COLUMN].copy()
train_features = train_df_orig.drop(columns=[TARGET_VARIABLE, ID_COLUMN])
test_features = test_df_orig.drop(columns=[ID_COLUMN])
ntrain = train_features.shape[0]
all_features = pd.concat((train_features, test_features), ignore_index=True)
print(f"Data gabungan awal: {all_features.shape}")

# 3.2 Pembersihan Tipe Data Object (Koma Desimal)
print("Membersihkan tipe data object (koma desimal)...")
object_cols = all_features.select_dtypes(include='object').columns.tolist()
time_col = 'depth_reading_time'
if time_col in object_cols: object_cols.remove(time_col)
converted_cols = []
for col in object_cols:
    try:
        if not all_features[col].isnull().all():
            all_features[col] = all_features[col].str.replace(' ', '', regex=False).str.replace(',', '.', regex=False)
            all_features[col] = pd.to_numeric(all_features[col])
            converted_cols.append(col)
    except Exception as e: print(f"Warning: Gagal konversi kolom '{col}': {e}")
print(f"Kolom objek dikonversi ke numerik: {len(converted_cols)}")

# 3.3 Parsing Waktu
print("Memparsing fitur waktu...")
try:
    all_features[time_col] = pd.to_datetime(all_features[time_col])
    all_features['time_year'] = all_features[time_col].dt.year
    all_features['time_month'] = all_features[time_col].dt.month
    all_features['time_day'] = all_features[time_col].dt.day
    all_features['time_dayofweek'] = all_features[time_col].dt.dayofweek
    all_features['time_dayofyear'] = all_features[time_col].dt.dayofyear
    all_features['time_hour'] = all_features[time_col].dt.hour
    all_features['time_hour_sin'] = np.sin(2 * np.pi * all_features['time_hour']/24.0)
    all_features['time_hour_cos'] = np.cos(2 * np.pi * all_features['time_hour']/24.0)
    all_features['time_month_sin'] = np.sin(2 * np.pi * all_features['time_month']/12.0)
    all_features['time_month_cos'] = np.cos(2 * np.pi * all_features['time_month']/12.0)
    all_features = all_features.drop(columns=[time_col])
    print("Fitur waktu berhasil diproses.")
except Exception as e:
    print(f"Warning: Gagal parsing waktu '{time_col}': {e}")

# 3.4 Hapus Kolom Missing Tinggi (>50%)
print("Menghapus kolom dengan >50% missing...")
cols_to_drop_high_missing = [
    'aragonite_saturation_state', 'pH', 'dissolved_inorganic_carbon (µmol kg-1)',
    'partial_pressure_CO2 (µatm)', 'water_temperature_50m'
]
existing_cols_to_drop = [col for col in cols_to_drop_high_missing if col in all_features.columns]
if existing_cols_to_drop:
    all_features = all_features.drop(columns=existing_cols_to_drop)
    print(f"Kolom dihapus: {existing_cols_to_drop}")

# 3.5 Hapus Sisa Kolom Non-Numerik
non_numeric_cols = all_features.select_dtypes(exclude=np.number).columns
if not non_numeric_cols.empty:
    print(f"Menghapus sisa kolom non-numerik: {non_numeric_cols.tolist()}")
    all_features = all_features.drop(columns=non_numeric_cols)

print(f"Bentuk data fitur akhir: {all_features.shape}")

# 3.6 Memisahkan Data Final
X = all_features.iloc[:ntrain].copy()
X_test = all_features.iloc[ntrain:].copy()
y = y_train_full.copy()
print(f"Ukuran final - X: {X.shape}, y: {y.shape}, X_test: {X_test.shape}")

# --- 4. Tuning Hyperparameter XGBoost dengan Optuna (GPU) ---

# 4.1 Fungsi Objective
def objective_xgb_gpu(trial):
    xgb_params = {
        'objective': 'reg:squarederror', 'eval_metric': 'rmse',
        'device': 'cuda', # Aktifkan GPU
        'eta': trial.suggest_float('eta', 0.01, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'seed': RANDOM_SEED, 'nthread': -1, 'tree_method': 'hist'
    }
    n_estimators_xgb = trial.suggest_int('n_estimators', 500, 3000, step=100)
    early_stopping_rounds_xgb = 50

    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
    fold_scores = []
    for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val = y.iloc[train_index], y.iloc[val_index]
        model = xgb.XGBRegressor(**xgb_params, n_estimators=n_estimators_xgb)
        try:
            model.fit(X_train, y_train_fold,
                      eval_set=[(X_val, y_val)],
                      early_stopping_rounds=early_stopping_rounds_xgb,
                      verbose=False)
        except Exception as e:
             print(f"Error saat fit di fold {fold+1}: {e}. Mengembalikan nilai penalti.")
             return float('inf') # Penalti agar Optuna tidak memilih param ini
        val_preds = model.predict(X_val)
        fold_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
        fold_scores.append(fold_rmse)
        trial.report(fold_rmse, fold)
        if trial.should_prune(): raise optuna.TrialPruned()
    return np.mean(fold_scores)

# 4.2 Setup & Run Optuna Study
STUDY_NAME_XGB_GPU = 'xgb_gpu_hydrostatic_tuning_final' # Nama studi
try:
    study_xgb_gpu = optuna.load_study(study_name=STUDY_NAME_XGB_GPU, storage=f'sqlite:///{DB_FILENAME}')
    print(f"Memuat studi Optuna XGBoost GPU '{STUDY_NAME_XGB_GPU}'")
    remaining_trials = max(0, N_TRIALS_XGB_GPU - len(study_xgb_gpu.trials))
    if remaining_trials == 0: print("Jumlah trials sudah tercapai.")
    else: print(f"Akan menjalankan {remaining_trials} trial tambahan.")
    N_TRIALS_TO_RUN = remaining_trials
except KeyError:
    print(f"Membuat studi Optuna XGBoost GPU baru '{STUDY_NAME_XGB_GPU}'")
    study_xgb_gpu = optuna.create_study(direction='minimize', study_name=STUDY_NAME_XGB_GPU,
                                        storage=f'sqlite:///{DB_FILENAME}', load_if_exists=True)
    N_TRIALS_TO_RUN = N_TRIALS_XGB_GPU

if N_TRIALS_TO_RUN > 0:
    print(f"\nMemulai optimasi XGBoost GPU dengan {N_TRIALS_TO_RUN} trials...")
    start_opt_time = time.time()
    study_xgb_gpu.optimize(objective_xgb_gpu, n_trials=N_TRIALS_TO_RUN, timeout=OPTUNA_TIMEOUT)
    end_opt_time = time.time()
    print(f"Optimasi GPU selesai dalam {end_opt_time - start_opt_time:.2f} detik.")

# 4.3 Tampilkan Hasil Tuning XGBoost
print("\n--- Hasil Tuning Optuna XGBoost (GPU) ---")
best_xgb_gpu_params = None
best_xgb_gpu_rmse = float('inf')
if len(study_xgb_gpu.trials) > 0:
    completed_trials = [t for t in study_xgb_gpu.trials if t.state == optuna.trial.TrialState.COMPLETE]
    if completed_trials:
        best_trial_xgb_gpu = study_xgb_gpu.best_trial
        best_xgb_gpu_rmse = best_trial_xgb_gpu.value
        print(f"Trial XGBoost GPU terbaik:")
        print(f"  Value (Min RMSE): {best_xgb_gpu_rmse:.6f}")
        print(f"  Params: ")
        for key, value in best_trial_xgb_gpu.params.items(): print(f"    {key}: {value}")
        # Simpan parameter terbaik
        best_xgb_gpu_params = { 'objective': 'reg:squarederror', 'eval_metric': 'rmse',
                                'seed': RANDOM_SEED, 'nthread': -1, 'tree_method': 'hist',
                                'device': 'cuda' } # Penting: tambahkan device cuda
        best_xgb_gpu_params.update(best_trial_xgb_gpu.params)
        print("\nParameter XGBoost GPU terbaik disimpan.")
    else: print("\nTidak ada trial Optuna GPU yang berhasil COMPLETE.")
else: print("\nTidak ada trial Optuna GPU yang dijalankan.")


# --- 5. Pelatihan Model Final & Submission ---

# Pilih parameter terbaik (antara LGBM tuned atau XGB tuned jika ada)
# Asumsi kita punya best_lgbm_params dari tuning sebelumnya dan skornya
try:
    # Ambil skor LGBM terbaik dari trial sebelumnya (jika ada)
    # Anda mungkin perlu memuat studi LGBM jika tidak ada di memori
    study_lgbm = optuna.load_study(study_name='lgbm_hydrostatic_tuning', storage=f'sqlite:///{DB_FILENAME}') # Ganti nama jika berbeda
    best_lgbm_rmse = study_lgbm.best_trial.value
    print(f"\nSkor CV LGBM Terbaik (dari DB): {best_lgbm_rmse:.6f}")

    # Bandingkan
    if best_xgb_gpu_params is not None and best_xgb_gpu_rmse < best_lgbm_rmse:
        print("\nXGBoost (GPU) Tuned lebih baik. Menggunakan XGBoost untuk model final.")
        final_model_params = best_xgb_gpu_params
        final_model = xgb.XGBRegressor(**final_model_params)
        model_type = "XGBoost_GPU"
    else:
        print("\nLGBM Tuned lebih baik atau XGBoost gagal. Menggunakan LGBM untuk model final.")
        # Pastikan best_lgbm_params ada dari proses sebelumnya
        if 'best_lgbm_params' not in locals():
            print("ERROR: Variabel 'best_lgbm_params' tidak ditemukan!")
            # Muat ulang dari studi LGBM jika perlu
            best_lgbm_params_from_study = { 'objective': 'regression_l2', 'metric': 'rmse', ... } # Parameter dasar
            best_lgbm_params_from_study.update(study_lgbm.best_trial.params)
            best_lgbm_params = best_lgbm_params_from_study
            print("Parameter LGBM dimuat ulang dari studi.")

        final_model_params = best_lgbm_params
        final_model = lgb.LGBMRegressor(**final_model_params)
        model_type = "LightGBM"

except KeyError:
    print("\nStudi LGBM tidak ditemukan di DB. Menggunakan hasil XGBoost (GPU) jika ada.")
    if best_xgb_gpu_params is not None:
        final_model_params = best_xgb_gpu_params
        final_model = xgb.XGBRegressor(**final_model_params)
        model_type = "XGBoost_GPU"
    else:
        print("ERROR: Tidak ada parameter model terbaik yang bisa digunakan!")
        final_model = None
        model_type = "None"
except NameError:
     print("\nVariabel best_lgbm_params tidak ditemukan. Menggunakan hasil XGBoost (GPU) jika ada.")
     if best_xgb_gpu_params is not None:
        final_model_params = best_xgb_gpu_params
        final_model = xgb.XGBRegressor(**final_model_params)
        model_type = "XGBoost_GPU"
     else:
        print("ERROR: Tidak ada parameter model terbaik yang bisa digunakan!")
        final_model = None
        model_type = "None"


# Latih model final jika parameter terpilih
if final_model is not None:
    print(f"\n--- Melatih Model Final ({model_type}) pada Seluruh Data Training ---")
    start_time_final_train = time.time()
    # Latih pada X dan y lengkap
    final_model.fit(X, y) # Tanpa early stopping untuk training final
    end_time_final_train = time.time()
    print(f"Pelatihan model final selesai dalam {end_time_final_train - start_time_final_train:.2f} detik.")

    # Buat prediksi pada data test
    print("\nMembuat prediksi pada data test...")
    if not X_test.empty:
        final_test_predictions = final_model.predict(X_test)

        # Buat DataFrame submission
        print("Membuat file submission...")
        submission_df = pd.DataFrame({
            ID_COLUMN: test_ids,
            TARGET_VARIABLE: final_test_predictions
        })

        # Simpan ke file CSV
        submission_df.to_csv(FINAL_SUBMISSION_FILENAME_XGB, index=False)
        print(f"\nFile submission '{FINAL_SUBMISSION_FILENAME_XGB}' telah berhasil dibuat.")
        print("Contoh isi file submission:")
        print(submission_df.head())
    else:
        print("\nData test (X_test) kosong, submission tidak dibuat.")
else:
    print("\nTidak dapat melatih model final karena tidak ada parameter terbaik yang valid.")

print("\n--- Proses Selesai ---")