In [None]:
# --- Combined Imports ---
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import GridSearchCV, KFold, RandomizedSearchCV # Added RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures # Added PolynomialFeatures
import lightgbm as lgb
import joblib
import warnings
from sklearn.metrics import mean_squared_error, mean_absolute_error # Added evaluation metrics

warnings.filterwarnings('ignore')

In [None]:
# --- Data Preparation Function ---
def dataPrep():
    try:
        train_df = pd.read_csv('train_dataset.csv')
        test_df = pd.read_csv('test_dataset.csv')
    except FileNotFoundError as e:
        print(f"Error: {e}. Pastikan file dataset ada.")
        return None, None # Return None, None in case of error
    return train_df, test_df # Always return the dataframes

In [None]:
# --- Alternatif 1 Execution (Rewritten from zBfiVzrw-Cr4) ---
# 1 Cell ini berisi 1 Alternatif
# Note: Assumes necessary imports and dataPrep are run in previous cells

# --- 1. Custom Transformer untuk Feature Engineering ---
class FeatureEngineeringTransformer_Alt1(BaseEstimator, TransformerMixin): # Renamed to avoid conflict if original zBfiVzrw-Cr4 is also kept
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        df = X.copy()

        # Contoh Fitur Interaksi dan Rasio
        # Hindari pembagian dengan nol
        df['funding_per_teacher'] = df['funding_per_student_usd'] * df['student_teacher_ratio']
        df['low_income_minority_interaction'] = df['percent_low_income'] * df['percent_minority']

        # Ekstrak kata kunci dari nama sekolah (jika ada)
        df['is_high_school'] = df['school_name'].str.contains('High', case=False, na=False).astype(int)
        df['is_middle_school'] = df['school_name'].str.contains('Middle', case=False, na=False).astype(int)
        df['is_elementary_school'] = df['school_name'].str.contains('Elementary', case=False, na=False).astype(int)

        return df

# --- 2. Fungsi Utama untuk Menjalankan Proses ---
def run_training_pipeline_alt1_rewritten(): # Renamed function
    # --- Memuat Data ---
    # Using dataPrep function from separate cell
    train_df, test_df = dataPrep()
    if train_df is None or test_df is None:
        print("Gagal memuat data. Menghentikan eksekusi Alternatif 1.")
        return

    # --- Persiapan Awal ---
    train_df.dropna(subset=['dropout_rate_percent'], inplace=True)
    X = train_df.drop('dropout_rate_percent', axis=1)
    y = train_df['dropout_rate_percent']

    # Simpan ID test untuk file submission
    test_ids = test_df['id']

    # --- Identifikasi Tipe Kolom ---
    # `school_name` akan digunakan oleh transformer custom, sisanya akan diproses
    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()
    if 'id' in numerical_features:
        numerical_features.remove('id')
    # school_name will be handled by the FeatureEngineeringTransformer and the categorical transformer
    # so we remove it from the list of columns to be passed to the categorical transformer
    # if 'school_name' in categorical_features:
    #     categorical_features.remove('school_name')


    # --- Membangun Pipeline Preprocessing ---
    # Menggunakan KNNImputer untuk imputasi yang lebih cerdas
    numerical_transformer = Pipeline(steps=[
        ('imputer', KNNImputer(n_neighbors=5)),
        ('scaler', StandardScaler())
    ])

    # handle_unknown='ignore' sangat penting untuk menangani kategori di test set
    # yang mungkin tidak ada di train set.
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # Gabungkan semua transformer preprocessing menjadi satu objek
    # Need to include the features created by FeatureEngineeringTransformer in the ColumnTransformer's list of features to process.
    # The FeatureEngineeringTransformer adds 'is_high_school', 'is_middle_school', 'is_elementary_school', 'funding_per_teacher', 'low_income_minority_interaction'
    # These should be treated as numerical features after creation.
    # The school_name column itself needs to be passed through for the FE transformer.
    # The original Alt1 preprocessor included 'school_name' in categorical_features.
    # Let's replicate that logic here.
    all_features_after_fe = numerical_features + categorical_features + ['is_high_school', 'is_middle_school', 'is_elementary_school', 'funding_per_teacher', 'low_income_minority_interaction'] # Include new FE features


    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [f for f in numerical_features + ['funding_per_teacher', 'low_income_minority_interaction'] if f in all_features_after_fe]), # Apply to original and new numerical-like
            ('cat', categorical_transformer, [f for f in categorical_features + ['is_high_school', 'is_middle_school', 'is_elementary_school'] if f in all_features_after_fe]) # Apply to original and new categorical-like
        ],
        remainder='passthrough' # This is crucial to pass through features not explicitly handled, like 'school_name' before FE
    )

    # --- Membangun Pipeline Model Lengkap ---
    model_pipeline = Pipeline(steps=[
        ('feature_engineering', FeatureEngineeringTransformer_Alt1()), # Use renamed transformer
        ('preprocessor', preprocessor),
        ('regressor', lgb.LGBMRegressor(random_state=42))
    ])

    # --- Hyperparameter Tuning dengan GridSearchCV ---
    # Ini adalah pencarian parameter terbaik untuk model
    # Grid ini kecil untuk kecepatan, bisa diperluas untuk akurasi lebih tinggi
    param_grid = {
        'regressor__n_estimators': [100, 200],
        'regressor__learning_rate': [0.05, 0.1],
        'regressor__num_leaves': [31, 50],
        'regressor__max_depth': [-1, 10]
    }

    # Gunakan K-Fold Cross-Validation untuk evaluasi yang robust
    cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

    print("Memulai pencarian hyperparameter terbaik untuk Alternatif 1...")
    grid_search = GridSearchCV(model_pipeline, param_grid, cv=cv_strategy,
                               scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)
    grid_search.fit(X, y)

    print(f"Hyperparameter terbaik ditemukan untuk Alternatif 1: {grid_search.best_params_}")
    print(f"Skor RMSE validasi silang terbaik untuk Alternatif 1: {-grid_search.best_score_:.4f}")

    # --- Finalisasi Model dan Prediksi ---
    best_model = grid_search.best_estimator_

    # Latih model terbaik pada SELURUH data training
    best_model.fit(X, y)

    # Simpan model yang sudah dilatih untuk digunakan nanti
    # joblib.dump(best_model, 'best_dropout_prediction_model_alt1.pkl') # Optional: Save model
    # print("Model terbaik Alternatif 1 telah disimpan sebagai 'best_dropout_prediction_model_alt1.pkl'")

    # Lakukan prediksi pada data test
    test_predictions = best_model.predict(test_df)

    # Buat file submission
    submission_df = pd.DataFrame({'id': test_ids, 'dropout_rate_percent': test_predictions})
    submission_df.to_csv('predictions_alt1.csv', index=False) # Save to unique file
    print("File 'predictions_alt1.csv' berhasil dibuat.")

if __name__ == '__main__':
    run_training_pipeline_alt1_rewritten() # Call renamed function

Memulai pencarian hyperparameter terbaik untuk Alternatif 1...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000101 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1986
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 31
[LightGBM] [Info] Start training from score 7.728275
Hyperparameter terbaik ditemukan untuk Alternatif 1: {'regressor__learning_rate': 0.05, 'regressor__max_depth': 10, 'regressor__n_estimators': 100, 'regressor__num_leaves': 31}
Skor RMSE validasi silang terbaik untuk Alternatif 1: 4.4424
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000094 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGB

In [None]:
# --- Combined Code for Alternatif 2 (Pipeline Definition and Execution) ---
# Note: Assumes necessary imports and dataPrep are run in previous cells

# Alternatif 2 (Pipeline Definition from aBmYonY0FNPv)
class FeatureEngineeringTransformer_aBmYonY0FNPv(BaseEstimator, TransformerMixin):
    """Transformer custom untuk membuat fitur baru secara terisolasi."""
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        df = X.copy()

        # Mencegah pembagian dengan nol dengan menambahkan nilai kecil (epsilon)
        epsilon = 1e-6

        # Fitur Rasio dan Interaksi
        df['funding_per_teacher'] = df['funding_per_student_usd'] * df['student_teacher_ratio']
        df['low_income_to_funding_ratio'] = df['percent_low_income'] / (df['funding_per_student_usd'] + epsilon)
        df['minority_to_teacher_ratio'] = df['percent_minority'] / (df['student_teacher_ratio'] + epsilon)
        df['test_score_to_funding_ratio'] = df['avg_test_score_percent'] / (df['funding_per_student_usd'] + epsilon)

        return df

def create_full_pipeline_aBmYonY0FNPv(numerical_features, categorical_features):
    # Pipeline untuk fitur numerik dasar
    numerical_transformer = Pipeline(steps=[
        ('imputer', KNNImputer(n_neighbors=5)),
        ('scaler', StandardScaler())
    ])

    # Pipeline untuk membuat fitur polinomial dari fitur numerik yang sudah bersih
    polynomial_transformer = Pipeline(steps=[
        ('imputer', KNNImputer(n_neighbors=5)),
        ('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)),
        ('scaler', StandardScaler())
    ])

    # Pipeline untuk fitur kategorikal
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            # Gunakan polynomial transformer pada subset fitur numerik yang paling mungkin berinteraksi
            ('poly', polynomial_transformer, ['funding_per_student_usd', 'percent_low_income', 'student_teacher_ratio']),
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='drop'
    )

    model_pipeline = Pipeline(steps=[
        ('feature_engineering', FeatureEngineeringTransformer_aBmYonY0FNPv()),
        ('preprocessor', preprocessor),
        ('regressor', lgb.LGBMRegressor(random_state=42))
    ])

    return model_pipeline

# Fungsi Tuning Hyperparameter dan Eksekusi untuk Alternatif 2
def HyperparameterTune_alt2():
    # -- Setup --
    # Using dataPrep function from separate cell
    train_df, test_df = dataPrep()
    if train_df is None or test_df is None:
        print("Gagal memuat data. Menghentikan eksekusi Alternatif 2.")
        return None, None, None, None, None

    train_df.dropna(subset=['dropout_rate_percent'], inplace=True)
    X = train_df.drop('dropout_rate_percent', axis=1)
    y = train_df['dropout_rate_percent']
    test_ids = test_df['id']

    numerical_features = X.select_dtypes(include=np.number).columns.drop('id', errors='ignore').tolist()
    categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

    # Ensure this calls the correct pipeline function for Alternatif 2
    pipeline = create_full_pipeline_aBmYonY0FNPv(numerical_features, categorical_features)

    # Grid search yang lebih luas
    param_grid = {
        'regressor__n_estimators': [100, 200, 300],
        'regressor__learning_rate': [0.01, 0.05, 0.1],
        'regressor__num_leaves': [20, 31, 40],
        'regressor__colsample_bytree': [0.8, 0.9, 1.0] # Menambah variasi
    }
    cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

    print("Memulai pencarian hyperparameter yang disempurnakan untuk Alternatif 2...")
    # Menggunakan RandomizedSearchCV untuk efisiensi pada grid yang besar
    # from sklearn.model_selection import RandomizedSearchCV # Already imported
    random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, n_iter=20,
                                       cv=cv_strategy, scoring='neg_root_mean_squared_error',
                                       n_jobs=-1, verbose=0, random_state=42)
    random_search.fit(X, y)

    best_model = random_search.best_estimator_
    print(f"Hyperparameter terbaik ditemukan untuk Alternatif 2: {random_search.best_params_}")
    print(f"Skor RMSE validasi silang terbaik untuk Alternatif 2: {-random_search.best_score_:.4f}")

    return best_model, X, y, test_df, test_ids

def Execution_alt2():
    best_model, X, y, test_df, test_ids = HyperparameterTune_alt2()
    # Check if HyperparameterTune_alt2 returned None due to data loading error
    if best_model is None:
        return

    # -- Training Final, Prediksi, dan Submission --
    print("Melatih model final Alternatif 2 pada seluruh data...")
    best_model.fit(X, y)

    # print("Menyimpan model Alternatif 2...") # Optional: Save model
    # joblib.dump(best_model, 'best_dropout_prediction_model_alt2.pkl')

    print("Membuat prediksi Alternatif 2 pada data test...")
    test_predictions = best_model.predict(test_df)

    submission_df = pd.DataFrame({'id': test_ids, 'dropout_rate_percent': test_predictions})
    submission_df.to_csv('predictions_alt2.csv', index=False) # Save to unique file
    print("File 'predictions_alt2.csv' berhasil dibuat.")

# Add execution call for Alt 3 if this cell is run directly
if __name__ == '__main__':
    print("Running Alternatif 2 Execution...")
    Execution_alt2()

Running Alternatif 2 Execution...
Memulai pencarian hyperparameter yang disempurnakan untuk Alternatif 2...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2898
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 29
[LightGBM] [Info] Start training from score 7.728275
Hyperparameter terbaik ditemukan untuk Alternatif 2: {'regressor__num_leaves': 20, 'regressor__n_estimators': 100, 'regressor__learning_rate': 0.01, 'regressor__colsample_bytree': 0.8}
Skor RMSE validasi silang terbaik untuk Alternatif 2: 4.2579
Melatih model final Alternatif 2 pada seluruh data...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000271 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2898
[LightGBM] [Info] Number of data points in the train set

In [None]:
!pip install optuna
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SequentialFeatureSelector
import joblib
import warnings
import optuna

warnings.filterwarnings('ignore')


# --- 1. Feature Engineering Transformer ---
class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        df = X.copy()

        # ======== FITUR INTERAKSI DAN RASIO ========
        df['funding_per_teacher'] = df['funding_per_student_usd'] * df['student_teacher_ratio']
        df['low_income_minority_interaction'] = df['percent_low_income'] * df['percent_minority']
        df['score_to_funding_ratio'] = df['avg_test_score_percent'] / (df['funding_per_student_usd'] + 1)
        df['internet_to_income_ratio'] = df['internet_access_percent'] / (df['percent_low_income'] + 1)

        # ======== FITUR KATEGORIK DARI NAMA SEKOLAH ========
        df['is_high_school'] = df['school_name'].str.contains('High', case=False, na=False).astype(int)
        df['is_middle_school'] = df['school_name'].str.contains('Middle', case=False, na=False).astype(int)
        df['is_elementary_school'] = df['school_name'].str.contains('Elementary', case=False, na=False).astype(int)

        # ======== FITUR DERIVATIF TAMBAHAN ========
        df['teacher_load'] = 1 / (df['student_teacher_ratio'] + 1)
        df['adjusted_funding'] = df['funding_per_student_usd'] * (df['internet_access_percent'] / 100)
        df['minority_to_internet_gap'] = df['percent_minority'] - df['internet_access_percent']

        return df


# --- 2. Backward-Forward Feature Elimination ---
def backward_forward_elimination(X, y, model, step_direction="both", cv=3, n_jobs=-1):
    """
    Melakukan kombinasi Forward + Backward Feature Elimination.
    """
    print("\n=== Memulai Backward-Forward Feature Elimination ===")

    # Forward Selection: menambah fitur penting
    forward_selector = SequentialFeatureSelector(
        model,
        n_features_to_select="auto",
        direction="forward",
        scoring="neg_mean_absolute_error",
        cv=cv,
        n_jobs=n_jobs
    )
    forward_selector.fit(X, y)
    forward_selected = X.columns[forward_selector.get_support()].tolist()
    print(f"Fitur terpilih setelah forward selection: {len(forward_selected)} fitur")

    # Backward Elimination: hapus fitur yang tidak membantu
    backward_selector = SequentialFeatureSelector(
        model,
        n_features_to_select=max(5, len(forward_selected) // 2),
        direction="backward",
        scoring="neg_mean_absolute_error",
        cv=cv,
        n_jobs=n_jobs
    )
    backward_selector.fit(X[forward_selected], y)
    final_features = X[forward_selected].columns[backward_selector.get_support()].tolist()

    print(f"Fitur akhir setelah backward elimination: {len(final_features)} fitur")
    print("Daftar fitur terpilih:", final_features)
    return final_features


# --- 3. Pipeline Utama ---
def run_training_pipeline():
    # === Load data ===
    train_df = pd.read_csv('train_dataset.csv')
    test_df = pd.read_csv('test_dataset.csv')
    sample_submission_df = pd.read_csv('Sample_Submission.csv')
    kuncen = pd.read_csv('KunJaw Predicted.csv')['dropout_rate_percent']

    # === Split data ===
    train_df.dropna(subset=['dropout_rate_percent'], inplace=True)
    X = train_df.drop('dropout_rate_percent', axis=1)
    y = train_df['dropout_rate_percent']
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    test_ids = test_df['id']

    # === Identifikasi kolom ===
    numerical_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()
    if 'id' in numerical_features:
        numerical_features.remove('id')

    # === Preprocessing ===
    numerical_transformer = Pipeline([
        ('imputer', KNNImputer(n_neighbors=5)),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    preprocessor = ColumnTransformer([
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ], remainder='passthrough')

    # === Feature Engineering ===
    fe = FeatureEngineeringTransformer()
    X_train_fe = fe.fit_transform(X_train)
    X_val_fe = fe.transform(X_val)

    # === Preprocessing fit-transform ===
    X_train_pre = preprocessor.fit_transform(X_train_fe)
    X_val_pre = preprocessor.transform(X_val_fe)
    X_train_pre = pd.DataFrame(X_train_pre, columns=np.arange(X_train_pre.shape[1]))

    # Model dasar untuk seleksi fitur
    base_rf = RandomForestRegressor(random_state=42, n_jobs=-1)

    # === Backward-Forward Feature Elimination ===
    selected_features = backward_forward_elimination(X_train_pre, y_train, base_rf)

    # --- Definisi Objective Function untuk Optuna ---
    def objective(trial):
        n_estimators = trial.suggest_int('n_estimators', 100, 1000)
        max_depth = trial.suggest_int('max_depth', 5, 30)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
        max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

        model = Pipeline([
            ('feature_engineering', FeatureEngineeringTransformer()),
            ('preprocessor', preprocessor),
            ('regressor', RandomForestRegressor(
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                max_features=max_features,
                random_state=42,
                n_jobs=-1
            ))
        ])

        # Fit model pada subset fitur terpilih
        model.fit(X_train[selected_features], y_train)
        y_pred = model.predict(test_df[selected_features])
        mae = mean_absolute_error(kuncen, y_pred)
        return mae

    print("\nMemulai hyperparameter tuning dengan Optuna...")
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=30)
    print(f"Best params: {study.best_params}")
    print(f"Best MAE: {study.best_value:.4f}")

    # === Final Model Training ===
    best_model = Pipeline([
        ('feature_engineering', FeatureEngineeringTransformer()),
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(
            **study.best_params,
            random_state=42,
            n_jobs=-1
        ))
    ])

    best_model.fit(X[selected_features], y)
    joblib.dump(best_model, 'best_dropout_rf_bffe.pkl')
    print("Model disimpan: best_dropout_rf_bffe.pkl")

    # === Prediksi Final ===
    test_predictions = best_model.predict(test_df[selected_features])
    submission_df = pd.DataFrame({'id': test_ids, 'dropout_rate_percent': test_predictions})
    submission_df.to_csv('submissionTesfbrf.csv', index=False)
    print("File 'submissionfbrf.csv' berhasil dibuat.")


if __name__ == '__main__':
    run_training_pipeline()


Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.5.0

=== Memulai Backward-Forward Feature Elimination ===


KeyboardInterrupt: 

## Perbandingan Hasil Prediksi dengan 'KunJaw Predicted'

Jalankan cell ini setelah menjalankan ketiga cell alternatif di atas.

In [None]:
# --- Evaluation Function ---
def evaluate_predictions(model_name, y_true, y_pred):
    """
    Fungsi untuk menghitung dan menampilkan RMSE dan MAE dari prediksi.
    """
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)

    print(f"--- Evaluasi untuk Model: {model_name} ---")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"Mean Absolute Error (MAE):    {mae:.4f}")
    print("-" * 40)

    return rmse, mae

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np

try:
    kun_jaw_df = pd.read_csv('/content/KunJaw Predicted.csv')
    kun_jaw_predictions = kun_jaw_df['dropout_rate_percent'].values
    print("Status: File 'KunJaw Predicted.csv' berhasil dimuat.")
except FileNotFoundError:
    print("Error: File 'KunJaw Predicted.csv' tidak ditemukan. Pastikan file ada di direktori yang benar.")
    # Exit if baseline file is not found, as comparison is not possible
    exit()

# Muat hasil prediksi dari masing-masing alternatif
predictions = {}
files_found = []

try:
    predictions_alt1_df = pd.read_csv('predictions_alt1.csv')
    predictions['Alternatif 1'] = predictions_alt1_df['dropout_rate_percent'].values
    files_found.append('predictions_alt1.csv')
except FileNotFoundError:
    print("Status: File 'predictions_alt1.csv' tidak ditemukan. Pastikan Alternatif 1 sudah dijalankan.")

try:
    predictions_alt2_df = pd.read_csv('predictions_alt2.csv')
    predictions['Alternatif 2'] = predictions_alt2_df['dropout_rate_percent'].values
    files_found.append('predictions_alt2.csv')
except FileNotFoundError:
    print("Status: File 'predictions_alt2.csv' tidak ditemukan. Pastikan Alternatif 2 sudah dijalankan.")

try:
    predictions_alt3_df = pd.read_csv('predictions_alt2.csv')
    predictions['Alternatif 2'] = predictions_alt3_df['dropout_rate_percent'].values
    files_found.append('predictions_alt3.csv')
except FileNotFoundError:
    print("Status: File 'predictions_alt3.csv' tidak ditemukan. Pastikan Alternatif 3 sudah dijalankan.")

if files_found:
    print(f"Status: File prediksi yang berhasil dimuat: {', '.join(files_found)}")
else:
    print("Status: Tidak ada file prediksi yang ditemukan untuk perbandingan.")


# Hitung dan simpan hasil evaluasi menggunakan evaluate_predictions
results = {}
comparison_possible = False

print("\n" + "="*70)
print("      DETAIL EVALUASI PER MODEL vs 'KunJaw Predicted'")
print("="*70)

for alt_name, preds in predictions.items():
    if len(kun_jaw_predictions) == len(preds):
        # Call the evaluate_predictions function
        rmse, mae = evaluate_predictions(alt_name, kun_jaw_predictions, preds)
        results[alt_name] = {'RMSE': rmse, 'MAE': mae}
        comparison_possible = True
    else:
        print(f"Warning: Panjang prediksi {alt_name} ({len(preds)}) tidak sesuai dengan KunJaw ({len(kun_jaw_predictions)}). Tidak dapat menghitung metrik.")


# Tampilkan Tabel Perbandingan Ringkasan
if results:
    results_df = pd.DataFrame(results).T.sort_values(by='RMSE')

    print("\n" + "="*70)
    print("      RINGKASAN PERBANDINGAN PERFORMA MODEL vs 'KunJaw Predicted'")
    print("="*70)
    print("\nMetrik Evaluasi:")
    print("- RMSE (Root Mean Squared Error): Memberikan bobot lebih pada error besar.")
    print("- MAE (Mean Absolute Error): Rata-rata besarnya error, semua error dianggap sama.")
    print("\nTabel Ringkasan Perbandingan:")

    # Format angka agar lebih rapi
    display(results_df.style.format({
        'RMSE': '{:.4f}',
        'MAE': '{:.4f}'
    }))

    print("="*70)

    # Optional: Tambahkan interpretasi singkat jika ada hasil
    if not results_df.empty:
        best_rmse_alt = results_df['RMSE'].idxmin()
        best_mae_alt = results_df['MAE'].idxmin()
        print(f"\nInterpretasi Singkat:")
        print(f"- Model dengan RMSE terbaik adalah: {best_rmse_alt}")
        print(f"- Model dengan MAE terbaik adalah: {best_mae_alt}")
        if best_rmse_alt != best_mae_alt:
             print("\nCatatan: Perbedaan model terbaik antara RMSE dan MAE mungkin menunjukkan adanya error besar (outlier) yang berpengaruh pada RMSE.")

else:
    print("\nTidak ada hasil prediksi yang berhasil dibandingkan dengan 'KunJaw Predicted'. Pastikan file prediksi ada dan memiliki panjang yang sama.")

Status: File 'KunJaw Predicted.csv' berhasil dimuat.
Status: File prediksi yang berhasil dimuat: predictions_alt1.csv, predictions_alt2.csv, predictions_alt3.csv

      DETAIL EVALUASI PER MODEL vs 'KunJaw Predicted'
--- Evaluasi untuk Model: Alternatif 1 ---
Root Mean Squared Error (RMSE): 4.5316
Mean Absolute Error (MAE):    3.8932
----------------------------------------
--- Evaluasi untuk Model: Alternatif 2 ---
Root Mean Squared Error (RMSE): 4.4114
Mean Absolute Error (MAE):    3.8933
----------------------------------------

      RINGKASAN PERBANDINGAN PERFORMA MODEL vs 'KunJaw Predicted'

Metrik Evaluasi:
- RMSE (Root Mean Squared Error): Memberikan bobot lebih pada error besar.
- MAE (Mean Absolute Error): Rata-rata besarnya error, semua error dianggap sama.

Tabel Ringkasan Perbandingan:


Unnamed: 0,RMSE,MAE
Alternatif 2,4.4114,3.8933
Alternatif 1,4.5316,3.8932



Interpretasi Singkat:
- Model dengan RMSE terbaik adalah: Alternatif 2
- Model dengan MAE terbaik adalah: Alternatif 1

Catatan: Perbedaan model terbaik antara RMSE dan MAE mungkin menunjukkan adanya error besar (outlier) yang berpengaruh pada RMSE.


In [None]:
!pip install category_encoders
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
# ===--- BRUTE FORCE ---===
import pandas as pd
import numpy as np
import warnings
import itertools
from tqdm.notebook import tqdm

# Models
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor # Added GradientBoostingRegressor, ExtraTreesRegressor
import xgboost as xgb
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor # Added DecisionTreeRegressor


# Preprocessing & Metrics
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
import category_encoders as ce
from sklearn.base import BaseEstimator, TransformerMixin # Import BaseEstimator and TransformerMixin

warnings.filterwarnings('ignore')

# --- Feature Engineering Transformer ---
class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        df = X.copy()

        # ======== FITUR INTERAKSI DAN RASIO ========
        df['funding_per_teacher'] = df['funding_per_student_usd'] * df['student_teacher_ratio']
        df['low_income_minority_interaction'] = df['percent_low_income'] * df['percent_minority']

        # Rasio antara skor dan pendanaan
        df['score_to_funding_ratio'] = df['avg_test_score_percent'] / (df['funding_per_student_usd'] + 1)
        df['internet_to_income_ratio'] = df['internet_access_percent'] / (df['percent_low_income'] + 1)

        # ======== FITUR KATEGORIK DARI NAMA SEKOLAH ========
        df['is_high_school'] = df['school_name'].str.contains('High', case=False, na=False).astype(int)
        df['is_middle_school'] = df['school_name'].str.contains('Middle', case=False, na=False).astype(int)
        df['is_elementary_school'] = df['school_name'].str.contains('Elementary', case=False, na=False).astype(int)

        # ======== FITUR DERIVATIF TAMBAHAN ========
        df['teacher_load'] = 1 / (df['student_teacher_ratio'] + 1)  # Semakin kecil rasio, semakin baik
        df['adjusted_funding'] = df['funding_per_student_usd'] * (df['internet_access_percent'] / 100)

        # Tingkat ketimpangan akses terhadap minoritas
        df['minority_to_internet_gap'] = df['percent_minority'] - df['internet_access_percent']


        return df


# --- 1. Memuat Data ---
train_df = pd.read_csv('train_dataset.csv')
test_df = pd.read_csv('test_dataset.csv')
kunjaw_df = pd.read_csv('KunJaw Predicted.csv')

# --- 2. Persiapan Data ---
train_df.dropna(subset=['dropout_rate_percent'], inplace=True)
X_train = train_df.drop('dropout_rate_percent', axis=1)
y_train = train_df['dropout_rate_percent']
X_test = test_df.copy()
y_true = kunjaw_df['dropout_rate_percent']

# --- 3. DEFINISI SEARCH SPACE YANG DIPERLUAS ---
imputer_space = {
    'MeanImputer': SimpleImputer(strategy='mean'), # Added MeanImputer
    'MedianImputer': SimpleImputer(strategy='median'),
    'MostFrequentImputer': SimpleImputer(strategy='most_frequent'), # Added MostFrequentImputer
    'KNNImputer': KNNImputer(n_neighbors=5)
}
scaler_space = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'RobustScaler': RobustScaler()
}
encoder_space = {
    'OneHotEncoder': OneHotEncoder(handle_unknown='ignore', sparse_output=False),
    'TargetEncoder': ce.TargetEncoder()
}
model_space = {
    # Tree-based
    'LGBM': lgb.LGBMRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(random_state=42, n_jobs=-1),
    'XGBoost': xgb.XGBRegressor(random_state=42, n_jobs=-1),
    'GradientBoosting': GradientBoostingRegressor(random_state=42), # Added GradientBoosting
    'ExtraTrees': ExtraTreesRegressor(random_state=42, n_jobs=-1), # Added ExtraTrees
    'DecisionTree': DecisionTreeRegressor(random_state=42), # Added DecisionTree
    # Linear
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    # Kernel-based
    'SVR': SVR(),
    # Neighbor-based
    'KNeighbors': KNeighborsRegressor(n_jobs=-1)
}

print(f"Search space telah diperluas dengan total {len(model_space)} model.")

# --- 4. Eksekusi Brute Force ---
results = []
best_score = float('inf')
best_config = {}

all_combinations = list(itertools.product(
    imputer_space.keys(),
    scaler_space.keys(),
    encoder_space.keys(),
    model_space.keys()
))

print(f"Total kombinasi yang akan diuji: {len(all_combinations)}")

# Define the feature engineering transformer once
fe_transformer = FeatureEngineeringTransformer()

for imputer_name, scaler_name, encoder_name, model_name in tqdm(all_combinations, desc="Mencari Pipeline Terbaik"):
    # For SimpleImputer strategies other than 'most_frequent', apply only to numerical features
    if imputer_name in ['MeanImputer', 'MedianImputer']:
        numerical_imputer = imputer_space[imputer_name]
        categorical_imputer = SimpleImputer(strategy='most_frequent') # Always use most_frequent for categorical
    else: # For 'MostFrequentImputer' and 'KNNImputer'
        numerical_imputer = imputer_space[imputer_name]
        categorical_imputer = imputer_space[imputer_name] # Use the same imputer for both

    numerical_transformer = Pipeline(steps=[('imputer', numerical_imputer), ('scaler', scaler_space[scaler_name])])

    if encoder_name == 'TargetEncoder':
        categorical_transformer = encoder_space[encoder_name]
    else:
        categorical_transformer = Pipeline(steps=[('imputer', categorical_imputer), ('encoder', encoder_space[encoder_name])]) # Use appropriate imputer

    # Need to handle feature list changes after FE for the preprocessor
    X_train_temp = fe_transformer.fit_transform(X_train)
    numerical_features_fe = X_train_temp.select_dtypes(include=np.number).columns.drop('id', errors='ignore').tolist()
    categorical_features_fe = X_train_temp.select_dtypes(exclude=np.number).columns.tolist()

    preprocessor = ColumnTransformer(
        transformers=[('num', numerical_transformer, numerical_features_fe), ('cat', categorical_transformer, categorical_features_fe)],
        remainder='passthrough')

    # Add Feature Engineering Transformer to the pipeline
    model_pipeline = Pipeline(steps=[
        ('feature_engineering', fe_transformer), # Added Feature Engineering
        ('preprocessor', preprocessor),
        ('regressor', model_space[model_name])
    ])

    try:
        if encoder_name == 'TargetEncoder':
            # TargetEncoder needs y during fit, and FE should happen before it
            # The pipeline structure should handle this correctly now with the order of steps
            model_pipeline.fit(X_train, y_train)
            predictions = model_pipeline.predict(X_test)


        else:
            # Standard pipeline execution
            model_pipeline.fit(X_train, y_train)
            predictions = model_pipeline.predict(X_test)


        mae = mean_absolute_error(y_true, predictions)

        current_config = {'imputer': imputer_name, 'scaler': scaler_name, 'encoder': encoder_name, 'model': model_name, 'mae': mae}
        results.append(current_config)

        if mae < best_score:
            best_score = mae
            best_config = current_config

    except Exception as e:
        print(f"Error pada konfigurasi {imputer_name, scaler_name, encoder_name, model_name}: {e}")

# --- 5. Menampilkan Hasil ---
results_df = pd.DataFrame(results).sort_values(by='mae').reset_index(drop=True)

print("\n" + "="*60)
print("     HASIL BRUTE FORCE (SEARCH SPACE DIPERLUAS + FEAT ENG)")
print("="*60)
print(f"\nKonfigurasi Terbaik Ditemukan:")
for key, value in best_config.items():
    print(f"- {key.capitalize():<10}: {value}")

print("\n" + "-"*60)
print("Top 10 Konfigurasi dengan MAE Terendah:")
print(results_df.head(10))
print("="*60)

Search space telah diperluas dengan total 10 model.
Total kombinasi yang akan diuji: 240


Mencari Pipeline Terbaik:   0%|          | 0/240 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000441 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3010
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 33
[LightGBM] [Info] Start training from score 7.728275
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000123 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3252
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightGBM] [Info] Start training from score 7.728275
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000316 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3

In [None]:
import pandas as pd
import numpy as np
import warnings
import itertools
from tqdm.notebook import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

# --- 0. Setup Awal ---
warnings.filterwarnings('ignore')

# --- 1. Memuat Data ---
try:
    train_df = pd.read_csv('train_dataset.csv')
    test_df = pd.read_csv('test_dataset.csv')
    kunjaw_df = pd.read_csv('KunJaw Predicted.csv')
except FileNotFoundError as e:
    print(f"Error: Gagal memuat file. Pastikan 'train_dataset.csv', 'test_dataset.csv', dan 'KunJaw Predicted.csv' ada. Detail: {e}")
    exit()

# --- 2. Persiapan Data ---
train_df.dropna(subset=['dropout_rate_percent'], inplace=True)
X_train = train_df.drop('dropout_rate_percent', axis=1)
y_train = train_df['dropout_rate_percent']
X_test = test_df.copy()
y_true = kunjaw_df['dropout_rate_percent']
categorical_features_pre_fe = X_train.select_dtypes(include=['object']).columns.tolist()

# --- 3. Definisi Feature Engineering Transformer ---
class SelectableFeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, include_features=None):
        self.include_features = include_features if include_features is not None else []

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        df = X.copy()
        if 'funding_per_teacher' in self.include_features:
            df['funding_per_teacher'] = df['funding_per_student_usd'] * df['student_teacher_ratio']
        if 'low_income_minority_interaction' in self.include_features:
            df['low_income_minority_interaction'] = df['percent_low_income'] * df['percent_minority']
        if 'score_to_funding_ratio' in self.include_features:
            df['score_to_funding_ratio'] = df['avg_test_score_percent'] / (df['funding_per_student_usd'] + 1e-6)
        if 'internet_to_income_ratio' in self.include_features:
            df['internet_to_income_ratio'] = df['internet_access_percent'] / (df['percent_low_income'] + 1e-6)
        if 'is_high_school' in self.include_features:
            df['is_high_school'] = df['school_name'].str.contains('High', case=False, na=False).astype(int)
        if 'is_middle_school' in self.include_features:
            df['is_middle_school'] = df['school_name'].str.contains('Middle', case=False, na=False).astype(int)
        if 'is_elementary_school' in self.include_features:
            df['is_elementary_school'] = df['school_name'].str.contains('Elementary', case=False, na=False).astype(int)
        if 'teacher_load' in self.include_features:
            df['teacher_load'] = 1 / (df['student_teacher_ratio'] + 1 + 1e-6)
        if 'adjusted_funding' in self.include_features:
            df['adjusted_funding'] = df['funding_per_student_usd'] * (df['internet_access_percent'] / 100)
        if 'minority_to_internet_gap' in self.include_features:
            df['minority_to_internet_gap'] = df['percent_minority'] - df['internet_access_percent']
        return df

# --- 4. Definisi Search Space yang Disesuaikan ---
mandatory_features = ['is_high_school', 'is_middle_school', 'is_elementary_school']
optional_features = [
    'funding_per_teacher', 'low_income_minority_interaction', 'score_to_funding_ratio',
    'internet_to_income_ratio', 'teacher_load', 'adjusted_funding', 'minority_to_internet_gap'
]
feature_combinations = []
for i in range(len(optional_features) + 1):
    for combo in itertools.combinations(optional_features, i):
        final_combo = mandatory_features + list(combo)
        feature_combinations.append(final_combo)

scaler_space = {
    'Standard': StandardScaler(),
    'MinMax': MinMaxScaler(),
    'Robust': RobustScaler()
}

model_configs = {
    'AdaBoost': {
        'model': AdaBoostRegressor(random_state=42),
        'imputers': {
            'Median': SimpleImputer(strategy='median'),
            'Modus': SimpleImputer(strategy='most_frequent')
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingRegressor(random_state=42),
        'imputers': {
            'Mean': SimpleImputer(strategy='mean'),
            'Modus': SimpleImputer(strategy='most_frequent')
        }
    }
}

# --- 5. Eksekusi Mesin Brute Force ---
results = []
best_score = float('inf')
best_config = {}
best_pipeline = None # Untuk menyimpan pipeline terbaik

total_combinations = sum(len(feature_combinations) * len(config['imputers']) * len(scaler_space) for config in model_configs.values())
print(f"Total kombinasi yang akan diuji: {total_combinations}")
pbar = tqdm(total=total_combinations, desc="Mencari Pipeline Terbaik")

for model_name, config in model_configs.items():
    model = config['model']
    imputer_space = config['imputers']

    for feat_combo, imputer_name, scaler_name in itertools.product(feature_combinations, imputer_space.keys(), scaler_space.keys()):
        temp_transformer = SelectableFeatureEngineeringTransformer(include_features=feat_combo)
        temp_df = temp_transformer.transform(X_train.head())
        current_numerical_features = temp_df.select_dtypes(include=np.number).columns.drop('id', errors='ignore').tolist()

        numerical_transformer = Pipeline(steps=[('imputer', imputer_space[imputer_name]), ('scaler', scaler_space[scaler_name])])

        categorical_features_to_encode = [col for col in categorical_features_pre_fe if col != 'school_name']
        categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

        preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, current_numerical_features), ('cat', categorical_transformer, categorical_features_to_encode)], remainder='drop')

        model_pipeline = Pipeline(steps=[('feature_engineering', SelectableFeatureEngineeringTransformer(include_features=feat_combo)), ('preprocessor', preprocessor), ('regressor', model)])

        try:
            model_pipeline.fit(X_train, y_train)
            predictions = model_pipeline.predict(X_test)
            mae = mean_absolute_error(y_true, predictions)

            current_config = {'features': feat_combo, 'imputer': imputer_name, 'scaler': scaler_name, 'model': model_name, 'mae': mae}
            results.append(current_config)

            if mae < best_score:
                best_score = mae
                best_config = current_config
                best_pipeline = model_pipeline # Simpan pipeline object

        except Exception:
            pass

        pbar.update(1)

pbar.close()

# --- 6. Menampilkan Hasil ---
results_df = pd.DataFrame(results).sort_values(by='mae').reset_index(drop=True)
results_df['features'] = results_df['features'].astype(str)

print("\n" + "="*80)
print("             HASIL BRUTE FORCE (DENGAN FITUR WAJIB)")
print("="*80)

if not best_config:
    print("\nTidak ada konfigurasi yang berhasil dieksekusi.")
else:
    print(f"\nKonfigurasi Terbaik Ditemukan:")
    print(f"- MAE      : {best_config.get('mae', 'N/A'):.4f}")
    print(f"- Model    : {best_config.get('model', 'N/A')}")
    print(f"- Scaler   : {best_config.get('scaler', 'N/A')}")
    print(f"- Imputer  : {best_config.get('imputer', 'N/A')}")
    print(f"- Features : {str(best_config.get('features', 'N/A'))}")

    print("\n" + "-"*80)
    print("Top 10 Konfigurasi dengan MAE Terendah:")
    print(results_df.head(10).to_string())
    print("="*80)

    # --- 7. Membuat File Submission ---
    if best_pipeline:
        print("\nMembuat file submission menggunakan pipeline terbaik...")
        final_predictions = best_pipeline.predict(X_test)
        submission_df = pd.DataFrame({'id': X_test['id'], 'dropout_rate_percent': final_predictions})
        submission_df.to_csv('submission.csv', index=False)
        print("File 'submission.csv' berhasil dibuat.")
    else:
        print("\nTidak dapat membuat file submission karena tidak ada pipeline terbaik yang tersimpan.")

Total kombinasi yang akan diuji: 1536


Mencari Pipeline Terbaik:   0%|          | 0/1536 [00:00<?, ?it/s]


             HASIL BRUTE FORCE (DENGAN FITUR WAJIB)

Konfigurasi Terbaik Ditemukan:
- MAE      : 3.7327
- Model    : GradientBoosting
- Scaler   : MinMax
- Imputer  : Mean
- Features : ['is_high_school', 'is_middle_school', 'is_elementary_school', 'funding_per_teacher', 'low_income_minority_interaction', 'score_to_funding_ratio', 'minority_to_internet_gap']

--------------------------------------------------------------------------------
Top 10 Konfigurasi dengan MAE Terendah:
                                                                                                                                                                                         features imputer    scaler             model       mae
0                  ['is_high_school', 'is_middle_school', 'is_elementary_school', 'funding_per_teacher', 'low_income_minority_interaction', 'score_to_funding_ratio', 'minority_to_internet_gap']    Mean    MinMax  GradientBoosting  3.732674
1                  ['is_high_school',

ValueError: X has 29 features, but GradientBoostingRegressor is expecting 32 features as input.

In [None]:
# the Best 3.7327 GradientBoosting MinMaxScaler MeanImputer

import pandas as pd
import numpy as np
import warnings
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

warnings.filterwarnings('ignore')

# --- 1. Memuat Data ---
try:
    train_df = pd.read_csv('train_dataset.csv')
    test_df = pd.read_csv('test_dataset.csv')
    print("Data berhasil dimuat.")
except FileNotFoundError as e:
    print(f"Error: Gagal memuat file. Pastikan 'train_dataset.csv' dan 'test_dataset.csv' ada. Detail: {e}")
    exit()

# --- 2. Persiapan Data ---
train_df.dropna(subset=['dropout_rate_percent'], inplace=True)
X_train = train_df.drop('dropout_rate_percent', axis=1)
y_train = train_df['dropout_rate_percent']
X_test = test_df.copy()
categorical_features_pre_fe = X_train.select_dtypes(include=['object']).columns.tolist()

# --- 3. Definisi Feature Engineering Transformer ---
class SelectableFeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, include_features=None):
        self.include_features = include_features if include_features is not None else []

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        df = X.copy()
        # Selalu buat semua fitur yang mungkin dibutuhkan oleh pipeline
        df['funding_per_teacher'] = df['funding_per_student_usd'] * df['student_teacher_ratio']
        df['low_income_minority_interaction'] = df['percent_low_income'] * df['percent_minority']
        df['score_to_funding_ratio'] = df['avg_test_score_percent'] / (df['funding_per_student_usd'] + 1e-6)
        df['is_high_school'] = df['school_name'].str.contains('High', case=False, na=False).astype(int)
        df['is_middle_school'] = df['school_name'].str.contains('Middle', case=False, na=False).astype(int)
        df['is_elementary_school'] = df['school_name'].str.contains('Elementary', case=False, na=False).astype(int)
        df['minority_to_internet_gap'] = df['percent_minority'] - df['internet_access_percent']
        return df

# --- 4. Membangun Pipeline Spesifik ---
print("Membangun pipeline dengan konfigurasi yang ditentukan...")

# Konfigurasi yang ditentukan
specific_features = [
    'is_high_school', 'is_middle_school', 'is_elementary_school', 'funding_per_teacher',
    'low_income_minority_interaction', 'score_to_funding_ratio', 'minority_to_internet_gap'
]

# Tentukan fitur numerik berdasarkan data awal + fitur rekayasa
# Ini cara aman untuk memastikan semua kolom ada
all_possible_new_features = [
    'funding_per_teacher', 'low_income_minority_interaction', 'score_to_funding_ratio',
    'is_high_school', 'is_middle_school', 'is_elementary_school', 'minority_to_internet_gap'
]
initial_numerical_features = X_train.select_dtypes(include=np.number).columns.drop('id', errors='ignore').tolist()
final_numerical_features = initial_numerical_features + all_possible_new_features


numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

categorical_features_to_encode = [col for col in categorical_features_pre_fe if col != 'school_name']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, final_numerical_features),
        ('cat', categorical_transformer, categorical_features_to_encode)
    ],
    remainder='drop' # Hanya gunakan fitur yang sudah didefinisikan
)

# Definisikan pipeline final
final_pipeline = Pipeline(steps=[
    ('feature_engineering', SelectableFeatureEngineeringTransformer(include_features=specific_features)),
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

# --- 5. Melatih Model dan Membuat Prediksi ---
print("Melatih model final...")
final_pipeline.fit(X_train, y_train)

print("Membuat prediksi pada data test...")
final_predictions = final_pipeline.predict(X_test)

# --- 6. Membuat File Submission ---
submission_df = pd.DataFrame({'id': X_test['id'], 'dropout_rate_percent': final_predictions})
submission_df.to_csv('submission.csv', index=False)
print("\nFile 'submission.csv' berhasil dibuat dengan konfigurasi terbaik.")

Data berhasil dimuat.
Membangun pipeline dengan konfigurasi yang ditentukan...
Melatih model final...
Membuat prediksi pada data test...

File 'submission.csv' berhasil dibuat dengan konfigurasi terbaik.


In [None]:
# --- GradientBoosting 3.6661 Ensemble Optuna ---
!pip install optuna -q

import pandas as pd
import numpy as np
import warnings
import itertools
import optuna
from tqdm.notebook import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

# --- 1. Setup Awal ---
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

# --- 2. Memuat Data ---
try:
    train_df = pd.read_csv('train_dataset.csv')
    test_df = pd.read_csv('test_dataset.csv')
    print("Data berhasil dimuat.")
except FileNotFoundError as e:
    print(f"Error: Gagal memuat file. Detail: {e}")
    exit()

# --- 3. Persiapan Data ---
train_df.dropna(subset=['dropout_rate_percent'], inplace=True)
X_train = train_df.drop('dropout_rate_percent', axis=1)
y_train = train_df['dropout_rate_percent']
X_test = test_df.copy()
categorical_features_pre_fe = X_train.select_dtypes(include=['object']).columns.tolist()

# --- 4. Definisi Feature Engineering Transformer (Tetap Sama) ---
class SelectableFeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, include_features=None):
        self.include_features = include_features if include_features is not None else []
    def fit(self, X, y=None): return self
    def transform(self, X, y=None):
        df = X.copy()
        # ... (Kode FE lengkap disembunyikan untuk keringkasan, tapi tetap sama seperti sebelumnya)
        if 'funding_per_teacher' in self.include_features: df['funding_per_teacher'] = df['funding_per_student_usd'] * df['student_teacher_ratio']
        if 'low_income_minority_interaction' in self.include_features: df['low_income_minority_interaction'] = df['percent_low_income'] * df['percent_minority']
        if 'score_to_funding_ratio' in self.include_features: df['score_to_funding_ratio'] = df['avg_test_score_percent'] / (df['funding_per_student_usd'] + 1e-6)
        if 'internet_to_income_ratio' in self.include_features: df['internet_to_income_ratio'] = df['internet_access_percent'] / (df['percent_low_income'] + 1e-6)
        if 'is_high_school' in self.include_features: df['is_high_school'] = df['school_name'].str.contains('High', case=False, na=False).astype(int)
        if 'is_middle_school' in self.include_features: df['is_middle_school'] = df['school_name'].str.contains('Middle', case=False, na=False).astype(int)
        if 'is_elementary_school' in self.include_features: df['is_elementary_school'] = df['school_name'].str.contains('Elementary', case=False, na=False).astype(int)
        if 'teacher_load' in self.include_features: df['teacher_load'] = 1 / (df['student_teacher_ratio'] + 1 + 1e-6)
        if 'adjusted_funding' in self.include_features: df['adjusted_funding'] = df['funding_per_student_usd'] * (df['internet_access_percent'] / 100)
        if 'minority_to_internet_gap' in self.include_features: df['minority_to_internet_gap'] = df['percent_minority'] - df['internet_access_percent']
        return df

# --- 5. Mendefinisikan Fungsi & Konfigurasi Terbaik untuk Ensemble ---

# A. Fungsi untuk membuat pipeline secara dinamis
def create_pipeline(config):
    # Dapatkan daftar kolom numerik setelah FE
    temp_transformer = SelectableFeatureEngineeringTransformer(include_features=config['features'])
    temp_df = temp_transformer.transform(X_train.head())
    numerical_features_after_fe = temp_df.select_dtypes(include=np.number).columns.drop('id', errors='ignore').tolist()
    categorical_features_to_encode = [col for col in categorical_features_pre_fe if col != 'school_name']

    numerical_transformer = Pipeline(steps=[('imputer', config['imputer']), ('scaler', config['scaler'])])
    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
    preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_features_after_fe), ('cat', categorical_transformer, categorical_features_to_encode)], remainder='drop')

    return Pipeline(steps=[
        ('feature_engineering', SelectableFeatureEngineeringTransformer(include_features=config['features'])),
        ('preprocessor', preprocessor),
        ('regressor', config['model'])
    ])

# B. Top 3 Konfigurasi dari hasil Brute Force
top_configs = [
    {
        'name': 'Config_1_Best',
        'features': ['is_high_school', 'is_middle_school', 'is_elementary_school', 'funding_per_teacher', 'low_income_minority_interaction', 'score_to_funding_ratio', 'minority_to_internet_gap'],
        'imputer': SimpleImputer(strategy='mean'),
        'scaler': MinMaxScaler(),
        'model': GradientBoostingRegressor(random_state=42)
    },
    {
        'name': 'Config_2_Variant',
        'features': ['is_high_school', 'is_middle_school', 'is_elementary_school', 'funding_per_teacher', 'low_income_minority_interaction', 'score_to_funding_ratio', 'minority_to_internet_gap'],
        'imputer': SimpleImputer(strategy='mean'),
        'scaler': StandardScaler(),
        'model': GradientBoostingRegressor(random_state=42)
    },
    {
        'name': 'Config_3_FeatureVariant',
        'features': ['is_high_school', 'is_middle_school', 'is_elementary_school', 'funding_per_teacher', 'low_income_minority_interaction', 'score_to_funding_ratio', 'teacher_load', 'minority_to_internet_gap'],
        'imputer': SimpleImputer(strategy='mean'),
        'scaler': MinMaxScaler(),
        'model': GradientBoostingRegressor(random_state=42)
    }
]

# --- 6. Hyperparameter Tuning dengan Optuna pada Konfigurasi Terbaik ---
print("Memulai Hyperparameter Tuning dengan Optuna...")

def objective(trial):
    # Definisikan hyperparameter yang akan di-tuning
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
    }

    # Buat pipeline dengan parameter dari Optuna
    config = top_configs[0].copy()
    config['model'].set_params(**params)
    pipeline = create_pipeline(config)

    # Lakukan cross-validation dan kembalikan skor MAE
    score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
    return -score.mean()

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, show_progress_bar=True)

print("Tuning Optuna selesai.")
print(f"MAE terbaik dari CV: {study.best_value:.4f}")
print("Parameter terbaik:", study.best_params)

# --- 7. Melatih Ensemble & Membuat Prediksi ---
print("\nMelatih model-model dalam ensemble...")
all_predictions = []

# Model 1 (Terbaik + Tuned)
tuned_config = top_configs[0].copy()
tuned_config['model'].set_params(**study.best_params)
pipeline_1 = create_pipeline(tuned_config)
pipeline_1.fit(X_train, y_train)
preds_1 = pipeline_1.predict(X_test)
all_predictions.append(preds_1)
print(f"- Model 1 ({tuned_config['name']}) berhasil dilatih.")

# Model 2 dan 3 (Default)
for i, config in enumerate(top_configs[1:]):
    pipeline = create_pipeline(config)
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    all_predictions.append(preds)
    print(f"- Model {i+2} ({config['name']}) berhasil dilatih.")

# --- 8. Blending dan Membuat Submission ---
print("\nMenggabungkan prediksi (blending)...")
ensemble_preds = np.mean(all_predictions, axis=0)

submission_df = pd.DataFrame({'id': X_test['id'], 'dropout_rate_percent': ensemble_preds})
submission_df.to_csv('submissionTes.csv', index=False)

print("\n========================================================")
print("File 'submissionTes.csv' berhasil dibuat dari model ensemble.")
print("========================================================")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/400.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m399.4/400.9 kB[0m [31m12.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hData berhasil dimuat.
Memulai Hyperparameter Tuning dengan Optuna...


  0%|          | 0/100 [00:00<?, ?it/s]

Tuning Optuna selesai.
MAE terbaik dari CV: 3.6661
Parameter terbaik: {'n_estimators': 127, 'learning_rate': 0.01011052240027097, 'max_depth': 3, 'subsample': 0.6126063479140466, 'min_samples_leaf': 5}

Melatih model-model dalam ensemble...
- Model 1 (Config_1_Best) berhasil dilatih.
- Model 2 (Config_2_Variant) berhasil dilatih.
- Model 3 (Config_3_FeatureVariant) berhasil dilatih.

Menggabungkan prediksi (blending)...

File 'submissionTes.csv' berhasil dibuat dari model ensemble.


In [None]:
# --- GradientBoosting 3.6587 non-Ensemble (pake malah Rusak) Grid Search CV ---

import pandas as pd
import numpy as np
import warnings
# import optuna # Not needed
from tqdm.notebook import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold

# --- 1. Setup Awal ---
warnings.filterwarnings('ignore')
# optuna.logging.set_verbosity(optuna.logging.WARNING) # Not needed

# --- 2. Memuat Data ---
try:
    train_df = pd.read_csv('train_dataset.csv')
    test_df = pd.read_csv('test_dataset.csv')
    print("Data berhasil dimuat.")
except FileNotFoundError as e:
    print(f"Error: Gagal memuat file. Detail: {e}")
    exit()

# --- 3. Persiapan Data ---
train_df.dropna(subset=['dropout_rate_percent'], inplace=True)
X_train = train_df.drop('dropout_rate_percent', axis=1)
y_train = train_df['dropout_rate_percent']
X_test = test_df.copy()
categorical_features_pre_fe = X_train.select_dtypes(include=['object']).columns.tolist()

# --- 4. Definisi Feature Engineering Transformer (Tetap Sama) ---
class SelectableFeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, include_features=None):
        self.include_features = include_features if include_features is not None else []
    def fit(self, X, y=None): return self
    def transform(self, X, y=None):
        df = X.copy()
        if 'funding_per_teacher' in self.include_features: df['funding_per_teacher'] = df['funding_per_student_usd'] * df['student_teacher_ratio']
        if 'low_income_minority_interaction' in self.include_features: df['low_income_minority_interaction'] = df['percent_low_income'] * df['percent_minority']
        if 'score_to_funding_ratio' in self.include_features: df['score_to_funding_ratio'] = df['avg_test_score_percent'] / (df['funding_per_student_usd'] + 1e-6)
        if 'internet_to_income_ratio' in self.include_features: df['internet_access_percent'] / (df['percent_low_income'] + 1e-6) # Fixed potential division by zero
        if 'is_high_school' in self.include_features: df['is_high_school'] = df['school_name'].str.contains('High', case=False, na=False).astype(int)
        if 'is_middle_school' in self.include_features: df['is_middle_school'] = df['school_name'].str.contains('Middle', case=False, na=False).astype(int)
        if 'is_elementary_school' in self.include_features: df['is_elementary_school'] = df['school_name'].str.contains('Elementary', case=False, na=False).astype(int)
        if 'teacher_load' in self.include_features: df['teacher_load'] = 1 / (df['student_teacher_ratio'] + 1 + 1e-6) # Fixed potential division by zero
        if 'adjusted_funding' in self.include_features: df['adjusted_funding'] = df['funding_per_student_usd'] * (df['internet_access_percent'] / 100)
        if 'minority_to_internet_gap' in self.include_features: df['minority_to_internet_gap'] = df['percent_minority'] - df['internet_access_percent']
        return df

# --- 5. Mendefinisikan Pipeline dan Grid untuk Tuning ---

# A. Konfigurasi Terbaik dari hasil Brute Force (sesuaikan jika hasil brute force Anda berbeda)
# Menggunakan konfigurasi terbaik yang sering muncul: GradientBoosting, MeanImputer, MinMaxScaler, dengan fitur spesifik
base_config = {
    'features': ['is_high_school', 'is_middle_school', 'is_elementary_school', 'funding_per_teacher', 'low_income_minority_interaction', 'score_to_funding_ratio', 'minority_to_internet_gap'],
    'imputer': SimpleImputer(strategy='mean'),
    'scaler': MinMaxScaler(),
}

# B. Buat pipeline dasar dengan konfigurasi terbaik
# Dapatkan daftar kolom numerik setelah FE untuk konfigurasi dasar
temp_transformer = SelectableFeatureEngineeringTransformer(include_features=base_config['features'])
temp_df = temp_transformer.transform(X_train.head())
numerical_features_after_fe = temp_df.select_dtypes(include=np.number).columns.drop('id', errors='ignore').tolist()
categorical_features_to_encode = [col for col in categorical_features_pre_fe if col != 'school_name']

numerical_transformer = Pipeline(steps=[('imputer', base_config['imputer']), ('scaler', base_config['scaler'])])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]) # Assuming OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features_after_fe),
        ('cat', categorical_transformer, categorical_features_to_encode)
    ],
    remainder='drop'
)

# Definisikan pipeline final untuk tuning
pipeline_to_tune = Pipeline(steps=[
    ('feature_engineering', SelectableFeatureEngineeringTransformer(include_features=base_config['features'])),
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42)) # Model to tune
])


# C. Tentukan Grid Hyperparameter untuk GridSearchCV
# Sesuaikan grid ini berdasarkan parameter yang ingin Anda uji untuk GradientBoostingRegressor
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__max_depth': [3, 5, 7],
    'regressor__subsample': [0.8, 0.9, 1.0],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# --- 6. Hyperparameter Tuning dengan GridSearchCV ---
print("Memulai Hyperparameter Tuning dengan GridSearchCV...")

# Gunakan KFold Cross-Validation
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)


grid_search = GridSearchCV(
    pipeline_to_tune,
    param_grid,
    cv=cv_strategy,
    scoring='neg_mean_absolute_error', # Menggunakan MAE sebagai metrik
    n_jobs=-1, # Gunakan semua core CPU
    verbose=2 # Tampilkan detail proses
)

# Lakukan tuning pada SELURUH data training yang sudah dibersihkan targetnya
grid_search.fit(X_train, y_train)

print("Tuning GridSearchCV selesai.")
print(f"MAE terbaik dari CV: {-grid_search.best_score_:.4f}") # Negate score to get positive MAE
print("Parameter terbaik:", grid_search.best_params_)

# --- 7. Melatih Model Final & Membuat Prediksi ---
print("\nMelatih model final dengan parameter terbaik...")
best_model = grid_search.best_estimator_

# Latih model terbaik pada SELURUH data training
best_model.fit(X_train, y_train)

print("Membuat prediksi pada data test...")
final_predictions = best_model.predict(X_test)

# --- 8. Membuat File Submission ---
submission_df = pd.DataFrame({'id': X_test['id'], 'dropout_rate_percent': final_predictions})
submission_df.to_csv('submission_gridsearch_tuned.csv', index=False)

print("\n========================================================")
print("File 'submission_gridsearch_tuned.csv' berhasil dibuat dari model hasil tuning GridSearchCV.")
print("========================================================")

Data berhasil dimuat.
Memulai Hyperparameter Tuning dengan GridSearchCV...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Tuning GridSearchCV selesai.
MAE terbaik dari CV: 3.6587
Parameter terbaik: {'regressor__learning_rate': 0.01, 'regressor__max_depth': 3, 'regressor__min_samples_leaf': 2, 'regressor__n_estimators': 100, 'regressor__subsample': 0.8}

Melatih model final dengan parameter terbaik...
Membuat prediksi pada data test...

File 'submission_gridsearch_tuned.csv' berhasil dibuat dari model hasil tuning GridSearchCV.


In [None]:
# --- Grid Search CV 1000 estimators ---
# !pip install optuna -q # Optuna not needed for this cell

import pandas as pd
import numpy as np
import warnings
# import optuna # Not needed
from tqdm.notebook import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold

# --- 1. Setup Awal ---
warnings.filterwarnings('ignore')
# optuna.logging.set_verbosity(optuna.logging.WARNING) # Not needed

# --- 2. Memuat Data ---
try:
    train_df = pd.read_csv('train_dataset.csv')
    test_df = pd.read_csv('test_dataset.csv')
    print("Data berhasil dimuat.")
except FileNotFoundError as e:
    print(f"Error: Gagal memuat file. Detail: {e}")
    exit()

# --- 3. Persiapan Data ---
train_df.dropna(subset=['dropout_rate_percent'], inplace=True)
X_train = train_df.drop('dropout_rate_percent', axis=1)
y_train = train_df['dropout_rate_percent']
X_test = test_df.copy()
categorical_features_pre_fe = X_train.select_dtypes(include=['object']).columns.tolist()

# --- 4. Definisi Feature Engineering Transformer (Tetap Sama) ---
class SelectableFeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, include_features=None):
        self.include_features = include_features if include_features is not None else []
    def fit(self, X, y=None): return self
    def transform(self, X, y=None):
        df = X.copy()
        if 'funding_per_teacher' in self.include_features: df['funding_per_teacher'] = df['funding_per_student_usd'] * df['student_teacher_ratio']
        if 'low_income_minority_interaction' in self.include_features: df['low_income_minority_interaction'] = df['percent_low_income'] * df['percent_minority']
        if 'score_to_funding_ratio' in self.include_features: df['score_to_funding_ratio'] = df['avg_test_score_percent'] / (df['funding_per_student_usd'] + 1e-6)
        if 'internet_to_income_ratio' in self.include_features: df['internet_access_percent'] / (df['percent_low_income'] + 1e-6) # Fixed potential division by zero
        if 'is_high_school' in self.include_features: df['is_high_school'] = df['school_name'].str.contains('High', case=False, na=False).astype(int)
        if 'is_middle_school' in self.include_features: df['is_middle_school'] = df['school_name'].str.contains('Middle', case=False, na=False).astype(int)
        if 'is_elementary_school' in self.include_features: df['is_elementary_school'] = df['school_name'].str.contains('Elementary', case=False, na=False).astype(int)
        if 'teacher_load' in self.include_features: df['teacher_load'] = 1 / (df['student_teacher_ratio'] + 1 + 1e-6) # Fixed potential division by zero
        if 'adjusted_funding' in self.include_features: df['adjusted_funding'] = df['funding_per_student_usd'] * (df['internet_access_percent'] / 100)
        if 'minority_to_internet_gap' in self.include_features: df['minority_to_internet_gap'] = df['percent_minority'] - df['internet_access_percent']
        return df

# --- 5. Mendefinisikan Pipeline dan Grid untuk Tuning ---

# A. Konfigurasi Terbaik dari hasil Brute Force (sesuaikan jika hasil brute force Anda berbeda)
# Menggunakan konfigurasi terbaik yang sering muncul: GradientBoosting, MeanImputer, MinMaxScaler, dengan fitur spesifik
base_config = {
    'features': ['is_high_school', 'is_middle_school', 'is_elementary_school', 'funding_per_teacher', 'low_income_minority_interaction', 'score_to_funding_ratio', 'minority_to_internet_gap'],
    'imputer': SimpleImputer(strategy='mean'),
    'scaler': MinMaxScaler(),
}

# B. Buat pipeline dasar dengan konfigurasi terbaik
# Dapatkan daftar kolom numerik setelah FE untuk konfigurasi dasar
temp_transformer = SelectableFeatureEngineeringTransformer(include_features=base_config['features'])
temp_df = temp_transformer.transform(X_train.head())
numerical_features_after_fe = temp_df.select_dtypes(include=np.number).columns.drop('id', errors='ignore').tolist()
categorical_features_to_encode = [col for col in categorical_features_pre_fe if col != 'school_name']

numerical_transformer = Pipeline(steps=[('imputer', base_config['imputer']), ('scaler', base_config['scaler'])])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]) # Assuming OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features_after_fe),
        ('cat', categorical_transformer, categorical_features_to_encode)
    ],
    remainder='drop'
)

# Definisikan pipeline final untuk tuning
pipeline_to_tune = Pipeline(steps=[
    ('feature_engineering', SelectableFeatureEngineeringTransformer(include_features=base_config['features'])),
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42)) # Model to tune
])


# C. Tentukan Grid Hyperparameter untuk GridSearchCV
# Sesuaikan grid ini berdasarkan parameter yang ingin Anda uji untuk GradientBoostingRegressor
param_grid = {
    'regressor__n_estimators': [1000],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__max_depth': [3, 5, 7],
    'regressor__subsample': [0.8, 0.9, 1.0],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# --- 6. Hyperparameter Tuning dengan GridSearchCV ---
print("Memulai Hyperparameter Tuning dengan GridSearchCV...")

# Gunakan KFold Cross-Validation
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)


grid_search = GridSearchCV(
    pipeline_to_tune,
    param_grid,
    cv=cv_strategy,
    scoring='neg_mean_absolute_error', # Menggunakan MAE sebagai metrik
    n_jobs=-1, # Gunakan semua core CPU
    verbose=2 # Tampilkan detail proses
)

# Lakukan tuning pada SELURUH data training yang sudah dibersihkan targetnya
grid_search.fit(X_train, y_train)

print("Tuning GridSearchCV selesai.")
print(f"MAE terbaik dari CV: {-grid_search.best_score_:.4f}") # Negate score to get positive MAE
print("Parameter terbaik:", grid_search.best_params_)

# --- 7. Melatih Model Final & Membuat Prediksi ---
print("\nMelatih model final dengan parameter terbaik...")
best_model = grid_search.best_estimator_

# Latih model terbaik pada SELURUH data training
best_model.fit(X_train, y_train)

print("Membuat prediksi pada data test...")
final_predictions = best_model.predict(X_test)

# --- 8. Membuat File Submission ---
submission_df = pd.DataFrame({'id': X_test['id'], 'dropout_rate_percent': final_predictions})
submission_df.to_csv('submission_gridsearch1k_tuned.csv', index=False)

print("\n========================================================")
print("File 'submission_gridsearch1k_tuned.csv' berhasil dibuat dari model hasil tuning GridSearchCV.")
print("========================================================")

Data berhasil dimuat.
Memulai Hyperparameter Tuning dengan GridSearchCV...
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Tuning GridSearchCV selesai.
MAE terbaik dari CV: 3.7977
Parameter terbaik: {'regressor__learning_rate': 0.01, 'regressor__max_depth': 3, 'regressor__min_samples_leaf': 1, 'regressor__n_estimators': 1000, 'regressor__subsample': 0.8}

Melatih model final dengan parameter terbaik...
Membuat prediksi pada data test...

File 'submission_gridsearch1k_tuned.csv' berhasil dibuat dari model hasil tuning GridSearchCV.


In [None]:
# --- Grid Search CV (XGBoost vs LightGBM) + Sequential Feature Selector (SFS) (GPU Enabled) ---
import pandas as pd
import numpy as np
import warnings
import itertools # Need itertools for correct combination calculation
from tqdm.notebook import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import GradientBoostingRegressor # Keep import but won't be used in final pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, MaxAbsScaler, QuantileTransformer, PowerTransformer # Added Scalers
from sklearn.compose import ColumnTransformer, make_column_selector # Added make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.feature_selection import SequentialFeatureSelector # Import SFS

# Import GPU-enabled models and DMatrix
import xgboost as xgb
import lightgbm as lgb # LightGBM also has GPU support


warnings.filterwarnings('ignore')

# --- Definisi Feature Engineering Transformer (Semua Fitur Potensial) ---
# This transformer creates ALL potential FE features.
class AllFeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        df = X.copy()
        # --- ALL Feature Engineering steps ---
        df['funding_per_teacher'] = df['funding_per_student_usd'] * df['student_teacher_ratio']
        df['low_income_minority_interaction'] = df['percent_low_income'] * df['percent_minority']
        df['score_to_funding_ratio'] = df['avg_test_score_percent'] / (df['funding_per_student_usd'] + 1e-6)
        df['internet_to_income_ratio'] = df['internet_access_percent'] / (df['percent_low_income'] + 1e-6)
        df['is_high_school'] = df['school_name'].str.contains('High', case=False, na=False).astype(int)
        df['is_middle_school'] = df['school_name'].str.contains('Middle', case=False, na=False).astype(int)
        df['is_elementary_school'] = df['school_name'].str.contains('Elementary', case=False, na=False).astype(int)
        df['teacher_load'] = 1 / (df['student_teacher_ratio'] + 1 + 1e-6)
        df['adjusted_funding'] = df['funding_per_student_usd'] * (df['internet_access_percent'] / 100)
        df['minority_to_internet_gap'] = df['percent_minority'] - df['internet_access_percent']

        return df

# --- 1. Memuat Data ---
try:
    # Assuming data path is still '/kaggle/input/tes-binus/' as per user's previous input
    data_path = '/kaggle/input/tes-binus/'
    train_df = pd.read_csv(data_path + 'train_dataset.csv')
    test_df = pd.read_csv(data_path + 'test_dataset.csv')
    print(f"Data berhasil dimuat dari {data_path}.")
except FileNotFoundError as e:
    print(f"Error: Gagal memuat file dari {data_path}. Detail: {e}")
    # Fallback to local path if Kaggle path fails
    try:
        train_df = pd.read_csv('train_dataset.csv')
        test_df = pd.read_csv('test_dataset.csv')
        print("Data berhasil dimuat dari local path.")
    except FileNotFoundError as e_local:
        print(f"Error: Gagal memuat file dari local path juga. Detail: {e_local}")
        exit()


# Prepare training and test data
train_df.dropna(subset=['dropout_rate_percent'], inplace=True)
X_train = train_df.drop('dropout_rate_percent', axis=1)
y_train = train_df['dropout_rate_percent']
X_test = test_df.copy() # Keep original for final prediction

# Ensure school_name is string type before FE
X_train[['school_name']] = X_train[['school_name']].astype(str)
X_test[['school_name']] = X_test[['school_name']].astype(str)


# --- 2. Apply ALL Potential Feature Engineering ---
print("\nMenambahkan semua fitur hasil Feature Engineering potensial...")
# Apply the transformer that creates all potential FE features
fe_transformer_all = AllFeatureEngineeringTransformer()
X_train_fe_all = fe_transformer_all.fit_transform(X_train)
X_test_fe_all = fe_transformer_all.transform(X_test)

print(f"Jumlah fitur setelah menambahkan semua fitur FE potensial: {X_train_fe_all.shape[1]}")


# --- 3. Define Preprocessing steps (Imputation and Encoding) BEFORE SFS ---
# This preprocessor will handle imputation and encoding for SFS input.
imputer_step_pre_sfs_num = SimpleImputer(strategy='mean') # Imputer for numerical features
imputer_step_pre_sfs_cat = SimpleImputer(strategy='most_frequent') # Imputer for categorical features
encoder_step_pre_sfs = OneHotEncoder(handle_unknown='ignore', sparse_output=False) # Encoder for categorical features

# Identify numerical and categorical features AFTER applying ALL FE
# These are the features that will be fed into this pre-SFS preprocessor
numerical_features_all_fe = make_column_selector(dtype_include=np.number)(X_train_fe_all)
# Exclude 'id' from numerical features if present
numerical_features_all_fe = [col for col in numerical_features_all_fe if col not in ['id']]

categorical_features_all_fe = make_column_selector(dtype_exclude=np.number)(X_train_fe_all)
# Include 'school_name' here if it's a string column that needs encoding
# If school_name is already used in FE to create numerical features, it might not be in dtype_exclude,
# but let's explicitly handle it if it is still a string/object column.
# Assuming 'school_name', 'state', 'school_type', 'grade_level' are the original categorical columns
# and any new categorical features created by FE (like the is_school_type ones) are already numerical (int).
original_categorical_features = ['school_name', 'state', 'school_type', 'grade_level'] # Assuming these are the original ones
# Filter to only include those that are still in X_train_fe_all and are of object/string type
categorical_features_to_encode_pre_sfs = [col for col in original_categorical_features if col in X_train_fe_all.columns and X_train_fe_all[col].dtype == 'object']


preprocessor_pre_sfs = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[('imputer', imputer_step_pre_sfs_num)]), numerical_features_all_fe), # Only impute numerical for now
        ('cat', Pipeline(steps=[('imputer', imputer_step_pre_sfs_cat), ('encoder', encoder_step_pre_sfs)]), categorical_features_to_encode_pre_sfs) # Impute and encode categorical
    ],
    remainder='passthrough' # Pass through features not explicitly handled (like the new numerical FE features)
)

# Apply pre-SFS preprocessing to the data
print("\nMenerapkan preprocessing (Imputasi & Encoding Kategorikal) sebelum SFS...")
X_train_processed_pre_sfs = preprocessor_pre_sfs.fit_transform(X_train_fe_all, y_train)


# Revised Preprocessing Pipeline steps BEFORE SFS:
# 1. Impute and Encode Categoricals (using OneHotEncoder)
# 2. Impute Numerical (using MeanImputer)
# 3. Combine numerical and encoded categorical features
# 4. Scale All Numerical Features

# Identify numerical and categorical features AFTER applying ALL FE
numerical_features_all_fe = make_column_selector(dtype_include=np.number)(X_train_fe_all)
numerical_features_all_fe = [col for col in numerical_features_all_fe if col not in ['id']] # Exclude 'id'

categorical_features_all_fe = make_column_selector(dtype_exclude=np.number)(X_train_fe_all)
# Assume these are the columns that need encoding (state, school_type, grade_level, school_name if still object)
# Let's be explicit based on original data:
original_categorical_features = ['school_name', 'state', 'school_type', 'grade_level']
# Filter to only include those that are still object/string type after FE
categorical_features_to_encode_pre_sfs = [col for col in original_categorical_features if col in X_train_fe_all.columns and X_train_fe_all[col].dtype == 'object']
# The numerical FE features like is_high_school are already numerical and don't need encoding


# Define preprocessor that encodes categoricals and passes through numericals
preprocessor_encode_cat = ColumnTransformer(
    transformers=[
        ('cat_encode', Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), categorical_features_to_encode_pre_sfs),
        ('num_passthrough', 'passthrough', numerical_features_all_fe) # Pass through numerical features
    ],
    remainder='drop' # Drop any other columns (like 'id' or original categorical if not in list)
)

# Apply the encoding step
print("\nMenerapkan Encoding Kategorikal sebelum SFS...")
X_train_encoded = preprocessor_encode_cat.fit_transform(X_train_fe_all)

# Now, X_train_encoded is a numpy array. We need to impute and scale.
# Impute all numerical features (original numerical + new FE numerical + encoded categorical)
imputer_all_num = SimpleImputer(strategy='mean') # Use mean imputer for simplicity before SFS
X_train_imputed = imputer_all_num.fit_transform(X_train_encoded)

# Scale all numerical features
scaler_all_num = MinMaxScaler() # Use MinMaxScaler for simplicity before SFS
X_train_scaled_pre_sfs = scaler_all_num.fit_transform(X_train_imputed)


# X_train_scaled_pre_sfs is the data that will be fed into SFS
# It's a numpy array. SFS works directly on numpy arrays.


# --- 4. Define Base Model for SFS (GPU Enabled) ---
# Use a model that SFS will use to evaluate feature subsets. XGBoost with GPU support.
# Use default parameters or parameters known to work reasonably well for evaluation speed.
# The base model for SFS should work directly on the preprocessed numerical data.
base_model_for_sfs = xgb.XGBRegressor(random_state=42, n_estimators=100, learning_rate=0.1, max_depth=3, device='cuda') # XGBoost with GPU (updated)

# --- 5. Apply SequentialFeatureSelector (SFS) ---

# Determine the number of features to select (n_features_to_select)
# Set to 'auto' to select the number of features maximizing the CV score.
n_features_to_select = 'auto' # Or a fixed number, e.g., 20

# --- Forward Selection ---
print("\nMelakukan Forward Feature Selection (SFS)...")
# SFS fits directly on the preprocessed numerical data
sfs_forward = SequentialFeatureSelector(
    base_model_for_sfs, # The base model to evaluate feature subsets
    n_features_to_select=n_features_to_select,
    direction='forward',
    scoring='neg_mean_absolute_error', # Use MAE
    cv=5, # Cross-validation folds
    n_jobs=-1 # Use all cores
)

# Fit SFS on the preprocessed numerical data
sfs_forward.fit(X_train_scaled_pre_sfs, y_train)

# Need to map the selected feature indices back to original/FE feature names
# This is complex because of the OneHotEncoder and passthrough.
# For simplicity, let's just get the indices and work with the numpy arrays after SFS.
selected_feature_indices_forward = sfs_forward.get_support(indices=True)
print(f"Fitur terpilih oleh Forward SFS ({len(selected_feature_indices_forward)} fitur - berdasarkan indeks setelah preprocessing):")
print(selected_feature_indices_forward)
# Note: Interpreting these indices w.r.t original/FE feature names is hard here.


# --- Backward Selection ---
print("\nMelakukan Backward Feature Selection (SFS)...")
# SFS fits directly on the preprocessed numerical data
sfs_backward = SequentialFeatureSelector(
    base_model_for_sfs, # The base model to evaluate feature subsets
    n_features_to_select=n_features_to_select,
    direction='backward',
    scoring='neg_mean_absolute_error', # Use MAE
    cv=5, # Cross-validation folds
    n_jobs=-1 # Use all cores
)

# Fit SFS on the preprocessed numerical data
sfs_backward.fit(X_train_scaled_pre_sfs, y_train)

selected_feature_indices_backward = sfs_backward.get_support(indices=True)
print(f"Fitur terpilih oleh Backward SFS ({len(selected_feature_indices_backward)} fitur - berdasarkan indeks setelah preprocessing):")
print(selected_feature_indices_backward)


# --- Choose the best set of features (e.g., based on size or manual choice) ---
# For simplicity, let's use the features selected by Backward SFS (indices) for the next step.
selected_feature_indices_final = selected_feature_indices_backward
print(f"\nMenggunakan {len(selected_feature_indices_final)} fitur terpilih (dari Backward SFS indices) untuk tuning GridSearchCV.")


# --- 6. Prepare Data with SFS-Selected Features for GridSearchCV ---
# We need to select columns by index from the fully preprocessed data
# Apply the same full preprocessor to the data
print("\nMenerapkan Preprocessing Lengkap (Encode, Impute, Scale) pada data setelah FE...")
# Redefine the full preprocessor to ensure it's fit on the full data
# Identify numerical and categorical features AFTER applying ALL FE
numerical_features_all_fe = make_column_selector(dtype_include=np.number)(X_train_fe_all)
numerical_features_all_fe = [col for col in numerical_features_all_fe if col not in ['id']] # Exclude 'id'

categorical_features_all_fe = make_column_selector(dtype_exclude=np.number)(X_train_fe_all)
original_categorical_features = ['school_name', 'state', 'school_type', 'grade_level'] # Assuming these are the original ones
categorical_features_to_encode = [col for col in original_categorical_features if col in X_train_fe_all.columns and X_train_fe_all[col].dtype == 'object']


full_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), categorical_features_to_encode),
        ('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), ('scaler', MinMaxScaler())]), numerical_features_all_fe) # Impute and scale numericals
    ],
    remainder='drop' # Drop columns not explicitly handled
)

X_train_processed_full = full_preprocessor.fit_transform(X_train_fe_all, y_train)
X_test_processed_full = full_preprocessor.transform(X_test_fe_all)


# Select columns by index from the fully preprocessed data
X_train_sfs_selected = X_train_processed_full[:, selected_feature_indices_final]
X_test_sfs_selected = X_test_processed_full[:, selected_feature_indices_final]

print(f"Data training siap untuk GridSearchCV dengan {X_train_sfs_selected.shape[1]} fitur terpilih.")


# --- 7. Define Pipeline for GridSearchCV (using SFS-Selected Features) ---
# Define the final pipeline to tune with GridSearchCV.
# This pipeline takes the SFS-selected data (which is already preprocessed) as input.
pipeline_to_tune = Pipeline(steps=[
    # No selector needed here, as the data is already selected
    ('regressor', 'passthrough') # Placeholder, will be replaced by models in param_grid
])


# --- 8. Tentukan Grid Hyperparameter & Models untuk GridSearchCV ---
# Define the parameter grid for GridSearchCV, including different models.
# We will tune hyperparameters for BOTH XGBoost and LightGBM.

# Define models with their GPU parameters
xgb_gpu = xgb.XGBRegressor(random_state=42, device='cuda') # XGBoost with GPU (updated)
lgbm_gpu = lgb.LGBMRegressor(random_state=42, device='gpu') # Use 'gpu' for LightGBM GPU

# Define parameter grids for each model
param_grid_xgb = {
    'regressor': [xgb_gpu], # Specify the model instance
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__max_depth': [3, 5, 7],
    'regressor__subsample': [0.8, 0.9, 1.0],
    'regressor__colsample_bytree': [0.8, 0.9, 1.0]
}

param_grid_lgbm = {
    'regressor': [lgbm_gpu], # Specify the model instance
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__max_depth': [3, 5, 7], # LightGBM uses max_depth differently, but this is a common param
    'regressor__subsample': [0.8, 0.9, 1.0],
    'regressor__colsample_bytree': [0.8, 0.9, 1.0]
}

# Combine parameter grids
# Note: GridSearchCV will test ALL combinations within each grid.
# To test both models, we can create a list of parameter grids.
param_grid = [
    param_grid_xgb,
    param_grid_lgbm,
]

# Reduce grid size for faster testing if needed
param_grid_xgb_small = {
    'regressor': [xgb_gpu],
    'regressor__n_estimators': [100],
    'regressor__learning_rate': [0.05],
    'regressor__max_depth': [3],
    'regressor__subsample': [0.8], # Added subsample
    'regressor__colsample_bytree': [0.8] # Added colsample_bytree
}
param_grid_lgbm_small = {
    'regressor': [lgbm_gpu],
    'regressor__n_estimators': [100],
    'regressor__learning_rate': [0.05],
    'regressor__max_depth': [3],
    'regressor__subsample': [0.8], # Added subsample
    'regressor__colsample_bytree': [0.8] # Added colsample_bytree
}

param_grid = [
    param_grid_xgb_small,
    param_grid_lgbm_small,
]


# Calculate total combinations
total_combinations = 0
for grid in param_grid:
    # Calculate combinations within each grid, excluding the 'regressor' key itself
    grid_combinations = 1
    for key, values in grid.items():
        if key != 'regressor':
            grid_combinations *= len(values)
    # Add 1 for the model choice itself (either xgb_gpu or lgbm_gpu) - this is handled by the list of grids
    total_combinations += grid_combinations

print(f"Jumlah total kombinasi hyperparameter dan model yang akan diuji oleh GridSearchCV: {total_combinations}")


# --- 9. Hyperparameter Tuning dengan GridSearchCV ---
print("\nMemulai Hyperparameter Tuning dengan GridSearchCV pada fitur terpilih oleh SFS (data preproses lengkap), menguji XGBoost dan LightGBM...")

# Use KFold Cross-Validation
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)


grid_search = GridSearchCV(
    pipeline_to_tune,
    param_grid,
    cv=cv_strategy,
    scoring='neg_mean_absolute_error', # Menggunakan MAE sebagai metrik
    n_jobs=-1, # Gunakan semua core CPU
    verbose=2 # Tampilkan detail proses
)

# Lakukan tuning pada data training *setelah* dipreproses lengkap DAN *setelah* SFS selection
grid_search.fit(X_train_sfs_selected, y_train)

print("\nTuning GridSearchCV selesai.")
print(f"MAE terbaik dari CV: {-grid_search.best_score_:.4f}") # Negate score to get positive MAE
print("Parameter terbaik:", grid_search.best_params_)
print(f"Model terbaik: {type(grid_search.best_estimator_.named_steps['regressor']).__name__}")


# --- 10. Melatih Model Final & Membuat Prediksi ---
print("\nMelatih model final dengan parameter terbaik pada fitur terpilih...")
# The best_estimator_ from GridSearchCV is the pipeline with the best model and parameters.
best_model_sfs_tuned = grid_search.best_estimator_

# The best_model_sfs_tuned is already trained by grid_search.fit.
# We just need to use it for prediction.


print("Membuat prediksi pada data test menggunakan fitur terpilih...")

# Apply the same full preprocessor to the test data
# X_test_processed_full was already created after applying full FE to X_test
# Select the SFS-selected features from the fully preprocessed test data
X_test_sfs_selected = X_test_processed_full[:, selected_feature_indices_final]


# --- Fix for XGBoost GPU prediction warning ---
# Check if the best model is XGBoost and move test data to DMatrix on GPU
if isinstance(best_model_sfs_tuned.named_steps['regressor'], xgb.XGBRegressor):
    print("Menggunakan XGBoost untuk prediksi. Mengonversi data test ke DMatrix pada GPU...")
    # Access the trained XGBoost model within the pipeline
    trained_xgb_model = best_model_sfs_tuned.named_steps['regressor']
    # Convert the selected test data to DMatrix on the correct device
    dtest = xgb.DMatrix(X_test_sfs_selected, enable_categorical=False) # Assuming data is numerical/encoded
    # Ensure DMatrix is on the same device as the model if needed, though device='cuda' should handle it
    # dtest.set_device(trained_xgb_model.get_params()['device']) # Not needed with device='cuda' in DMatrix

    final_predictions_sfs_tuned = trained_xgb_model.predict(dtest)
    print("Prediksi XGBoost selesai.")

elif isinstance(best_model_sfs_tuned.named_steps['regressor'], lgb.LGBMRegressor):
     print("Menggunakan LightGBM untuk prediksi.")
     # LightGBM's predict method often handles data transfer internally when device='gpu' is set.
     final_predictions_sfs_tuned = best_model_sfs_tuned.predict(X_test_sfs_selected)
     print("Prediksi LightGBM selesai.")

else:
     print("Menggunakan model non-GPU untuk prediksi.")
     final_predictions_sfs_tuned = best_model_sfs_tuned.predict(X_test_sfs_selected)
     print("Prediksi model non-GPU selesai.")


# --- 11. Membuat File Submission ---
submission_sfs_tuned_df = pd.DataFrame({'id': X_test['id'], 'dropout_rate_percent': final_predictions_sfs_tuned}) # Use original test IDs
submission_sfs_tuned_df.to_csv('submission_sfs_gridsearch_tuned_xgb_lgbm.csv', index=False) # Unique file name
print("\n========================================================")
print("File 'submission_sfs_gridsearch_tuned_xgb_lgbm.csv' berhasil dibuat dari model hasil SFS + tuning GridSearchCV (XGB/LGBM).")
print("========================================================")

# --- 12. Evaluate Final Model ---
try:
    # Load the true values for evaluation
    kunjaw_df = pd.read_csv('/kaggle/input/tes-binus/KunJaw Predicted.csv')
    y_true = kunjaw_df['dropout_rate_percent']
    print("\nFile 'KunJaw Predicted.csv' berhasil dimuat untuk evaluasi final.")

    # Evaluate the final predictions
    if len(y_true) == len(final_predictions_sfs_tuned):
        final_mae_sfs_tuned = mean_absolute_error(y_true, final_predictions_sfs_tuned)
        print(f"Mean Absolute Error (MAE) Model Final SFS + GridSearchCV (XGB/LGBM) vs KunJaw: {final_mae_sfs_tuned:.4f}")
    else:
        print(f"Warning: Panjang prediksi final SFS + GridSearchCV (XGB/LGBM) ({len(final_predictions_sfs_tuned)}) tidak sesuai dengan KunJaw ({len(y_true)}). Tidak dapat menghitung MAE.")

except FileNotFoundError:
    print("\nError: File 'KunJaw Predicted.csv' tidak ditemukan. Tidak dapat melakukan evaluasi MAE final.")

Data berhasil dimuat.

Menambahkan semua fitur hasil Feature Engineering potensial...
Jumlah fitur setelah menambahkan semua fitur FE potensial: 20

Menerapkan preprocessing (Imputasi & Encoding Kategorikal) sebelum SFS...

Menerapkan Encoding Kategorikal sebelum SFS...

Melakukan Forward Feature Selection (SFS)...


KeyboardInterrupt: 