In [None]:
# GradientBoosting 3.7327  MinMaxScaler MeanImputer

import pandas as pd
import numpy as np
import warnings
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

warnings.filterwarnings('ignore')

# --- 1. Memuat Data ---
try:
    train_df = pd.read_csv('train_dataset.csv')
    test_df = pd.read_csv('test_dataset.csv')
    print("Data berhasil dimuat.")
except FileNotFoundError as e:
    print(f"Error: Gagal memuat file. Pastikan 'train_dataset.csv' dan 'test_dataset.csv' ada. Detail: {e}")
    exit()

# --- 2. Persiapan Data ---
train_df.dropna(subset=['dropout_rate_percent'], inplace=True)
X_train = train_df.drop('dropout_rate_percent', axis=1)
y_train = train_df['dropout_rate_percent']
X_test = test_df.copy()
categorical_features_pre_fe = X_train.select_dtypes(include=['object']).columns.tolist()

# --- 3. Definisi Feature Engineering Transformer ---
class SelectableFeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, include_features=None):
        self.include_features = include_features if include_features is not None else []

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        df = X.copy()
        # Selalu buat semua fitur yang mungkin dibutuhkan oleh pipeline
        df['funding_per_teacher'] = df['funding_per_student_usd'] * df['student_teacher_ratio']
        df['low_income_minority_interaction'] = df['percent_low_income'] * df['percent_minority']
        df['score_to_funding_ratio'] = df['avg_test_score_percent'] / (df['funding_per_student_usd'] + 1e-6)
        df['is_high_school'] = df['school_name'].str.contains('High', case=False, na=False).astype(int)
        df['is_middle_school'] = df['school_name'].str.contains('Middle', case=False, na=False).astype(int)
        df['is_elementary_school'] = df['school_name'].str.contains('Elementary', case=False, na=False).astype(int)
        df['minority_to_internet_gap'] = df['percent_minority'] - df['internet_access_percent']
        return df

# --- 4. Membangun Pipeline Spesifik ---
print("Membangun pipeline dengan konfigurasi yang ditentukan...")

# Konfigurasi yang ditentukan
specific_features = [
    'is_high_school', 'is_middle_school', 'is_elementary_school', 'funding_per_teacher',
    'low_income_minority_interaction', 'score_to_funding_ratio', 'minority_to_internet_gap'
]

# Tentukan fitur numerik berdasarkan data awal + fitur rekayasa
# Ini cara aman untuk memastikan semua kolom ada
all_possible_new_features = [
    'funding_per_teacher', 'low_income_minority_interaction', 'score_to_funding_ratio',
    'is_high_school', 'is_middle_school', 'is_elementary_school', 'minority_to_internet_gap'
]
initial_numerical_features = X_train.select_dtypes(include=np.number).columns.drop('id', errors='ignore').tolist()
final_numerical_features = initial_numerical_features + all_possible_new_features


numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

categorical_features_to_encode = [col for col in categorical_features_pre_fe if col != 'school_name']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, final_numerical_features),
        ('cat', categorical_transformer, categorical_features_to_encode)
    ],
    remainder='drop' # Hanya gunakan fitur yang sudah didefinisikan
)

# Definisikan pipeline final
final_pipeline = Pipeline(steps=[
    ('feature_engineering', SelectableFeatureEngineeringTransformer(include_features=specific_features)),
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

# --- 5. Melatih Model dan Membuat Prediksi ---
print("Melatih model final...")
final_pipeline.fit(X_train, y_train)

print("Membuat prediksi pada data test...")
final_predictions = final_pipeline.predict(X_test)

# --- 6. Membuat File Submission ---
submission_df = pd.DataFrame({'id': X_test['id'], 'dropout_rate_percent': final_predictions})
submission_df.to_csv('submission.csv', index=False)
print("\nFile 'submission.csv' berhasil dibuat dengan konfigurasi terbaik.")

In [None]:
# --- GradientBoosting 3.6661 Ensemble Optuna ---
!pip install optuna -q

import pandas as pd
import numpy as np
import warnings
import itertools
import optuna
from tqdm.notebook import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

# --- 1. Setup Awal ---
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

# --- 2. Memuat Data ---
try:
    train_df = pd.read_csv('train_dataset.csv')
    test_df = pd.read_csv('test_dataset.csv')
    print("Data berhasil dimuat.")
except FileNotFoundError as e:
    print(f"Error: Gagal memuat file. Detail: {e}")
    exit()

# --- 3. Persiapan Data ---
train_df.dropna(subset=['dropout_rate_percent'], inplace=True)
X_train = train_df.drop('dropout_rate_percent', axis=1)
y_train = train_df['dropout_rate_percent']
X_test = test_df.copy()
categorical_features_pre_fe = X_train.select_dtypes(include=['object']).columns.tolist()

# --- 4. Definisi Feature Engineering Transformer (Tetap Sama) ---
class SelectableFeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, include_features=None):
        self.include_features = include_features if include_features is not None else []
    def fit(self, X, y=None): return self
    def transform(self, X, y=None):
        df = X.copy()
        # ... (Kode FE lengkap disembunyikan untuk keringkasan, tapi tetap sama seperti sebelumnya)
        if 'funding_per_teacher' in self.include_features: df['funding_per_teacher'] = df['funding_per_student_usd'] * df['student_teacher_ratio']
        if 'low_income_minority_interaction' in self.include_features: df['low_income_minority_interaction'] = df['percent_low_income'] * df['percent_minority']
        if 'score_to_funding_ratio' in self.include_features: df['score_to_funding_ratio'] = df['avg_test_score_percent'] / (df['funding_per_student_usd'] + 1e-6)
        if 'internet_to_income_ratio' in self.include_features: df['internet_to_income_ratio'] = df['internet_access_percent'] / (df['percent_low_income'] + 1e-6)
        if 'is_high_school' in self.include_features: df['is_high_school'] = df['school_name'].str.contains('High', case=False, na=False).astype(int)
        if 'is_middle_school' in self.include_features: df['is_middle_school'] = df['school_name'].str.contains('Middle', case=False, na=False).astype(int)
        if 'is_elementary_school' in self.include_features: df['is_elementary_school'] = df['school_name'].str.contains('Elementary', case=False, na=False).astype(int)
        if 'teacher_load' in self.include_features: df['teacher_load'] = 1 / (df['student_teacher_ratio'] + 1 + 1e-6)
        if 'adjusted_funding' in self.include_features: df['adjusted_funding'] = df['funding_per_student_usd'] * (df['internet_access_percent'] / 100)
        if 'minority_to_internet_gap' in self.include_features: df['minority_to_internet_gap'] = df['percent_minority'] - df['internet_access_percent']
        return df

# --- 5. Mendefinisikan Fungsi & Konfigurasi Terbaik untuk Ensemble ---

# A. Fungsi untuk membuat pipeline secara dinamis
def create_pipeline(config):
    # Dapatkan daftar kolom numerik setelah FE
    temp_transformer = SelectableFeatureEngineeringTransformer(include_features=config['features'])
    temp_df = temp_transformer.transform(X_train.head())
    numerical_features_after_fe = temp_df.select_dtypes(include=np.number).columns.drop('id', errors='ignore').tolist()
    categorical_features_to_encode = [col for col in categorical_features_pre_fe if col != 'school_name']

    numerical_transformer = Pipeline(steps=[('imputer', config['imputer']), ('scaler', config['scaler'])])
    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
    preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_features_after_fe), ('cat', categorical_transformer, categorical_features_to_encode)], remainder='drop')

    return Pipeline(steps=[
        ('feature_engineering', SelectableFeatureEngineeringTransformer(include_features=config['features'])),
        ('preprocessor', preprocessor),
        ('regressor', config['model'])
    ])

# B. Top 3 Konfigurasi dari hasil Brute Force
top_configs = [
    {
        'name': 'Config_1_Best',
        'features': ['is_high_school', 'is_middle_school', 'is_elementary_school', 'funding_per_teacher', 'low_income_minority_interaction', 'score_to_funding_ratio', 'minority_to_internet_gap'],
        'imputer': SimpleImputer(strategy='mean'),
        'scaler': MinMaxScaler(),
        'model': GradientBoostingRegressor(random_state=42)
    },
    {
        'name': 'Config_2_Variant',
        'features': ['is_high_school', 'is_middle_school', 'is_elementary_school', 'funding_per_teacher', 'low_income_minority_interaction', 'score_to_funding_ratio', 'minority_to_internet_gap'],
        'imputer': SimpleImputer(strategy='mean'),
        'scaler': StandardScaler(),
        'model': GradientBoostingRegressor(random_state=42)
    },
    {
        'name': 'Config_3_FeatureVariant',
        'features': ['is_high_school', 'is_middle_school', 'is_elementary_school', 'funding_per_teacher', 'low_income_minority_interaction', 'score_to_funding_ratio', 'teacher_load', 'minority_to_internet_gap'],
        'imputer': SimpleImputer(strategy='mean'),
        'scaler': MinMaxScaler(),
        'model': GradientBoostingRegressor(random_state=42)
    }
]

# --- 6. Hyperparameter Tuning dengan Optuna pada Konfigurasi Terbaik ---
print("Memulai Hyperparameter Tuning dengan Optuna...")

def objective(trial):
    # Definisikan hyperparameter yang akan di-tuning
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
    }

    # Buat pipeline dengan parameter dari Optuna
    config = top_configs[0].copy()
    config['model'].set_params(**params)
    pipeline = create_pipeline(config)

    # Lakukan cross-validation dan kembalikan skor MAE
    score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
    return -score.mean()

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, show_progress_bar=True)

print("Tuning Optuna selesai.")
print(f"MAE terbaik dari CV: {study.best_value:.4f}")
print("Parameter terbaik:", study.best_params)

# --- 7. Melatih Ensemble & Membuat Prediksi ---
print("\nMelatih model-model dalam ensemble...")
all_predictions = []

# Model 1 (Terbaik + Tuned)
tuned_config = top_configs[0].copy()
tuned_config['model'].set_params(**study.best_params)
pipeline_1 = create_pipeline(tuned_config)
pipeline_1.fit(X_train, y_train)
preds_1 = pipeline_1.predict(X_test)
all_predictions.append(preds_1)
print(f"- Model 1 ({tuned_config['name']}) berhasil dilatih.")

# Model 2 dan 3 (Default)
for i, config in enumerate(top_configs[1:]):
    pipeline = create_pipeline(config)
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    all_predictions.append(preds)
    print(f"- Model {i+2} ({config['name']}) berhasil dilatih.")

# --- 8. Blending dan Membuat Submission ---
print("\nMenggabungkan prediksi (blending)...")
ensemble_preds = np.mean(all_predictions, axis=0)

submission_df = pd.DataFrame({'id': X_test['id'], 'dropout_rate_percent': ensemble_preds})
submission_df.to_csv('submissionTes.csv', index=False)

print("\n========================================================")
print("File 'submissionTes.csv' berhasil dibuat dari model ensemble.")
print("========================================================")

In [None]:
# --- GradientBoosting 3.6587 non-Ensemble (diEnsemble malah Rusak) Grid Search CV ---

import pandas as pd
import numpy as np
import warnings
# import optuna # Not needed
from tqdm.notebook import tqdm
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold

# --- 1. Setup Awal ---
warnings.filterwarnings('ignore')
# optuna.logging.set_verbosity(optuna.logging.WARNING) # Not needed

# --- 2. Memuat Data ---
try:
    train_df = pd.read_csv('train_dataset.csv')
    test_df = pd.read_csv('test_dataset.csv')
    print("Data berhasil dimuat.")
except FileNotFoundError as e:
    print(f"Error: Gagal memuat file. Detail: {e}")
    exit()

# --- 3. Persiapan Data ---
train_df.dropna(subset=['dropout_rate_percent'], inplace=True)
X_train = train_df.drop('dropout_rate_percent', axis=1)
y_train = train_df['dropout_rate_percent']
X_test = test_df.copy()
categorical_features_pre_fe = X_train.select_dtypes(include=['object']).columns.tolist()

# --- 4. Definisi Feature Engineering Transformer (Tetap Sama) ---
class SelectableFeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, include_features=None):
        self.include_features = include_features if include_features is not None else []
    def fit(self, X, y=None): return self
    def transform(self, X, y=None):
        df = X.copy()
        if 'funding_per_teacher' in self.include_features: df['funding_per_teacher'] = df['funding_per_student_usd'] * df['student_teacher_ratio']
        if 'low_income_minority_interaction' in self.include_features: df['low_income_minority_interaction'] = df['percent_low_income'] * df['percent_minority']
        if 'score_to_funding_ratio' in self.include_features: df['score_to_funding_ratio'] = df['avg_test_score_percent'] / (df['funding_per_student_usd'] + 1e-6)
        if 'internet_to_income_ratio' in self.include_features: df['internet_access_percent'] / (df['percent_low_income'] + 1e-6) # Fixed potential division by zero
        if 'is_high_school' in self.include_features: df['is_high_school'] = df['school_name'].str.contains('High', case=False, na=False).astype(int)
        if 'is_middle_school' in self.include_features: df['is_middle_school'] = df['school_name'].str.contains('Middle', case=False, na=False).astype(int)
        if 'is_elementary_school' in self.include_features: df['is_elementary_school'] = df['school_name'].str.contains('Elementary', case=False, na=False).astype(int)
        if 'teacher_load' in self.include_features: df['teacher_load'] = 1 / (df['student_teacher_ratio'] + 1 + 1e-6) # Fixed potential division by zero
        if 'adjusted_funding' in self.include_features: df['adjusted_funding'] = df['funding_per_student_usd'] * (df['internet_access_percent'] / 100)
        if 'minority_to_internet_gap' in self.include_features: df['minority_to_internet_gap'] = df['percent_minority'] - df['internet_access_percent']
        return df

# --- 5. Mendefinisikan Pipeline dan Grid untuk Tuning ---

# A. Konfigurasi Terbaik dari hasil Brute Force (sesuaikan jika hasil brute force Anda berbeda)
# Menggunakan konfigurasi terbaik yang sering muncul: GradientBoosting, MeanImputer, MinMaxScaler, dengan fitur spesifik
base_config = {
    'features': ['is_high_school', 'is_middle_school', 'is_elementary_school', 'funding_per_teacher', 'low_income_minority_interaction', 'score_to_funding_ratio', 'minority_to_internet_gap'],
    'imputer': SimpleImputer(strategy='mean'),
    'scaler': MinMaxScaler(),
}

# B. Buat pipeline dasar dengan konfigurasi terbaik
# Dapatkan daftar kolom numerik setelah FE untuk konfigurasi dasar
temp_transformer = SelectableFeatureEngineeringTransformer(include_features=base_config['features'])
temp_df = temp_transformer.transform(X_train.head())
numerical_features_after_fe = temp_df.select_dtypes(include=np.number).columns.drop('id', errors='ignore').tolist()
categorical_features_to_encode = [col for col in categorical_features_pre_fe if col != 'school_name']

numerical_transformer = Pipeline(steps=[('imputer', base_config['imputer']), ('scaler', base_config['scaler'])])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]) # Assuming OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features_after_fe),
        ('cat', categorical_transformer, categorical_features_to_encode)
    ],
    remainder='drop'
)

# Definisikan pipeline final untuk tuning
pipeline_to_tune = Pipeline(steps=[
    ('feature_engineering', SelectableFeatureEngineeringTransformer(include_features=base_config['features'])),
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42)) # Model to tune
])


# C. Tentukan Grid Hyperparameter untuk GridSearchCV
# Sesuaikan grid ini berdasarkan parameter yang ingin Anda uji untuk GradientBoostingRegressor
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__max_depth': [3, 5, 7],
    'regressor__subsample': [0.8, 0.9, 1.0],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# --- 6. Hyperparameter Tuning dengan GridSearchCV ---
print("Memulai Hyperparameter Tuning dengan GridSearchCV...")

# Gunakan KFold Cross-Validation
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)


grid_search = GridSearchCV(
    pipeline_to_tune,
    param_grid,
    cv=cv_strategy,
    scoring='neg_mean_absolute_error', # Menggunakan MAE sebagai metrik
    n_jobs=-1, # Gunakan semua core CPU
    verbose=2 # Tampilkan detail proses
)

# Lakukan tuning pada SELURUH data training yang sudah dibersihkan targetnya
grid_search.fit(X_train, y_train)

print("Tuning GridSearchCV selesai.")
print(f"MAE terbaik dari CV: {-grid_search.best_score_:.4f}") # Negate score to get positive MAE
print("Parameter terbaik:", grid_search.best_params_)

# --- 7. Melatih Model Final & Membuat Prediksi ---
print("\nMelatih model final dengan parameter terbaik...")
best_model = grid_search.best_estimator_

# Latih model terbaik pada SELURUH data training
best_model.fit(X_train, y_train)

print("Membuat prediksi pada data test...")
final_predictions = best_model.predict(X_test)

# --- 8. Membuat File Submission ---
submission_df = pd.DataFrame({'id': X_test['id'], 'dropout_rate_percent': final_predictions})
submission_df.to_csv('submission_gridsearch_tuned.csv', index=False)

print("\n========================================================")
print("File 'submission_gridsearch_tuned.csv' berhasil dibuat dari model hasil tuning GridSearchCV.")
print("========================================================")