In [3]:
# !pip install optuna

Defaulting to user installation because normal site-packages is not writeable


In [4]:
# --- NOTEBOOK 08: HYPERPARAMETER TUNING ---

import pandas as pd
import numpy as np
import optuna  
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import logging
import gc

# Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class Config:
    INPUT_PATH = 'train_full_merged.parquet'
    SEED = 42
    N_FOLDS = 5
    # We reduce rows for Tuning to speed it up (Optional but recommended)
    # If you have a powerful machine, set this to False
    SUBSAMPLE_FOR_TUNING = True 
    SUBSAMPLE_SIZE = 50000 

logger.info("Notebook 08 Initialized.")

2026-01-20 17:12:17,024 - INFO - Notebook 08 Initialized.


In [5]:
def objective(trial, X, y):
    # 1. Define the Search Space
    # Optuna will pick a value from these ranges
    param_grid = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': 2000,
        'learning_rate': 0.05, # Fixed for tuning speed (we lower it for final run)
        'verbosity': -1,
        'random_state': Config.SEED,
        'n_jobs': 4,
        
        # TUNING PARAMETERS
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 10.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    }
    
    # 2. Cross Validation (3-Fold is enough for tuning, 5 is better but slower)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=Config.SEED)
    scores = []
    
    for train_idx, val_idx in cv.split(X, y):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        # Train with Early Stopping
        model = lgb.LGBMClassifier(**param_grid)
        callbacks = [lgb.early_stopping(stopping_rounds=50, verbose=False)]
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='auc',
            callbacks=callbacks
        )
        
        preds = model.predict_proba(X_val)[:, 1]
        scores.append(roc_auc_score(y_val, preds))
    
    # Return Mean AUC
    return np.mean(scores)

def run_tuning():
    logger.info("Loading Data...")
    df = pd.read_parquet(Config.INPUT_PATH)
    
    X = df.drop(columns=['TARGET', 'SK_ID_CURR'])
    y = df['TARGET']
    
    # Subsample for Speed?
    if Config.SUBSAMPLE_FOR_TUNING:
        from sklearn.model_selection import train_test_split
        X, _, y, _ = train_test_split(X, y, train_size=Config.SUBSAMPLE_SIZE, stratify=y, random_state=Config.SEED)
        logger.info(f"Tuning on subsample: {X.shape}")
        
    logger.info("Starting Optuna Study...")
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X, y), n_trials=30) # Run 30 trials
    
    logger.info("--- BEST PARAMETERS ---")
    logger.info(study.best_params)
    logger.info(f"Best CV Score: {study.best_value}")
    
    return study.best_params

# Execute Tuning
# best_params = run_tuning()

In [6]:
best_params = run_tuning()
best_params

2026-01-20 17:13:01,054 - INFO - Loading Data...
2026-01-20 17:13:02,553 - INFO - Tuning on subsample: (50000, 315)
2026-01-20 17:13:02,555 - INFO - Starting Optuna Study...
[32m[I 2026-01-20 17:13:02,557][0m A new study created in memory with name: no-name-4d4d49b8-0c44-4882-b597-a1ab8c2aa1ea[0m
[32m[I 2026-01-20 17:13:16,612][0m Trial 0 finished with value: 0.7679702815118082 and parameters: {'num_leaves': 43, 'max_depth': 4, 'min_child_samples': 44, 'reg_alpha': 0.202658648966017, 'reg_lambda': 1.8372838411425592, 'colsample_bytree': 0.9387612810596043, 'subsample': 0.9978715785481631}. Best is trial 0 with value: 0.7679702815118082.[0m
[32m[I 2026-01-20 17:13:41,164][0m Trial 1 finished with value: 0.7657342479740986 and parameters: {'num_leaves': 93, 'max_depth': 12, 'min_child_samples': 48, 'reg_alpha': 9.927225192291306, 'reg_lambda': 3.935138298315582, 'colsample_bytree': 0.9494356154163142, 'subsample': 0.5449153092554752}. Best is trial 0 with value: 0.767970281511808

{'num_leaves': 26,
 'max_depth': 5,
 'min_child_samples': 21,
 'reg_alpha': 8.069414717200658,
 'reg_lambda': 0.5482549315171409,
 'colsample_bytree': 0.5392681172683854,
 'subsample': 0.6151966387267918}

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

class TargetEncoder(BaseEstimator, TransformerMixin):
    """
    Encodes categorical features with the mean of the target variable.
    Smoothing is applied to prevent overfitting on rare categories.
    """
    def __init__(self, cols=None, smoothing=10):
        self.cols = cols
        self.smoothing = smoothing
        self.maps = {}
        self.global_mean = 0
        
    def fit(self, X, y):
        self.global_mean = y.mean()
        
        for col in self.cols:
            # Calculate stats
            stats = y.groupby(X[col]).agg(['count', 'mean'])
            counts = stats['count']
            means = stats['mean']
            
            # Smooth the mean
            # (count * mean + smoothing * global_mean) / (count + smoothing)
            smooth = (counts * means + self.smoothing * self.global_mean) / (counts + self.smoothing)
            self.maps[col] = smooth
            
        return self

    def transform(self, X):
        X_out = X.copy()
        for col in self.cols:
            # Map values, fill unknown with global mean
            X_out[col] = X_out[col].map(self.maps[col]).fillna(self.global_mean)
        return X_out

In [8]:
def run_final_model(best_params):
    logger.info("--- STARTING FINAL TRAINING ---")
    
    df = pd.read_parquet(Config.INPUT_PATH)
    X = df.drop(columns=['TARGET', 'SK_ID_CURR'])
    y = df['TARGET']
    
    # Identify Categoricals for Target Encoding
    # (We usually pick high-cardinality ones like Occupation, Organization)
    cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=Config.SEED)
    scores = []
    
    # Update params with lower Learning Rate for final run
    final_params = best_params.copy()
    final_params['learning_rate'] = 0.01  # Super slow learning for max precision
    final_params['n_estimators'] = 10000  # Allow it to grow
    final_params['objective'] = 'binary'
    final_params['metric'] = 'auc'
    
    for fold, (train_idx, val_idx) in enumerate(folds.split(X, y)):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        # 1. APPLY TARGET ENCODING (Inside the loop!)
        encoder = TargetEncoder(cols=cat_cols)
        X_train_enc = encoder.fit_transform(X_train, y_train)
        X_val_enc = encoder.transform(X_val)
        
        # 2. TRAIN
        model = lgb.LGBMClassifier(**final_params)
        
        callbacks = [
            lgb.early_stopping(stopping_rounds=200),
            lgb.log_evaluation(period=1000)
        ]
        
        model.fit(
            X_train_enc, y_train,
            eval_set=[(X_val_enc, y_val)],
            eval_names=['train', 'valid'],
            eval_metric='auc',
            callbacks=callbacks
        )
        
        score = roc_auc_score(y_val, model.predict_proba(X_val_enc)[:, 1])
        scores.append(score)
        logger.info(f"Fold {fold+1} AUC: {score:.4f}")
        
    logger.info(f"Final Mean AUC: {np.mean(scores):.4f}")

# Usage:
# 1. Run Optuna to get `best_params`
# 2. Pass `best_params` to `run_final_model`

In [9]:
run_final_model(best_params)

2026-01-20 17:21:49,281 - INFO - --- STARTING FINAL TRAINING ---


Training until validation scores don't improve for 200 rounds
[1000]	train's auc: 0.773663
[2000]	train's auc: 0.780707
[3000]	train's auc: 0.783469
[4000]	train's auc: 0.784599
[5000]	train's auc: 0.78505
Early stopping, best iteration is:
[5260]	train's auc: 0.785167


2026-01-20 17:25:56,677 - INFO - Fold 1 AUC: 0.7852


Training until validation scores don't improve for 200 rounds
[1000]	train's auc: 0.785529
[2000]	train's auc: 0.792017
[3000]	train's auc: 0.794191
[4000]	train's auc: 0.795014
Early stopping, best iteration is:
[4179]	train's auc: 0.795064


2026-01-20 17:29:35,425 - INFO - Fold 2 AUC: 0.7951


Training until validation scores don't improve for 200 rounds
[1000]	train's auc: 0.774153
[2000]	train's auc: 0.781024
[3000]	train's auc: 0.783107
[4000]	train's auc: 0.784057
[5000]	train's auc: 0.784612
Early stopping, best iteration is:
[5360]	train's auc: 0.784802


2026-01-20 17:34:11,740 - INFO - Fold 3 AUC: 0.7848


Training until validation scores don't improve for 200 rounds
[1000]	train's auc: 0.784065
[2000]	train's auc: 0.788487
[3000]	train's auc: 0.790298
[4000]	train's auc: 0.790773
Early stopping, best iteration is:
[4116]	train's auc: 0.790805


2026-01-20 17:37:43,443 - INFO - Fold 4 AUC: 0.7908


Training until validation scores don't improve for 200 rounds
[1000]	train's auc: 0.775551
[2000]	train's auc: 0.782108
[3000]	train's auc: 0.784159
[4000]	train's auc: 0.785356
Early stopping, best iteration is:
[4593]	train's auc: 0.785726


2026-01-20 17:41:56,483 - INFO - Fold 5 AUC: 0.7857
2026-01-20 17:41:56,485 - INFO - Final Mean AUC: 0.7883
