In [5]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
from sklearn.datasets import fetch_california_housing
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error as mse
from IPython.display import display
from sklearn.preprocessing import LabelEncoder
import optuna
import lightgbm as lgbm

In [2]:
BASE_DIR = Path("/kaggle/input/playground-series-s3e1/")
test_df = pd.read_csv(BASE_DIR / "test.csv")

# feature engineered, conactenated dataframe from prev notebook
OTHER_DIR = Path("/kaggle/input/playground-s03e01-finetuned-xgb/")
df = pd.read_csv(OTHER_DIR / "playground-s03e01-ultimate.csv")

In [3]:
train = df.iloc[:-len(test_df),:]
test = df.iloc[-len(test_df):,:].drop('MedHouseVal', axis=1).reset_index(drop=True)

X = train.drop('MedHouseVal', axis=1)
y = train['MedHouseVal']

In [14]:
import warnings
warnings.filterwarnings('ignore')

In [17]:
from optuna.integration import LightGBMPruningCallback

def objective(trial, X, y):
    param_grid = {
#         "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.1
        ),
    }

    cv = KFold(n_splits=5, shuffle=True, random_state=1337)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = lgbm.LGBMRegressor(objective="regression", **param_grid)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="rmse",
            early_stopping_rounds=100,
            callbacks=[
                LightGBMPruningCallback(trial, "rmse")
            ],  # Add a pruning callback
        )
        y_preds = model.predict(X_test)
        cv_scores[idx] = mse(y_test, y_preds, squared=False)
    
    rmse = np.mean(cv_scores)
    print(f"AVG CV RMSE: \t {rmse}")

    return rmse

In [None]:
study = optuna.create_study(direction="minimize", study_name="LGBM Regressor Tuning")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=100)

In [19]:
study.best_value

0.5259197909920328

In [20]:
study.best_params

{'n_estimators': 10000,
 'learning_rate': 0.2828181500089648,
 'num_leaves': 2620,
 'max_depth': 11,
 'min_data_in_leaf': 800,
 'lambda_l1': 5,
 'lambda_l2': 55,
 'min_gain_to_split': 0.18447231486025706,
 'bagging_fraction': 0.6000000000000001,
 'bagging_freq': 1,
 'feature_fraction': 0.8}

In [22]:
model = lgbm.LGBMRegressor(objective="regression", **study.best_params)
model.fit(X, y)

LGBMRegressor(bagging_fraction=0.6000000000000001, bagging_freq=1,
              feature_fraction=0.8, lambda_l1=5, lambda_l2=55,
              learning_rate=0.2828181500089648, max_depth=11,
              min_data_in_leaf=800, min_gain_to_split=0.18447231486025706,
              n_estimators=10000, num_leaves=2620, objective='regression')

In [23]:
y_pred_test = model.predict(test)
y_pred_test

array([0.69124806, 0.97993522, 3.90814505, ..., 1.0742553 , 3.53414592,
       3.67722713])

In [24]:
submission_df = pd.DataFrame(data={'id': test_df.id, 'MedHouseVal': y_pred_test})
submission_df.to_csv("submission.csv", index=False)