In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

import time
import numpy as np
import pandas as pd
from scipy.stats import skew, chisquare, kruskal, ks_2samp, chi2_contingency

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import MiniBatchKMeans, AffinityPropagation
from sklearn.mixture import GaussianMixture

from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

test_df = pd.read_csv(r'..//data//test.csv')
train_df = pd.read_csv(r'..//data//train.csv')
train_extra_df = pd.read_csv(r'..//data//training_extra.csv')
train_df = pd.concat([train_df, train_extra_df], ignore_index=True)

target = 'price'

def prepare_data(df: pd.DataFrame, is_train: bool = True):
    """
    Prepares the dataset for training or testing by renaming columns, handling missing values,
    converting categorical and numerical features, and creating new features.
    
    Args:
        df (pd.DataFrame): The input dataframe (train or test).
        is_train (bool): Indicates if the dataframe is training data (default is True).
        
    Returns:
        pd.DataFrame: The processed dataframe.
    """
    
    # Define the column names
    columns = [
        'id', 'brand', 'material', 'size', 'compartments', 
        'laptop_compartment', 'is_waterproof', 'style', 'color', 
        'weight_capacity'
    ]
    
    if is_train:
        columns.append('price')
    
    df.columns = columns
    df = df.drop(columns='id')
    
    # Define the mapping for Size conversion
    size_mapping = {"Small": 1, "Medium": 2, "Large": 3}
    df["size_int"] = df["size"].map(size_mapping).fillna(0).astype(int)
    
    # Handle weight capacity
    df['weight_capacity'] = df['weight_capacity'].fillna(0)
    df['weight_capacity_int'] = df['weight_capacity'].astype(int)
    df['weight_capacity_size'] = df['weight_capacity'] * df['size_int']
    
    # Convert categorical columns
    df['compartments'] = df['compartments'].astype('category')
    cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    df[cat_cols] = df[cat_cols].astype('category')
    
    # Convert boolean columns to integer type
    df['laptop_compartment'] = df['laptop_compartment'].cat.codes.fillna(-1).astype(int)
    df['is_waterproof'] = df['is_waterproof'].cat.codes.fillna(-1).astype(int)
    
    return df

# Apply function to train and test datasets
train_df = prepare_data(train_df, is_train=True)
test_df = prepare_data(test_df, is_train=False)

X = train_df.drop(target, axis=1)
y = train_df[target]

In [8]:
cat_features = train_df.select_dtypes(include=['object', 'category']).columns.tolist()
cat_feature_indexes = [train_df.columns.get_loc(col) for col in cat_features]

In [11]:
from datetime import datetime

from optuna.integration import CatBoostPruningCallback
import optuna

from catboost import CatBoostRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure categorical features are strings
X_train.iloc[:, cat_feature_indexes] = X_train.iloc[:, cat_feature_indexes].astype(str)
X_val.iloc[:, cat_feature_indexes]   = X_val.iloc[:, cat_feature_indexes].astype(str)


def objective(trial):
    params = {
        "random_state": 42,
        "verbose": 0,
        "eval_metric": "RMSE",
        
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.5, log=True),
        "iterations": trial.suggest_int("iterations", 100, 5000),
        "depth": trial.suggest_int("depth", 4, 16),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "random_strength": trial.suggest_float("random_strength", 1, 50),
        
        # Additional CatBoost parameters with conditionals
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
    }

    # Conditional parameters
    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)


    pruning_callback = CatBoostPruningCallback(trial, "RMSE")
    model = CatBoostRegressor(**params)
    model.fit(
        X_train, 
        y_train,
        cat_features=cat_feature_indexes,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=50, 
        callbacks=[pruning_callback],
        use_best_model=True
    )

    preds = model.predict(X_val)
    rmse = root_mean_squared_error(y_val, preds)    
    return rmse


study = optuna.create_study(
        storage="sqlite:///db.sqlite3",
        study_name="catboost_" + datetime.now().strftime("%Y-%m-%d_%H-%M"),
        direction="minimize"
    )
study.optimize(objective, n_trials=1000)

print("\n=========================")
print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.number)
print("Best value (RMSE):", study.best_trial.value)
print("Best hyperparameters:", study.best_trial.params)
best_params = study.best_trial.params
best_params["random_state"] = 42
best_params["verbose"] = 0
best_params["eval_metric"] = "RMSE"
# best_model = CatBoostRegressor(**best_params)
# best_model.fit(
#     X_train, y_train, 
#     cat_features=cat_feature_indexes,
#     eval_set=[(X_val, y_val)],
#     early_stopping_rounds=50,
#     use_best_model=True
# )

# y_pred = best_model.predict(X_val)
# val_rmse = mean_squared_error(y_val, y_pred, squared=False)
# print("Validation RMSE:", val_rmse)

[I 2025-02-08 07:52:37,724] A new study created in RDB with name: catboost_2025-02-08_07-52
[I 2025-02-08 08:00:23,513] Trial 0 finished with value: 38.91186497402076 and parameters: {'learning_rate': 0.0001583315382744083, 'iterations': 3322, 'depth': 12, 'l2_leaf_reg': 8.662989555854423, 'random_strength': 47.9247624285914, 'bootstrap_type': 'Bayesian', 'boosting_type': 'Plain', 'colsample_bylevel': 0.08209247142454183, 'bagging_temperature': 4.183899114276568}. Best is trial 0 with value: 38.91186497402076.
[I 2025-02-08 08:01:09,893] Trial 1 finished with value: 38.904133320020364 and parameters: {'learning_rate': 0.03150732601352319, 'iterations': 4470, 'depth': 11, 'l2_leaf_reg': 1.3827966120794049, 'random_strength': 31.043895761388654, 'bootstrap_type': 'MVS', 'boosting_type': 'Plain', 'colsample_bylevel': 0.0101724321741923}. Best is trial 1 with value: 38.904133320020364.
[I 2025-02-08 08:06:17,996] Trial 2 finished with value: 38.90809146131426 and parameters: {'learning_rat