In [14]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
print("LightGBM version:  {}".format(lgb.__version__))
from sklearn.metrics import mean_squared_error, mean_squared_log_error
import optuna
print("Optuna version:  {}".format(optuna.__version__))
import xgboost as xgb
print("XGBoost version:  {}".format(xgb.__version__))
import catboost as cb
from catboost import CatBoostRegressor, Pool

print("Catboost version:  {}".format(catboost.__version__))

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

LightGBM version:  3.3.2
Optuna version:  3.4.0
XGBoost version:  2.0.1
Catboost version:  1.2.2
/kaggle/input/store-sales-time-series-forecasting/oil.csv
/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv
/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv
/kaggle/input/store-sales-time-series-forecasting/stores.csv
/kaggle/input/store-sales-time-series-forecasting/train.csv
/kaggle/input/store-sales-time-series-forecasting/test.csv
/kaggle/input/store-sales-time-series-forecasting/transactions.csv


See Version 2 for tuning of Light GBM.
Version 3 is for tuning XGBoost.
Version 4 for XGBoost with categorical.
Version 5 for Catboost with categorical.

In [2]:
train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv')

In [3]:
train.date = pd.to_datetime(train.date, format='%Y-%m-%d')
# split date into seperate columns
train['year'] = train.date.dt.year
train['month'] = train.date.dt.month
train['day'] = train.date.dt.day
# add additional columns based on date
train['dayofweek'] = train.date.dt.dayofweek
train['dayofyear'] = train.date.dt.dayofyear
train['weekofyear'] = train.date.dt.isocalendar().week

In [4]:
train.weekofyear = train.weekofyear.astype('int8')

In [5]:
train.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,year,month,day,dayofweek,dayofyear,weekofyear
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,2013,1,1,1,1,1
1,1,2013-01-01,1,BABY CARE,0.0,0,2013,1,1,1,1,1
2,2,2013-01-01,1,BEAUTY,0.0,0,2013,1,1,1,1,1
3,3,2013-01-01,1,BEVERAGES,0.0,0,2013,1,1,1,1,1
4,4,2013-01-01,1,BOOKS,0.0,0,2013,1,1,1,1,1


In [10]:
# not used for catboost
#label_encoder = LabelEncoder()
#train.family = label_encoder.fit_transform(train.family)

# tell Catboost instead that family is a categorical column
categorical_columns = ['store_nbr', 'family']
# convert categorical columns to dtype string
for feature in categorical_columns:
    train[feature] = train[feature].astype('str')

In [11]:
columns_to_drop = ['sales','id','date']
X_train = train.loc[train.date < '2017-08-01']
y_train = X_train.sales
X_train = X_train.drop(columns_to_drop, axis = 1)
X_val = train.loc[train.date >= '2017-08-01']
y_val = X_val.sales
X_val = X_val.drop(columns_to_drop, axis = 1)

In [12]:
train.dtypes

id                      int64
date           datetime64[ns]
store_nbr              object
family                 object
sales                 float64
onpromotion             int64
year                    int32
month                   int32
day                     int32
dayofweek               int32
dayofyear               int32
weekofyear               int8
dtype: object

In [13]:
train.isna().sum()

id             0
date           0
store_nbr      0
family         0
sales          0
onpromotion    0
year           0
month          0
day            0
dayofweek      0
dayofyear      0
weekofyear     0
dtype: int64

In [None]:
# use log transformation of target column and then caclulate rsme. This is the same as using rsmle. 

df=pd.DataFrame({'yhat': [5000,2500,500], 'y': [10000,5000,1000]})
df['rmse'] = df.apply(lambda row: mean_squared_error([row['y']], [row['yhat']], squared=False), axis=1)
df['rsmle'] = df.apply(lambda row: mean_squared_log_error([row['y']], [row['yhat']], squared=False), axis=1)
#df['mse'] = df.apply(lambda row: mean_squared_error([row['actual']], [row['predicted']]), axis=1)
df['lnyhat'] = np.log1p(df.yhat)
df['lny'] = np.log1p(df.y)
df['transformback'] = np.expm1(df.lny)
df['ln_rmse'] = df.apply(lambda row: mean_squared_error([row['lny']], [row['lnyhat']], squared=False), axis=1) # the rsme of the log columns equals the rsmle of the original columns
df

Please read: https://forecastegy.com/posts/xgboost-hyperparameter-tuning-with-optuna/

https://forecastegy.com/posts/catboost-hyperparameter-tuning-guide-with-optuna/

In [22]:
def objective(trial):
    params = {
        "iterations": 100,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.5, log=True),
        "depth": trial.suggest_int("depth", 5, 15),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }

    model = cb.CatBoostRegressor(**params, cat_features=categorical_columns, silent=True)
    model.fit(X_train, np.log1p(y_train))
    predictions = model.predict(X_val)
    rmse = mean_squared_error(np.log1p(y_val), predictions, squared=False)
    return rmse

In [23]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

[I 2023-11-23 08:18:20,542] A new study created in memory with name: no-name-8e3cb564-0bd0-4e77-a7f0-579d16e6fe74
[I 2023-11-23 08:19:27,305] Trial 0 finished with value: 1.0323276120778184 and parameters: {'learning_rate': 0.016527729962154927, 'depth': 10, 'subsample': 0.9641307057161292, 'colsample_bylevel': 0.4817036063307967, 'min_data_in_leaf': 48}. Best is trial 0 with value: 1.0323276120778184.
[I 2023-11-23 08:20:24,662] Trial 1 finished with value: 0.7222041601991667 and parameters: {'learning_rate': 0.18342487162003707, 'depth': 12, 'subsample': 0.32058995747961705, 'colsample_bylevel': 0.7491764722906279, 'min_data_in_leaf': 15}. Best is trial 1 with value: 0.7222041601991667.
[I 2023-11-23 08:21:20,872] Trial 2 finished with value: 0.9194722074125394 and parameters: {'learning_rate': 0.024745965687658758, 'depth': 10, 'subsample': 0.8641707807673937, 'colsample_bylevel': 0.3888245244096772, 'min_data_in_leaf': 55}. Best is trial 1 with value: 0.7222041601991667.
[I 2023-11

In [24]:
print('Best hyperparameters:', study.best_params)
print('Best RMSE:', study.best_value)

Best hyperparameters: {'learning_rate': 0.29844759163525564, 'depth': 15, 'subsample': 0.44116710254766295, 'colsample_bylevel': 0.8200575737708942, 'min_data_in_leaf': 38}
Best RMSE: 0.715374956568049


depth is again at the maximum. run again when time. try first with these hyperparams.

In [None]:
 #just to compare here again,
# params = {
#        "iterations": 100,
 #       "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
  #      "depth": trial.suggest_int("depth", 1, 10),
   #     "subsample": trial.suggest_float("subsample", 0.05, 1.0),
    #    "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
     #   "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    #}
    
#{'learning_rate': 0.09862076321051468, 'depth': 10, 'subsample': 0.5527953446670311, 
# 'colsample_bylevel': 0.7475725939379173, 'min_data_in_leaf': 58}

# -> adjust learning rate and depth