In [1]:
import catboost as cat
import xgboost as xgb
import lightgbm as lgb
import glob
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_log_error
from sklearn.model_selection import cross_val_score
import optuna


In [2]:
TRAIN_MODE = True
LOCAL_NOTEBOOK= True  #Local or in Kaggle
if LOCAL_NOTEBOOK:

    DEVICE = 'gpu'

    save_models_path = r'..\data\kaggle_playground\calories_competition\models'
    kaggle_path = glob.glob(r'..\data\kaggle_playground\calories_competition\*.csv')

    csv_files = {
        path.split("\\")[-1][:-4]: path
        for path in kaggle_path
    }

    df_test = pd.read_csv(csv_files['test'])
    df_train= pd.read_csv(csv_files['train']).drop(columns=['id'])
    df_train['Calories'] = df_train['Calories'].astype(int)
    df_subsample = pd.read_csv(csv_files['sample_submission'])

else:
    # Kaggle read csvs

    DEVICE = 'cpu'
    df_test = pd.read_csv(r'/kaggle/input/playground-series-s5e5/test.csv')
    df_train = pd.read_csv(r'/kaggle/input/playground-series-s5e5/train.csv')
    df_subsample = pd.read_csv(r'/kaggle/input/playground-series-s5e5/train.csv')
    save_models_path = r'/kaggle/working/models'

In [3]:
def categorical_to(col: pd.Series):
    decode = {}
    encode = {}
    
    categorical_data = col.sort_index().unique()

    for item in enumerate(categorical_data):

        item_enc = {item[1]: item[0]}
        item_dec = {item[0]: item[1]}
        encode.update(item_enc)
        decode.update(item_dec)
    
    return encode,decode
        
enc_sex, dec_sex = categorical_to(df_train['Sex'])

df_train['Sex'] = df_train['Sex'].map(enc_sex)
df_test['Sex'] = df_test['Sex'].map(enc_sex)

features = list(df_train.drop(columns=['Calories']).columns)
features_scaler = list(df_train.drop(columns=['Sex','Calories','Age']).columns)
target = 'Calories'

In [4]:


X = df_train[features].copy()
y = df_train[target].copy()

scaler = StandardScaler()
X[features_scaler] = scaler.fit_transform(X[features_scaler])
df_test[features_scaler] = scaler.transform(df_test[features_scaler])

X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2, shuffle=True)


In [None]:
def objective(trial):
    booster = trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"])

    params = {
        "verbosity": 0,
        "objective": "reg:squarederror",  
        "eval_metric": "rmsle",  
        "booster": booster,
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "n_estimators": trial.suggest_int("n_estimators", 700, 900),
        "device":"gpu"
    }

    if booster in ["gbtree", "dart"]:
        params["max_depth"] = trial.suggest_int("max_depth", 3, 10)
        params["subsample"] = trial.suggest_float("subsample", 0.5, 1.0)
        params["colsample_bytree"] = trial.suggest_float("colsample_bytree", 0.5, 1.0)

    try:
        model = xgb.XGBRegressor(**params)
        score = cross_val_score(model, X_train, y_train, cv=3, scoring="neg_root_mean_squared_error").mean()
        return score
    except Exception as e:
        print("Failed with params:", params)
        raise e


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=4)

[I 2025-05-07 15:41:20,620] A new study created in memory with name: no-name-75b27503-5fc0-48ae-a021-8c63437932d2
[I 2025-05-07 15:49:04,877] Trial 0 finished with value: -3.7182881639684235 and parameters: {'booster': 'dart', 'lambda': 0.0001862691402022658, 'alpha': 1.1175827499974151e-07, 'learning_rate': 0.23009108748345408, 'n_estimators': 900, 'max_depth': 6, 'subsample': 0.7746703833134805, 'colsample_bytree': 0.5793668594207906}. Best is trial 0 with value: -3.7182881639684235.
[I 2025-05-07 15:49:19,683] Trial 1 finished with value: -3.6675830784255328 and parameters: {'booster': 'gbtree', 'lambda': 0.19023495851242614, 'alpha': 9.378756721550792e-05, 'learning_rate': 0.07395891534391195, 'n_estimators': 813, 'max_depth': 8, 'subsample': 0.5027248181268938, 'colsample_bytree': 0.5343832872687366}. Best is trial 1 with value: -3.6675830784255328.
[I 2025-05-07 15:49:45,301] Trial 2 finished with value: -11.102693126591006 and parameters: {'booster': 'gblinear', 'lambda': 7.4668

In [52]:
study.best_params

{'booster': 'gbtree',
 'lambda': 0.19023495851242614,
 'alpha': 9.378756721550792e-05,
 'learning_rate': 0.07395891534391195,
 'n_estimators': 813,
 'max_depth': 8,
 'subsample': 0.5027248181268938,
 'colsample_bytree': 0.5343832872687366}

### GradBoosting Models

In [5]:
boosters = ["gbtree", "gblinear"]

models_name = ['xgb']


if TRAIN_MODE:
#GridSearchCV()

    if 'xgb' in models_name:
        model_xgb = xgb.XGBRegressor(
            booster= 'gbtree',
            reg_lambda= 0.19023495851242614,
            alpha = 9.378756721550792e-05,
            learning_rate = 0.07395891534391195,
            n_estimators= 5000,
            max_depth = 8,
            subsample= 0.5027248181268938,
            colsample_bytree= 0.5343832872687366,
            device = DEVICE
            , early_stopping_rounds=100
        )


        model_xgb.fit(X_train ,y_train ,eval_set=[(X_val,y_val)] ,verbose=False)

    if 'cat' in models_name:
        eval_data = cat.Pool(X_val ,y_val)
        model_cat = cat.CatBoostRegressor(iterations= 10000 
                                    ,task_type = DEVICE.upper()	
                                    ,learning_rate=0.001
                                    ,depth=5
                                    ,l2_leaf_reg= 2
                                    ,verbose=0 
                                    ,loss_function='RMSE')
        
        model_cat.fit(X_train, y_train, eval_set=[eval_data], early_stopping_rounds=100)
        

    if 'lgbm' in models_name:
        model_lgbm = lgb.LGBMRegressor(n_estimators = 10000
                                ,learning_rate = 0.001
                                ,objective='regression'
                                ,max_depth=5
                                ,early_stopping_rounds=50
                                ,metric='rmse'
                                ,verbose=-1
                                ,device='gpu')
        
        model_lgbm.fit(X_train, y_train , eval_set=[(X_val, y_val)])


In [6]:

if 'xgb' in models_name:

    xgb_pred = model_xgb.predict(X_val)
    xgb_pred = np.where(xgb_pred < 0, 0, xgb_pred)
    xgb_loss = root_mean_squared_log_error(y_val,xgb_pred)
    print(f"Xgboost: {xgb_loss}")

if 'cat' in models_name:

    cat_pred = model_cat.predict(X_val)
    cat_pred = np.where(cat_pred < 0, 0, cat_pred)
    cat_loss = root_mean_squared_log_error(y_val,cat_pred)
    print(f"Catboost: {cat_loss}")

if 'lgbm' in models_name:

    lgbm_pred = model_lgbm.predict(X_val)
    lgbm_pred = np.where(lgbm_pred < 0, 0, lgbm_pred)
    lgbm_loss = root_mean_squared_log_error(y_val,lgbm_pred)
    print(f"LGBM: {lgbm_loss}")




Xgboost: 0.06598023579961027


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


In [8]:
model_name = 'xgb'
joblib.dump(model_xgb,f'{save_models_path}\\{model_name}_{4}.model')

['..\\data\\kaggle_playground\\calories_competition\\models\\xgb_4.model']

In [137]:
xgb_sub_pred= model_xgb.predict(df_test[features])
y_pred_xgb = model_xgb.predict(X_val)

print(f"Validation RMSLE: {round(root_mean_squared_log_error(y_val, y_pred_xgb),3)}")

Validation RMSLE: 0.07


In [None]:
if LOCAL_NOTEBOOK == False and TRAIN_MODE == True:
    submission = {
        'id':df_test['id'] ,
        'Calories': xgb_sub_pred
    }

    df_sub = pd.DataFrame(submission)
    df_sub.to_csv('submission')