In [None]:
import catboost as cat
import xgboost as xgb
import lightgbm as lgb
import glob
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,RobustScaler,MinMaxScaler
from sklearn.metrics import root_mean_squared_log_error
from sklearn.model_selection import GridSearchCV


In [111]:
TRAIN_MODE = True
LOCAL_NOTEBOOK= True #Local or in Kaggle
if LOCAL_NOTEBOOK:

    DEVICE = 'gpu'

    models_path = r'..\data\kaggle_playground\calories_competition\models'
    kaggle_path = glob.glob(r'..\data\kaggle_playground\calories_competition\*.csv')

    csv_files = {
        path.split("\\")[-1][:-4]: path
        for path in kaggle_path
    }

    df_test = pd.read_csv(csv_files['test'])
    df_train= pd.read_csv(csv_files['train']).drop(columns=['id'])
    df_subsample = pd.read_csv(csv_files['sample_submission'])

else:
    # Kaggle read csvs

    DEVICE = 'cpu'
    df_test = pd.read_csv(r'/kaggle/input/playground-series-s5e5/test.csv')
    df_train = pd.read_csv(r'/kaggle/input/playground-series-s5e5/train.csv')
    df_subsample = pd.read_csv(r'/kaggle/input/playground-series-s5e5/train.csv')
    models_path = r'/kaggle/working/models'

In [112]:
def categorical_to(col: pd.Series):
    decode = {}
    encode = {}
    
    categorical_data = col.sort_index().unique()

    for item in enumerate(categorical_data):

        item_enc = {item[1]: item[0]}
        item_dec = {item[0]: item[1]}
        encode.update(item_enc)
        decode.update(item_dec)
    
    return encode,decode
        
enc_sex, dec_sex = categorical_to(df_train['Sex'])

df_train['Sex'] = df_train['Sex'].map(enc_sex)
df_test['Sex'] = df_test['Sex'].map(enc_sex)

features = list(df_train.drop(columns=['Calories']).columns)
features_scaler = list(df_train.drop(columns=['Sex','Calories','Age']).columns)
target = 'Calories'

In [113]:


X = df_train[features]
y = df_train[target]

scaler = StandardScaler()
X[features_scaler] = scaler.fit_transform(X[features_scaler])
df_test[features_scaler] = scaler.transform(df_test[features_scaler])

X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2, shuffle=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[features_scaler] = scaler.fit_transform(X[features_scaler])


In [114]:
df_test

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,750000,0,45,0.179525,0.418684,-1.008011,-0.897793,-0.302938
1,750001,0,26,1.972969,1.562956,0.548113,0.583714,0.594643
2,750002,1,29,1.037259,0.704752,0.069306,0.689536,0.466417
3,750003,1,39,-0.210354,-0.153452,0.548113,1.218646,0.722869
4,750004,1,30,-0.132378,-0.582554,0.069306,-0.157039,0.594643
...,...,...,...,...,...,...,...,...
249995,999995,1,56,-1.224040,-0.940138,-1.127713,-1.109436,-0.815841
249996,999996,0,32,2.128921,1.849024,-1.486818,-1.215258,-2.098099
249997,999997,1,31,-0.834161,-0.797104,-0.170098,0.266248,0.081740
249998,999998,1,62,-1.302016,-1.011655,1.146622,1.112824,0.851095


### GradBoosting Models

In [116]:
boosters = ["gbtree", "gblinear"]

models_name = ['xgb','cat']


if TRAIN_MODE:
#GridSearchCV()

    if 'xgb' in models_name:
        model_xgb = xgb.XGBRegressor(
            booster = "gbtree"
            ,device = DEVICE
            ,verbosity = 0
            ,learning_rate = 0.01
            ,n_estimators = 10000
            ,eval_metric="rmse"
            ,early_stopping_rounds=100
        )


        model_xgb.fit(X_train ,y_train ,eval_set=[(X_val,y_val)] ,verbose=False)

    if 'cat' in models_name:
        eval_data = cat.Pool(X_val ,y_val)
        model_cat = cat.CatBoostRegressor(iterations= 10000 
                                    ,task_type = DEVICE.upper()	
                                    ,learning_rate=0.001
                                    ,depth=5
                                    ,l2_leaf_reg= 2
                                    ,verbose=0 
                                    ,loss_function='RMSE')
        
        model_cat.fit(X_train, y_train, eval_set=[eval_data], early_stopping_rounds=100)
        

    if 'lgbm' in models_name:
        model_lgbm = lgb.LGBMRegressor(n_estimators = 10000
                                ,learning_rate = 0.001
                                ,objective='regression'
                                ,max_depth=5
                                ,early_stopping_rounds=50
                                ,metric='rmse'
                                ,verbose=-1
                                ,device='gpu')
        
        model_lgbm.fit(X_train, y_train , eval_set=[(X_val, y_val)])


In [118]:

if 'xgb' in models_name:

    xgb_pred = model_xgb.predict(X_val)
    xgb_pred = np.where(xgb_pred < 0, 0, xgb_pred)
    xgb_loss = root_mean_squared_log_error(y_val,xgb_pred)
    print(f"Xgboost: {xgb_loss}")

if 'cat' in models_name:

    cat_pred = model_cat.predict(X_val)
    cat_pred = np.where(cat_pred < 0, 0, cat_pred)
    cat_loss = root_mean_squared_log_error(y_val,cat_pred)
    print(f"Catboost: {cat_loss}")

if 'lgbm' in models_name:

    lgbm_pred = model_lgbm.predict(X_val)
    lgbm_pred = np.where(lgbm_pred < 0, 0, lgbm_pred)
    lgbm_loss = root_mean_squared_log_error(y_val,lgbm_pred)
    print(f"LGBM: {lgbm_loss}")




Xgboost: 0.06210014702972625
Catboost: 0.08789775795341305


In [104]:
model_name = 'xgb'
joblib.dump(model_xgb,f'{models_path}\\{model_name}_{3}.model')

['..\\data\\kaggle_playground\\calories_competition\\models\\xgb_3.model']

In [137]:
xgb_sub_pred= model_xgb.predict(df_test[features])
y_pred_xgb = model_xgb.predict(X_val)

print(f"Validation RMSLE: {round(root_mean_squared_log_error(y_val, y_pred_xgb),3)}")

Validation RMSLE: 0.07


In [None]:
if LOCAL_NOTEBOOK == False and TRAIN_MODE == True:
    submission = {
        'id':df_test['id'] ,
        'Calories': xgb_sub_pred
    }

    df_sub = pd.DataFrame(submission)
    df_sub.to_csv('submission')