In [121]:
import catboost as cat
import xgboost as xgb
import lightgbm as lgb
import glob
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_log_error
from sklearn.model_selection import GridSearchCV

models_path = r'..\data\kaggle_playground\calories_competition\models'
datasets_path = glob.glob(r'..\data\*.csv')
kaggle_path = glob.glob(r'..\data\kaggle_playground\calories_competition\*.csv')

csv_files = {
    path.split("\\")[-1][:-4]: path
    for path in kaggle_path
}
csv_files

{'sample_submission': '..\\data\\kaggle_playground\\calories_competition\\sample_submission.csv',
 'test': '..\\data\\kaggle_playground\\calories_competition\\test.csv',
 'train': '..\\data\\kaggle_playground\\calories_competition\\train.csv'}

In [122]:
df_test = pd.read_csv(csv_files['test'])
df_train= pd.read_csv(csv_files['train']).drop(columns=['id'])
df_subsample = pd.read_csv(csv_files['sample_submission'])

In [None]:
def categorical_to(col: pd.Series):
    decode = {}
    encode = {}
    
    categorical_data = col.sort_index().unique()

    for item in enumerate(categorical_data):

        item_enc = {item[1]: item[0]}
        item_dec = {item[0]: item[1]}
        encode.update(item_enc)
        decode.update(item_dec)
    
    return encode,decode
        
enc_sex, dec_sex = categorical_to(df_train['Sex'])

df_train['Sex'] = df_train['Sex'].map(enc_sex)
df_test['Sex'] = df_test['Sex'].map(enc_sex)

features = list(df_train.drop(columns=['Calories']).columns)
target = 'Calories'

In [127]:
X = df_train[features]
y = df_train[target]

X_train, X_val, y_train, y_val = train_test_split(X,y, shuffle=True)


In [128]:
boosters = ["gbtree", "gblinear"]



model_xgb = xgb.XGBRegressor(
    booster = "gbtree"
    ,device ="cuda"
    ,verbosity = 0
    ,learning_rate = 0.01
    ,n_estimators = 1000
    ,eval_metric="rmsle"
    ,early_stopping_rounds = 100
)


model_xgb.fit(X_train
              ,y_train
              ,eval_set=[(X_val,y_val)]
              ,verbose=False)




In [129]:
model_name = 'xgb'
joblib.dump(model_xgb,f'{models_path}\\{model_name}_{1}.model')

['..\\data\\kaggle_playground\\calories_competition\\models\\xgb_1.model']

In [None]:
xgb_sub_pred= model_xgb.predict(df_test[features])
y_pred_xgb = model_xgb.predict(X_val)

print(f"Validation RMSLE: {round(root_mean_squared_log_error(y_val, y_pred_xgb),3)}")

Validadation RMSLE: 0.069


In [None]:
submission = {
    'id':df_test['id'] ,
    'Calories': xgb_sub_pred
}

df_sub = pd.DataFrame(submission)
df_sub.to_csv('submi')

',id,Calories\r\n0,750000,27.184998\r\n1,750001,106.83332\r\n2,750002,88.13893\r\n3,750003,128.21811\r\n4,750004,75.45897\r\n5,750005,21.776108\r\n6,750006,48.772945\r\n7,750007,7.0850143\r\n8,750008,10.135475\r\n9,750009,203.8956\r\n10,750010,78.84044\r\n11,750011,218.30841\r\n12,750012,70.64007\r\n13,750013,249.90002\r\n14,750014,87.49661\r\n15,750015,138.60442\r\n16,750016,67.87247\r\n17,750017,166.3949\r\n18,750018,125.240005\r\n19,750019,188.07996\r\n20,750020,85.42453\r\n21,750021,62.269127\r\n22,750022,76.7455\r\n23,750023,6.664383\r\n24,750024,35.814156\r\n25,750025,151.79709\r\n26,750026,74.81627\r\n27,750027,155.28305\r\n28,750028,85.90381\r\n29,750029,17.10231\r\n30,750030,86.67727\r\n31,750031,22.175774\r\n32,750032,32.22689\r\n33,750033,11.920807\r\n34,750034,96.13547\r\n35,750035,104.62777\r\n36,750036,50.680355\r\n37,750037,43.691822\r\n38,750038,154.08096\r\n39,750039,133.69287\r\n40,750040,102.323685\r\n41,750041,37.32842\r\n42,750042,83.75445\r\n43,750043,88.63677\r\n