In [1]:


import os
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from joblib import parallel_backend
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

DATA_DIR = Path("../../../data/processed")
train = pd.read_csv(DATA_DIR / 'train_fe_scaled.csv')
val = pd.read_csv(DATA_DIR / 'val_fe_scaled.csv')

TARGET = 'Calories'
FEATURES = [c for c in train.columns if c not in ['id', TARGET]]
X_train, y_train = train[FEATURES], train[TARGET]
X_val, y_val = val[FEATURES], val[TARGET]


In [2]:
def report_results(model_name, grid, X_val, y_val):
    best = grid.best_estimator_
    preds = best.predict(X_val)
    mae = mean_absolute_error(y_val, preds)
    mse = mean_squared_error(y_val, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_val, preds)
    return {
        "model": model_name,
        "best_params": grid.best_params_,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2
    }, best

RESULTS_CSV = "../../../results/baseline_results.csv"

def append_result_to_csv(result_dict, csv_path=RESULTS_CSV):
    df_new = pd.DataFrame([result_dict])
    if os.path.exists(csv_path):
        df_existing = pd.read_csv(csv_path)
        df_all = pd.concat([df_existing, df_new], ignore_index=True)
    else:
        df_all = df_new
    df_all.to_csv(csv_path, index=False)

In [3]:
results = {}
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

if XGBRegressor is not None:
    param_dist_xgb = {
        "n_estimators": [50, 100],
        "learning_rate": [0.01, 0.1],
        "max_depth": [3, 6]
    }
    xgb_grid = RandomizedSearchCV(XGBRegressor(random_state=42, n_jobs=-1), 
                                  param_distributions=param_dist_xgb,
                                  n_iter=5, cv=3, 
                                  scoring="neg_mean_squared_error", n_jobs=-1, random_state=42,
                                  verbose=3)
    with parallel_backend('threading', n_jobs=1):
        xgb_grid.fit(X_train, y_train)
    res, best = report_results("XGBoost", xgb_grid, X_val, y_val)
    results["XGBoost"] = res
    joblib.dump(best, "../../../results/models/XGBoost.joblib")
    append_result_to_csv(res)


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 1/3] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=-1542.760 total time=   2.7s
[CV 2/3] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=-1548.606 total time=   2.7s
[CV 3/3] END learning_rate=0.1, max_depth=3, n_estimators=100;, score=-19.280 total time=   5.1s
[CV 1/3] END learning_rate=0.1, max_depth=3, n_estimators=100;, score=-19.311 total time=   5.1s
[CV 2/3] END learning_rate=0.1, max_depth=3, n_estimators=100;, score=-18.304 total time=   5.2s
[CV 2/3] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=-655.567 total time=   5.5s
[CV 3/3] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=-650.948 total time=   5.6s
[CV 1/3] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=-652.176 total time=   5.9s
[CV 3/3] END learning_rate=0.01, max_depth=3, n_estimators=50;, score=-1540.756 total time=   3.5s
[CV 2/3] END learning_rate=0.01, max_depth=6, n_estimat

In [4]:
results_df = pd.DataFrame(results).T
print(results_df)
results_df.to_csv("../../../results/models/baseline_results_xgb.csv")


           model                                        best_params       MAE  \
XGBoost  XGBoost  {'n_estimators': 100, 'max_depth': 6, 'learnin...  2.270453   

             RMSE        R2  
XGBoost  3.713193  0.996442  
