In [1]:


import os
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from joblib import parallel_backend
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

DATA_DIR = Path("../../../data/processed")
train = pd.read_csv(DATA_DIR / 'train_fe_scaled.csv')
val = pd.read_csv(DATA_DIR / 'val_fe_scaled.csv')

TARGET = 'Calories'
FEATURES = [c for c in train.columns if c not in ['id', TARGET]]
X_train, y_train = train[FEATURES], train[TARGET]
X_val, y_val = val[FEATURES], val[TARGET]


In [2]:
def report_results(model_name, grid, X_val, y_val):
    best = grid.best_estimator_
    preds = best.predict(X_val)
    mae = mean_absolute_error(y_val, preds)
    mse = mean_squared_error(y_val, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_val, preds)
    return {
        "model": model_name,
        "best_params": grid.best_params_,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2
    }, best

RESULTS_CSV = "../../../results/baseline_results.csv"

def append_result_to_csv(result_dict, csv_path=RESULTS_CSV):
    df_new = pd.DataFrame([result_dict])
    if os.path.exists(csv_path):
        df_existing = pd.read_csv(csv_path)
        df_all = pd.concat([df_existing, df_new], ignore_index=True)
    else:
        df_all = df_new
    df_all.to_csv(csv_path, index=False)

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

results = {}
param_dist_rf = {
    "n_estimators": [50, 100],
    "max_depth": [None, 10]
}
rf_grid = RandomizedSearchCV(RandomForestRegressor(random_state=42, n_jobs=-1), 
                             param_distributions=param_dist_rf,
                             n_iter=4, cv=3, 
                             scoring="neg_mean_squared_error", n_jobs=1, random_state=42,
                             verbose=3)
rf_grid.fit(X_train, y_train)
res, best = report_results("RandomForest", rf_grid, X_val, y_val)
results["RandomForest"] = res
joblib.dump(best, "../../../results/models/RandomForest.joblib")
append_result_to_csv(res)


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3] END .max_depth=None, n_estimators=50;, score=-15.227 total time=  56.5s
[CV 2/3] END .max_depth=None, n_estimators=50;, score=-14.126 total time= 1.5min
[CV 3/3] END .max_depth=None, n_estimators=50;, score=-14.788 total time= 1.6min
[CV 1/3] END max_depth=None, n_estimators=100;, score=-15.113 total time= 3.1min
[CV 2/3] END max_depth=None, n_estimators=100;, score=-14.014 total time= 3.1min
[CV 3/3] END max_depth=None, n_estimators=100;, score=-14.624 total time= 2.4min
[CV 1/3] END ...max_depth=10, n_estimators=50;, score=-21.195 total time=  19.5s
[CV 2/3] END ...max_depth=10, n_estimators=50;, score=-20.092 total time=  22.4s
[CV 3/3] END ...max_depth=10, n_estimators=50;, score=-20.810 total time=  23.5s
[CV 1/3] END ..max_depth=10, n_estimators=100;, score=-21.123 total time=  44.3s
[CV 2/3] END ..max_depth=10, n_estimators=100;, score=-20.075 total time=  46.5s
[CV 3/3] END ..max_depth=10, n_estimators=100;, s

In [4]:
results_df = pd.DataFrame(results).T
print(results_df)
results_df.to_csv("../../../results/models/baseline_results_rf.csv")


                     model                               best_params  \
RandomForest  RandomForest  {'n_estimators': 100, 'max_depth': None}   

                   MAE      RMSE        R2  
RandomForest  2.267412  3.782081  0.996309  
