In [1]:


from pathlib import Path

import joblib
import numpy as np
import os
import pandas as pd
from joblib import parallel_backend
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

DATA_DIR = Path("../../../data/processed")
train = pd.read_csv(DATA_DIR / 'train_fe_scaled.csv')
val = pd.read_csv(DATA_DIR / 'val_fe_scaled.csv')

TARGET = 'Calories'
FEATURES = [c for c in train.columns if c not in ['id', TARGET]]
X_train, y_train = train[FEATURES], train[TARGET]
X_val, y_val = val[FEATURES], val[TARGET]


In [2]:
def report_results(model_name, grid, X_val, y_val):
    best = grid.best_estimator_
    preds = best.predict(X_val)
    mae = mean_absolute_error(y_val, preds)
    mse = mean_squared_error(y_val, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_val, preds)
    return {
        "model": model_name,
        "best_params": grid.best_params_,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2
    }, best

RESULTS_CSV = "../../../results/baseline_results.csv"

def append_result_to_csv(result_dict, csv_path=RESULTS_CSV):
    df_new = pd.DataFrame([result_dict])
    if os.path.exists(csv_path):
        df_existing = pd.read_csv(csv_path)
        df_all = pd.concat([df_existing, df_new], ignore_index=True)
    else:
        df_all = df_new
    df_all.to_csv(csv_path, index=False)

In [3]:
results = {}
search_space_ridge = {"alpha": [0.01, 0.1, 1.0, 10.0]}
ridge_grid = GridSearchCV(Ridge(random_state=42), search_space_ridge, cv=3,
                          scoring="neg_mean_squared_error", n_jobs=-1, 
                          verbose=3)
with parallel_backend('threading', n_jobs=3):
    ridge_grid.fit(X_train, y_train)
res, best = report_results("Ridge", ridge_grid, X_val, y_val)
results["Ridge"] = res
joblib.dump(best, "../../../results/models/Ridge.joblib")
append_result_to_csv(res)


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 2/3] END ......................alpha=0.01;, score=-60.736 total time=   0.4s[CV 3/3] END ......................alpha=0.01;, score=-60.117 total time=   0.4s
[CV 3/3] END .......................alpha=0.1;, score=-60.117 total time=   0.4s

[CV 1/3] END .......................alpha=1.0;, score=-61.314 total time=   0.5s
[CV 1/3] END .......................alpha=0.1;, score=-61.314 total time=   0.5s
[CV 1/3] END ......................alpha=0.01;, score=-61.314 total time=   0.5s
[CV 2/3] END .......................alpha=1.0;, score=-60.737 total time=   0.5s
[CV 2/3] END .......................alpha=0.1;, score=-60.736 total time=   0.5s
[CV 1/3] END ......................alpha=10.0;, score=-61.316 total time=   0.2s
[CV 2/3] END ......................alpha=10.0;, score=-60.745 total time=   0.2s
[CV 3/3] END .......................alpha=1.0;, score=-60.118 total time=   0.3s
[CV 3/3] END ......................alpha=10.0;, s

In [4]:
results_df = pd.DataFrame(results).T
print(results_df)
results_df.to_csv("../../../results/models/baseline_results_ridge.csv")


       model      best_params       MAE      RMSE        R2
Ridge  Ridge  {'alpha': 0.01}  5.539119  7.763404  0.984449
