In [7]:


from pathlib import Path

import joblib
import numpy as np
import os
import pandas as pd
from joblib import parallel_backend
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

DATA_DIR = Path("../../../data/processed")
train = pd.read_csv(DATA_DIR / 'train_fe_scaled.csv')
val = pd.read_csv(DATA_DIR / 'val_fe_scaled.csv')

TARGET = 'Calories'
FEATURES = [c for c in train.columns if c not in ['id', TARGET]]
X_train, y_train = train[FEATURES], train[TARGET]
X_val, y_val = val[FEATURES], val[TARGET]


In [8]:
def report_results(model_name, grid, X_val, y_val):
    best = grid.best_estimator_
    preds = best.predict(X_val)
    mae = mean_absolute_error(y_val, preds)
    mse = mean_squared_error(y_val, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_val, preds)
    return {
        "model": model_name,
        "best_params": grid.best_params_,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2
    }, best

RESULTS_CSV = "../../../results/baseline_results.csv"

def append_result_to_csv(result_dict, csv_path=RESULTS_CSV):
    df_new = pd.DataFrame([result_dict])
    if os.path.exists(csv_path):
        df_existing = pd.read_csv(csv_path)
        df_all = pd.concat([df_existing, df_new], ignore_index=True)
    else:
        df_all = df_new
    df_all.to_csv(csv_path, index=False)

In [9]:
results = {}
search_space_lasso = {"alpha": [0.001, 0.01, 0.1, 1.0, 10.0]}
lasso_grid = GridSearchCV(Lasso(random_state=42, max_iter=10000, ), search_space_lasso,
                          cv=3, scoring="neg_mean_squared_error", n_jobs=1, verbose=3)
lasso_grid.fit(X_train, y_train)
res, best = report_results("Lasso", lasso_grid, X_val, y_val)
results["Lasso"] = res
joblib.dump(best, "../../../results/models/Lasso.joblib")
append_result_to_csv(res)


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 1/3] END .....................alpha=0.001;, score=-61.314 total time= 1.1min
[CV 2/3] END .....................alpha=0.001;, score=-60.738 total time= 1.2min
[CV 3/3] END .....................alpha=0.001;, score=-60.121 total time= 1.2min
[CV 1/3] END ......................alpha=0.01;, score=-61.564 total time=  18.9s
[CV 2/3] END ......................alpha=0.01;, score=-60.990 total time=  26.4s
[CV 3/3] END ......................alpha=0.01;, score=-60.412 total time=  21.3s
[CV 1/3] END .......................alpha=0.1;, score=-73.327 total time=  11.9s
[CV 2/3] END .......................alpha=0.1;, score=-73.045 total time=  12.3s
[CV 3/3] END .......................alpha=0.1;, score=-72.574 total time=  12.4s
[CV 1/3] END .......................alpha=1.0;, score=-91.953 total time=   1.6s
[CV 2/3] END .......................alpha=1.0;, score=-92.218 total time=   1.6s
[CV 3/3] END .......................alpha=1.0;, s

In [10]:
results_df = pd.DataFrame(results).T
print(results_df)
results_df.to_csv("../../../results/models/baseline_results_lasso.csv")


       model       best_params       MAE      RMSE        R2
Lasso  Lasso  {'alpha': 0.001}  5.537306  7.763374  0.984449
