In [1]:


import os
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from joblib import parallel_backend
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

DATA_DIR = Path("../../../data/processed")
train = pd.read_csv(DATA_DIR / 'train_fe_scaled.csv')
val = pd.read_csv(DATA_DIR / 'val_fe_scaled.csv')

TARGET = 'Calories'
FEATURES = [c for c in train.columns if c not in ['id', TARGET]]
X_train, y_train = train[FEATURES], train[TARGET]
X_val, y_val = val[FEATURES], val[TARGET]


In [2]:
def report_results(model_name, grid, X_val, y_val):
    best = grid.best_estimator_
    preds = best.predict(X_val)
    mae = mean_absolute_error(y_val, preds)
    mse = mean_squared_error(y_val, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_val, preds)
    return {
        "model": model_name,
        "best_params": grid.best_params_,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2
    }, best

RESULTS_CSV = "../../../results/baseline_results.csv"

def append_result_to_csv(result_dict, csv_path=RESULTS_CSV):
    df_new = pd.DataFrame([result_dict])
    if os.path.exists(csv_path):
        df_existing = pd.read_csv(csv_path)
        df_all = pd.concat([df_existing, df_new], ignore_index=True)
    else:
        df_all = df_new
    df_all.to_csv(csv_path, index=False)

In [3]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV
results = {}
param_dist_knn = {
    "n_neighbors": [3, 5, 7],
    "weights": ["uniform", "distance"]
}
knn_grid = RandomizedSearchCV(KNeighborsRegressor(), param_distributions=param_dist_knn,
                              n_iter=5, cv=3, scoring="neg_mean_squared_error", n_jobs=1, random_state=42,
                              verbose=3)
with parallel_backend('threading', n_jobs=3):
    knn_grid.fit(X_train, y_train)
res, best = report_results("KNeighbors", knn_grid, X_val, y_val)
results["KNeighbors"] = res
joblib.dump(best, "../../../results/models/KNeighbors.joblib")
append_result_to_csv(res)


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 1/3] END ..n_neighbors=3, weights=uniform;, score=-24.228 total time=  23.4s
[CV 2/3] END ..n_neighbors=3, weights=uniform;, score=-23.203 total time=  29.0s
[CV 3/3] END ..n_neighbors=3, weights=uniform;, score=-23.359 total time=  29.0s
[CV 1/3] END .n_neighbors=3, weights=distance;, score=-23.987 total time=  28.9s
[CV 2/3] END .n_neighbors=3, weights=distance;, score=-22.917 total time=  22.3s
[CV 3/3] END .n_neighbors=3, weights=distance;, score=-23.085 total time=  32.6s
[CV 1/3] END .n_neighbors=7, weights=distance;, score=-19.667 total time=  24.7s
[CV 2/3] END .n_neighbors=7, weights=distance;, score=-18.596 total time=  38.0s
[CV 3/3] END .n_neighbors=7, weights=distance;, score=-18.982 total time=  38.5s
[CV 1/3] END ..n_neighbors=5, weights=uniform;, score=-21.157 total time=  36.8s
[CV 2/3] END ..n_neighbors=5, weights=uniform;, score=-20.168 total time=  36.2s
[CV 3/3] END ..n_neighbors=5, weights=uniform;, s

In [4]:
results_df = pd.DataFrame(results).T
print(results_df)
results_df.to_csv("../../../results/models/baseline_results_knn.csv")


                 model                                best_params       MAE  \
KNeighbors  KNeighbors  {'weights': 'distance', 'n_neighbors': 7}  2.710705   

                RMSE        R2  
KNeighbors  4.279837  0.995274  
