### 1. Imports

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
os.chdir("..")

In [4]:
%pip install lightgbm

Collecting lightgbm
  Using cached lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Using cached lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Cellule 1: imports
import pandas as pd
import yaml
from pathlib import Path
from src.models.modeling  import train_models_for_zone



In [8]:
def load_config(config_path: str = "config.yaml") -> dict:
    with open(config_path, "r") as f:
        cfg = yaml.safe_load(f)
    return cfg

# Cellule 2: configuration
cfg = load_config("config.yaml")
processed_dir = Path(cfg["paths"]["data"]["processed"])
models_dir    = Path("models")
reports_dir   = Path("outputs/reports")
models_dir.mkdir(exist_ok=True, parents=True)
reports_dir.mkdir(exist_ok=True, parents=True)
zones = list(cfg["geo_zones"].values())


In [None]:
# Cellule 3 – Boucle d’entraînement
all_metrics = []
for zone in zones:
    print(f"→ Training {zone}")
    res = train_models_for_zone(zone, processed_dir, models_dir)
    # aplatissement des métriques
    flat = {"zone": zone}
    for h, mets in res.items():
        for m, vals in mets.items():
            flat[f"{h}_{m}_MAE"]  = vals["MAE"]
            flat[f"{h}_{m}_RMSE"] = vals["RMSE"]
            flat[f"{h}_{m}_MAPE"]= vals["MAPE"]
    all_metrics.append(flat)

# Cellule 4 – Sauvegarde des métriques
pd.DataFrame(all_metrics).to_csv(
    reports_dir/"all_zones_baseline_metrics.csv", index=False
)

→ Processing zone: Peninsule_Iberique
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000769 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4594
[LightGBM] [Info] Number of data points in the train set: 3653, number of used features: 20
[LightGBM] [Info] Start training from score 4946181.019710


found 0 physical cores < 1
  File "c:\Users\NANKOULI\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


TypeError: train_baseline() takes 5 positional arguments but 6 were given

In [None]:
# Cellule 4: sauvegarde des métriques
pd.DataFrame(all_metrics).to_csv(reports_dir/"baseline_metrics.csv", index=False)


In [None]:
# Cellule 5: visualisation rapide
import matplotlib.pyplot as plt

for zone in zones:
    df = pd.read_parquet(processed_dir/f"{zone}_processed_daily.parquet")
    test = df["demand"].iloc[int(len(df)*0.8):]
    model_rf = joblib.load(models_dir/f"{zone}_rf.pkl")
    pred = model_rf.predict(df.drop(columns="demand").iloc[int(len(df)*0.8):])
    plt.figure(figsize=(10,3))
    plt.plot(test.index, test, label="Réel")
    plt.plot(test.index, pred, label="RF prédit")
    plt.title(f"{zone} — RandomForest baseline")
    plt.legend()
    plt.show()


Training LinearRegression...
  -> MAE=638903.9, RMSE=1112.8, MAPE=50.2%
Training Ridge...
  -> MAE=638215.4, RMSE=1112.5, MAPE=50.2%
Training RandomForest...
  -> MAE=1101546.2, RMSE=1356.5, MAPE=53.7%
