<a href="https://colab.research.google.com/github/Mr-houngbo/respire/blob/main/prediction/predict_iqa_esmt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd

In [1]:
!pip install -q gdown
import gdown

# ID des Datasets et noms des outputs
file_ids = ["1BZazndHP417b_nUGeSxmB-wdaYDb6Ngk"]
outputs = ["iqa-164928.csv"]


for i,file_id in enumerate(file_ids):
  url = f"https://drive.google.com/uc?id={file_id}"
  output = outputs[i]
  gdown.download(url, output, quiet=False)


Downloading...
From: https://drive.google.com/uc?id=1BZazndHP417b_nUGeSxmB-wdaYDb6Ngk
To: /content/iqa-164928.csv
100%|██████████| 427/427 [00:00<00:00, 620kB/s]


In [2]:
!pip install -q gdown
import gdown

lien = "https://drive.google.com/file/d//view?usp=sharing"


# ID du Dataset
file_id = "1b4sKColUCmG2LhQO54tRA2_okXiTvR8I"
url = f"https://drive.google.com/uc?id={file_id}"
output = "164928.csv"
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1b4sKColUCmG2LhQO54tRA2_okXiTvR8I
To: /content/164928.csv
100%|██████████| 3.62k/3.62k [00:00<00:00, 3.84MB/s]


'164928.csv'

In [6]:
df_iqa_164928 = pd.read_csv("iqa-164928.csv")
esmt = pd.read_csv("164928.csv")

In [7]:
# Harmoniser : enlever timezone
esmt['UTC Date/Time'] = pd.to_datetime(esmt['UTC Date/Time']).dt.tz_localize(None)
df_iqa_164928['date'] = pd.to_datetime(df_iqa_164928['date'])

# Merge
merged = pd.merge(
    esmt,
    df_iqa_164928,
    left_on='UTC Date/Time',
    right_on='date',
    how='inner'
).drop(columns=['date'])

print(merged.info())
print(merged.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Location ID                 18 non-null     int64         
 1   Location Name               18 non-null     object        
 2   Location Group              0 non-null      float64       
 3   Location Type               18 non-null     object        
 4   Sensor ID                   18 non-null     object        
 5   Place Open                  18 non-null     bool          
 6   Local Date/Time             18 non-null     object        
 7   UTC Date/Time               18 non-null     datetime64[ns]
 8   # of aggregated records     18 non-null     int64         
 9   PM2.5 (μg/m³) raw           18 non-null     float64       
 10  PM2.5 (μg/m³) corrected     18 non-null     float64       
 11  0.3μm particle count        18 non-null     int64         
 

In [8]:
# Supprimer colonnes inutiles
cols_a_supprimer = [
    'Location ID', 'Location Name', 'Location Group', 'Location Type',
    'Sensor ID', 'Place Open', 'UTC Date/Time','# of aggregated records'] + [col for col in merged.columns if "raw" in col.lower()]  # remove raw

merged.drop(columns=cols_a_supprimer, inplace=True, errors='ignore')

In [9]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Local Date/Time             18 non-null     object 
 1   PM2.5 (μg/m³) corrected     18 non-null     float64
 2   0.3μm particle count        18 non-null     int64  
 3   CO2 (ppm) corrected         18 non-null     int64  
 4   Temperature (°C) corrected  18 non-null     float64
 5   Heat Index (°C)             18 non-null     float64
 6   Humidity (%) corrected      18 non-null     int64  
 7   TVOC (ppb)                  18 non-null     int64  
 8   TVOC index                  18 non-null     int64  
 9   NOX index                   18 non-null     int64  
 10  PM1 (μg/m³)                 18 non-null     float64
 11  PM10 (μg/m³)                18 non-null     float64
 12  iqa                         18 non-null     float64
dtypes: float64(6), int64(6), object(1)
me

# Predictionnnn

In [11]:
# Prédiction IQA J+1 avec XGBoost en utilisant TOUTES les features disponibles
# Hypothèse pour la prévision J+1→J+5 : les variables exogènes (PM, CO2, etc.)
# suivent une persistance (valeur du dernier jour connue). On entraîne sur des lags.

import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.base import clone
import joblib

# ========= 1) Préparation =========
# df_raw : DataFrame que tu viens d’afficher (13 colonnes)
df_raw = merged.copy()  # mets ici ton DataFrame (celui avec 'Local Date/Time' et toutes les mesures)

# Passer la date en datetime et trier
df_raw['Local Date/Time'] = pd.to_datetime(df_raw['Local Date/Time'])
df_raw = df_raw.sort_values('Local Date/Time').reset_index(drop=True)

# (Optionnel) forcer un pas journalier et combler s'il manque des jours
df_raw = df_raw.set_index('Local Date/Time').asfreq('D')
# Interpolation simple pour combler d’éventuels trous
for c in df_raw.columns:
    df_raw[c] = df_raw[c].interpolate()

df_raw = df_raw.reset_index().rename(columns={'Local Date/Time': 'date'})

# ========= 2) Lags =========
# On crée des lags pour TOUTES les features exogènes + plusieurs lags pour la cible
target_col = 'iqa'
exog_cols = [c for c in df_raw.columns if c not in ['date', target_col]]

# Paramètres de lags
n_lags_target = 1   # lags pour iqa
n_lags_exog   = 1   # lags pour les exogènes

def make_lags(df_in: pd.DataFrame, target: str, exog: list, n_t: int, n_x: int) -> pd.DataFrame:
    df_out = df_in.copy()
    # lags de la cible
    for k in range(1, n_t + 1):
        df_out[f'{target}_lag_{k}'] = df_out[target].shift(k)
    # lags des exogènes
    for col in exog:
        for k in range(1, n_x + 1):
            df_out[f'{col}_lag_{k}'] = df_out[col].shift(k)
    return df_out

df_lags = make_lags(df_raw, target_col, exog_cols, n_lags_target, n_lags_exog).dropna().reset_index(drop=True)

# ========= 3) Train / Test =========
# On prédit iqa (courant) à partir des lags (donc J est prédit par J-1, J-2, ...)
features = [c for c in df_lags.columns if c not in ['date', target_col]]
X = df_lags[features]
y = df_lags[target_col]

train_size = int(len(df_lags) * 0.8)
X_train, y_train = X.iloc[:train_size], y.iloc[:train_size]
X_test,  y_test  = X.iloc[train_size:], y.iloc[train_size:]

# ========= 4) CV + Entraînement =========
n_splits = max(2, min(5, len(X_train) - 1))
tscv = TimeSeriesSplit(n_splits=n_splits)

base_xgb = XGBRegressor(
    n_estimators=800,
    learning_rate=0.04,
    max_depth=3,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
    eval_metric="rmse",  # compatible v2+ (défini dans le constructeur)
)

# CV manuelle (pas d'early_stopping pour compat toutes versions)
cv_rmses = []
for tr_idx, val_idx in tscv.split(X_train):
    est = clone(base_xgb)
    est.fit(X_train.iloc[tr_idx], y_train.iloc[tr_idx], verbose=False)
    y_val_hat = est.predict(X_train.iloc[val_idx])
    rmse = float(np.sqrt(mean_squared_error(y_train.iloc[val_idx], y_val_hat)))
    cv_rmses.append(rmse)

cv_rmses = np.array(cv_rmses)
print("📊 CV RMSE per fold :", np.round(cv_rmses, 4))
print("📊 CV RMSE mean     :", np.round(cv_rmses.mean(), 4))

# Entraînement final sur tout le train
final_xgb = clone(base_xgb)
final_xgb.fit(X_train, y_train, verbose=False)

# ========= 5) Évaluation =========
y_pred = final_xgb.predict(X_test)
rmse_test = float(np.sqrt(mean_squared_error(y_test, y_pred)))
mae_test  = float(mean_absolute_error(y_test, y_pred))
r2_test   = float(r2_score(y_test, y_pred))
print("🔎 Évaluation sur test :")
print(f"   RMSE = {rmse_test:.4f}")
print(f"   MAE  = {mae_test:.4f}")
print(f"   R2   = {r2_test:.4f}")

# ========= 6) Sauvegarde =========
joblib.dump(final_xgb, "xgb_iqa_all_features.pkl")
print("✅ Modèle sauvegardé : xgb_iqa_all_features.pkl")

# ========= 7) Prévision J+1 =========
# On part de la dernière ligne connue et de ses lags déjà construits
last_row = X.iloc[[-1]]  # DataFrame 1xN
pred_j1 = float(final_xgb.predict(last_row)[0])
print(f"📅 Prédiction J+1 : {pred_j1:.2f}")

# ========= 8) Prévisions J+1 → J+5 (auto-régression) =========
# Règle de persistance pour les exogènes : leurs lags avancent en gardant la dernière valeur connue.
# Les lags de iqa se mettent à jour avec les prédictions successives.

n_days = 5
multi_preds = []

# On travaille sur une copie des features de la dernière ligne
step_feats = last_row.copy()

# Colonnes de lags cible et exogènes
iqa_lag_cols = [f'{target_col}_lag_{k}' for k in range(1, n_lags_target + 1)]
exog_lag_cols = []
for col in exog_cols:
    for k in range(1, n_lags_exog + 1):
        exog_lag_cols.append(f'{col}_lag_{k}')

for _ in range(n_days):
    # prédire
    y_hat = float(final_xgb.predict(step_feats)[0])
    multi_preds.append(y_hat)

    # 1) MAJ lags iqa : shift à droite et mettre lag_1 = y_hat
    iqa_vals = step_feats[iqa_lag_cols].to_numpy().ravel()
    iqa_vals = np.roll(iqa_vals, 1)
    iqa_vals[0] = y_hat
    step_feats[iqa_lag_cols] = iqa_vals

    # 2) MAJ lags exogènes : persistance (on répète la dernière valeur)
    if len(exog_lag_cols) > 0:
        exog_vals = step_feats[exog_lag_cols].to_numpy().ravel()
        exog_vals = np.roll(exog_vals, 1)
        # pour persistance, on remet exog_lag_1 = exog_lag_1 actuel (avant shift) → on récupère l'ancienne valeur
        # Ici, plus simple : on garde la valeur après shift (équivaut à persistance de la dernière connue)
        step_feats[exog_lag_cols] = exog_vals

print("📅 Prédictions J+1 à J+5 :", [round(p, 2) for p in multi_preds])


📊 CV RMSE per fold : [9.8528 3.171  4.2602 3.4989 1.3899]
📊 CV RMSE mean     : 4.4346
🔎 Évaluation sur test :
   RMSE = 1.4155
   MAE  = 0.9035
   R2   = 0.8629
✅ Modèle sauvegardé : xgb_iqa_all_features.pkl
📅 Prédiction J+1 : 130.04
📅 Prédictions J+1 à J+5 : [130.04, 129.66, 130.02, 129.55, 128.94]
