# Etape 2.1 : Nettoyage des donnees meteo

## Charger `meteo_raw.csv` avec Pandas

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
import os
warnings.filterwarnings('ignore')

# Configuration affichage
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Chemins
DATA_DIR = "../data_ecf"
OUTPUT_DIR = "../output"
CSV_DIR = "../output/"

# Charger les donnees meteo
df_weather_raw = pd.read_csv(f"{DATA_DIR}/meteo_raw.csv")

print(f"Shape: {df_weather_raw.shape}")
print(f"\nColonnes: {df_weather_raw.columns.tolist()}")
df_weather_raw.head(10)

Shape: (252612, 7)

Colonnes: ['commune', 'timestamp', 'temperature_c', 'humidite_pct', 'rayonnement_solaire_wm2', 'vitesse_vent_kmh', 'precipitation_mm']


Unnamed: 0,commune,timestamp,temperature_c,humidite_pct,rayonnement_solaire_wm2,vitesse_vent_kmh,precipitation_mm
0,Saint-Etienne,09/15/2024 15:00:00,17.1,143.3,244.9,14.3,0.0
1,Bordeaux,21/07/2023 15:00,19.6,50.6,414.9,3.2,0.0
2,Montpellier,2023-09-18 20:00:00,18.3,65.7,218.4,13.6,0.0
3,Le Havre,01/03/2024 22:00:00,3.7,94.9,6.8,18.6,11.6
4,Lille,29/10/2024 20:00,14.0,42.9,781.8,4.0,0.0
5,Bordeaux,22/12/2023 13:00,4.4,36.9,796.4,6.1,0.0
6,Marseille,09/15/2023 21:00:00,22.5,86.8,5.8,32.6,0.0
7,Toulouse,30/05/2023 00:00,8.3,66.3,26.4,31.4,7.2
8,Bordeaux,2024-10-05T09:00:00,11.5,69.8,71.4,34.4,0.0
9,Toulon,2024-09-28T21:00:00,19.2,79.0,13.1,31.8,0.0


In [3]:
# Audit : essai : avant nettoyage
audit_before = pd.DataFrame({
    "non_null_count": df_weather_raw.notna().sum(),
    "null_count": df_weather_raw.isna().sum(),
    "completeness_%": (1 - df_weather_raw.isna().mean()) * 100
})

audit_before


Unnamed: 0,non_null_count,null_count,completeness_%
commune,252612,0,100.0
timestamp,252612,0,100.0
temperature_c,251383,1229,99.513483
humidite_pct,252612,0,100.0
rayonnement_solaire_wm2,252612,0,100.0
vitesse_vent_kmh,252612,0,100.0
precipitation_mm,252612,0,100.0


## Standardiser les formats de dates

In [4]:
ts_raw = df_weather_raw["timestamp"].astype(str).str.strip()

# 1) tentative format FR (jour/mois)
ts_fr = pd.to_datetime(ts_raw, errors="coerce", dayfirst=True)

# 2) fallback (US / ISO / autres)
ts_any = pd.to_datetime(ts_raw, errors="coerce", dayfirst=False)

df_weather_raw["timestamp"] = ts_fr.fillna(ts_any)

print("Timestamps invalides:", df_weather_raw["timestamp"].isna().sum())

# suppression ligne sans timestamp
df_weather_raw = df_weather_raw.dropna(subset=["timestamp"])
df_weather_raw

Timestamps invalides: 189116


Unnamed: 0,commune,timestamp,temperature_c,humidite_pct,rayonnement_solaire_wm2,vitesse_vent_kmh,precipitation_mm
0,Saint-Etienne,2024-09-15 15:00:00,17.1,143.3,244.9,14.3,0.0
3,Le Havre,2024-01-03 22:00:00,3.7,94.9,6.8,18.6,11.6
6,Marseille,2023-09-15 21:00:00,22.5,86.8,5.8,32.6,0.0
18,Rennes,2024-06-05 08:00:00,26.7,40.2,603.6,35.9,14.4
21,Toulouse,2023-09-24 14:00:00,14.0,93.0,146.9,2.7,0.0
...,...,...,...,...,...,...,...
252597,Lille,2023-08-16 09:00:00,23.9,84.2,345.3,16.3,0.0
252600,Nantes,2024-07-25 12:00:00,19.7,90.2,197.7,20.3,6.8
252604,Marseille,2023-03-27 21:00:00,136,87.0,7.6,10.7,0.0
252608,Montpellier,2024-03-13 03:00:00,14.1,75.0,12.9,19.2,12.8


## Convertir les colonnes numériques en gérant les erreurs

In [5]:
print("Conversion des colonnes numériques")
num_cols = [
    "temperature_c",
    "humidite_pct",
    "rayonnement_solaire_wm2",
    "vitesse_vent_kmh",
    "precipitation_mm"
]

pattern = r"^-?\d+([.,]\d+)?$"  # nombre entier ou décimal (virgule ou point)

for c in num_cols:
    df_weather_raw[c] = (
        df_weather_raw[c]
        .astype(str)
        .str.strip()
        # tout ce qui n'est PAS un nombre valide → NaN
        .where(df_weather_raw[c].astype(str).str.match(pattern), np.nan)
        # harmonisation virgule → point
        .str.replace(",", ".", regex=False)
        .astype(float)
    )

print("  Conversions effectuees.")
df_weather_raw

Conversion des colonnes numériques
  Conversions effectuees.


Unnamed: 0,commune,timestamp,temperature_c,humidite_pct,rayonnement_solaire_wm2,vitesse_vent_kmh,precipitation_mm
0,Saint-Etienne,2024-09-15 15:00:00,17.1,143.3,244.9,14.3,0.0
3,Le Havre,2024-01-03 22:00:00,3.7,94.9,6.8,18.6,11.6
6,Marseille,2023-09-15 21:00:00,22.5,86.8,5.8,32.6,0.0
18,Rennes,2024-06-05 08:00:00,26.7,40.2,603.6,35.9,14.4
21,Toulouse,2023-09-24 14:00:00,14.0,93.0,146.9,2.7,0.0
...,...,...,...,...,...,...,...
252597,Lille,2023-08-16 09:00:00,23.9,84.2,345.3,16.3,0.0
252600,Nantes,2024-07-25 12:00:00,19.7,90.2,197.7,20.3,6.8
252604,Marseille,2023-03-27 21:00:00,13.6,87.0,7.6,10.7,0.0
252608,Montpellier,2024-03-13 03:00:00,14.1,75.0,12.9,19.2,12.8


## Corriger les valeurs aberrantes :

In [6]:
# Temperatures hors [-40, 50] -> NaN puis interpolation
df_weather_raw.loc[(df_weather_raw["temperature_c"] < -40) | (df_weather_raw["temperature_c"] > 50), "temperature_c"] = np.nan

# Humidite hors [0, 100] -> clipping
df_weather_raw["humidite_pct"] = df_weather_raw["humidite_pct"].clip(0, 100)

# Rayonnement solaire negatif -> 0
df_weather_raw.loc[df_weather_raw["rayonnement_solaire_wm2"] < 0, "rayonnement_solaire_wm2"] = 0
df_weather_raw

Unnamed: 0,commune,timestamp,temperature_c,humidite_pct,rayonnement_solaire_wm2,vitesse_vent_kmh,precipitation_mm
0,Saint-Etienne,2024-09-15 15:00:00,17.1,100.0,244.9,14.3,0.0
3,Le Havre,2024-01-03 22:00:00,3.7,94.9,6.8,18.6,11.6
6,Marseille,2023-09-15 21:00:00,22.5,86.8,5.8,32.6,0.0
18,Rennes,2024-06-05 08:00:00,26.7,40.2,603.6,35.9,14.4
21,Toulouse,2023-09-24 14:00:00,14.0,93.0,146.9,2.7,0.0
...,...,...,...,...,...,...,...
252597,Lille,2023-08-16 09:00:00,23.9,84.2,345.3,16.3,0.0
252600,Nantes,2024-07-25 12:00:00,19.7,90.2,197.7,20.3,6.8
252604,Marseille,2023-03-27 21:00:00,13.6,87.0,7.6,10.7,0.0
252608,Montpellier,2024-03-13 03:00:00,14.1,75.0,12.9,19.2,12.8


## Traiter les valeurs manquantes :

In [7]:
# interpolation par commune 
df_weather_raw = df_weather_raw.sort_values(["commune", "timestamp"])

# Interpolation lineaire pour temperature et humidite
df_weather_raw["temperature_c"] = df_weather_raw.groupby("commune")["temperature_c"].transform(lambda s: s.interpolate(method="linear"))
df_weather_raw["humidite_pct"] = df_weather_raw.groupby("commune")["humidite_pct"].transform(lambda s: s.interpolate(method="linear"))

# Forward fill pour precipitation
df_weather_raw["precipitation_mm"] = df_weather_raw.groupby("commune")["precipitation_mm"].ffill()

# Forward fill pour precipitation
before_na = (df_weather_raw['precipitation_mm'].isna() | (df_weather_raw['precipitation_mm'] == '')).sum()
df_weather_raw['precipitation_mm'] = df_weather_raw['precipitation_mm'].replace('', np.nan)
df_weather_raw['precipitation_mm'] = df_weather_raw.groupby('commune')['precipitation_mm'].transform(
    lambda x: x.ffill().bfill()
)
after_na = df_weather_raw['precipitation_mm'].isna().sum()
print(f"  precipitation: {before_na} -> {after_na} NaN (forward filled)")
df_weather_raw


  precipitation: 0 -> 0 NaN (forward filled)


Unnamed: 0,commune,timestamp,temperature_c,humidite_pct,rayonnement_solaire_wm2,vitesse_vent_kmh,precipitation_mm
233980,Bordeaux,2023-01-01 01:00:00,2.7,39.7,6.1,21.5,5.3
5904,Bordeaux,2023-01-01 03:00:00,10.3,64.2,24.3,0.6,14.7
23544,Bordeaux,2023-01-01 04:00:00,0.9,36.4,20.7,35.6,0.0
52861,Bordeaux,2023-01-01 14:00:00,6.9,61.8,691.1,36.8,0.0
156468,Bordeaux,2023-01-01 21:00:00,10.1,55.9,3.6,6.4,0.0
...,...,...,...,...,...,...,...
86427,Toulouse,2024-12-31 13:00:00,-0.6,40.1,28.7,33.9,0.0
153080,Toulouse,2024-12-31 15:00:00,-0.4,61.0,252.2,18.8,0.0
126592,Toulouse,2024-12-31 17:00:00,6.5,70.5,512.6,19.6,0.0
109239,Toulouse,2024-12-31 21:00:00,8.3,53.3,23.1,14.6,0.0


## Ajouter des colonnes temporelles (jour, mois, saison, jour de semaine)

In [8]:
print("\n[5/5] Ajout des colonnes temporelles...")
df_weather_raw["jour"] = df_weather_raw["timestamp"].dt.day
df_weather_raw["mois"] = df_weather_raw["timestamp"].dt.month
df_weather_raw["jour_semaine"] = df_weather_raw["timestamp"].dt.dayofweek

df_weather_raw["saison"] = df_weather_raw["mois"].map({
    12: 'Hiver', 1: 'Hiver', 2: 'Hiver',
    3: 'Printemps', 4: 'Printemps', 5: 'Printemps',
    6: 'Ete', 7: 'Ete', 8: 'Ete',
    9: 'Automne', 10: 'Automne', 11: 'Automne'
})

print("  Colonnes ajoutees: jour, mois, saison, jour de semaine")


df_weather_raw



[5/5] Ajout des colonnes temporelles...
  Colonnes ajoutees: jour, mois, saison, jour de semaine


Unnamed: 0,commune,timestamp,temperature_c,humidite_pct,rayonnement_solaire_wm2,vitesse_vent_kmh,precipitation_mm,jour,mois,jour_semaine,saison
233980,Bordeaux,2023-01-01 01:00:00,2.7,39.7,6.1,21.5,5.3,1,1,6,Hiver
5904,Bordeaux,2023-01-01 03:00:00,10.3,64.2,24.3,0.6,14.7,1,1,6,Hiver
23544,Bordeaux,2023-01-01 04:00:00,0.9,36.4,20.7,35.6,0.0,1,1,6,Hiver
52861,Bordeaux,2023-01-01 14:00:00,6.9,61.8,691.1,36.8,0.0,1,1,6,Hiver
156468,Bordeaux,2023-01-01 21:00:00,10.1,55.9,3.6,6.4,0.0,1,1,6,Hiver
...,...,...,...,...,...,...,...,...,...,...,...
86427,Toulouse,2024-12-31 13:00:00,-0.6,40.1,28.7,33.9,0.0,31,12,1,Hiver
153080,Toulouse,2024-12-31 15:00:00,-0.4,61.0,252.2,18.8,0.0,31,12,1,Hiver
126592,Toulouse,2024-12-31 17:00:00,6.5,70.5,512.6,19.6,0.0,31,12,1,Hiver
109239,Toulouse,2024-12-31 21:00:00,8.3,53.3,23.1,14.6,0.0,31,12,1,Hiver


In [9]:
# Audit : essai : après nettoyage
audit_after = pd.DataFrame({
    "non_null": df_weather_raw.notna().sum(),
    "null": df_weather_raw.isna().sum(),
    "completeness_%": (1 - df_weather_raw.isna().mean()) * 100
}).sort_values("completeness_%")

audit_after

Unnamed: 0,non_null,null,completeness_%
commune,63496,0,100.0
timestamp,63496,0,100.0
temperature_c,63496,0,100.0
humidite_pct,63496,0,100.0
rayonnement_solaire_wm2,63496,0,100.0
vitesse_vent_kmh,63496,0,100.0
precipitation_mm,63496,0,100.0
jour,63496,0,100.0
mois,63496,0,100.0
jour_semaine,63496,0,100.0


## Export output/meteo_clean.csv

In [10]:
import os

OUTPUT_DIR = "../output"
OUT_PATH = os.path.join(OUTPUT_DIR, "meteo_clean.csv")

os.makedirs(OUTPUT_DIR, exist_ok=True)

df_weather_raw.to_csv(OUT_PATH, index=False)

print("Export :", OUT_PATH)
print("Shape final :", df_weather_raw.shape)


Export : ../output\meteo_clean.csv
Shape final : (63496, 11)


## Rapport avant/apres nettoyage (completude par colonne)

In [11]:
audit_compare = audit_before.join(audit_after, lsuffix="_before", rsuffix="_after")
audit_compare["delta_completeness_%"] = (
    audit_compare["completeness_%_after"] - audit_compare["completeness_%_before"]
)

# tri par amélioration
audit_compare = audit_compare.sort_values("delta_completeness_%", ascending=False)

audit_compare

Unnamed: 0,non_null_count,null_count,completeness_%_before,non_null,null,completeness_%_after,delta_completeness_%
temperature_c,251383,1229,99.513483,63496,0,100.0,0.486517
commune,252612,0,100.0,63496,0,100.0,0.0
timestamp,252612,0,100.0,63496,0,100.0,0.0
humidite_pct,252612,0,100.0,63496,0,100.0,0.0
rayonnement_solaire_wm2,252612,0,100.0,63496,0,100.0,0.0
vitesse_vent_kmh,252612,0,100.0,63496,0,100.0,0.0
precipitation_mm,252612,0,100.0,63496,0,100.0,0.0
