# Etape 3 - Nettoyage avance Pandas

**Objectif** : Preparer les donnees meteo et fusionner avec les donnees de pollution

---
---

## Imports

In [166]:
import sys
import os
from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np

---

## Chemins des données

In [167]:
DATA_DIR = (Path.cwd() / ".." / "data").resolve()
OUTPUT_DIR = os.path.join(DATA_DIR, "output", "pollution_meteo_clean")
PARQUET_DIR = os.path.join(OUTPUT_DIR, "air_quality_clean")
WEATHER_PATH = os.path.join(DATA_DIR, "weather_raw.csv")

---

## 3.1 Chargement du fichier weather_raw.csv avec Pandas (les données météo brutes)

In [168]:
## Chargement des données avec Pandas
df_weather_raw = pd.read_csv(WEATHER_PATH)

## Affichage de quelques infos
print(f"- Shape: {df_weather_raw.shape}")
print(f"- Colonnes: {df_weather_raw.columns.tolist()}")
print()
# Info sur les types
print("- Infos sur les types : ")
df_weather_raw.info()
print()
## Appercu des donnees
print("- Appercu des donnees : ")
df_weather_raw.head(10)

- Shape: (42172, 7)
- Colonnes: ['city', 'timestamp', 'temperature_c', 'humidity_pct', 'wind_speed_kmh', 'precipitation_mm', 'weather_condition']

- Infos sur les types : 
<class 'pandas.DataFrame'>
RangeIndex: 42172 entries, 0 to 42171
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   city               42172 non-null  str    
 1   timestamp          42172 non-null  str    
 2   temperature_c      41724 non-null  str    
 3   humidity_pct       41760 non-null  float64
 4   wind_speed_kmh     42172 non-null  float64
 5   precipitation_mm   42172 non-null  float64
 6   weather_condition  41310 non-null  str    
dtypes: float64(3), str(4)
memory usage: 3.7 MB

- Appercu des donnees : 


Unnamed: 0,city,timestamp,temperature_c,humidity_pct,wind_speed_kmh,precipitation_mm,weather_condition
0,Grenoble,2024-03-12 09:00:00,14.8,74.6,24.8,2.4,pluvieux
1,Strasbourg,05/04/2024 21:00,10.8,78.4,5.7,0.0,neigeux
2,Marseille,28/01/2024 23:00,13.3,60.4,45.7,3.5,pluvieux
3,Bordeaux,29/01/2024 12:00,1.3,84.1,43.3,0.0,brumeux
4,Marseille,2024-01-15 00:00:00,13.3,87.4,21.7,6.0,orageux
5,Lille,2024-05-14 14:00:00,15.7,45.4,0.0,0.0,brumeux
6,Paris,2024-02-12 22:00:00,1.5,93.8,29.6,0.0,brumeux
7,Nantes,04/17/2024 14:00:00,19.2,51.2,49.4,0.0,pluvieux
8,Bordeaux,04/25/2024 09:00:00,199.0,43.6,22.9,0.7,pluvieux
9,Toulouse,01/06/2024 13:00,35.7,61.7,29.2,6.2,pluvieux


---

## 3.2 Identification et traitement des valeurs manquantes (Diagnostic de la qualite des donnees) :
- Interpolation linéaire pour température et humidité.
- Forward fill pour les conditions météo.

In [169]:
def quality_report(df, name="DataFrame"):
    print(f"RAPPORT QUALITE - {name}")
    print(f"- Lignes: {len(df):,}")
    print(f"- Colonnes: {len(df.columns)}")
    
    report = []
    for col in df.columns:
        total = len(df)
        missing = df[col].isna().sum() + (df[col] == '').sum() if df[col].dtype == 'object' else df[col].isna().sum()
        completude = (1 - missing / total) * 100
        unique = df[col].nunique()
        dtype = df[col].dtype
        
        report.append({
            'Colonne': col,
            'Type': str(dtype),
            'Manquants': missing,
            'Completude %': round(completude, 2),
            'Uniques': unique
        })
    
    return pd.DataFrame(report)

quality_report(df_weather_raw, "Weather Raw")

RAPPORT QUALITE - Weather Raw
- Lignes: 42,172
- Colonnes: 7


Unnamed: 0,Colonne,Type,Manquants,Completude %,Uniques
0,city,str,0,100.0,10
1,timestamp,str,0,100.0,16406
2,temperature_c,str,448,98.94,816
3,humidity_pct,float64,412,99.02,836
4,wind_speed_kmh,float64,0,100.0,501
5,precipitation_mm,float64,0,100.0,101
6,weather_condition,str,862,97.96,6


In [170]:
## Examiner les valeurs problematiques dans temperature_c
print("- Valeurs uniques non numeriques dans temperature_c:")
temp_non_numeric = df_weather_raw[
    ~df_weather_raw['temperature_c'].astype(str).str.match(r'^-?[0-9]+[.,]?[0-9]*$', na=False)
]['temperature_c'].unique()
print(temp_non_numeric)

- Valeurs uniques non numeriques dans temperature_c:
<ArrowStringArray>
[nan]
Length: 1, dtype: str


In [171]:
## Valeurs aberrantes de temperature
df_temp = df_weather_raw.copy()
df_temp['temperature_c'] = pd.to_numeric(
    df_temp['temperature_c'].astype(str).str.replace(',', '.'), 
    errors='coerce'
)

print("- Distribution temperature:")
print(df_temp['temperature_c'].describe())

print("\n- Nombres des valeurs aberrantes de temperature :")
print(f"    - Temperatures < -40: {(df_temp['temperature_c'] < -40).sum()}")
print(f"    - Temperatures > 50: {(df_temp['temperature_c'] > 50).sum()}")

- Distribution temperature:
count    41724.000000
mean        13.995688
std         11.963235
min        -80.000000
25%          7.900000
50%         13.500000
75%         20.100000
max        100.000000
Name: temperature_c, dtype: float64

- Nombres des valeurs aberrantes de temperature :
    - Temperatures < -40: 165
    - Temperatures > 50: 290


In [172]:
## Humidite hors bornes
df_temp['humidity_pct'] = pd.to_numeric(df_temp['humidity_pct'], errors='coerce')

print("\n- Distribution humidite:")
print(df_temp['humidity_pct'].describe())

print(f"\n- Humidite > 100%: {(df_temp['humidity_pct'] > 100).sum()}")
print(f"- Humidite < 0%: {(df_temp['humidity_pct'] < 0).sum()}")


- Distribution humidite:
count    41760.000000
mean        68.171889
std         16.974696
min         40.000000
25%         54.000000
50%         67.900000
75%         81.700000
max        150.000000
Name: humidity_pct, dtype: float64

- Humidite > 100%: 454
- Humidite < 0%: 0


---

## 3.3 Nettoyage des donnees meteo

In [173]:
def parse_timestamp(ts):
    """Parse les timestamps multi-formats."""
    if pd.isna(ts):
        return pd.NaT
    
    formats = [
        "%Y-%m-%d %H:%M:%S",
        "%d/%m/%Y %H:%M",
        "%m/%d/%Y %H:%M:%S",
        "%Y-%m-%dT%H:%M:%S",
    ]
    
    for fmt in formats:
        try:
            return datetime.strptime(str(ts), fmt)
        except ValueError:
            continue
    
    return pd.NaT

df_weather = df_weather_raw.copy()

## 1. Parser les timestamps
print("[1/5] Parsing des timestamps ...")
df_weather['timestamp'] = df_weather['timestamp'].apply(parse_timestamp)
invalid_ts = df_weather['timestamp'].isna().sum()
print(f"    - Timestamps invalides: {invalid_ts}")

## Supprimer les lignes sans timestamp valide
df_weather = df_weather.dropna(subset=['timestamp'])

[1/5] Parsing des timestamps ...
    - Timestamps invalides: 0


In [174]:
## 2. Convertir les colonnes numeriques
print("\n[2/5] Conversion des colonnes numeriques ...")

## Temperature (remplacer virgule par point)
df_weather['temperature_c'] = pd.to_numeric(
    df_weather['temperature_c'].astype(str).str.replace(',', '.'),
    errors='coerce' # Si une valeur ne peut pas être convertie en nombre, transforme-la en NaN
)

## Autres colonnes
df_weather['humidity_pct'] = pd.to_numeric(df_weather['humidity_pct'], errors='coerce')
df_weather['wind_speed_kmh'] = pd.to_numeric(df_weather['wind_speed_kmh'], errors='coerce')
df_weather['precipitation_mm'] = pd.to_numeric(df_weather['precipitation_mm'], errors='coerce')

print("  Conversions effectuees.")


[2/5] Conversion des colonnes numeriques ...
  Conversions effectuees.


In [175]:
## 3. Corriger les valeurs aberrantes
print("\n[3/5] Correction des valeurs aberrantes ...")

## Temperatures hors [-40, 50] -> NaN
temp_outliers = ((df_weather['temperature_c'] < -40) | (df_weather['temperature_c'] > 50)).sum()
df_weather.loc[
    (df_weather['temperature_c'] < -40) | (df_weather['temperature_c'] > 50),
    'temperature_c'
] = np.nan
print(f"  - Temperatures aberrantes -> NaN: {temp_outliers}")

# Humidite hors [0, 100] -> clipper
humidity_outliers = ((df_weather['humidity_pct'] < 0) | (df_weather['humidity_pct'] > 100)).sum()
df_weather['humidity_pct'] = df_weather['humidity_pct'].clip(0, 100)
print(f"  - Humidite clippee [0, 100]: {humidity_outliers}")


[3/5] Correction des valeurs aberrantes ...
  - Temperatures aberrantes -> NaN: 455
  - Humidite clippee [0, 100]: 454


In [176]:
## 4. Traiter les valeurs manquantes
print("[4/5] Traitement des valeurs manquantes ...")

## Trier par ville et timestamp pour l'interpolation
df_weather = df_weather.sort_values(['city', 'timestamp'])

## Interpolation lineaire pour temperature et humidite (par ville)
for col in ['temperature_c', 'humidity_pct', 'wind_speed_kmh', 'precipitation_mm']:
    before_na = df_weather[col].isna().sum()
    df_weather[col] = df_weather.groupby('city')[col].transform(
        lambda x: x.interpolate(method='linear', limit_direction='both')
    )
    after_na = df_weather[col].isna().sum()
    print(f"  - {col}: {before_na} -> {after_na} NaN (interpoles: {before_na - after_na})")

## Forward fill pour weather_condition
before_na = (df_weather['weather_condition'].isna() | (df_weather['weather_condition'] == '')).sum()
df_weather['weather_condition'] = df_weather['weather_condition'].replace('', np.nan)
df_weather['weather_condition'] = df_weather.groupby('city')['weather_condition'].transform(
    lambda x: x.ffill().bfill()
)
after_na = df_weather['weather_condition'].isna().sum()
print(f"  - weather_condition: {before_na} -> {after_na} NaN (forward filled)")

[4/5] Traitement des valeurs manquantes ...
  - temperature_c: 903 -> 0 NaN (interpoles: 903)
  - humidity_pct: 412 -> 0 NaN (interpoles: 412)
  - wind_speed_kmh: 0 -> 0 NaN (interpoles: 0)
  - precipitation_mm: 0 -> 0 NaN (interpoles: 0)
  - weather_condition: 862 -> 0 NaN (forward filled)


In [177]:
## 5. Ajouter des colonnes temporelles
print("[5/5] Ajout des colonnes temporelles ...")

df_weather['date'] = df_weather['timestamp'].dt.date
df_weather['hour'] = df_weather['timestamp'].dt.hour
df_weather['day_of_week'] = df_weather['timestamp'].dt.dayofweek
df_weather['month'] = df_weather['timestamp'].dt.month
df_weather['season'] = df_weather['month'].map({
    12: 'Hiver', 1: 'Hiver', 2: 'Hiver',
    3: 'Printemps', 4: 'Printemps', 5: 'Printemps',
    6: 'Ete', 7: 'Ete', 8: 'Ete',
    9: 'Automne', 10: 'Automne', 11: 'Automne'
})

print("  - Colonnes ajoutees: date, hour, day_of_week, month, season")

print("  - Appercu des données meteo nettoyées : ")
df_weather.head(10)

[5/5] Ajout des colonnes temporelles ...
  - Colonnes ajoutees: date, hour, day_of_week, month, season
  - Appercu des données meteo nettoyées : 


Unnamed: 0,city,timestamp,temperature_c,humidity_pct,wind_speed_kmh,precipitation_mm,weather_condition,date,hour,day_of_week,month,season
6149,Bordeaux,2024-01-01 00:00:00,11.1,64.7,1.0,4.1,pluvieux,2024-01-01,0,0,1,Hiver
3877,Bordeaux,2024-01-01 01:00:00,8.75,68.3,44.4,0.0,ensoleille,2024-01-01,1,0,1,Hiver
5918,Bordeaux,2024-01-01 02:00:00,6.4,80.2,28.5,0.0,brumeux,2024-01-01,2,0,1,Hiver
22424,Bordeaux,2024-01-01 03:00:00,6.6,87.0,43.7,0.0,brumeux,2024-01-01,3,0,1,Hiver
16306,Bordeaux,2024-01-01 04:00:00,1.7,45.9,21.5,0.0,pluvieux,2024-01-01,4,0,1,Hiver
6649,Bordeaux,2024-01-01 05:00:00,11.9,53.7,48.8,0.0,brumeux,2024-01-01,5,0,1,Hiver
2711,Bordeaux,2024-01-01 06:00:00,13.6,50.1,47.1,1.9,pluvieux,2024-01-01,6,0,1,Hiver
14930,Bordeaux,2024-01-01 07:00:00,11.7,67.6,10.5,0.0,pluvieux,2024-01-01,7,0,1,Hiver
15777,Bordeaux,2024-01-01 08:00:00,12.9,93.7,44.6,0.0,nuageux,2024-01-01,8,0,1,Hiver
28955,Bordeaux,2024-01-01 09:00:00,13.8,91.6,13.8,5.8,pluvieux,2024-01-01,9,0,1,Hiver


In [178]:
## Rapport apres nettoyage
quality_report(df_weather, "Weather Clean")

RAPPORT QUALITE - Weather Clean
- Lignes: 42,172
- Colonnes: 12


Unnamed: 0,Colonne,Type,Manquants,Completude %,Uniques
0,city,str,0,100.0,10
1,timestamp,datetime64[us],0,100.0,4368
2,temperature_c,float64,0,100.0,783
3,humidity_pct,float64,0,100.0,750
4,wind_speed_kmh,float64,0,100.0,501
5,precipitation_mm,float64,0,100.0,101
6,weather_condition,str,0,100.0,6
7,date,object,0,100.0,182
8,hour,int32,0,100.0,24
9,day_of_week,int32,0,100.0,7


---

## 3.4 Chargement des donnees de pollution nettoyees

In [179]:
## Charger les donnees Parquet (produites par l'etape 2)
if os.path.exists(PARQUET_DIR):
    df_pollution = pd.read_parquet(PARQUET_DIR)
    print(f"- Donnees pollution nettoyées chargees (en lisant le parquet) : {len(df_pollution):,} lignes")
    df_pollution.head()

else:
    ## Si les donnees Parquet n'existent pas, charger le CSV brut et nettoyer
    print("Parquet non trouve, chargement du CSV brut...")
    df_air_raw = pd.read_csv(f"{DATA_DIR}/air_quality_raw.csv")
    df_stations = pd.read_csv(f"{DATA_DIR}/stations.csv")
    
    ## Nettoyage simplifie
    df_pollution = df_air_raw.copy()
    df_pollution['timestamp'] = df_pollution['timestamp'].apply(parse_timestamp)
    df_pollution = df_pollution.dropna(subset=['timestamp'])
    df_pollution['value'] = pd.to_numeric(
        df_pollution['value'].astype(str).str.replace(',', '.'),
        errors='coerce'
    )
    df_pollution = df_pollution.dropna(subset=['value'])
    df_pollution = df_pollution[(df_pollution['value'] >= 0) & (df_pollution['value'] <= 1000)]
    df_pollution = df_pollution.drop_duplicates(subset=['station_id', 'timestamp', 'pollutant'])
    
    ## Joindre avec stations
    df_pollution = df_pollution.merge(
        df_stations[['station_id', 'city', 'station_type']],
        on='station_id',
        how='left'
    )
    
    ## Renommer pour coherence
    df_pollution = df_pollution.rename(columns={'value': 'value_mean'})
    
    print(f"- Donnees pollution nettoyees: {len(df_pollution):,} lignes")

print("- Appercu de df_pollution :")
df_pollution.head(10)

Parquet non trouve, chargement du CSV brut...
- Donnees pollution nettoyees: 1,176,727 lignes
- Appercu de df_pollution :


Unnamed: 0,station_id,timestamp,pollutant,value_mean,unit,city,station_type
0,ST0040,2024-01-07 05:00:00,O3,79.29,ug/m3,Grenoble,urbaine
1,ST0004,2024-06-09 18:00:00,O3,41.58,ug/m3,Lyon,urbaine
2,ST0027,2024-05-23 11:00:00,PM10,29.2,ug/m3,Nantes,urbaine
3,ST0002,2024-03-18 12:00:00,SO2,7.72,ug/m3,Paris,periurbaine
4,ST0035,2024-06-11 08:00:00,O3,29.87,ug/m3,Grenoble,urbaine
5,ST0023,2024-04-19 10:00:00,O3,30.07,ug/m3,Lille,periurbaine
6,ST0035,2024-03-19 05:00:00,PM2.5,14.12,ug/m3,Grenoble,urbaine
7,ST0001,2024-05-19 13:00:00,PM10,17.09,ug/m3,Paris,urbaine
8,ST0014,2024-03-10 20:00:00,CO,0.29,mg/m3,Toulouse,urbaine
9,ST0004,2024-01-28 16:00:00,NO2,25.69,ug/m3,Lyon,urbaine


In [180]:
# Apercu des donnees pollution
df_pollution.info()

<class 'pandas.DataFrame'>
RangeIndex: 1176727 entries, 0 to 1176726
Data columns (total 7 columns):
 #   Column        Non-Null Count    Dtype         
---  ------        --------------    -----         
 0   station_id    1176727 non-null  str           
 1   timestamp     1176727 non-null  datetime64[us]
 2   pollutant     1176727 non-null  str           
 3   value_mean    1176727 non-null  float64       
 4   unit          1176727 non-null  str           
 5   city          1176727 non-null  str           
 6   station_type  1176727 non-null  str           
dtypes: datetime64[us](1), float64(1), str(5)
memory usage: 97.1 MB


---

## 3.5 Fusion pollution/meteo

In [181]:
## Preparer les donnees pour la fusion

## Verifier si les donnees de pollution sont vides
if len(df_pollution) == 0:
    print("ATTENTION: Les donnees de pollution sont vides!")
    print("Veuillez d'abord executer le notebook 02 pour generer les donnees Parquet.")
    # Creer un DataFrame vide avec les bonnes colonnes pour eviter les erreurs
    df_pollution['datetime_hour'] = pd.to_datetime([])
else:
    ## Pour la pollution: creer une colonne datetime arrondie a l'heure
    if 'date' in df_pollution.columns and 'hour' in df_pollution.columns:
        df_pollution['datetime_hour'] = pd.to_datetime(
            df_pollution['date'].astype(str) + ' ' + df_pollution['hour'].astype(str) + ':00:00'
        )
    elif 'timestamp_parsed' in df_pollution.columns:
        df_pollution['datetime_hour'] = df_pollution['timestamp_parsed'].dt.floor('h')
    elif 'timestamp' in df_pollution.columns:
        df_pollution['datetime_hour'] = df_pollution['timestamp'].dt.floor('h')
    else:
        print("- ERREUR: Aucune colonne temporelle trouvee dans df_pollution")
        df_pollution['datetime_hour'] = pd.to_datetime([])

## Pour la meteo: arrondir a l'heure
df_weather['datetime_hour'] = df_weather['timestamp'].dt.floor('h')

if len(df_pollution) > 0:
    print(f"- Pollution: {df_pollution['datetime_hour'].min()} -> {df_pollution['datetime_hour'].max()}")
print(f"- Meteo: {df_weather['datetime_hour'].min()} -> {df_weather['datetime_hour'].max()}")

- Pollution: 2024-01-01 00:00:00 -> 2024-06-30 23:00:00
- Meteo: 2024-01-01 00:00:00 -> 2024-06-30 23:00:00


In [182]:
## Agreger la meteo par ville et heure (au cas ou il y aurait des doublons)
df_weather_hourly = df_weather.groupby(['city', 'datetime_hour']).agg({
    'temperature_c': 'mean', # → smooths multiple measurements in the same hour
    'humidity_pct': 'mean',
    'wind_speed_kmh': 'mean',
    'precipitation_mm': 'sum', # → correct: rainfall accumulates
    'weather_condition': 'first', # → assumes these don’t change within the hour
    'season': 'first'
}).reset_index() # Converts group keys back into normal columns => Makes the DataFrame easier to work with later

print(f"- Meteo agregee: {len(df_weather_hourly):,} lignes")

- Meteo agregee: 42,172 lignes


In [183]:
## Fusion sur ville et datetime_hour
if len(df_pollution) == 0:
    print("ERREUR: Impossible de fusionner - les donnees de pollution sont vides!")
    print("Veuillez d'abord executer le notebook 02_nettoyage_spark.ipynb")
    df_merged = pd.DataFrame()
else:
    df_merged = df_pollution.merge(
        df_weather_hourly,
        on=['city', 'datetime_hour'],
        how='left'
    )

    print(f"\nResultat de la fusion:")
    print(f"  - Lignes pollution: {len(df_pollution):,}")
    print(f"  - Lignes apres fusion: {len(df_merged):,}")
    print(f"  - Lignes avec meteo: {df_merged['temperature_c'].notna().sum():,}")
    print(f"  - Lignes sans meteo: {df_merged['temperature_c'].isna().sum():,}")


Resultat de la fusion:
  - Lignes pollution: 1,176,727
  - Lignes apres fusion: 1,176,727
  - Lignes avec meteo: 1,134,662
  - Lignes sans meteo: 42,065


In [184]:
## Apercu du dataset fusionne
df_merged.head(10)

Unnamed: 0,station_id,timestamp,pollutant,value_mean,unit,city,station_type,datetime_hour,temperature_c,humidity_pct,wind_speed_kmh,precipitation_mm,weather_condition,season
0,ST0040,2024-01-07 05:00:00,O3,79.29,ug/m3,Grenoble,urbaine,2024-01-07 05:00:00,9.9,61.3,7.9,0.0,ensoleille,Hiver
1,ST0004,2024-06-09 18:00:00,O3,41.58,ug/m3,Lyon,urbaine,2024-06-09 18:00:00,20.1,82.3,4.5,0.0,nuageux,Ete
2,ST0027,2024-05-23 11:00:00,PM10,29.2,ug/m3,Nantes,urbaine,2024-05-23 11:00:00,,,,,,
3,ST0002,2024-03-18 12:00:00,SO2,7.72,ug/m3,Paris,periurbaine,2024-03-18 12:00:00,,,,,,
4,ST0035,2024-06-11 08:00:00,O3,29.87,ug/m3,Grenoble,urbaine,2024-06-11 08:00:00,29.5,50.9,15.6,0.0,nuageux,Ete
5,ST0023,2024-04-19 10:00:00,O3,30.07,ug/m3,Lille,periurbaine,2024-04-19 10:00:00,11.4,76.1,27.1,7.5,pluvieux,Printemps
6,ST0035,2024-03-19 05:00:00,PM2.5,14.12,ug/m3,Grenoble,urbaine,2024-03-19 05:00:00,18.1,49.6,26.5,0.0,pluvieux,Printemps
7,ST0001,2024-05-19 13:00:00,PM10,17.09,ug/m3,Paris,urbaine,2024-05-19 13:00:00,22.9,76.6,30.1,0.0,nuageux,Printemps
8,ST0014,2024-03-10 20:00:00,CO,0.29,mg/m3,Toulouse,urbaine,2024-03-10 20:00:00,12.35,94.2,14.5,0.0,brumeux,Printemps
9,ST0004,2024-01-28 16:00:00,NO2,25.69,ug/m3,Lyon,urbaine,2024-01-28 16:00:00,11.8,79.4,39.9,0.0,pluvieux,Hiver


---

## 3.6 Rapport de nettoyage et sauvegarde

In [185]:
## Rapport final
print("RAPPORT DE NETTOYAGE - DONNEES METEO")
print(f"Lignes brutes: {len(df_weather_raw):,}")
print(f"Lignes apres nettoyage: {len(df_weather):,}")
print(f"Lignes supprimees: {len(df_weather_raw) - len(df_weather):,}")
print("\nCompletude finale:")
for col in ['temperature_c', 'humidity_pct', 'wind_speed_kmh', 'precipitation_mm', 'weather_condition']:
    completude = (1 - df_weather[col].isna().sum() / len(df_weather)) * 100
    print(f"  {col}: {completude:.2f}%")

RAPPORT DE NETTOYAGE - DONNEES METEO
Lignes brutes: 42,172
Lignes apres nettoyage: 42,172
Lignes supprimees: 0

Completude finale:
  temperature_c: 100.00%
  humidity_pct: 100.00%
  wind_speed_kmh: 100.00%
  precipitation_mm: 100.00%
  weather_condition: 100.00%


In [186]:
## Sauvegarder le dataset fusionne
    ## Ensure OUTPUT_DIR
os.makedirs(OUTPUT_DIR, exist_ok=True)
    ## Constuire le chemin
output_path = os.path.join(OUTPUT_DIR, "pollution_meteo_clean.csv")
    ## Sauvegarde
df_merged.to_csv(output_path, index=False)
    ## Affichage de quelques informations
print(f"- Dataset fusionne sauvegarde: {output_path}")
print(f"- Taille: {len(df_merged):,} lignes x {len(df_merged.columns)} colonnes")

- Dataset fusionne sauvegarde: C:\Users\Administrateur\Desktop\spark-pandas-viz-lab\data\output\pollution_meteo_clean\pollution_meteo_clean.csv
- Taille: 1,176,727 lignes x 14 colonnes


In [187]:
## Sauvegarder aussi en Parquet pour de meilleures performances
parquet_path = os.path.join(OUTPUT_DIR, "pollution_meteo_clean.parquet")
df_merged.to_parquet(parquet_path, index=False)
print(f"- Dataset Parquet sauvegarde: {parquet_path}")

- Dataset Parquet sauvegarde: C:\Users\Administrateur\Desktop\spark-pandas-viz-lab\data\output\pollution_meteo_clean\pollution_meteo_clean.parquet


In [188]:
## Resume des colonnes du dataset final
print("\nColonnes du dataset final:")
df_merged.info()


Colonnes du dataset final:
<class 'pandas.DataFrame'>
RangeIndex: 1176727 entries, 0 to 1176726
Data columns (total 14 columns):
 #   Column             Non-Null Count    Dtype         
---  ------             --------------    -----         
 0   station_id         1176727 non-null  str           
 1   timestamp          1176727 non-null  datetime64[us]
 2   pollutant          1176727 non-null  str           
 3   value_mean         1176727 non-null  float64       
 4   unit               1176727 non-null  str           
 5   city               1176727 non-null  str           
 6   station_type       1176727 non-null  str           
 7   datetime_hour      1176727 non-null  datetime64[us]
 8   temperature_c      1134662 non-null  float64       
 9   humidity_pct       1134662 non-null  float64       
 10  wind_speed_kmh     1134662 non-null  float64       
 11  precipitation_mm   1134662 non-null  float64       
 12  weather_condition  1134662 non-null  str           
 13  season    