# Etape 4 - Analyse exploratoire et statistiques

**Objectif** : Comprendre les patterns de pollution

---
---

## Imports

In [21]:
import os
from pathlib import Path
import warnings
import pandas as pd
import numpy as np
from scipy import stats

---

## CFG. generales

In [22]:
## Python: “don’t show any warnings in this session”.
warnings.filterwarnings('ignore') 

## Pandas: None => show all columns, no matter how many.
pd.set_option('display.max_columns', None)

## Pandas: None => don’t wrap, let pandas decide automatically: (Effect: Wide tables display in a single row instead of breaking across lines)
pd.set_option('display.width', None) 

---

## Chemins des données

In [23]:
DATA_DIR = (Path.cwd() / ".." / "data").resolve()
OUTPUT_DIR = os.path.join(DATA_DIR, "output")

---

## Chargement des donnees nettoyees

In [24]:
## Charger le dataset fusionne
data_set_path = os.path.join(OUTPUT_DIR, "pollution_meteo_clean", "pollution_meteo_clean.csv")
df = pd.read_csv(data_set_path, parse_dates=['datetime_hour'])

print(f"Dataset charge: {len(df):,} lignes x {len(df.columns)} colonnes")
df.head()

Dataset charge: 1,176,727 lignes x 14 colonnes


Unnamed: 0,station_id,timestamp,pollutant,value_mean,unit,city,station_type,datetime_hour,temperature_c,humidity_pct,wind_speed_kmh,precipitation_mm,weather_condition,season
0,ST0040,2024-01-07 05:00:00,O3,79.29,ug/m3,Grenoble,urbaine,2024-01-07 05:00:00,9.9,61.3,7.9,0.0,ensoleille,Hiver
1,ST0004,2024-06-09 18:00:00,O3,41.58,ug/m3,Lyon,urbaine,2024-06-09 18:00:00,20.1,82.3,4.5,0.0,nuageux,Ete
2,ST0027,2024-05-23 11:00:00,PM10,29.2,ug/m3,Nantes,urbaine,2024-05-23 11:00:00,,,,,,
3,ST0002,2024-03-18 12:00:00,SO2,7.72,ug/m3,Paris,periurbaine,2024-03-18 12:00:00,,,,,,
4,ST0035,2024-06-11 08:00:00,O3,29.87,ug/m3,Grenoble,urbaine,2024-06-11 08:00:00,29.5,50.9,15.6,0.0,nuageux,Ete


In [25]:
## Ajouter des colonnes temporelles si manquantes
if 'date' not in df.columns:
    df['date'] = df['datetime_hour'].dt.date
    print("- Colonne 'date' ajoutée à df")
if 'hour' not in df.columns:
    df['hour'] = df['datetime_hour'].dt.hour
    print("- Colonne 'hour' ajoutée à df")
if 'day_of_week' not in df.columns:
    df['day_of_week'] = df['datetime_hour'].dt.dayofweek
    print("- Colonne 'day_of_week' ajoutée à df")
if 'month' not in df.columns:
    df['month'] = df['datetime_hour'].dt.month
    print("- Colonne 'month' ajoutée à df")
if 'season' not in df.columns:
    df['season'] = df['month'].map({
        12: 'Hiver', 1: 'Hiver', 2: 'Hiver',
        3: 'Printemps', 4: 'Printemps', 5: 'Printemps',
        6: 'Ete', 7: 'Ete', 8: 'Ete',
        9: 'Automne', 10: 'Automne', 11: 'Automne'
    })

    print("- Colonne 'season' ajoutée à df")

## Noms des jours
day_names = ['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche']
df['day_name'] = df['day_of_week'].map(lambda x: day_names[x])

## Appercu de la df
df.head()

- Colonne 'date' ajoutée à df
- Colonne 'hour' ajoutée à df
- Colonne 'day_of_week' ajoutée à df
- Colonne 'month' ajoutée à df


Unnamed: 0,station_id,timestamp,pollutant,value_mean,unit,city,station_type,datetime_hour,temperature_c,humidity_pct,wind_speed_kmh,precipitation_mm,weather_condition,season,date,hour,day_of_week,month,day_name
0,ST0040,2024-01-07 05:00:00,O3,79.29,ug/m3,Grenoble,urbaine,2024-01-07 05:00:00,9.9,61.3,7.9,0.0,ensoleille,Hiver,2024-01-07,5,6,1,Dimanche
1,ST0004,2024-06-09 18:00:00,O3,41.58,ug/m3,Lyon,urbaine,2024-06-09 18:00:00,20.1,82.3,4.5,0.0,nuageux,Ete,2024-06-09,18,6,6,Dimanche
2,ST0027,2024-05-23 11:00:00,PM10,29.2,ug/m3,Nantes,urbaine,2024-05-23 11:00:00,,,,,,,2024-05-23,11,3,5,Jeudi
3,ST0002,2024-03-18 12:00:00,SO2,7.72,ug/m3,Paris,periurbaine,2024-03-18 12:00:00,,,,,,,2024-03-18,12,0,3,Lundi
4,ST0035,2024-06-11 08:00:00,O3,29.87,ug/m3,Grenoble,urbaine,2024-06-11 08:00:00,29.5,50.9,15.6,0.0,nuageux,Ete,2024-06-11,8,1,6,Mardi


---

## 4.1 Statistiques descriptives par polluant et par ville

In [26]:
## Determiner la colonne de valeur
value_col = 'value_mean' if 'value_mean' in df.columns else 'value'

## Statistiques par polluant
stats_pollutant = df.groupby('pollutant')[value_col].agg([
    ('count', 'count'),
    ('mean', 'mean'),
    ('std', 'std'),
    ('min', 'min'),
    ('25%', lambda x: x.quantile(0.25)),
    ('median', 'median'),
    ('75%', lambda x: x.quantile(0.75)),
    ('max', 'max')
]).round(2)

print("- Statistiques par polluant:")
stats_pollutant

- Statistiques par polluant:


Unnamed: 0_level_0,count,mean,std,min,25%,median,75%,max
pollutant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CO,196036,0.78,0.4,0.08,0.48,0.71,1.0,2.73
NO2,196174,46.82,24.11,5.05,28.55,42.42,59.76,163.78
O3,196114,78.06,40.27,8.52,47.65,70.51,99.56,272.85
PM10,196131,38.99,20.03,4.22,23.86,35.25,49.7,136.49
PM2.5,196085,23.43,12.08,2.52,14.29,21.19,29.89,81.9
SO2,196187,7.8,4.02,0.84,4.75,7.05,9.96,27.3


In [27]:
## Statistiques par ville
stats_city = df.groupby('city')[value_col].agg([
    ('count', 'count'),
    ('mean', 'mean'),
    ('std', 'std'),
    ('max', 'max')
]).round(2).sort_values('mean', ascending=False)

print("\nStatistiques par ville (tous polluants):")
stats_city


Statistiques par ville (tous polluants):


Unnamed: 0_level_0,count,mean,std,max
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Strasbourg,100188,35.79,36.08,271.48
Nantes,100217,35.73,36.08,272.36
Paris,75151,32.77,32.48,236.28
Lyon,75102,32.73,32.64,236.54
Bordeaux,75019,32.71,32.5,236.1
Lille,125133,32.2,33.98,272.85
Toulouse,125151,32.18,33.96,272.24
Grenoble,150094,31.76,33.19,272.63
Marseille,175343,31.53,32.72,272.74
Rouen,175329,31.52,32.68,272.54


In [28]:
## Tableau croise polluant x ville
pivot_city_pollutant = df.pivot_table(
    values=value_col,
    index='city',
    columns='pollutant',
    aggfunc='mean'
).round(2)

print("\nConcentrations moyennes par ville et polluant:")
pivot_city_pollutant


Concentrations moyennes par ville et polluant:


pollutant,CO,NO2,O3,PM10,PM2.5,SO2
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bordeaux,0.78,46.93,78.13,38.96,23.53,7.84
Grenoble,0.76,45.55,76.05,37.95,22.72,7.57
Lille,0.77,46.11,77.01,38.51,23.11,7.71
Lyon,0.78,47.15,78.27,38.97,23.46,7.85
Marseille,0.75,45.17,75.44,37.69,22.58,7.51
Nantes,0.85,51.11,85.5,42.65,25.73,8.55
Paris,0.79,47.1,78.09,39.16,23.57,7.86
Rouen,0.75,45.25,75.31,37.71,22.62,7.49
Strasbourg,0.86,51.31,85.61,42.64,25.65,8.55
Toulouse,0.77,46.09,76.89,38.42,23.12,7.69


---

## 4.2 Identification des depassements de seuils reglementaires

In [29]:
## Seuils reglementaires
SEUILS_INFO = {
    'PM2.5': 25,
    'PM10': 50,
    'NO2': 200,
    'O3': 180,
    'SO2': 300,
    'CO': 10  # mg/m3
}

SEUILS_ALERTE = {
    'PM2.5': 50,
    'PM10': 80,
    'NO2': 400,
    'O3': 240,
    'SO2': 500,
    'CO': 20
}

## Ajouter les colonnes de depassement
df['seuil_info'] = df.apply(
    lambda row: SEUILS_INFO.get(row['pollutant'], 9999),
    axis=1
)
df['seuil_alerte'] = df.apply(
    lambda row: SEUILS_ALERTE.get(row['pollutant'], 9999),
    axis=1
)

df['depassement_info'] = df[value_col] > df['seuil_info']
df['depassement_alerte'] = df[value_col] > df['seuil_alerte']

In [30]:
## Comptage des depassements par polluant
depassements = df.groupby('pollutant').agg({
    'depassement_info': 'sum',
    'depassement_alerte': 'sum',
    value_col: 'count'
}).rename(columns={value_col: 'total_mesures'})

depassements['pct_info'] = (depassements['depassement_info'] / depassements['total_mesures'] * 100).round(2)
depassements['pct_alerte'] = (depassements['depassement_alerte'] / depassements['total_mesures'] * 100).round(2)

print("Depassements de seuils par polluant:")
depassements

Depassements de seuils par polluant:


Unnamed: 0_level_0,depassement_info,depassement_alerte,total_mesures,pct_info,pct_alerte
pollutant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CO,0,0,196036,0.0,0.0
NO2,0,0,196174,0.0,0.0
O3,3983,396,196114,2.03,0.2
PM10,48142,8680,196131,24.55,4.43
PM2.5,74739,7035,196085,38.12,3.59
SO2,0,0,196187,0.0,0.0


In [31]:
## Jours avec depassement du seuil d'alerte
jours_alerte = df[df['depassement_alerte']].groupby(['date', 'city', 'pollutant'])[value_col].max().reset_index()
jours_alerte = jours_alerte.sort_values(value_col, ascending=False)

print(f"\nNombre de cas de depassement du seuil d'alerte: {len(jours_alerte)}")
print("\nTop 20 des depassements les plus graves:")
jours_alerte.head(20)


Nombre de cas de depassement du seuil d'alerte: 2631

Top 20 des depassements les plus graves:


Unnamed: 0,date,city,pollutant,value_mean
30,2024-01-02,Lille,O3,272.85
865,2024-02-05,Marseille,O3,272.74
1257,2024-02-21,Lille,O3,272.73
619,2024-01-26,Grenoble,O3,272.63
484,2024-01-20,Rouen,O3,272.54
224,2024-01-10,Grenoble,O3,272.52
609,2024-01-25,Rouen,O3,272.5
579,2024-01-24,Nantes,O3,272.36
781,2024-02-01,Toulouse,O3,272.24
1084,2024-02-14,Lille,O3,272.2


---

## 4.3 Matrice de correlation polluants / meteo

In [32]:
## Pivoter les donnees pour avoir un polluant par colonne
df_pivot = df.pivot_table(
    values=value_col,
    index=['datetime_hour', 'city'],
    columns='pollutant',
    aggfunc='mean'
).reset_index()

# Joindre les donnees meteo
meteo_cols = ['temperature_c', 'humidity_pct', 'wind_speed_kmh', 'precipitation_mm']
df_meteo_unique = df[['datetime_hour', 'city'] + meteo_cols].drop_duplicates()

df_corr = df_pivot.merge(df_meteo_unique, on=['datetime_hour', 'city'], how='left')

print(f"Dataset pour correlation: {len(df_corr):,} lignes")
df_corr.head()

Dataset pour correlation: 43,680 lignes


Unnamed: 0,datetime_hour,city,CO,NO2,O3,PM10,PM2.5,SO2,temperature_c,humidity_pct,wind_speed_kmh,precipitation_mm
0,2024-01-01,Bordeaux,0.843333,80.49,86.19,64.096667,22.896667,9.41,11.1,64.7,1.0,4.1
1,2024-01-01,Grenoble,0.87,64.433333,102.736667,39.43,31.391667,8.725,10.7,77.4,37.4,0.0
2,2024-01-01,Lille,0.836,61.86,89.762,38.924,31.622,8.458,4.9,52.7,14.1,0.0
3,2024-01-01,Lyon,0.776667,25.175,116.863333,46.98,29.296667,7.96,-1.3,51.9,5.1,0.0
4,2024-01-01,Marseille,1.122857,43.717143,82.798571,43.362857,28.541429,9.144286,,,,


In [33]:
## Colonnes pour la correlation
pollutants = ['PM2.5', 'PM10', 'NO2', 'O3', 'SO2', 'CO']
available_pollutants = [p for p in pollutants if p in df_corr.columns]
available_meteo = [m for m in meteo_cols if m in df_corr.columns]

corr_cols = available_pollutants + available_meteo

## Calculer la matrice de correlation
correlation_matrix = df_corr[corr_cols].corr().round(3)

print("Matrice de correlation:")
correlation_matrix

Matrice de correlation:


Unnamed: 0,PM2.5,PM10,NO2,O3,SO2,CO,temperature_c,humidity_pct,wind_speed_kmh,precipitation_mm
PM2.5,1.0,0.678,0.683,0.679,0.685,0.679,-0.498,-0.001,-0.007,-0.007
PM10,0.678,1.0,0.68,0.68,0.679,0.678,-0.496,0.003,-0.004,-0.007
NO2,0.683,0.68,1.0,0.683,0.686,0.681,-0.5,-0.001,-0.004,-0.007
O3,0.679,0.68,0.683,1.0,0.683,0.68,-0.493,0.003,0.0,0.001
SO2,0.685,0.679,0.686,0.683,1.0,0.68,-0.498,0.002,-0.004,-0.004
CO,0.679,0.678,0.681,0.68,0.68,1.0,-0.493,0.003,-0.004,-0.001
temperature_c,-0.498,-0.496,-0.5,-0.493,-0.498,-0.493,1.0,-0.001,-0.005,0.004
humidity_pct,-0.001,0.003,-0.001,0.003,0.002,0.003,-0.001,1.0,0.0,0.004
wind_speed_kmh,-0.007,-0.004,-0.004,0.0,-0.004,-0.004,-0.005,0.0,1.0,-0.005
precipitation_mm,-0.007,-0.007,-0.007,0.001,-0.004,-0.001,0.004,0.004,-0.005,1.0


In [34]:
## Correlations significatives polluants/meteo
print("\nCorrelations polluants vs meteo:")
for pollutant in available_pollutants:
    print(f"\n{pollutant}:")
    for meteo in available_meteo:
        corr = correlation_matrix.loc[pollutant, meteo]
        strength = "forte" if abs(corr) > 0.5 else "moderee" if abs(corr) > 0.3 else "faible"
        print(f"  vs {meteo}: {corr:+.3f} ({strength})")


Correlations polluants vs meteo:

PM2.5:
  vs temperature_c: -0.498 (moderee)
  vs humidity_pct: -0.001 (faible)
  vs wind_speed_kmh: -0.007 (faible)
  vs precipitation_mm: -0.007 (faible)

PM10:
  vs temperature_c: -0.496 (moderee)
  vs humidity_pct: +0.003 (faible)
  vs wind_speed_kmh: -0.004 (faible)
  vs precipitation_mm: -0.007 (faible)

NO2:
  vs temperature_c: -0.500 (moderee)
  vs humidity_pct: -0.001 (faible)
  vs wind_speed_kmh: -0.004 (faible)
  vs precipitation_mm: -0.007 (faible)

O3:
  vs temperature_c: -0.493 (moderee)
  vs humidity_pct: +0.003 (faible)
  vs wind_speed_kmh: +0.000 (faible)
  vs precipitation_mm: +0.001 (faible)

SO2:
  vs temperature_c: -0.498 (moderee)
  vs humidity_pct: +0.002 (faible)
  vs wind_speed_kmh: -0.004 (faible)
  vs precipitation_mm: -0.004 (faible)

CO:
  vs temperature_c: -0.493 (moderee)
  vs humidity_pct: +0.003 (faible)
  vs wind_speed_kmh: -0.004 (faible)
  vs precipitation_mm: -0.001 (faible)


---

## 4.4 Analyse de la saisonnalite

In [35]:
## Pollution par mois
pollution_par_mois = df.groupby(['month', 'pollutant'])[value_col].mean().unstack()
pollution_par_mois.index = ['Jan', 'Fev', 'Mar', 'Avr', 'Mai', 'Juin', 'Juil', 'Aout', 'Sep', 'Oct', 'Nov', 'Dec'][:len(pollution_par_mois)]

print("Concentration moyenne par mois:")
pollution_par_mois.round(2)

Concentration moyenne par mois:


pollutant,CO,NO2,O3,PM10,PM2.5,SO2
Jan,0.99,59.56,99.31,49.62,29.89,9.95
Fev,1.0,59.86,99.52,49.52,29.9,9.92
Mar,0.71,42.59,70.96,35.48,21.26,7.11
Avr,0.71,42.68,71.07,35.49,21.35,7.09
Mai,0.71,42.54,70.98,35.47,21.3,7.1
Juin,0.57,34.0,56.97,28.58,17.02,5.66


In [36]:
## Pollution par jour de la semaine
pollution_par_jour = df.groupby(['day_of_week', 'pollutant'])[value_col].mean().unstack()
pollution_par_jour.index = day_names

print("\nConcentration moyenne par jour de la semaine:")
pollution_par_jour.round(2)


Concentration moyenne par jour de la semaine:


pollutant,CO,NO2,O3,PM10,PM2.5,SO2
Lundi,0.79,47.38,78.77,39.23,23.66,7.89
Mardi,0.79,47.16,78.98,39.42,23.66,7.85
Mercredi,0.79,47.04,78.39,39.25,23.58,7.84
Jeudi,0.79,47.29,78.75,39.44,23.58,7.87
Vendredi,0.78,46.62,77.3,38.77,23.28,7.76
Samedi,0.77,45.91,76.96,38.57,23.17,7.71
Dimanche,0.77,46.33,77.31,38.26,23.07,7.67


In [37]:
## Pollution par heure
pollution_par_heure = df.groupby(['hour', 'pollutant'])[value_col].mean().unstack()

print("\nConcentration moyenne par heure:")
pollution_par_heure.round(2)


Concentration moyenne par heure:


pollutant,CO,NO2,O3,PM10,PM2.5,SO2
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.75,45.01,75.2,37.64,22.74,7.5
1,0.75,44.97,75.0,37.72,22.67,7.48
2,0.52,31.8,52.68,26.18,15.83,5.25
3,0.53,31.68,52.68,26.33,15.8,5.29
4,0.53,31.41,52.07,26.42,15.74,5.27
5,0.75,44.97,75.35,37.88,22.62,7.47
6,0.75,45.09,75.07,37.77,22.63,7.52
7,0.98,58.54,98.07,48.8,29.31,9.77
8,0.98,58.33,97.4,48.65,29.41,9.7
9,0.98,58.81,97.99,48.78,29.32,9.73


In [38]:
## Pollution par saison
pollution_par_saison = df.groupby(['season', 'pollutant'])[value_col].mean().unstack()
## Reordonner les saisons
season_order = ['Hiver', 'Printemps', 'Ete', 'Automne']
pollution_par_saison = pollution_par_saison.reindex([s for s in season_order if s in pollution_par_saison.index])

print("\nConcentration moyenne par saison:")
pollution_par_saison.round(2)


Concentration moyenne par saison:


pollutant,CO,NO2,O3,PM10,PM2.5,SO2
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Hiver,0.99,59.71,99.37,49.56,29.89,9.94
Printemps,0.71,42.62,71.0,35.49,21.31,7.1
Ete,0.57,34.0,57.02,28.61,17.03,5.66


---

## 4.5 Top 10 des journees les plus polluees

In [39]:
## Moyenne journaliere par polluant
df_daily = df.groupby(['date', 'pollutant', 'city'])[value_col].mean().reset_index()

## Top 10 pour chaque polluant
for pollutant in available_pollutants:
    top10 = df_daily[df_daily['pollutant'] == pollutant].nlargest(10, value_col)
    print(f"\nTop 10 journees les plus polluees - {pollutant}:")
    print(top10.to_string(index=False))


Top 10 journees les plus polluees - PM2.5:
      date pollutant       city  value_mean
2024-02-11     PM2.5 Strasbourg   36.295495
2024-02-02     PM2.5     Nantes   35.968925
2024-01-28     PM2.5 Strasbourg   35.539247
2024-02-17     PM2.5     Nantes   34.884130
2024-01-24     PM2.5     Nantes   34.820455
2024-01-25     PM2.5     Nantes   34.754624
2024-02-27     PM2.5 Strasbourg   34.738478
2024-02-10     PM2.5 Strasbourg   34.720957
2024-01-08     PM2.5 Strasbourg   34.720638
2024-01-07     PM2.5     Nantes   34.620435

Top 10 journees les plus polluees - PM10:
      date pollutant       city  value_mean
2024-01-06      PM10     Nantes   59.541429
2024-02-16      PM10 Strasbourg   58.502065
2024-01-29      PM10 Strasbourg   58.234615
2024-01-12      PM10 Strasbourg   57.978947
2024-01-15      PM10     Nantes   57.629121
2024-01-23      PM10 Strasbourg   57.337849
2024-02-29      PM10 Strasbourg   57.336957
2024-01-03      PM10     Nantes   57.193444
2024-02-01      PM10 Strasbourg  

In [40]:
## Calculer un indice de pollution global (moyenne normalisee des polluants)
## Normaliser chaque polluant par son seuil d'information
df['ratio_seuil'] = df[value_col] / df['seuil_info']

## Indice journalier global
indice_journalier = df.groupby(['date', 'city'])['ratio_seuil'].mean().reset_index()
indice_journalier = indice_journalier.rename(columns={'ratio_seuil': 'indice_pollution'})

## Top 10 journees globales
top10_global = indice_journalier.nlargest(10, 'indice_pollution')
top10_global['indice_pollution'] = top10_global['indice_pollution'].round(3)

print("\nTop 10 journees les plus polluees (indice global):")
top10_global


Top 10 journees les plus polluees (indice global):


Unnamed: 0,date,city,indice_pollution
198,2024-01-20,Strasbourg,0.609
578,2024-02-27,Strasbourg,0.604
408,2024-02-10,Strasbourg,0.602
475,2024-02-17,Nantes,0.601
78,2024-01-08,Strasbourg,0.599
418,2024-02-11,Strasbourg,0.599
425,2024-02-12,Nantes,0.599
325,2024-02-02,Nantes,0.598
55,2024-01-06,Nantes,0.598
235,2024-01-24,Nantes,0.594


---

## 4.6 Sauvegarde des resultats

In [44]:
## Sauvegarder le tableau des depassements
depassements.to_csv(os.path.join(OUTPUT_DIR, "depassements_seuils.csv"))
print(f"- Tableau des depassements sauvegarde: {os.path.join(OUTPUT_DIR, 'depassements_seuils.csv')}")

## Sauvegarder la matrice de correlation
correlation_matrix.to_csv(os.path.join(OUTPUT_DIR, "matrice_correlation.csv"))
print(f"- Matrice de correlation sauvegardee: {os.path.join(OUTPUT_DIR, 'matrice_correlation.csv')}")

## Sauvegarder le top 10 global
top10_global.to_csv(os.path.join(OUTPUT_DIR, "top10_journees_polluees.csv"), index=False)
print(f"- Top 10 journees sauvegarde: {os.path.join(OUTPUT_DIR, 'top10_journees_polluees.csv')}")

- Tableau des depassements sauvegarde: C:\Users\Administrateur\Desktop\spark-pandas-viz-lab\data\output\depassements_seuils.csv
- Matrice de correlation sauvegardee: C:\Users\Administrateur\Desktop\spark-pandas-viz-lab\data\output\matrice_correlation.csv
- Top 10 journees sauvegarde: C:\Users\Administrateur\Desktop\spark-pandas-viz-lab\data\output\top10_journees_polluees.csv


In [46]:
## Resume
print("RESUME DE L'ANALYSE EXPLORATOIRE")
print(f"- Periode analysee: {df['date'].min()} a {df['date'].max()}")
print(f"- Nombre de mesures: {len(df):,}")
print(f"- Nombre de villes: {df['city'].nunique()}")
print(f"- Nombre de stations: {df['station_id'].nunique()}")
print(f"\n- Depassements du seuil d'information: {df['depassement_info'].sum():,}")
print(f"- Depassements du seuil d'alerte: {df['depassement_alerte'].sum():,}")

RESUME DE L'ANALYSE EXPLORATOIRE
- Periode analysee: 2024-01-01 a 2024-06-30
- Nombre de mesures: 1,176,727
- Nombre de villes: 10
- Nombre de stations: 47

- Depassements du seuil d'information: 126,864
- Depassements du seuil d'alerte: 16,111
