# Etape 3.1 : Statistiques descriptives

**Livrables** :
- Ce notebook `06_statistiques_descriptives.ipynb`
- Tableaux de synthese exportes en CSV

---
---

## Import

In [1]:
import sys
import os
from pathlib import Path
import psutil
import time
from datetime import datetime
import pandas as pd

---

## (optionnel) Enregistrement de la date de la dernière execution de ce notebook

In [2]:
print(f"- Date de la dernière execution de ce notebook : {datetime.now().strftime('%d/%m/%Y %H:%M:%S')} (FR)")

- Date de la dernière execution de ce notebook : 20/02/2026 20:33:07 (FR)


---

## (Optionnel) Mesure du temps de traitement global pour ce script - enregistrement de l'heure de début + estimation instantanée des ressources machine libres

In [3]:
## Heure de début
start_time_06 = time.time()

## Machine: current available RAM (in GB)
ram_available_06 = psutil.virtual_memory().available / (1024**3)

## Machine: current available CPU
logical = psutil.cpu_count()
physical = psutil.cpu_count(logical=False) or logical

cpu_used = psutil.cpu_percent(interval=2)
cpu_available_pct_06 = 100 - cpu_used

available_logical_06 = logical * cpu_available_pct_06 / 100
available_physical_06 = physical * cpu_available_pct_06 / 100

## Show available resources
print(f"- Current machine RAM available : {ram_available_06:.2f} GB")
print(f"- Current machine CPU available : {cpu_available_pct_06:.2f}%")
print(f"    Approx logical cores free  : {available_logical_06:.1f}")
print(f"    Approx physical cores free : {available_physical_06:.1f}")

- Current machine RAM available : 10.72 GB
- Current machine CPU available : 91.80%
    Approx logical cores free  : 14.7
    Approx physical cores free : 7.3


---

## Chemins des données

In [4]:
# ==============================================================================================================
#                                                   OUTPUTS
# ==============================================================================================================
OUT_DIR = (Path.cwd() / ".." / "output").resolve()
OUT_EXPORT_DIR = os.path.join(OUT_DIR, "06_exports")
os.makedirs(OUT_EXPORT_DIR, exist_ok=True)

OUT_EXP_STATS_CSV = os.path.join(OUT_EXPORT_DIR, "stats_par_type_energie.csv")
OUT_EXP_PLUS_ENERGIVORES_CSV = os.path.join(OUT_EXPORT_DIR, "top_plus_energivores.csv")
OUT_EXP_MOINS_ENERGIVORES_CSV = os.path.join(OUT_EXPORT_DIR, "top_moins_energivores.csv")
OUT_EXP_REP_DPE_CSV = os.path.join(OUT_EXPORT_DIR, "repartition_dpe.csv")
OUT_EXP_TREND_MONS_CSV = os.path.join(OUT_EXPORT_DIR, "tendance_mensuelle.csv")
OUT_EXP_COMP_DPE_CSV = os.path.join(OUT_EXPORT_DIR, "comparaison_dpe.csv")

# ==============================================================================================================
#                                                   INPUTS
# ==============================================================================================================
IN_DIR = (Path.cwd() / ".." / "data").resolve()
IN_CONSO_ENRICHIE_CSV =  os.path.join(OUT_DIR, "05_consommations_enrichies.csv")

# ==============================================================================================================
#                                                    OTHERS
# ==============================================================================================================
TMP_DIR = (Path.cwd() / ".." / "my_tmp").resolve()
TMP_FILE_CSV = TMP_DIR / "tmp_06_resources.txt" # Enregistrer les metrics pour ce script

---

## Chargement des données

In [5]:
df_conso = pd.read_csv(IN_CONSO_ENRICHIE_CSV)

## Affichage de quelques infos
print("df_conso :")
print(f"    - Shape: {df_conso.shape}")
print(f"    - Colonnes: {df_conso.columns.tolist()}")
print()
# Info sur les types
print("    - Infos sur les types : ")
df_conso.info()
print()
## Appercu des donnees
print("    - Appercu des donnees : ")
df_conso.head()

df_conso :
    - Shape: (991, 31)
    - Colonnes: ['batiment_id', 'heure', 'consommation_moyenne', 'unite', 'date', 'type_energie', 'ts_h', 'nom', 'type', 'commune', 'surface_m2', 'annee_construction', 'classe_energetique', 'nb_occupants_moyen', 'temperature_c', 'humidite_pct', 'rayonnement_solaire_wm2', 'vitesse_vent_kmh', 'precipitation_mm', 'jour', 'mois', 'saison', 'jour_de_semaine', 'date_debut', 'date_fin', 'tarif_unitaire', 'cout_financier', 'conso_par_occupant', 'conso_par_m2', 'IPE', 'ecart_moyenne_categorie']

    - Infos sur les types : 
<class 'pandas.DataFrame'>
RangeIndex: 991 entries, 0 to 990
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   batiment_id              991 non-null    str    
 1   heure                    991 non-null    int64  
 2   consommation_moyenne     991 non-null    float64
 3   unite                    991 non-null    str    
 4   date                  

Unnamed: 0,batiment_id,heure,consommation_moyenne,unite,date,type_energie,ts_h,nom,type,commune,...,saison,jour_de_semaine,date_debut,date_fin,tarif_unitaire,cout_financier,conso_par_occupant,conso_par_m2,IPE,ecart_moyenne_categorie
0,BAT0043,22,280.88,m3,2023-01-01,eau,2023-01-01 22:00:00,Piscine Bordeaux 43,piscine,Bordeaux,...,Hiver,6.0,2023-01-01,2023-12-31,3.5,983.08,1.898,0.123,0.123,-401.254
1,BAT0100,17,4.52,m3,2023-01-01,eau,2023-01-01 17:00:00,Mediatheque Rennes 100,mediatheque,Rennes,...,Hiver,6.0,2023-01-01,2023-12-31,3.5,15.82,0.037,0.005,0.005,-92.701
2,BAT0006,3,5.5,kWh,2023-01-01,gaz,2023-01-01 03:00:00,Mairie Paris 6,mairie,Paris,...,Hiver,6.0,2023-01-01,2023-06-30,0.09,0.495,0.087,0.005,0.005,-43.286
3,BAT0126,10,126.88,kWh,2023-01-01,gaz,2023-01-01 10:00:00,Ecole Le Havre 126,ecole,Le Havre,...,Hiver,6.0,2023-01-01,2023-06-30,0.09,11.419,0.375,0.075,0.075,7.068
4,BAT0035,20,3.97,m3,2023-01-02,eau,2023-01-02 20:00:00,Mairie Toulouse 35,mairie,Toulouse,...,,,2023-01-01,2023-12-31,3.5,13.895,0.046,0.005,0.005,-44.816


---

## Calculer les statistiques par type d'energie, type de batiment et commune

In [6]:
stats = (
    df_conso
    .groupby(["type_energie", "type", "commune"])
    .agg(
        conso_moyenne=("consommation_moyenne", "mean"),
        conso_totale=("consommation_moyenne", "sum"),
        cout_total=("cout_financier", "sum")
    )
    .round(2)
    .reset_index()
)

stats.head()

Unnamed: 0,type_energie,type,commune,conso_moyenne,conso_totale,cout_total
0,eau,ecole,Bordeaux,2.74,10.98,40.83
1,eau,ecole,Le Havre,8.39,41.96,149.32
2,eau,ecole,Lyon,7.54,90.53,330.19
3,eau,ecole,Marseille,4.74,23.69,88.52
4,eau,ecole,Montpellier,3.73,11.2,42.0


---

## Identifier les batiments les plus/moins energivores

In [7]:
conso_bat = (
    df_conso
    .groupby(["batiment_id", "nom", "type", "commune"], as_index=False)
    .agg(conso_totale=("consommation_moyenne", "sum"))
    .sort_values("conso_totale", ascending=False)
)

print("- Les batiments les plus énergivores :")
plus_energivores = conso_bat.head(10)
plus_energivores.head()

- Les batiments les plus énergivores :


Unnamed: 0,batiment_id,nom,type,commune,conso_totale
135,BAT0136,Piscine Toulon 136,piscine,Toulon,13133.77
145,BAT0146,Piscine Toulon 146,piscine,Toulon,10054.92
42,BAT0043,Piscine Bordeaux 43,piscine,Bordeaux,9931.62
49,BAT0050,Piscine Lille 50,piscine,Lille,9747.52
4,BAT0005,Piscine Paris 5,piscine,Paris,9632.15


In [8]:
print("- Les batiments les moins énergivores :")
moins_energivores = conso_bat.tail(10)
moins_energivores.tail()

- Les batiments les moins énergivores :


Unnamed: 0,batiment_id,nom,type,commune,conso_totale
76,BAT0077,Mairie Montpellier 77,mairie,Montpellier,36.08
39,BAT0040,Mairie Bordeaux 40,mairie,Bordeaux,32.74
119,BAT0120,Piscine Le Havre 120,piscine,Le Havre,32.3
97,BAT0098,Mairie Rennes 98,mairie,Rennes,24.56
100,BAT0101,Mairie Rennes 101,mairie,Rennes,22.26


---

## Calculer la repartition des consommations par classe energetique DPE

In [9]:
repartition_dpe = (
    df_conso.groupby("classe_energetique")["consommation_moyenne"]
    .sum()
    .sort_values(ascending=False)
    .round(2)
)

repartition_dpe

classe_energetique
G    78252.41
F    77561.49
E    28170.03
D    24432.62
C    10461.94
B     8592.63
A      578.04
Name: consommation_moyenne, dtype: float64

---

## Analyser l'evolution temporelle (tendances mensuelles, saisonnalite)

In [10]:
df_conso["date"] = pd.to_datetime(df_conso["date"])
df_conso["mois"] = df_conso["date"].dt.to_period("M")

# Consommation mensuelle totale
trend_mensuel = (
    df_conso.groupby("mois")["consommation_moyenne"]
    .sum()
    .reset_index()
)

trend_mensuel


Unnamed: 0,mois,consommation_moyenne
0,2023-01,8403.8
1,2023-02,15553.52
2,2023-03,8481.1
3,2023-04,5852.26
4,2023-05,9664.27
5,2023-06,5215.31
6,2023-07,5864.65
7,2023-08,8396.47
8,2023-09,3930.27
9,2023-10,10226.1


In [11]:
# Saison par exemple
saisonnalite = (
    df_conso.groupby("saison")["consommation_moyenne"]
    .mean()
    .round(2)
)
saisonnalite

saison
Automne      239.32
Ete          143.08
Hiver        320.32
Printemps    202.74
Name: consommation_moyenne, dtype: float64

---

## Comparer la consommation theorique (selon DPE) vs reelle

In [12]:
dpe_theorique = {
    "A": 50,
    "B": 90,
    "C": 150,
    "D": 230,
    "E": 330,
    "F": 450,
    "G": 600
}

df_conso["conso_theorique_dpe"] = df_conso["classe_energetique"].map(dpe_theorique)

df_conso["ecart_theorique_reel"] = (
    df_conso["conso_par_m2"] - df_conso["conso_theorique_dpe"]
)

df_conso.head()

Unnamed: 0,batiment_id,heure,consommation_moyenne,unite,date,type_energie,ts_h,nom,type,commune,...,date_debut,date_fin,tarif_unitaire,cout_financier,conso_par_occupant,conso_par_m2,IPE,ecart_moyenne_categorie,conso_theorique_dpe,ecart_theorique_reel
0,BAT0043,22,280.88,m3,2023-01-01,eau,2023-01-01 22:00:00,Piscine Bordeaux 43,piscine,Bordeaux,...,2023-01-01,2023-12-31,3.5,983.08,1.898,0.123,0.123,-401.254,600,-599.877
1,BAT0100,17,4.52,m3,2023-01-01,eau,2023-01-01 17:00:00,Mediatheque Rennes 100,mediatheque,Rennes,...,2023-01-01,2023-12-31,3.5,15.82,0.037,0.005,0.005,-92.701,330,-329.995
2,BAT0006,3,5.5,kWh,2023-01-01,gaz,2023-01-01 03:00:00,Mairie Paris 6,mairie,Paris,...,2023-01-01,2023-06-30,0.09,0.495,0.087,0.005,0.005,-43.286,330,-329.995
3,BAT0126,10,126.88,kWh,2023-01-01,gaz,2023-01-01 10:00:00,Ecole Le Havre 126,ecole,Le Havre,...,2023-01-01,2023-06-30,0.09,11.419,0.375,0.075,0.075,7.068,600,-599.925
4,BAT0035,20,3.97,m3,2023-01-02,eau,2023-01-02 20:00:00,Mairie Toulouse 35,mairie,Toulouse,...,2023-01-01,2023-12-31,3.5,13.895,0.046,0.005,0.005,-44.816,450,-449.995


In [13]:
# --- Moyenne par classe :
comparaison_dpe = (
    df_conso.groupby("classe_energetique")[["conso_par_m2","conso_theorique_dpe"]]
    .mean()
    .round(2)
)


---

## Tableaux de synthese exportes en CSV

In [14]:
stats.to_csv(OUT_EXP_STATS_CSV, index=False)
plus_energivores.to_csv(OUT_EXP_PLUS_ENERGIVORES_CSV, index=False)
moins_energivores.to_csv(OUT_EXP_MOINS_ENERGIVORES_CSV, index=False)
repartition_dpe.to_csv(OUT_EXP_REP_DPE_CSV)
trend_mensuel.to_csv(OUT_EXP_TREND_MONS_CSV, index=False)
comparaison_dpe.to_csv(OUT_EXP_COMP_DPE_CSV)

---

## Libérer la mémoire (Optionnel) 

In [15]:
del df_conso
del comparaison_dpe
del trend_mensuel
del repartition_dpe
del moins_energivores
del plus_energivores
del saisonnalite
del stats

---

## (Optionnel) enregistrement dans un fichier temporaire du temps d'execution + ressources pour utilisation ultérieure (dans le script run_pipeline_hybride.py ou autres)

In [16]:
temps_execution_06 = time.time() - start_time_06
temps_resources = f"""
    Date : {datetime.now().strftime("%d/%m/%Y %H:%M:%S")} (FR)

    temps_exec_sec={temps_execution_06:.2f}
    ram_gb={ram_available_06:.2f}
    cpu_pct={cpu_available_pct_06:.2f}
    logi_cores={available_logical_06:.1f}
    physi_cores={available_physical_06:.1f}
"""

# Ecrire des données du temps d'execution + ressources dans le fichier TMP_FILE_CSV
TMP_FILE_CSV.write_text(temps_resources, encoding="utf-8")

137