###  Top summary / Resumen 
##### collection_wrangling_ceee — Purpose & Outputs / Propósito y Resultados
certificados eficiencia edificios / energy certificates per buildings

| Section / Sección              | Content                                                               |  Contenido
|-------------------------------|----------------------------------------------------------------------------------|----------------------------------------------------------------------------------|
| **General Purpose**           | Produce canonical cleaned datasets for analysis and dashboard artifacts         | Generar conjuntos de datos limpios y canónicos para análisis y visualización    |
| **Raw Inputs**                | `data/raw/*.csv` or CKAN package IDs:<br>• `registro_certificados_eficiencia_energetica_2025.csv` | `data/raw/*.csv` o IDs de CKAN:<br>• `registro_certificados_eficiencia_energetica_2025.csv` |
| **Processed Outputs**         | • `data/processed/ceee.csv`<br>• `data/ingest_audit/*.json`                     | • `data/processed/ceee.csv`<br>• `data/ingest_audit/*.json`                     |
| **Expected Schema**           | ~50,000 rows<br>Key columns: postal code, area (m²), energy demands             | ~50,000 filas<br>Columnas clave: código postal, superficie (m²), demandas energéticas |
| **Reproducibility Checklist** | • `config.yml` present at repo root<br>• Run cells top → bottom<br>• Outputs saved to correct folders | • `config.yml` presente en la raíz del repositorio<br>• Ejecutar celdas de arriba hacia abajo<br>• Guardar salidas en carpetas correspondientes |


In [None]:
# Cell 1: Parameters

import pandas as pd, numpy as np
import os, sys, re, glob, yaml
from pathlib import Path

ROOT = Path.cwd()
while ROOT != ROOT.parent and not (ROOT/"config.yml").exists() and not (ROOT/".git").exists():
      ROOT = ROOT.parent
if str(ROOT) not in sys.path: sys.path.insert(0, str(ROOT))
cfg = yaml.safe_load((ROOT/"config.yml").read_text()) if (ROOT/"config.yml").exists() else {}
RAW_DIR   = ROOT / cfg.get("data", {}).get("raw_dir", "data/raw")
PROC_DIR  = ROOT / cfg.get("data", {}).get("processed_dir", "data/processed")
AUDIT_DIR = ROOT / cfg.get("data", {}).get("audit_dir", "data/ingest_audit")
ADOPTION_DEFAULT = cfg.get("defaults", {}).get("adoption_rate_default", 0.30)
PRIORITY         = cfg.get("defaults", {}).get("priority_districts", [10,11,12,13,15])
madrid_codes_official = set(cfg.get("defaults", {}).get("madrid_postal_codes_official", []))
print(f"RAW={RAW_DIR}\nPROC={PROC_DIR}\nAUDIT={AUDIT_DIR}")

RAW=c:\_Workspace\2_Work\1_Projects_Active\Datos_Abiertos_Madrid\Low-Carbon-Heating-Roadmap-for-Madrid\data\raw
PROC=c:\_Workspace\2_Work\1_Projects_Active\Datos_Abiertos_Madrid\Low-Carbon-Heating-Roadmap-for-Madrid\data\processed
AUDIT=c:\_Workspace\2_Work\1_Projects_Active\Datos_Abiertos_Madrid\Low-Carbon-Heating-Roadmap-for-Madrid\data\ingest_audit


In [14]:
# Cell 2: Ingestion / Ingesta
from src.loader import load_ceee; from src.cleaning import inspect_dataframe
ceee = load_ceee(save=False)
inspect_dataframe(ceee, name="ceee")

  return _light(_std(pd.read_csv(url, sep=sep, encoding=enc, low_memory=low_memory)))



 ceee.shape → 115596 rows × 86 columns

 head ()


Unnamed: 0,edif_tiporeg,edif_codreg,edif_direc,edif_muni,edif_codpost,edif_prov,edif_zonaclim,edif_año,edif_norma,edif_refcat,...,cal_norenovcalef,cal_norenovrefrig,cal_norenovacs,cal_norenovilu,cal_co2limite,cal_co2global,cal_co2calef,cal_co2refrig,cal_co2acs,cal_co2ilu
0,,65/710484.9/21,CALLE TEMBLEQUE Nº 91,MADRID,28024,MADRID,D3,1968,Anterior,5416319VK3751E,...,F,D,G,,79.6,F,F,C,G,
1,,10/287538.9/23,AVENIDA DE BADAJOZ 10,MADRID,28027,MADRID,D3,1981,NBE-CT-79,4470613VK4747A,...,G,E,G,,79.6,F,F,D,G,
2,,10/286187.9/23,CALLE VERDAGUER Y GARCÍA 3,MADRID,28027,MADRID,D3,1981,NBE-CT-79,4470613VK4747A,...,F,E,G,,79.6,F,F,D,G,
3,,10/287494.9/23,CALLE DERECHOS HUMANOS 14,MADRID,28027,MADRID,D3,1981,NBE-CT-79,4569209VK4746H,...,G,E,G,,99999.99,G,G,D,G,
4,,10/285907.9/23,CALLE SALVADOR DE MADARIAGA 26,MADRID,28027,MADRID,D3,1981,NBE-CT-79,4366610VK4746E,...,G,F,G,,99999.99,G,G,D,G,



 tail ()


Unnamed: 0,edif_tiporeg,edif_codreg,edif_direc,edif_muni,edif_codpost,edif_prov,edif_zonaclim,edif_año,edif_norma,edif_refcat,...,cal_norenovcalef,cal_norenovrefrig,cal_norenovacs,cal_norenovilu,cal_co2limite,cal_co2global,cal_co2calef,cal_co2refrig,cal_co2acs,cal_co2ilu
115591,,10/807545.9/24,CL CARDEÑUELA RIOPICO 9(C) BL:2 ES:6 PL:06 PT:D,MADRID,28050.0,MADRID,D3,2004,NBE-CT-79,3339501VK4383H0175ZE,...,E,B,G,,66.3,E,E,A,F,
115592,,10/807674.9/24,ADOLFO MARSILLACH Nº5 1º B,GETAFE,28907.0,MADRID,D3,2007,NBE-CT-79,7337701VK3673N0077QO,...,E,C,E,,66.3,E,E,B,E,
115593,,10/798330.9/24,"CALLE RIBERA, 61 ARROYOMOLINOS",ARROYOMOLINOS,28939.0,MADRID,D3,2020,CTE 2013,2571603VK2527S0002DJ,...,B,F,F,,19.9,B,A,D,E,
115594,,10/798252.9/24,CL PARQUE AZUL 7 BL7 ES1 PL02 PTA,COLMENAREJO,28270.0,MADRID,D3,1998,Anterior,3812108VK1931S0015HY,...,E,D,G,,66.3,E,E,C,F,
115595,,10/799275.9/24,CL DRACENA 30 ES:1 PL:00 PT:01,MADRID,28016.0,MADRID,D3,1971,Anterior,3496912VK4739E0002AM,...,E,A,D,,45.74,D,D,A,D,



 info ():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115596 entries, 0 to 115595
Data columns (total 86 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   edif_tiporeg         0 non-null       float64
 1   edif_codreg          115596 non-null  object 
 2   edif_direc           115560 non-null  object 
 3   edif_muni            115560 non-null  object 
 4   edif_codpost         115541 non-null  object 
 5   edif_prov            114343 non-null  object 
 6   edif_zonaclim        115560 non-null  object 
 7   edif_año             115557 non-null  object 
 8   edif_norma           115378 non-null  object 
 9   edif_refcat          115560 non-null  object 
 10  edif_tipo            115509 non-null  object 
 11  edif_proced          115558 non-null  object 
 12  edif_fecha           115596 non-null  object 
 13  edif_superf          115560 non-null  object 
 14  edif_compac          115560 non-null  object 
 15  edif_c

Unnamed: 0,edif_tiporeg,edif_codreg,edif_direc,edif_muni,edif_codpost,edif_prov,edif_zonaclim,edif_año,edif_norma,edif_refcat,...,cal_norenovcalef,cal_norenovrefrig,cal_norenovacs,cal_norenovilu,cal_co2limite,cal_co2global,cal_co2calef,cal_co2refrig,cal_co2acs,cal_co2ilu
count,0.0,115596,115560,115560,115541.0,114343,115560,115557.0,115378,115560,...,114153,113709,113370,5749,114314.0,115560,114152,113709,113321,5750
unique,,115596,103287,632,868.0,24,13,291.0,236,101476,...,7,7,7,7,,7,7,7,7,7
top,,10/799275.9/24,CALLE CABO DE GATA 19,MADRID,28850.0,MADRID,D3,1970.0,Anterior,9478701VK3997N0001JX,...,E,D,G,A,,E,E,C,G,A
freq,,1,68,57317,1923.0,114274,113815,4270.0,52869,128,...,55078,32805,75887,2822,,55633,53862,36596,66970,2824
mean,,,,,,,,,,,...,,,,,3688.796319,,,,,
std,,,,,,,,,,,...,,,,,18711.869143,,,,,
min,,,,,,,,,,,...,,,,,0.0,,,,,
25%,,,,,,,,,,,...,,,,,32.4,,,,,
50%,,,,,,,,,,,...,,,,,66.3,,,,,
75%,,,,,,,,,,,...,,,,,66.3,,,,,


In [15]:
# Cell 3: Schema validation / Validación de esquema
# Validate required columns and types using src.cleaning.validate_schema / usa helper
from src.cleaning import validate_schema  
req_building_cols = ["edif_codpost","edif_superf","edif_calef","elec_demcalef", "final_calef", "norenov_calef"]
try:
    validate_schema(ceee, req_building_cols)
    print("ceee schema OK")
except AssertionError as e:
    raise AssertionError(f"Schema error ceee: {e}")
# Drop not required columns  / Soltar columnas innecesarias
ceee = ceee[["edif_codpost","edif_superf","edif_calef","elec_demcalef", "final_calef", "norenov_calef"]]

# Keep only rows where e_postal_code is in official list / Conservar solo filas con código oficial
before = len(ceee)
ceee = ceee[ceee["edif_codpost"].astype(str).isin(madrid_codes_official)].copy()
after = len(ceee)
print(f"filtered ceee: {before} → {after} rows (only madrid municipality postal codes)")

ceee schema OK


filtered ceee: 115596 → 56340 rows (only madrid municipality postal codes)


In [16]:
# Cell 4: Data quality report / Informe de calidad de datos
# Use dq_report to get structured report and store it / usa dq_report y guárdalo
from src.cleaning import dq_report
dq_ceee = dq_report(ceee)
# print compact summary / imprimir resumen compacto
print("rows_in:", dq_ceee["rows_in"], "duplicate_rows:", dq_ceee["duplicate_rows"])
for col, meta in list(dq_ceee["columns"].items())[:8]:
    print(f"{col}: nulls={meta['null_count']} null_pct={meta['null_pct']:.2f}% uniques={meta['unique_nonnull']}")
# keep report in memory for audit / conservar para auditoría
dq_reports = {"df_ceee": dq_ceee}

rows_in: 56340 duplicate_rows: 1860
edif_codpost: nulls=0 null_pct=0.00% uniques=55
edif_superf: nulls=0 null_pct=0.00% uniques=11071
edif_calef: nulls=787 null_pct=1.40% uniques=102
elec_demcalef: nulls=787 null_pct=1.40% uniques=18178
final_calef: nulls=813 null_pct=1.44% uniques=21967
norenov_calef: nulls=787 null_pct=1.40% uniques=25252


In [17]:
# Cell 5: Cleaning — safe rename, convert, drop NaN/negatives, log transform / Limpiar — renombrar seguro, convertir, eliminar NaN/negativos, registrar transformación

# --- 0. Rename columns to canonical names / Renombrar columnas ---
rename_dict = {
    'edif_codpost': 'e_postal_code',           # Postal code / Código postal (5-digit string)
    'edif_superf': 'e_surface_m2',             # Surface area / Superficie (m2)
    'edif_calef': 'e_pct_surface_heated',      # % surface heated / % superficie con calefacción (pct 0-100)
    'elec_demcalef': 'e_heating_demand_kwh_m2_a', # Heating demand (DB-HE) / Demanda calefacción (kWh/m2·a)
    'final_calef': 'final_heating_kwh_m2_a',   # Final heating consumption / Consumo final calefacción (kWh/m2·a)
    'norenov_calef': 'nonren_heating_kwh_m2_a',# Non-renewable heating energy / Energía no renovable calefacción (kWh/m2·a)
}
ceee.rename(columns=rename_dict, inplace=True)

critical_cols = [
    "e_surface_m2",
    "e_pct_surface_heated",
    "e_heating_demand_kwh_m2_a",
    "final_heating_kwh_m2_a",
    "nonren_heating_kwh_m2_a"
]

sentinels = ['99999999,99','99999999.99','99999999','99999999,00',
             'NaN','nan','NULL','-']

# --- 1. Convert to numeric with cleaning ---
for c in critical_cols:
    if c in ceee.columns:
        s = ceee[c].astype(str).replace(sentinels, pd.NA)
        s = s.str.replace(r'\.', '', regex=True).str.replace(',', '.', regex=True)
        s = s.str.replace(r'[^\d\.\-]', '', regex=True)
        ceee[c] = pd.to_numeric(s, errors='coerce')

# --- 2. Drop rows with NaN in any critical column ---
n_before = len(ceee)
ceee = ceee.dropna(subset=critical_cols)
n_after_nan = len(ceee)

# --- 3. Drop rows with negatives or zeros ---
mask_bad = (ceee[critical_cols] <= 0).any(axis=1)
n_bad = mask_bad.sum()
ceee = ceee.loc[~mask_bad].copy()
n_final = len(ceee)

# --- 4. Final check ---
for col in critical_cols:
    n_null = ceee[col].isna().sum()
    n_neg  = (ceee[col] < 0).sum()
    n_zero = (ceee[col] == 0).sum()
    print(f"{col}: nulls={n_null}, negatives={n_neg}, zeros={n_zero}")

# --- 5. Register transformation ---
    transforms = []
    transforms.append({
    "step": "rename_and_clean_ceee",
    "renamed_columns": list(rename_dict.items()),
    "columns_cleaned": critical_cols,
    "sentinels_mapped": sentinels,
    "rows_before": n_before,
    "rows_dropped_nan": n_before - n_after_nan,
    "rows_dropped_negatives_or_zeros": n_bad,
    "rows_final": n_final
})

e_surface_m2: nulls=0, negatives=0, zeros=0
e_pct_surface_heated: nulls=0, negatives=0, zeros=0
e_heating_demand_kwh_m2_a: nulls=0, negatives=0, zeros=0
final_heating_kwh_m2_a: nulls=0, negatives=0, zeros=0
nonren_heating_kwh_m2_a: nulls=0, negatives=0, zeros=0


In [18]:
# Cell 6: Final Check / Revisión Final
ceee.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50638 entries, 0 to 114687
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   e_postal_code              50638 non-null  object 
 1   e_surface_m2               50638 non-null  float64
 2   e_pct_surface_heated       50638 non-null  float64
 3   e_heating_demand_kwh_m2_a  50638 non-null  float64
 4   final_heating_kwh_m2_a     50638 non-null  float64
 5   nonren_heating_kwh_m2_a    50638 non-null  float64
dtypes: float64(5), object(1)
memory usage: 2.7+ MB


In [19]:
# Cell 7: Export and audit / Exportar y auditar
from src.io import save_df, write_audit_log  # persistence helpers / ayudantes de persistencia
# save processed artifact / guardar artefacto procesado
out_path = save_df(ceee, str(PROC_DIR.joinpath("df_ceee.csv")))
print("Saved:", out_path)
# build audit entry and write / construir objeto de auditoría y guardar
audit = {
    "source": "registro_certificados_eficiencia_energetica_2025.csv",
    "rows_in": int(dq_ceee["rows_in"]),   # cast to Python int
    "rows_out": int(len(ceee)),           # ensure Python int
    "transforms": []
}

# normalize transforms too
for t in transforms:
    audit["transforms"].append({
        **t,
        "rows_before": int(t.get("rows_before", 0)),
        "rows_final": int(t.get("rows_final", 0)),
        "rows_dropped_nan": int(t.get("rows_dropped_nan", 0)),
        "rows_dropped_negatives_or_zeros": int(t.get("rows_dropped_negatives_or_zeros", 0))
    })

audit_path = write_audit_log(**audit)
print("Audit saved:", audit_path)

Saved: c:\_Workspace\2_Work\1_Projects_Active\Datos_Abiertos_Madrid\Low-Carbon-Heating-Roadmap-for-Madrid\data\processed\df_ceee.csv
Audit saved: C:\_Workspace\2_Work\1_Projects_Active\Datos_Abiertos_Madrid\Low-Carbon-Heating-Roadmap-for-Madrid\data\ingest_audit\audit_registro_certificados_eficiencia_energetica_2025_20251029_213722.json


In [20]:
# Cell 8: Reproducibility checks / Verificaciones de reproducibilidad

# list artifacts and audits / listar artefactos y auditorías
proc_files = glob.glob(str(PROC_DIR.joinpath("*.csv")))
audit_files = glob.glob(str(AUDIT_DIR.joinpath("*.json")))
print("processed files:", proc_files)
print("audit files:", audit_files)
# basic checks / comprobaciones básicas
assert proc_files, "No processed artifacts found in data/processed/  / No hay artefactos procesados"
assert audit_files, "No audit JSONs found in data/ingest_audit/  / No hay JSONs de auditoría"
# size checks (rows) / comprobación de filas mínima (adjust expected as needed)
min_rows_expected = 10
for p in proc_files:
    df = pd.read_csv(p, nrows=min_rows_expected)
    if df.shape[0] < min_rows_expected:
        raise RuntimeError(f"Artifact {p} has <{min_rows_expected} rows; check processing  / Artifact tiene pocas filas")
print("Reproducibility smoke tests passed / Pruebas de reproducibilidad OK")

processed files: ['c:\\_Workspace\\2_Work\\1_Projects_Active\\Datos_Abiertos_Madrid\\Low-Carbon-Heating-Roadmap-for-Madrid\\data\\processed\\df_ceee.csv', 'c:\\_Workspace\\2_Work\\1_Projects_Active\\Datos_Abiertos_Madrid\\Low-Carbon-Heating-Roadmap-for-Madrid\\data\\processed\\df_gei.csv', 'c:\\_Workspace\\2_Work\\1_Projects_Active\\Datos_Abiertos_Madrid\\Low-Carbon-Heating-Roadmap-for-Madrid\\data\\processed\\df_pst.csv', 'c:\\_Workspace\\2_Work\\1_Projects_Active\\Datos_Abiertos_Madrid\\Low-Carbon-Heating-Roadmap-for-Madrid\\data\\processed\\sql_buildings_train.csv']
audit files: ['c:\\_Workspace\\2_Work\\1_Projects_Active\\Datos_Abiertos_Madrid\\Low-Carbon-Heating-Roadmap-for-Madrid\\data\\ingest_audit\\audit_atm_inventario_gei_20251029_213059.json', 'c:\\_Workspace\\2_Work\\1_Projects_Active\\Datos_Abiertos_Madrid\\Low-Carbon-Heating-Roadmap-for-Madrid\\data\\ingest_audit\\audit_atm_inventario_gei_20251029_213554.json', 'c:\\_Workspace\\2_Work\\1_Projects_Active\\Datos_Abiertos_Mad