In [1]:
# 01_load_datasets.ipynb

# Libraries
import requests
import pandas as pd
import numpy as np

# Comunidad de Madrid CKAN API base for Raw Data Sources 1.) to 4.)
ckan_base = "https://datos.comunidad.madrid/api/3/action/package_show"

In [2]:
# 1.) Load and inspect Greenhouse Gas Emissions Inventory ∣ Cargar e inspeccionar Inventario Gases de Efecto invernadero (GEI)

dataset_id = "atm_inventario_gei"
r = requests.get(ckan_base, params={"id": dataset_id})
r.raise_for_status()  # raise if fail ∣ lanzar si falla
title = r.json()["result"]["title"]
print(f"\nDataset 1: {title}")

# Find first CSV resource and load in memory ∣ buscar primer recurso CSV y cargar en memoria
csv_url = next((res["url"] for res in r.json()["result"]["resources"] if res["format"].lower()=="csv"), None)
df_gei = pd.read_csv(csv_url, sep=";", encoding="latin1") if csv_url else pd.DataFrame()

# Standardize columns and show diagnostics ∣ Estandarizar columnas y mostrar diagnóstico
df_gei.columns = [c.lower().strip().replace(" ", "_") for c in df_gei.columns]
print("  - Shape:", df_gei.shape)
print("  - Dtypes:")
print(df_gei.dtypes)
print("\n  - Head(2):")
print(df_gei.head(2))


Dataset 1: Inventario de emisiones de gases de efecto invernadero de la Comunidad de Madrid
  - Shape: (5885, 6)
  - Dtypes:
inventario_gei_año               int64
inventario_gei_sector_crf       object
inventario_gei_categoria_crf    object
inventario_gei_actividad_crf    object
inventario_gei_contaminante     object
inventario_gei_gg_co2_eq        object
dtype: object

  - Head(2):
   inventario_gei_año inventario_gei_sector_crf  \
0                1990             CRF 1 Energía   
1                1990             CRF 1 Energía   

     inventario_gei_categoria_crf  \
0  1.A. Actividades de combustión   
1  1.A. Actividades de combustión   

                        inventario_gei_actividad_crf  \
0  1A1a Producción de energía y calor en plantas ...   
1  1A1a Producción de energía y calor en plantas ...   

  inventario_gei_contaminante inventario_gei_gg_co2_eq  
0                         N2O              0,015104735  
1                         CH4               0,01595958  


In [3]:
# 2.) Load and inspect PST dataset ∣ Cargar e inspeccionar Emisión PST (Partículas en suspensión)
dataset_id = "1911600"  # Emisión de contaminantes atmosféricos por sectores: PST

r = requests.get(ckan_base, params={"id": dataset_id})
r.raise_for_status()  # raise if fail ∣ lanzar si falla
title = r.json()["result"]["title"]
print(f"\nDataset 2: {title}")

# Find first CSV resource and load in memory ∣ buscar primer recurso CSV y cargar en memoria
csv_url = next((res["url"] for res in r.json()["result"]["resources"] if res["format"].lower() == "csv"), None)
df_pst = pd.read_csv(csv_url, sep=";", encoding="latin1") if csv_url else pd.DataFrame()

# Standardize columns and show diagnostics ∣ Estandarizar columnas y mostrar diagnóstico
df_pst.columns = [c.lower().strip().replace(" ", "_") for c in df_pst.columns]
print("  - Shape:", df_pst.shape)
print("  - Dtypes:")
print(df_pst.dtypes)
print("\n  - Head(2):")
print(df_pst.head(2))


Dataset 2: Emisión de contaminantes atmosféricos por sectores: Partículas en suspensión (PST)
  - Shape: (264, 7)
  - Dtypes:
año                    int64
concepto              object
tipo_territorio       object
código_territorio    float64
territorio           float64
valor                  int64
estado_dato          float64
dtype: object

  - Head(2):
    año                                           concepto tipo_territorio  \
0  2000  Emisión de partículas en suspensión en otras f...           Otros   
1  2000          Total emisión de partículas en suspensión           Otros   

   código_territorio  territorio  valor  estado_dato  
0                NaN         NaN    830          NaN  
1                NaN         NaN  18926          NaN  


In [5]:
# 3.) Load and inspect Energy Certificates dataset ∣ Cargar e inspeccionar Certificados de Eficiencia Energética (CEEE)
dataset_id = "registro_certificados_eficiencia_energetica"
r = requests.get(ckan_base, params={"id": dataset_id}); r.raise_for_status()
csv_url = next((res["url"] for res in r.json()["result"]["resources"] if res["format"].lower() == "csv"), None)

# Load full CSV ∣ Cargar CSV completo
df_ceee = pd.read_csv(csv_url, sep=";", encoding="latin1", low_memory=False) if csv_url else pd.DataFrame()
df_ceee.columns = [c.lower().strip().replace(" ", "_") for c in df_ceee.columns]

# Coerce mixed-type object columns ∣ Convertir columnas object con tipos mixtos
for c in df_ceee.select_dtypes("object"):
    sample = df_ceee[c].dropna().head(1000)
    if len({type(x) for x in sample}) > 1:
        try:
            df_ceee[c + "_orig"] = df_ceee[c]
            df_ceee[c] = pd.to_numeric(df_ceee[c].astype(str).str.replace(r"[^\d\-\.,eE]", "", regex=True).str.replace(",", "."), errors="coerce")
        except Exception:
            pass

# Downcast numerics ∣ Reducir tamaño de numéricos
for c in df_ceee.select_dtypes(np.number):
    try:
        dtype = "float" if pd.api.types.is_float_dtype(df_ceee[c]) else "integer"
        df_ceee[c] = pd.to_numeric(df_ceee[c], downcast=dtype)
    except Exception:
        pass

# Parse date-like columns ∣ Parsear columnas tipo fecha
for c in df_ceee.columns:
    if any(k in c for k in ("fecha", "date", "año", "ano", "year")):
        try:
            df_ceee[c] = pd.to_datetime(df_ceee[c], errors="coerce", dayfirst=True)
        except Exception:
            pass

# Diagnostics ∣ Diagnóstico
cols_dtypes = [f"'{c}' ({str(df_ceee[c].dtype)})" for c in df_ceee.columns]
print("Columns with dtypes:\n[" + ", ".join(cols_dtypes) + "]")
print("\nShape:", df_ceee.shape)
print("\nHead(2):\n", df_ceee.head(2))

Columns with dtypes:
['edif_tiporeg' (float32), 'edif_codreg' (object), 'edif_direc' (object), 'edif_muni' (object), 'edif_codpost' (object), 'edif_prov' (object), 'edif_zonaclim' (object), 'edif_año' (datetime64[ns]), 'edif_norma' (object), 'edif_refcat' (object), 'edif_tipo' (object), 'edif_proced' (object), 'edif_fecha' (datetime64[ns]), 'edif_superf' (object), 'edif_compac' (object), 'edif_calef' (float32), 'edif_refrig' (float32), 'edif_acrist' (object), 'edif_acs' (float64), 'calefac_nombre' (object), 'calefac_tipo' (object), 'calefac_potenc' (object), 'calefac_rendim' (object), 'calefac_vector' (object), 'calefac_modo' (object), 'refrig_nombre' (object), 'refrig_tipo' (object), 'refrig_potenc' (object), 'refrig_rendim' (object), 'refrig_vector' (object), 'refrig_modo' (object), 'acs_nombre' (object), 'acs_tipo' (object), 'acs_potencnom' (object), 'acs_rendim' (object), 'acs_vector' (object), 'acs_modo' (object), 'acs_potenctot' (object), 'termica_nombre' (object), 'termica_calef

In [6]:
# 4.) Load and inspect Final Gas Consumption ∣ Cargar e inspeccionar Consumo final de gas natural
dataset_id = "950a60f0-498c-48db-84f4-734990d3e253"  # Final gas consumption by sector

r = requests.get(ckan_base, params={"id": dataset_id}); r.raise_for_status()
title = r.json()["result"]["title"]
print(f"\nDataset 4: {title}")

# Find first CSV resource and load in memory ∣ buscar primer recurso CSV y cargar en memoria
csv_url = next((res["url"] for res in r.json()["result"]["resources"] if res["format"].lower() == "csv"), None)
df_gas = pd.read_csv(csv_url, sep=";", encoding="latin1") if csv_url else pd.DataFrame()

# Standardize columns and show diagnostics ∣ Estandarizar columnas y mostrar diagnóstico
df_gas.columns = [c.lower().strip().replace(" ", "_") for c in df_gas.columns]
print("  - Shape:", df_gas.shape)
print("  - Dtypes:")
print(df_gas.dtypes)
print("\n  - Head(2):")
print(df_gas.head(2))


Dataset 4: Consumo final de gas natural por sectores
  - Shape: (184, 7)
  - Dtypes:
año                    int64
concepto              object
tipo_territorio       object
código_territorio    float64
territorio           float64
valor                  int64
estado_dato          float64
dtype: object

  - Head(2):
    año                                           concepto tipo_territorio  \
0  2000                 Total consumo final de gas natural           Otros   
1  2000  Consumo final de gas natural en el sector domé...           Otros   

   código_territorio  territorio  valor  estado_dato  
0                NaN         NaN   1205          NaN  
1                NaN         NaN    740          NaN  


In [11]:
# 5.) Load and inspect Air Quality JSON ∣ Cargar e inspeccionar Calidad del Aire (JSON tiempo real)
import requests, pandas as pd

json_url = "https://ciudadesabiertas.madrid.es/dynamicAPI/API/query/calair_tiemporeal.json?pageSize=5000"

# Fetch and parse JSON ∣ Obtener y parsear JSON
r = requests.get(json_url); r.raise_for_status()
data = r.json()

# Normalize records ∣ Normalizar registros
df_air = pd.json_normalize(data["records"])
df_air.columns = [c.lower().strip().replace(" ", "_") for c in df_air.columns]

# Diagnostics ∣ Diagnóstico
cols_dtypes = [f"'{c}' ({str(df_air[c].dtype)})" for c in df_air.columns]
print("Columns with dtypes:\n[" + ", ".join(cols_dtypes) + "]")
print("\nShape:", df_air.shape)
print("\nHead(2):\n", df_air.head(2))

Columns with dtypes:
['provincia' (object), 'municipio' (object), 'estacion' (object), 'magnitud' (object), 'punto_muestreo' (object), 'ano' (object), 'mes' (object), 'dia' (object), 'h01' (object), 'v01' (object), 'h02' (object), 'v02' (object), 'h03' (object), 'v03' (object), 'h04' (object), 'v04' (object), 'h05' (object), 'v05' (object), 'h06' (object), 'v06' (object), 'h07' (object), 'v07' (object), 'h08' (object), 'v08' (object), 'h09' (object), 'v09' (object), 'h10' (object), 'v10' (object), 'h11' (object), 'v11' (object), 'h12' (object), 'v12' (object), 'h13' (object), 'v13' (object), 'h14' (object), 'v14' (object), 'h15' (object), 'v15' (object), 'h16' (object), 'v16' (object), 'h17' (object), 'v17' (object), 'h18' (object), 'v18' (object), 'h19' (object), 'v19' (object), 'h20' (object), 'v20' (object), 'h21' (object), 'v21' (object), 'h22' (object), 'v22' (object), 'h23' (object), 'v23' (object), 'h24' (object), 'v24' (object)]

Shape: (126, 56)

Head(2):
   provincia municipi