In [1]:
import requests
import os
from datetime import datetime, timedelta
import time
import json
import shutil
import pandas as pd
from glob import glob


### Variables globales

In [2]:

# Diccionario de ciudades de España y su coordenadas
ciudades_espana = [
    {"nombre": "Albacete", "lat": 38.9943, "lon": -1.8585},
    {"nombre": "Alicante", "lat": 38.3452, "lon": -0.4810},
    {"nombre": "Almería", "lat": 36.8340, "lon": -2.4637},
    {"nombre": "Ávila", "lat": 40.6565, "lon": -4.6818},
    {"nombre": "Badajoz", "lat": 38.8794, "lon": -6.9707},
    {"nombre": "Barcelona", "lat": 41.3874, "lon": 2.1686},
    {"nombre": "Bilbao", "lat": 43.2630, "lon": -2.9350},
    {"nombre": "Burgos", "lat": 42.3439, "lon": -3.6969},
    {"nombre": "Cáceres", "lat": 39.4753, "lon": -6.3722},
    {"nombre": "Cádiz", "lat": 36.5271, "lon": -6.2886},
    {"nombre": "Castellón", "lat": 39.9864, "lon": -0.0513},
    {"nombre": "Ciudad Real", "lat": 38.9861, "lon": -3.9270},
    {"nombre": "Córdoba", "lat": 37.8882, "lon": -4.7794},
    {"nombre": "A Coruña", "lat": 43.3623, "lon": -8.4115},
    {"nombre": "Cuenca", "lat": 40.0704, "lon": -2.1374},
    {"nombre": "Girona", "lat": 41.9794, "lon": 2.8214},
    {"nombre": "Granada", "lat": 37.1773, "lon": -3.5986},
    {"nombre": "Guadalajara", "lat": 40.6333, "lon": -3.1669},
    {"nombre": "Huelva", "lat": 37.2614, "lon": -6.9447},
    {"nombre": "Huesca", "lat": 42.1362, "lon": -0.4089},
    {"nombre": "Jaén", "lat": 37.7796, "lon": -3.7849},
    {"nombre": "León", "lat": 42.5987, "lon": -5.5671},
    {"nombre": "Lleida", "lat": 41.6176, "lon": 0.6200},
    {"nombre": "Logroño", "lat": 42.4667, "lon": -2.4500},
    {"nombre": "Lugo", "lat": 43.0097, "lon": -7.5560},
    {"nombre": "Madrid", "lat": 40.4168, "lon": -3.7038},
    {"nombre": "Málaga", "lat": 36.7213, "lon": -4.4217},
    {"nombre": "Murcia", "lat": 37.9834, "lon": -1.1299},
    {"nombre": "Ourense", "lat": 42.3350, "lon": -7.8639},
    {"nombre": "Oviedo", "lat": 43.3619, "lon": -5.8494},
    {"nombre": "Palencia", "lat": 42.0097, "lon": -4.5241},
    {"nombre": "Pontevedra", "lat": 42.4333, "lon": -8.6333},
    {"nombre": "Salamanca", "lat": 40.9688, "lon": -5.6639},
    {"nombre": "San Sebastián", "lat": 43.3128, "lon": -1.9740},
    {"nombre": "Santander", "lat": 43.4623, "lon": -3.8099},
    {"nombre": "Segovia", "lat": 40.9429, "lon": -4.1088},
    {"nombre": "Sevilla", "lat": 37.3828, "lon": -5.9732},
    {"nombre": "Soria", "lat": 41.7667, "lon": -2.4667},
    {"nombre": "Tarragona", "lat": 41.1167, "lon": 1.2500},
    {"nombre": "Teruel", "lat": 40.3456, "lon": -1.1065},
    {"nombre": "Toledo", "lat": 39.8628, "lon": -4.0273},
    {"nombre": "Valencia", "lat": 39.4699, "lon": -0.3763},
    {"nombre": "Valladolid", "lat": 41.6523, "lon": -4.7245},
    {"nombre": "Vitoria", "lat": 42.8467, "lon": -2.6716},
    {"nombre": "Zamora", "lat": 41.5033, "lon": -5.7446},
    {"nombre": "Zaragoza", "lat": 41.6488, "lon": -0.8891}
]

# Parámetros a recoger en la descarga
variables = [
    "temperature_2m", "precipitation", "rain",
    "cloud_cover", "cloud_cover_low", "cloud_cover_mid", "cloud_cover_high",
    "wind_speed_10m", "wind_speed_100m",
    "wind_direction_10m", "wind_direction_100m",
    "wind_gusts_10m"
]


# Directorios que usamos
dir_json = "../data_download"
os.makedirs(dir_json, exist_ok=True)

dir_parquet = "../data_parquet"
os.makedirs(dir_parquet, exist_ok=True)



# Descarga de los datos

In [5]:
start_date = "2020-01-01"
# end_date será un dia menos al de hoy para asegurar que los datos estan completos
#end_date = (datetime.today() - timedelta(days=1)).strftime("%Y-%m-%d")
end_date = "2025-10-21"
timezone = "Europe/Madrid"

for ciudad in ciudades_espana:
    
    # Construimos patrón de búsqueda
    pattern = os.path.join(dir_json, f"{ciudad['nombre'].replace(' ', '_')}.json")
    
    # Verificamos si ya hay un fichero para esa fecha
    ya_descargado = glob(pattern)
    if ya_descargado:
        print(f"Ya descargado: {ciudad['nombre']}, se omite.")
        continue

    try:
        params = {
            "latitude": ciudad["lat"],
            "longitude": ciudad["lon"],
            "start_date": start_date,
            "end_date": end_date,
            "hourly": ",".join(variables),
            "timezone": timezone,
            "format": "json"
        }

        print(f"Descargando {ciudad['nombre']}...")
        response = requests.get("https://archive-api.open-meteo.com/v1/archive", params=params, timeout=20)

        if response.status_code == 200:

            filename = os.path.join(dir_json, f"{ciudad['nombre'].replace(' ', '_')}.json")

            # añado el campo city con el valor de la ciudad en minuscula y sin acentos
            data = response.json()

            # Verificamos si los datos horarios están vacíos
            if not data.get("hourly") or all(len(v) == 0 for v in data["hourly"].values()):
                print(f"Datos vacíos para {ciudad['nombre']}, no se guarda el archivo.")
                continue

            data['city'] = ciudad['nombre'].lower().replace('ñ', 'n').replace(' ', '_')

            # Guardamos el JSON en el fichero
            with open(filename, "w", encoding="utf-8") as f:
                json.dump(data, f, ensure_ascii=False, indent=4)

            print(f"Guardado: {filename}")
        
        # Comprobamos si saturamos el limite por hora, y entonces esperamos 1h
        elif response.status_code == 429 or "Hourly API request limit exceeded" in response.text:
                print(f"Límite horario alcanzado en {ciudad['nombre']}. Esperando 60 minutos...")
                time.sleep(3600)

        else:
            print(f"Error {response.status_code} en {ciudad['nombre']}: {response.text[:100]}")

    except Exception as e:
        print(f"Excepción en {ciudad['nombre']}: {e}")
    
    # Para no saturar el limite por minuto
    time.sleep(10)

Ya descargado: Albacete, se omite.
Ya descargado: Alicante, se omite.
Ya descargado: Almería, se omite.
Ya descargado: Ávila, se omite.
Ya descargado: Badajoz, se omite.
Ya descargado: Barcelona, se omite.
Ya descargado: Bilbao, se omite.
Ya descargado: Burgos, se omite.
Ya descargado: Cáceres, se omite.
Ya descargado: Cádiz, se omite.
Ya descargado: Castellón, se omite.
Ya descargado: Ciudad Real, se omite.
Ya descargado: Córdoba, se omite.
Ya descargado: A Coruña, se omite.
Ya descargado: Cuenca, se omite.
Ya descargado: Girona, se omite.
Ya descargado: Granada, se omite.
Ya descargado: Guadalajara, se omite.
Ya descargado: Huelva, se omite.
Ya descargado: Huesca, se omite.
Ya descargado: Jaén, se omite.
Ya descargado: León, se omite.
Ya descargado: Lleida, se omite.
Ya descargado: Logroño, se omite.
Ya descargado: Lugo, se omite.
Ya descargado: Madrid, se omite.
Ya descargado: Málaga, se omite.
Ya descargado: Murcia, se omite.
Ya descargado: Ourense, se omite.
Ya descargado: Oviedo,

# Los guardamos en parquet y añadimos columnas year, date, hour

In [6]:
all_dfs = []

for filepath in glob(os.path.join(dir_json, "*.json")):
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)

    hourly = data["hourly"]
    time_list = hourly["time"]

    df_hourly = pd.DataFrame({key: val for key, val in hourly.items()})
    
    df = pd.DataFrame(df_hourly)

    df_long = pd.DataFrame()
    df_long["time"] = pd.to_datetime(hourly["time"])
    for var in hourly:
        if var != "time":
            df_long[var] = hourly[var]

    df_long["year"] = df_long["time"].dt.year
    df_long["date"] = df_long["time"].dt.date
    df_long["hour"] = df_long["time"].dt.time

    for key in data:
        if key not in ["hourly", "hourly_units"]:
            df_long[key] = data[key]

    all_dfs.append(df_long)

final_df = pd.concat(all_dfs, ignore_index=True)

# eliminamos los registros que no traen datos
final_df = final_df.dropna(subset=variables)

# Eliminamos la tabla meteo si ya existe antes de guardar
if os.path.exists(f"{dir_parquet}/meteo"):
    shutil.rmtree(f"{dir_parquet}/meteo")

# guardamos el parquet
final_df.to_parquet(
    f"{dir_parquet}/meteo",
    compression="snappy",
    partition_cols=["year"],
    index=False
)

print(f"Guardado en: {dir_parquet}")

Guardado en: ../data_parquet


In [7]:
final_df

Unnamed: 0,time,temperature_2m,precipitation,rain,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,wind_speed_10m,wind_speed_100m,...,date,hour,latitude,longitude,generationtime_ms,utc_offset_seconds,timezone,timezone_abbreviation,elevation,city
0,2020-01-01 00:00:00,0.1,0.0,0.0,1,1,0,0,9.4,19.5,...,2020-01-01,00:00:00,42.495605,-2.514679,134.548068,3600,Europe/Madrid,GMT+1,386.0,logrono
1,2020-01-01 01:00:00,0.1,0.0,0.0,1,1,0,2,9.4,18.4,...,2020-01-01,01:00:00,42.495605,-2.514679,134.548068,3600,Europe/Madrid,GMT+1,386.0,logrono
2,2020-01-01 02:00:00,0.2,0.0,0.0,0,0,0,0,9.0,19.1,...,2020-01-01,02:00:00,42.495605,-2.514679,134.548068,3600,Europe/Madrid,GMT+1,386.0,logrono
3,2020-01-01 03:00:00,0.3,0.0,0.0,12,0,0,11,9.8,19.6,...,2020-01-01,03:00:00,42.495605,-2.514679,134.548068,3600,Europe/Madrid,GMT+1,386.0,logrono
4,2020-01-01 04:00:00,0.3,0.0,0.0,0,0,0,0,10.1,19.1,...,2020-01-01,04:00:00,42.495605,-2.514679,134.548068,3600,Europe/Madrid,GMT+1,386.0,logrono
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2341579,2025-10-21 19:00:00,21.7,0.0,0.0,17,0,9,9,2.6,9.0,...,2025-10-21,19:00:00,41.652020,-0.910431,57.160020,3600,Europe/Madrid,GMT+1,228.0,zaragoza
2341580,2025-10-21 20:00:00,21.5,0.0,0.0,18,0,0,18,7.2,16.9,...,2025-10-21,20:00:00,41.652020,-0.910431,57.160020,3600,Europe/Madrid,GMT+1,228.0,zaragoza
2341581,2025-10-21 21:00:00,21.3,0.0,0.0,13,0,5,8,10.4,22.4,...,2025-10-21,21:00:00,41.652020,-0.910431,57.160020,3600,Europe/Madrid,GMT+1,228.0,zaragoza
2341582,2025-10-21 22:00:00,21.4,0.0,0.0,14,2,8,4,13.5,26.4,...,2025-10-21,22:00:00,41.652020,-0.910431,57.160020,3600,Europe/Madrid,GMT+1,228.0,zaragoza


In [8]:
df = pd.read_parquet(f"{dir_parquet}/meteo")
df.info(), df.head(5)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2341584 entries, 0 to 2341583
Data columns (total 24 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   time                   datetime64[ns]
 1   temperature_2m         float64       
 2   precipitation          float64       
 3   rain                   float64       
 4   cloud_cover            int64         
 5   cloud_cover_low        int64         
 6   cloud_cover_mid        int64         
 7   cloud_cover_high       int64         
 8   wind_speed_10m         float64       
 9   wind_speed_100m        float64       
 10  wind_direction_10m     int64         
 11  wind_direction_100m    int64         
 12  wind_gusts_10m         float64       
 13  date                   object        
 14  hour                   object        
 15  latitude               float64       
 16  longitude              float64       
 17  generationtime_ms      float64       
 18  utc_offset_seconds    

(None,
                  time  temperature_2m  precipitation  rain  cloud_cover  \
 0 2020-01-01 00:00:00             0.1            0.0   0.0            1   
 1 2020-01-01 01:00:00             0.1            0.0   0.0            1   
 2 2020-01-01 02:00:00             0.2            0.0   0.0            0   
 3 2020-01-01 03:00:00             0.3            0.0   0.0           12   
 4 2020-01-01 04:00:00             0.3            0.0   0.0            0   
 
    cloud_cover_low  cloud_cover_mid  cloud_cover_high  wind_speed_10m  \
 0                1                0                 0             9.4   
 1                1                0                 2             9.4   
 2                0                0                 0             9.0   
 3                0                0                11             9.8   
 4                0                0                 0            10.1   
 
    wind_speed_100m  ...      hour   latitude  longitude generationtime_ms  \
 0         

In [9]:
df.describe()

Unnamed: 0,time,temperature_2m,precipitation,rain,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,wind_gusts_10m,latitude,longitude,generationtime_ms,utc_offset_seconds,elevation
count,2341584,2341584.0,2341584.0,2341584.0,2341584.0,2341584.0,2341584.0,2341584.0,2341584.0,2341584.0,2341584.0,2341584.0,2341584.0,2341584.0,2341584.0,2341584.0,2341584.0,2341584.0
mean,2022-11-26 11:30:00,15.94823,0.0748996,0.07356452,49.52432,20.29699,18.69922,33.01903,10.28624,16.49493,186.6278,184.7924,23.21872,40.54863,-3.507201,398.8856,3600.0,390.0
min,2020-01-01 00:00:00,-15.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,36.52021,-8.722473,16.96754,3600.0,0.0
25%,2021-06-14 05:45:00,10.2,0.0,0.0,2.0,0.0,0.0,0.0,5.4,8.4,90.0,86.0,13.0,38.98067,-5.640991,330.2583,3600.0,36.0
50%,2022-11-26 11:30:00,15.4,0.0,0.0,46.0,0.0,0.0,3.0,8.8,14.9,201.0,201.0,20.5,40.94903,-3.692749,381.0669,3600.0,313.0
75%,2024-05-09 17:15:00,21.4,0.0,0.0,98.0,25.0,24.0,81.0,13.9,22.7,273.0,271.0,31.0,42.35501,-1.849304,421.0395,3600.0,697.0
max,2025-10-21 23:00:00,46.3,23.7,23.7,100.0,100.0,100.0,100.0,69.6,95.3,360.0,360.0,141.8,43.47979,2.882096,1957.341,3600.0,1133.0
std,,8.084232,0.4131028,0.4096692,42.13783,33.89323,31.33127,41.2424,6.552137,10.32048,103.9985,103.0817,13.07644,2.06962,2.721342,313.7365,0.0,356.2283
