In [1]:
# En este notebook vamos a limpiar los datos de las tablas parquet generadas en ../data_parquet/
# Comprobaremos que no haya horas con datos faltantes o duplicados, y en caso de que los haya, los corregiremos mediante interpolación temporal.
# Finalmente, guardaremos las tablas corregidas en ../data_parquet_clean/

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path
import shutil


In [2]:
df = pd.read_parquet("../data_parquet/meteo")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2338272 entries, 0 to 2338271
Data columns (total 24 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   time                   datetime64[ns]
 1   temperature_2m         float64       
 2   precipitation          float64       
 3   rain                   float64       
 4   cloud_cover            int64         
 5   cloud_cover_low        int64         
 6   cloud_cover_mid        int64         
 7   cloud_cover_high       int64         
 8   wind_speed_10m         float64       
 9   wind_speed_100m        float64       
 10  wind_direction_10m     int64         
 11  wind_direction_100m    int64         
 12  wind_gusts_10m         float64       
 13  date                   object        
 14  hour                   object        
 15  latitude               float64       
 16  longitude              float64       
 17  generationtime_ms      float64       
 18  utc_offset_seconds    

In [3]:
df

Unnamed: 0,time,temperature_2m,precipitation,rain,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,wind_speed_10m,wind_speed_100m,...,hour,latitude,longitude,generationtime_ms,utc_offset_seconds,timezone,timezone_abbreviation,elevation,city,year
0,2020-01-01 00:00:00,0.2,0.0,0.0,1,1,0,0,10.4,20.5,...,00:00:00,42.495605,-2.514679,222.465634,7200,Europe/Madrid,GMT+2,386.0,logrono,2020
1,2020-01-01 01:00:00,0.1,0.0,0.0,1,1,0,0,9.4,19.5,...,01:00:00,42.495605,-2.514679,222.465634,7200,Europe/Madrid,GMT+2,386.0,logrono,2020
2,2020-01-01 02:00:00,0.1,0.0,0.0,1,1,0,2,9.4,18.4,...,02:00:00,42.495605,-2.514679,222.465634,7200,Europe/Madrid,GMT+2,386.0,logrono,2020
3,2020-01-01 03:00:00,0.2,0.0,0.0,0,0,0,0,9.0,19.1,...,03:00:00,42.495605,-2.514679,222.465634,7200,Europe/Madrid,GMT+2,386.0,logrono,2020
4,2020-01-01 04:00:00,0.3,0.0,0.0,12,0,0,11,9.8,19.6,...,04:00:00,42.495605,-2.514679,222.465634,7200,Europe/Madrid,GMT+2,386.0,logrono,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2338267,2025-10-18 19:00:00,22.1,0.0,0.0,0,0,0,0,5.2,11.7,...,19:00:00,41.652020,-0.910431,129.336119,7200,Europe/Madrid,GMT+2,228.0,zaragoza,2025
2338268,2025-10-18 20:00:00,20.3,0.0,0.0,0,0,0,0,5.5,12.7,...,20:00:00,41.652020,-0.910431,129.336119,7200,Europe/Madrid,GMT+2,228.0,zaragoza,2025
2338269,2025-10-18 21:00:00,19.4,0.0,0.0,0,0,0,0,6.5,15.3,...,21:00:00,41.652020,-0.910431,129.336119,7200,Europe/Madrid,GMT+2,228.0,zaragoza,2025
2338270,2025-10-18 22:00:00,18.0,0.0,0.0,0,0,0,0,4.5,13.7,...,22:00:00,41.652020,-0.910431,129.336119,7200,Europe/Madrid,GMT+2,228.0,zaragoza,2025


In [4]:
df.isna().sum()

time                     0
temperature_2m           0
precipitation            0
rain                     0
cloud_cover              0
cloud_cover_low          0
cloud_cover_mid          0
cloud_cover_high         0
wind_speed_10m           0
wind_speed_100m          0
wind_direction_10m       0
wind_direction_100m      0
wind_gusts_10m           0
date                     0
hour                     0
latitude                 0
longitude                0
generationtime_ms        0
utc_offset_seconds       0
timezone                 0
timezone_abbreviation    0
elevation                0
city                     0
year                     0
dtype: int64

In [5]:
# lectura datos y función de comprobación de duplicados y faltantes


def comprobar_duplicados_faltantes(list_tabla, path="../data_parquet/"):
    faltantes_por_hora = []
    for tabla in list_tabla:
        df = pd.read_parquet(f"{path}{tabla}")

        # Comprobar duplicados en el campo hora con el formato datetime %Y-%m-%d %H:%M:%S   
        duplicados = df[df.duplicated(subset=['time', 'city'], keep=False)]
        if not duplicados.empty:
            print(f"Duplicados encontrados en {tabla}:")
            print(duplicados)
            print(duplicados.count())
        else:
            print(f"No se encontraron duplicados en {tabla}.")

        # Comprobar faltantes de time en cada city
        for ciudad in df['city'].unique():
            df_ciudad = df[df['city'] == ciudad]
            rango_completo = pd.date_range(start=df_ciudad['time'].min(), end=df_ciudad['time'].max(), freq='1h')
            faltantes = rango_completo.difference(df_ciudad['time'])
            if not faltantes.empty:
                print(f"Faltantes encontrados en {tabla} para la ciudad {ciudad}:")
                print(faltantes)
                faltantes_por_hora.append(faltantes)
        if not faltantes_por_hora:
            print(f"No se encontraron faltantes.")

    return faltantes_por_hora

list_tabla = ['meteo']

comprobar_duplicados_faltantes(list_tabla,path="../data_parquet_clean/")


No se encontraron duplicados en meteo.
No se encontraron faltantes.


[]

In [6]:
df = pd.read_parquet("../data_parquet/meteo")

output_dir = Path("../data_parquet_clean/meteo")
if output_dir.exists():
    shutil.rmtree(output_dir)
df.to_parquet("../data_parquet_clean/meteo",partition_cols=['year'], index=False) 

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2338272 entries, 0 to 2338271
Data columns (total 24 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   time                   datetime64[ns]
 1   temperature_2m         float64       
 2   precipitation          float64       
 3   rain                   float64       
 4   cloud_cover            int64         
 5   cloud_cover_low        int64         
 6   cloud_cover_mid        int64         
 7   cloud_cover_high       int64         
 8   wind_speed_10m         float64       
 9   wind_speed_100m        float64       
 10  wind_direction_10m     int64         
 11  wind_direction_100m    int64         
 12  wind_gusts_10m         float64       
 13  date                   object        
 14  hour                   object        
 15  latitude               float64       
 16  longitude              float64       
 17  generationtime_ms      float64       
 18  utc_offset_seconds    