In [1]:
# En este notebook vamos a limpiar los datos de las tablas parquet generadas en ../data_parquet/
# Comprobaremos que no haya horas con datos faltantes o duplicados, y en caso de que los haya, los corregiremos mediante interpolación temporal.
# Finalmente, guardaremos las tablas corregidas en ../data_parquet_clean/

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pathlib import Path
import shutil


In [2]:
df = pd.read_parquet("../data_parquet/precios_luz")
df[7170:7179]

Unnamed: 0,Dia,Hora,GEN,NOC,VHC,COFGEN,COFNOC,COFVHC,PMHGEN,PMHNOC,...,TEUCYM,CCVPCB,CCVCYM,EDSRPCB,EDSRCYM,EDCGASPCB,EDCGASCYM,TAHPCB,TAHCYM,year
7170,25/10/2020,19-20,79.62,98.05,98.05,9.4e-05,6.5e-05,6.1e-05,25.54,25.74,...,,,,,,,,,,2020
7171,25/10/2020,20-21,92.93,111.46,111.46,0.000105,8.3e-05,7.9e-05,39.88,40.19,...,,,,,,,,,,2020
7172,25/10/2020,21-22,99.29,117.82,117.82,0.000123,0.000123,0.000149,45.17,45.48,...,,,,,,,,,,2020
7173,25/10/2020,22-23,90.62,109.13,109.13,0.000128,0.000129,0.000157,36.07,36.35,...,,,,,,,,,,2020
7174,25/10/2020,23-24,91.17,43.22,109.72,0.000115,0.000122,0.000151,37.85,36.47,...,,,,,,,,,,2020
7175,25/10/2020,24-25,81.18,33.48,35.88,9.5e-05,0.000116,0.000143,27.21,26.16,...,,,,,,,,,,2020
7176,26/10/2020,00-01,85.63,37.68,40.34,7.8e-05,0.000105,0.000142,29.27,28.1,...,,,,,,,,,,2020
7177,26/10/2020,01-02,81.22,33.36,30.99,6.1e-05,9e-05,0.000146,24.87,23.84,...,,,,,,,,,,2020
7178,26/10/2020,02-03,80.77,32.93,30.57,5.5e-05,8.4e-05,0.000143,24.38,23.37,...,,,,,,,,,,2020


In [3]:
df = pd.read_parquet("../data_parquet/precios_luz")
# Convertimos la columna Hora a la primera hora del intervalo, en 00-01 por ejemplo será 0 horas
df['Hora'] = df['Hora'].str.split('-').str[0].astype(int)
# Eliminamos los registros que la Hora sea 24  
df = df[df['Hora'] != 24]
# Creamos una columna de timestamp combinando las columnas 'Dia' y 'Hora'
df['Hora'] = pd.to_datetime((pd.to_datetime(df['Dia'], format='%d/%m/%Y') + pd.to_timedelta(df['Hora'], unit='h')))
df = df[df['Dia'] != '21/09/2025']
output_dir = Path("../data_parquet_clean/precios_luz")
if output_dir.exists():
    shutil.rmtree(output_dir)
df.to_parquet("../data_parquet_clean/precios_luz",partition_cols=['year'], index=False) 

In [4]:
# lectura datos y función de comprobación de duplicados y faltantes


def comprobar_duplicados_faltantes(list_tabla, path="../data_parquet/"):
    faltantes_por_hora = {}
    for tabla in list_tabla:
        df = pd.read_parquet(f"{path}{tabla}")

        # Comprobar duplicados en el campo hora con el formato datetime %Y-%m-%d %H:%M:%S   
        duplicados = df[df.duplicated(subset=['Hora'], keep=False)]
        if not duplicados.empty:
            print(f"Duplicados encontrados en {tabla}:")
            print(duplicados)
            print(duplicados.count())

        # Comprobar faltantes
        rango_completo = pd.date_range(start=df['Hora'].min(), end=df['Hora'].max(), freq='1h')
        faltantes = rango_completo.difference(df['Hora'])
        if not faltantes.empty:
            faltantes_por_hora[tabla] = faltantes
            print(f"Faltantes encontrados en {tabla}:")
            print(faltantes)

    return faltantes_por_hora

list_tabla = ['precios_luz']

comprobar_duplicados_faltantes(list_tabla,path="../data_parquet_clean/")


Faltantes encontrados en precios_luz:
DatetimeIndex(['2020-03-29 02:00:00', '2021-03-28 02:00:00',
               '2022-03-27 02:00:00', '2023-03-26 02:00:00',
               '2024-03-31 02:00:00', '2025-03-30 02:00:00',
               '2025-09-21 00:00:00', '2025-09-21 01:00:00',
               '2025-09-21 02:00:00', '2025-09-21 03:00:00',
               '2025-09-21 04:00:00', '2025-09-21 05:00:00',
               '2025-09-21 06:00:00', '2025-09-21 07:00:00',
               '2025-09-21 08:00:00', '2025-09-21 09:00:00',
               '2025-09-21 10:00:00', '2025-09-21 11:00:00',
               '2025-09-21 12:00:00', '2025-09-21 13:00:00',
               '2025-09-21 14:00:00', '2025-09-21 15:00:00',
               '2025-09-21 16:00:00', '2025-09-21 17:00:00',
               '2025-09-21 18:00:00', '2025-09-21 19:00:00',
               '2025-09-21 20:00:00', '2025-09-21 21:00:00',
               '2025-09-21 22:00:00', '2025-09-21 23:00:00'],
              dtype='datetime64[ns]', freq=Non

{'precios_luz': DatetimeIndex(['2020-03-29 02:00:00', '2021-03-28 02:00:00',
                '2022-03-27 02:00:00', '2023-03-26 02:00:00',
                '2024-03-31 02:00:00', '2025-03-30 02:00:00',
                '2025-09-21 00:00:00', '2025-09-21 01:00:00',
                '2025-09-21 02:00:00', '2025-09-21 03:00:00',
                '2025-09-21 04:00:00', '2025-09-21 05:00:00',
                '2025-09-21 06:00:00', '2025-09-21 07:00:00',
                '2025-09-21 08:00:00', '2025-09-21 09:00:00',
                '2025-09-21 10:00:00', '2025-09-21 11:00:00',
                '2025-09-21 12:00:00', '2025-09-21 13:00:00',
                '2025-09-21 14:00:00', '2025-09-21 15:00:00',
                '2025-09-21 16:00:00', '2025-09-21 17:00:00',
                '2025-09-21 18:00:00', '2025-09-21 19:00:00',
                '2025-09-21 20:00:00', '2025-09-21 21:00:00',
                '2025-09-21 22:00:00', '2025-09-21 23:00:00'],
               dtype='datetime64[ns]', freq=None)}

In [5]:
# completamos las horas faltantes con los datos de la hora anterior
list_tabla = ['precios_luz']

for tabla in list_tabla:
    df = pd.read_parquet(f"../data_parquet_clean/{tabla}")
    
    # Asegurarse de que 'year' sea de tipo numérico
    if 'year' in df.columns and df['year'].dtype == 'category':
        df['year'] = df['year'].astype('int32')
    
    df.set_index('Hora', inplace=True)
    df = df.asfreq('1h')  # Asegurarse de que todos los registros de 1 hora estén presentes
    df.sort_index(inplace=True)
    
    # Interpolación para columnas numéricas
    numeric_columns = df.select_dtypes(include=['float64', 'int32', 'int64']).columns
    df[numeric_columns] = df[numeric_columns].interpolate(method='time')

    # en Dia guardamos la fecha en formato dd/mm/yyyy
    df['Dia'] = df.index.strftime('%d/%m/%Y')
    
    df.reset_index(inplace=True)

    output_dir = Path("../data_parquet_clean/precios_luz")
    if output_dir.exists():
        shutil.rmtree(output_dir)
    df.to_parquet("../data_parquet_clean/precios_luz",partition_cols=['year'], index=False) 

# comprobamos que no haya faltantes
faltantes_por_hora = comprobar_duplicados_faltantes(list_tabla,path="../data_parquet_clean/")
faltantes_por_hora



{}

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50904 entries, 0 to 50903
Data columns (total 60 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Hora       50904 non-null  datetime64[ns]
 1   Dia        50904 non-null  object        
 2   GEN        50904 non-null  float64       
 3   NOC        50904 non-null  float64       
 4   VHC        50904 non-null  float64       
 5   COFGEN     50904 non-null  float64       
 6   COFNOC     50904 non-null  float64       
 7   COFVHC     50904 non-null  float64       
 8   PMHGEN     50904 non-null  float64       
 9   PMHNOC     50904 non-null  float64       
 10  PMHVHC     50904 non-null  float64       
 11  SAHGEN     50904 non-null  float64       
 12  SAHNOC     50904 non-null  float64       
 13  SAHVHC     50904 non-null  float64       
 14  FOMGEN     50904 non-null  float64       
 15  FOMNOC     50904 non-null  float64       
 16  FOMVHC     50904 non-null  float64      