In [4]:
import pandas as pd
import numpy as np
import os
import glob
import h5py
import re

In [2]:
file_path = './1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_QC.h5'
with h5py.File(file_path, 'r') as hdf:
    keys = list(hdf.keys())
    print(keys)

['table_data', 'table_grid', 'table_info']


In [14]:
crossvalidation_path = './1 - Organized data gauge/BRAZIL/CROSSVALIDATION'
crossvalidation_files = glob.glob(os.path.join(crossvalidation_path, "*.h5"))
cv_len = len(crossvalidation_files)
print("file count:", cv_len, "\n\nexample:\n", crossvalidation_files[0],"\n...\n...\n...\n",crossvalidation_files[-1])

file count: 23376 

example:
 ./1 - Organized data gauge/BRAZIL/CROSSVALIDATION\1961_01_01_crossvalidation.h5 
...
...
...
 ./1 - Organized data gauge/BRAZIL/CROSSVALIDATION\2024_12_31_crossvalidation.h5


In [15]:
date_ls = []
for filepath in crossvalidation_files:
    match = re.search(r'(\d{4}_\d{2}_\d{2})', filepath)
    if match:
        date_str = match.group(1)  # -> '1961_01_01'
        # print(date_str)
        date_str = date_str.replace('_', '-')
        date_ls.append(date_str)

df_date_ls = pd.DataFrame(date_ls, columns=['date'])
df_date_ls['date'] = pd.to_datetime(df_date_ls['date'], format='%Y-%m-%d')
# df_date_ls = pd.DataFrame(df_date_ls, columns=['date'])
df_date_ls

Unnamed: 0,date
0,1961-01-01
1,1961-01-02
2,1961-01-03
3,1961-01-04
4,1961-01-05
...,...
23371,2024-12-27
23372,2024-12-28
23373,2024-12-29
23374,2024-12-30


In [16]:
# Generate full date range (1961-2024)
full_date_range = pd.date_range(
    start='1961-01-01',
    end='2024-12-31',
    freq='D'  # Daily frequency
)

# Convert to DataFrame
df_full_dates = pd.DataFrame(full_date_range, columns=['date'])
df_full_dates

Unnamed: 0,date
0,1961-01-01
1,1961-01-02
2,1961-01-03
3,1961-01-04
4,1961-01-05
...,...
23371,2024-12-27
23372,2024-12-28
23373,2024-12-29
23374,2024-12-30


In [17]:
missing_dates = df_full_dates[~df_full_dates['date'].isin(df_date_ls['date'])]
missing_dates

Unnamed: 0,date


In [18]:
file_list = []
count = 1
for filename in crossvalidation_files[:]:
    df_temp = pd.read_hdf(filename)
    file_list.append(df_temp)
    print("Loading...", count, "/", cv_len)
    count = count + 1

Loading... 1 / 23376
Loading... 2 / 23376
Loading... 3 / 23376
Loading... 4 / 23376
Loading... 5 / 23376
Loading... 6 / 23376
Loading... 7 / 23376
Loading... 8 / 23376
Loading... 9 / 23376
Loading... 10 / 23376
Loading... 11 / 23376
Loading... 12 / 23376
Loading... 13 / 23376
Loading... 14 / 23376
Loading... 15 / 23376
Loading... 16 / 23376
Loading... 17 / 23376
Loading... 18 / 23376
Loading... 19 / 23376
Loading... 20 / 23376
Loading... 21 / 23376
Loading... 22 / 23376
Loading... 23 / 23376
Loading... 24 / 23376
Loading... 25 / 23376
Loading... 26 / 23376
Loading... 27 / 23376
Loading... 28 / 23376
Loading... 29 / 23376
Loading... 30 / 23376
Loading... 31 / 23376
Loading... 32 / 23376
Loading... 33 / 23376
Loading... 34 / 23376
Loading... 35 / 23376
Loading... 36 / 23376
Loading... 37 / 23376
Loading... 38 / 23376
Loading... 39 / 23376
Loading... 40 / 23376
Loading... 41 / 23376
Loading... 42 / 23376
Loading... 43 / 23376
Loading... 44 / 23376
Loading... 45 / 23376
Loading... 46 / 233

In [19]:
df_crossvalidation = pd.concat(file_list)
df_crossvalidation['interpolated_rain_mm'] = df_crossvalidation['interpolated_rain_mm'].round(1).astype(float)
df_crossvalidation['rain_mm'] = df_crossvalidation['rain_mm'].round(1).astype(float)
df_crossvalidation['gauge_code'] = df_crossvalidation['gauge_code'].astype(str)
df_crossvalidation = df_crossvalidation[['gauge_code', 'datetime',	'interpolated_rain_mm',	'rain_mm']]
df_crossvalidation

Unnamed: 0,gauge_code,datetime,interpolated_rain_mm,rain_mm
0,00835022,1961-01-01,0.5,0.0
1,02147050,1961-01-01,14.2,1.3
2,02754006,1961-01-01,64.5,46.2
3,01641000,1961-01-01,0.1,0.0
4,01344007,1961-01-01,0.0,21.0
...,...,...,...,...
4563,56776500,2024-12-31,0.0,0.0
4564,58729800,2024-12-31,1.6,0.0
4565,355030860A,2024-12-31,0.8,0.0
4566,350970003A,2024-12-31,1.6,0.6


In [20]:
df_crossvalidation.describe()

Unnamed: 0,datetime,interpolated_rain_mm,rain_mm
count,106295715,106295700.0,106295700.0
mean,1992-05-07 16:45:38.075981056,3.739809,3.737769
min,1961-01-01 00:00:00,0.0,0.0
25%,1977-08-21 00:00:00,0.0,0.0
50%,1990-11-14 00:00:00,0.0,0.0
75%,2007-04-23 00:00:00,3.2,0.7
max,2024-12-31 00:00:00,587.0,600.0
std,,8.733825,10.69663


In [21]:
df_crossvalidation.to_hdf('./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_QC.h5'
                          , key = 'table_crossvalidation'
                          , mode = 'r+'
                          , complevel = 9
                          , encoding="utf-8"
                          , append = False)
df_crossvalidation

Unnamed: 0,gauge_code,datetime,interpolated_rain_mm,rain_mm
0,00835022,1961-01-01,0.5,0.0
1,02147050,1961-01-01,14.2,1.3
2,02754006,1961-01-01,64.5,46.2
3,01641000,1961-01-01,0.1,0.0
4,01344007,1961-01-01,0.0,21.0
...,...,...,...,...
4563,56776500,2024-12-31,0.0,0.0
4564,58729800,2024-12-31,1.6,0.0
4565,355030860A,2024-12-31,0.8,0.0
4566,350970003A,2024-12-31,1.6,0.6
