In [1]:
import pandas as pd
import numpy as np
import h5py
import matplotlib.pyplot as plt
import seaborn as sns
import geobr
import geopandas as gpd
import matplotlib as mpl

In [2]:
file_path = './1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5'

In [3]:
with h5py.File(file_path, 'r') as hdf:
    keys = list(hdf.keys())
    print(keys)

['table_data', 'table_data_filtered', 'table_info', 'table_p_availability', 'table_preclassif', 'table_q1_gaps', 'table_q2_week', 'table_q3_outliers', 'table_qc_info']


In [4]:
df_info = pd.read_hdf(file_path, key='table_info')
df_info

Unnamed: 0,name_station,gauge_code,city,state,responsible,source,state_abbreviation,lat,long
18981,SELVIRIA | S717,S717,SELVIRIA,MATO GROSSO DO SUL,INMET,INMET,MS,-20.351389,-51.430278
18980,SANTA RITA DO PARDO | S716,S716,SANTA RITA DO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-21.305889,-52.820375
18979,RIBAS DO RIO PARDO | S715,S715,RIBAS DO RIO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-20.466694,-53.763028
18978,PEDRO GOMES | S714,S714,PEDRO GOMES,MATO GROSSO DO SUL,INMET,INMET,MS,-18.072778,-54.548889
18977,NOVA ANDRADINA | S713,S713,NOVA ANDRADINA,MATO GROSSO DO SUL,INMET,INMET,MS,-22.078611,-53.465833
...,...,...,...,...,...,...,...,...,...
4,MARUDA,00047005,MARAPANIM,PARÁ,ANA,HIDROWEB,PA,-0.633600,-47.658300
3,PRIMAVERA,00047004,PRIMAVERA,PARÁ,ANA,HIDROWEB,PA,-0.929400,-47.099400
2,CURUÇA,00047003,CURUÇA,PARÁ,ANA,HIDROWEB,PA,-0.737500,-47.853600
1,SALINÓPOLIS,00047002,SALINÓPOLIS,PARÁ,ANA,HIDROWEB,PA,-0.623100,-47.353600


In [5]:
station_list = df_info['gauge_code'].unique()
print(len(station_list), station_list) # len()

18370 ['S717' 'S716' 'S715' ... '00047003' '00047002' '00047000']


In [6]:
df_data = pd.DataFrame()  # Initialize an empty DataFrame

count = 0
# Read the HDF file in chunks
for chunk in pd.read_hdf(file_path, key='table_data_filtered', encoding='utf-8', chunksize=12400000, iterator=True):
    chunk = chunk[chunk['gauge_code'].isin(station_list)]
    if df_data.empty:
        df_data = chunk.copy(deep=True)  # If this is the first chunk, assign it directly to df_data
    else:
        df_data = pd.concat([df_data, chunk], ignore_index=True)
    count = count + 1
    print(f"Processed chunk {count}")
df_data['year'] = pd.to_datetime(df_data['datetime']).dt.year
df_data

Processed chunk 1
Processed chunk 2
Processed chunk 3
Processed chunk 4
Processed chunk 5
Processed chunk 6
Processed chunk 7
Processed chunk 8
Processed chunk 9
Processed chunk 10


Unnamed: 0,gauge_code,datetime,rain_mm,year
0,00047000,1961-01-01,0.0,1961
1,00047000,1961-01-02,0.0,1961
2,00047000,1961-01-03,0.0,1961
3,00047000,1961-01-04,0.0,1961
4,00047000,1961-01-05,0.0,1961
...,...,...,...,...
123594482,S717,2021-12-27,0.0,2021
123594483,S717,2021-12-28,0.0,2021
123594484,S717,2021-12-29,0.0,2021
123594485,S717,2021-12-30,0.0,2021


In [7]:
del chunk  # Delete the chunk variable to free up memory

In [8]:
df_data['gauge_code'].nunique()

18370

In [9]:
df_data = df_data[df_data['gauge_code'].isin(station_list)]

In [10]:
df_qc_info = pd.read_hdf(file_path, key='table_qc_info')

lower_limit = 60

df_qc_info['final_classif'] = np.where(
    # (df_qc_info['q1_gaps'] < lower_limit) |
    # (df_qc_info['q2_week'] < lower_limit) |
    # (df_qc_info['q3_outliers'] < lower_limit) |
    (df_qc_info['quality_index'] < 80) |
    (df_qc_info['p_availability'] < 90) |
    (df_qc_info['preclassif'] == 'LQ') |
    (df_qc_info['final_classif'] == 'LQ'),
    'LQ',
    df_qc_info['final_classif']
)
df_qc_info

Unnamed: 0,name_station,gauge_code,city,state,responsible,source,state_abbreviation,lat,long,year,...,consecutive_dry_days,preclassif,p_availability,q1_gaps,q2_week,q3_outliers,quality_index,quality_label,region,final_classif
0,SALINÓPOLIS,00047000,SALINÓPOLIS,PARÁ,INMET,HIDROWEB,PA,-0.650000,-47.550000,1961,...,275.0,LQ,100.00000,100.0,71.803161,98.082192,92.471338,1 - Excellent Quality,North,LQ
1,SALINÓPOLIS,00047000,SALINÓPOLIS,PARÁ,INMET,HIDROWEB,PA,-0.650000,-47.550000,1962,...,153.0,LQ,100.00000,100.0,69.402386,99.452055,92.213610,1 - Excellent Quality,North,LQ
2,SALINÓPOLIS,00047000,SALINÓPOLIS,PARÁ,INMET,HIDROWEB,PA,-0.650000,-47.550000,1963,...,115.0,,100.00000,100.0,66.650234,99.726027,91.594065,1 - Excellent Quality,North,HQ
3,SALINÓPOLIS,00047000,SALINÓPOLIS,PARÁ,INMET,HIDROWEB,PA,-0.650000,-47.550000,1964,...,145.0,,100.00000,100.0,87.798731,99.726776,96.881377,1 - Excellent Quality,North,HQ
4,SALINÓPOLIS,00047002,SALINÓPOLIS,PARÁ,ANA,HIDROWEB,PA,-0.623100,-47.353600,1977,...,6.0,LQ,6.30137,0.0,6.458565,91.304348,26.016071,5 - Very Low Quality,North,LQ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346024,NOVA ANDRADINA | S713,S713,NOVA ANDRADINA,MATO GROSSO DO SUL,INMET,INMET,MS,-22.078611,-53.465833,2021,...,150.0,LQ,100.00000,100.0,62.583426,99.452055,90.508870,1 - Excellent Quality,Central-West,LQ
346025,PEDRO GOMES | S714,S714,PEDRO GOMES,MATO GROSSO DO SUL,INMET,INMET,MS,-18.072778,-54.548889,2021,...,75.0,,100.00000,100.0,65.175174,98.082192,90.814342,1 - Excellent Quality,Central-West,HQ
346026,RIBAS DO RIO PARDO | S715,S715,RIBAS DO RIO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-20.466694,-53.763028,2021,...,76.0,,100.00000,100.0,78.359308,97.808219,94.041882,1 - Excellent Quality,Central-West,HQ
346027,SANTA RITA DO PARDO | S716,S716,SANTA RITA DO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-21.305889,-52.820375,2021,...,68.0,,100.00000,100.0,77.324050,98.904110,94.057040,1 - Excellent Quality,Central-West,HQ


In [11]:
df_data_loss = df_qc_info[['gauge_code', 'year', 'preclassif', 'final_classif']]
df_data_loss = pd.merge(df_data, df_data_loss[['gauge_code', 'year', 'preclassif', 'final_classif']], on=['gauge_code', 'year'], how='inner')
df_data_loss

Unnamed: 0,gauge_code,datetime,rain_mm,year,preclassif,final_classif
0,00047000,1961-01-01,0.0,1961,LQ,LQ
1,00047000,1961-01-02,0.0,1961,LQ,LQ
2,00047000,1961-01-03,0.0,1961,LQ,LQ
3,00047000,1961-01-04,0.0,1961,LQ,LQ
4,00047000,1961-01-05,0.0,1961,LQ,LQ
...,...,...,...,...,...,...
123594482,S717,2021-12-27,0.0,2021,LQ,LQ
123594483,S717,2021-12-28,0.0,2021,LQ,LQ
123594484,S717,2021-12-29,0.0,2021,LQ,LQ
123594485,S717,2021-12-30,0.0,2021,LQ,LQ


In [12]:
df_data_loss = df_data_loss[df_data_loss['preclassif'] != 'LQ']
print(df_data_loss.shape[0], len(df_data_loss['gauge_code'].unique()))  # len()

107974358 17627


In [13]:
df_data_loss = df_data_loss[df_data_loss['final_classif'] != 'LQ']
print(len(df_data_loss), len(df_data_loss['gauge_code'].unique()))

# 110574472 17283

106295715 17130


In [14]:
del df_data_loss

In [15]:
df_data 

Unnamed: 0,gauge_code,datetime,rain_mm,year
0,00047000,1961-01-01,0.0,1961
1,00047000,1961-01-02,0.0,1961
2,00047000,1961-01-03,0.0,1961
3,00047000,1961-01-04,0.0,1961
4,00047000,1961-01-05,0.0,1961
...,...,...,...,...
123594482,S717,2021-12-27,0.0,2021
123594483,S717,2021-12-28,0.0,2021
123594484,S717,2021-12-29,0.0,2021
123594485,S717,2021-12-30,0.0,2021


In [16]:
df_data_qc = pd.merge(df_data, df_qc_info[['gauge_code', 'year', 'final_classif']], on=['gauge_code', 'year'], how='inner')
df_data_qc = df_data_qc[df_data_qc['final_classif'] == 'HQ']
df_data_qc = df_data_qc[['gauge_code', 'datetime', 'rain_mm']]
df_data_qc

Unnamed: 0,gauge_code,datetime,rain_mm
730,00047000,1963-01-01,0.0
731,00047000,1963-01-02,0.0
732,00047000,1963-01-03,0.0
733,00047000,1963-01-04,0.0
734,00047000,1963-01-05,0.0
...,...,...,...
123594117,S716,2021-12-27,0.0
123594118,S716,2021-12-28,0.0
123594119,S716,2021-12-29,0.0
123594120,S716,2021-12-30,4.4


In [17]:
df_info_qc = df_info[df_info['gauge_code'].isin(df_data_qc['gauge_code'].unique())]
df_info_qc.reset_index(drop=True, inplace=True)
df_info_qc

Unnamed: 0,name_station,gauge_code,city,state,responsible,source,state_abbreviation,lat,long
0,SANTA RITA DO PARDO | S716,S716,SANTA RITA DO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-21.305889,-52.820375
1,RIBAS DO RIO PARDO | S715,S715,RIBAS DO RIO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-20.466694,-53.763028
2,PEDRO GOMES | S714,S714,PEDRO GOMES,MATO GROSSO DO SUL,INMET,INMET,MS,-18.072778,-54.548889
3,NOVA ALVORADA DO SUL | S712,S712,NOVA ALVORADA DO SUL,MATO GROSSO DO SUL,INMET,INMET,MS,-21.450972,-54.341972
4,LAGUNA CARAPA | S711,S711,LAGUNA CARAPA,MATO GROSSO DO SUL,INMET,INMET,MS,-22.575389,-55.160333
...,...,...,...,...,...,...,...,...,...
17125,MARUDA,00047005,MARAPANIM,PARÁ,ANA,HIDROWEB,PA,-0.633600,-47.658300
17126,PRIMAVERA,00047004,PRIMAVERA,PARÁ,ANA,HIDROWEB,PA,-0.929400,-47.099400
17127,CURUÇA,00047003,CURUÇA,PARÁ,ANA,HIDROWEB,PA,-0.737500,-47.853600
17128,SALINÓPOLIS,00047002,SALINÓPOLIS,PARÁ,ANA,HIDROWEB,PA,-0.623100,-47.353600


In [21]:
del df_info, df_data, df_qc_info

In [18]:
df_info_qc.to_hdf('./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_QC.h5', key='table_info', mode='w', format='table', encoding='utf-8', index=False, complevel=9)

In [19]:
chunk_size = 12000000  # Define the chunk size
with pd.HDFStore('./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_QC.h5', mode='r+', complevel=9) as store:
    for i in range(0, len(df_data_qc), chunk_size):
        chunk = df_data_qc.iloc[i:i + chunk_size]
        chunk.to_hdf(store
                     , key='table_data'
                     , format='table'
                     , append=True
                     , index=False
                     , encoding='utf-8'
                     , mode='a'
                     , complevel=9
                     , min_itemsize={'gauge_code': 15})
        print(f"Appended chunk {i // chunk_size + 1}")

Appended chunk 1
Appended chunk 2
Appended chunk 3
Appended chunk 4
Appended chunk 5
Appended chunk 6
Appended chunk 7
Appended chunk 8
Appended chunk 9


In [20]:
del chunk

In [22]:
df_data_qc = pd.read_hdf('./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_QC.h5', key='table_data')
df_data_qc

Unnamed: 0,gauge_code,datetime,rain_mm
730,00047000,1963-01-01,0.0
731,00047000,1963-01-02,0.0
732,00047000,1963-01-03,0.0
733,00047000,1963-01-04,0.0
734,00047000,1963-01-05,0.0
...,...,...,...
123594117,S716,2021-12-27,0.0
123594118,S716,2021-12-28,0.0
123594119,S716,2021-12-29,0.0
123594120,S716,2021-12-30,4.4
