In [1]:
import pandas as pd
import numpy as np
import h5py
import matplotlib.pyplot as plt
import seaborn as sns
import geobr
import geopandas as gpd
import matplotlib as mpl

In [2]:
file_path = './1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5'

In [3]:
with h5py.File(file_path, 'r') as hdf:
    keys = list(hdf.keys())
    print(keys)

['table_data', 'table_data_filtered', 'table_info', 'table_p_availability', 'table_preclassif', 'table_q1_gaps', 'table_q2_week', 'table_q3_outliers', 'table_qc_info']


In [4]:
df_info = pd.read_hdf(file_path, key='table_info')
df_info

Unnamed: 0,name_station,gauge_code,city,state,responsible,source,state_abbreviation,lat,long
18981,SELVIRIA | S717,S717,SELVIRIA,MATO GROSSO DO SUL,INMET,INMET,MS,-20.351389,-51.430278
18980,SANTA RITA DO PARDO | S716,S716,SANTA RITA DO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-21.305889,-52.820375
18979,RIBAS DO RIO PARDO | S715,S715,RIBAS DO RIO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-20.466694,-53.763028
18978,PEDRO GOMES | S714,S714,PEDRO GOMES,MATO GROSSO DO SUL,INMET,INMET,MS,-18.072778,-54.548889
18977,NOVA ANDRADINA | S713,S713,NOVA ANDRADINA,MATO GROSSO DO SUL,INMET,INMET,MS,-22.078611,-53.465833
...,...,...,...,...,...,...,...,...,...
4,MARUDA,00047005,MARAPANIM,PARÁ,ANA,HIDROWEB,PA,-0.633600,-47.658300
3,PRIMAVERA,00047004,PRIMAVERA,PARÁ,ANA,HIDROWEB,PA,-0.929400,-47.099400
2,CURUÇA,00047003,CURUÇA,PARÁ,ANA,HIDROWEB,PA,-0.737500,-47.853600
1,SALINÓPOLIS,00047002,SALINÓPOLIS,PARÁ,ANA,HIDROWEB,PA,-0.623100,-47.353600


In [5]:
station_list = df_info['gauge_code'].unique()
print(len(station_list), station_list) # len()

18370 ['S717' 'S716' 'S715' ... '00047003' '00047002' '00047000']


In [6]:
df_data = pd.DataFrame()  # Initialize an empty DataFrame

count = 0
# Read the HDF file in chunks
for chunk in pd.read_hdf(file_path, key='table_data_filtered', encoding='utf-8', chunksize=12400000, iterator=True):
    chunk = chunk[chunk['gauge_code'].isin(station_list)]
    if df_data.empty:
        df_data = chunk.copy(deep=True)  # If this is the first chunk, assign it directly to df_data
    else:
        df_data = pd.concat([df_data, chunk], ignore_index=True)
    count = count + 1
    print(f"Processed chunk {count}")
df_data['year'] = pd.to_datetime(df_data['datetime']).dt.year
df_data

Processed chunk 1
Processed chunk 2
Processed chunk 3
Processed chunk 4
Processed chunk 5
Processed chunk 6
Processed chunk 7
Processed chunk 8
Processed chunk 9
Processed chunk 10


Unnamed: 0,gauge_code,datetime,rain_mm,year
0,00047000,1961-01-01,0.0,1961
1,00047000,1961-01-02,0.0,1961
2,00047000,1961-01-03,0.0,1961
3,00047000,1961-01-04,0.0,1961
4,00047000,1961-01-05,0.0,1961
...,...,...,...,...
123593340,S717,2021-12-27,0.0,2021
123593341,S717,2021-12-28,0.0,2021
123593342,S717,2021-12-29,0.0,2021
123593343,S717,2021-12-30,0.0,2021


In [7]:
del chunk  # Delete the chunk variable to free up memory

In [8]:
df_qc_info = pd.read_hdf(file_path, key='table_qc_info')

lower_limit = 60

df_qc_info['final_classif'] = np.where(
    # (df_qc_info['q1_gaps'] < lower_limit) |
    # (df_qc_info['q2_week'] < lower_limit) |
    # (df_qc_info['q3_outliers'] < lower_limit) |
    (df_qc_info['quality_index'] < 80) |
    (df_qc_info['p_availability'] < 90) |
    (df_qc_info['preclassif'] == 'LQ') |
    (df_qc_info['final_classif'] == 'LQ'),
    'LQ',
    df_qc_info['final_classif']
)
df_qc_info

Unnamed: 0,gauge_code,year,annual_rainfall_mm,active_days,consecutive_dry_days,preclassif,p_availability,q1_gaps,q2_week,q3_outliers,...,name_station,city,state,responsible,source,state_abbreviation,lat,long,region,final_classif
0,00047000,1961,2186.0,365.0,265.0,LQ,100.00000,100.0,71.803161,98.082192,...,SALINÓPOLIS,SALINÓPOLIS,PARÁ,INMET,HIDROWEB,PA,-0.650000,-47.550000,North,LQ
1,00047000,1962,273.8,365.0,93.0,,100.00000,100.0,69.402386,99.452055,...,SALINÓPOLIS,SALINÓPOLIS,PARÁ,INMET,HIDROWEB,PA,-0.650000,-47.550000,North,HQ
2,00047000,1963,686.2,365.0,71.0,,100.00000,100.0,66.650234,99.726027,...,SALINÓPOLIS,SALINÓPOLIS,PARÁ,INMET,HIDROWEB,PA,-0.650000,-47.550000,North,HQ
3,00047000,1964,597.5,366.0,93.0,,100.00000,100.0,87.798731,99.726776,...,SALINÓPOLIS,SALINÓPOLIS,PARÁ,INMET,HIDROWEB,PA,-0.650000,-47.550000,North,HQ
4,00047002,1977,133.4,23.0,5.0,LQ,6.30137,0.0,6.458565,91.304348,...,SALINÓPOLIS,SALINÓPOLIS,PARÁ,ANA,HIDROWEB,PA,-0.623100,-47.353600,North,LQ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345863,S713,2021,76.2,365.0,137.0,LQ,100.00000,100.0,62.583426,99.726027,...,NOVA ANDRADINA | S713,NOVA ANDRADINA,MATO GROSSO DO SUL,INMET,INMET,MS,-22.078611,-53.465833,Central-West,LQ
345864,S714,2021,828.0,365.0,58.0,,100.00000,100.0,63.664531,99.178082,...,PEDRO GOMES | S714,PEDRO GOMES,MATO GROSSO DO SUL,INMET,INMET,MS,-18.072778,-54.548889,Central-West,HQ
345865,S715,2021,1041.8,365.0,38.0,,100.00000,100.0,76.074189,98.630137,...,RIBAS DO RIO PARDO | S715,RIBAS DO RIO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-20.466694,-53.763028,Central-West,HQ
345866,S716,2021,928.8,365.0,32.0,,100.00000,100.0,74.501053,98.904110,...,SANTA RITA DO PARDO | S716,SANTA RITA DO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-21.305889,-52.820375,Central-West,HQ


In [9]:
df_data_loss = df_qc_info[['gauge_code', 'year', 'preclassif', 'final_classif']]
df_data_loss = pd.merge(df_data, df_data_loss[['gauge_code', 'year', 'preclassif', 'final_classif']], on=['gauge_code', 'year'], how='left')
df_data_loss

Unnamed: 0,gauge_code,datetime,rain_mm,year,preclassif,final_classif
0,00047000,1961-01-01,0.0,1961,LQ,LQ
1,00047000,1961-01-02,0.0,1961,LQ,LQ
2,00047000,1961-01-03,0.0,1961,LQ,LQ
3,00047000,1961-01-04,0.0,1961,LQ,LQ
4,00047000,1961-01-05,0.0,1961,LQ,LQ
...,...,...,...,...,...,...
123593340,S717,2021-12-27,0.0,2021,LQ,LQ
123593341,S717,2021-12-28,0.0,2021,LQ,LQ
123593342,S717,2021-12-29,0.0,2021,LQ,LQ
123593343,S717,2021-12-30,0.0,2021,LQ,LQ


In [10]:
preclassif_count = df_data_loss[df_data_loss['preclassif'] != 'LQ'].shape[0]
final_classif_count = df_data_loss[df_data_loss['final_classif'] != 'LQ'].shape[0]

print("Rows where 'preclassif' != 'LQ':", preclassif_count)
print("Rows where 'final_classif' != 'LQ':", final_classif_count)

# Rows where 'preclassif' != 'LQ': 107835098
# Rows where 'final_classif' != 'LQ': 107431189

Rows where 'preclassif' != 'LQ': 112619782
Rows where 'final_classif' != 'LQ': 101479598


In [11]:

del preclassif_count  # Delete the preclassif_count variable to free up memory
del final_classif_count  # Delete the final_classif_count variable to free up memory

In [12]:
df_data_loss = df_data_loss[df_data_loss['preclassif'] != 'LQ']
print(df_data_loss.shape[0], len(df_data_loss['gauge_code'].unique()))  # len()

112619782 17833


In [13]:
df_data_loss.pop('preclassif')

365           
366           
367           
368           
369           
            ..
123592975     
123592976     
123592977     
123592978     
123592979     
Name: preclassif, Length: 112619782, dtype: object

In [14]:
df_data_loss = df_data_loss[df_data_loss['final_classif'] != 'LQ']
print(len(df_data_loss), len(df_data_loss['gauge_code'].unique()))

# 110574472 17283

101479598 17150


In [15]:
del df_data_loss

In [None]:
df_data 

Unnamed: 0,gauge_code,datetime,rain_mm,year
0,00047000,1961-01-01,0.0,1961
1,00047000,1961-01-02,0.0,1961
2,00047000,1961-01-03,0.0,1961
3,00047000,1961-01-04,0.0,1961
4,00047000,1961-01-05,0.0,1961
...,...,...,...,...
123593340,S717,2021-12-27,0.0,2021
123593341,S717,2021-12-28,0.0,2021
123593342,S717,2021-12-29,0.0,2021
123593343,S717,2021-12-30,0.0,2021


: 

In [17]:
df_data_qc = pd.merge(df_data, df_qc_info[['gauge_code', 'year', 'final_classif']], on=['gauge_code', 'year'], how='left')
df_data_qc = df_data_qc[df_data_qc['final_classif'] == 'HQ']
df_data_qc = df_data_qc[['gauge_code', 'datetime', 'rain_mm']]
df_data_qc

: 

: 

In [None]:
df_info_qc = df_info[df_info['gauge_code'].isin(df_data_qc['gauge_code'].unique())]
df_info_qc.reset_index(drop=True, inplace=True)
df_info_qc

Unnamed: 0,name_station,gauge_code,city,state,responsible,source,state_abbreviation,lat,long
0,SANTA RITA DO PARDO | S716,S716,SANTA RITA DO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-21.305889,-52.820375
1,RIBAS DO RIO PARDO | S715,S715,RIBAS DO RIO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-20.466694,-53.763028
2,NOVA ALVORADA DO SUL | S712,S712,NOVA ALVORADA DO SUL,MATO GROSSO DO SUL,INMET,INMET,MS,-21.450972,-54.341972
3,LAGUNA CARAPA | S711,S711,LAGUNA CARAPA,MATO GROSSO DO SUL,INMET,INMET,MS,-22.575389,-55.160333
4,ITAPORA | S710,S710,ITAPORA,MATO GROSSO DO SUL,INMET,INMET,MS,-22.092833,-54.798833
...,...,...,...,...,...,...,...,...,...
16836,MARUDA,00047005,MARAPANIM,PARÁ,ANA,HIDROWEB,PA,-0.633600,-47.658300
16837,PRIMAVERA,00047004,PRIMAVERA,PARÁ,ANA,HIDROWEB,PA,-0.929400,-47.099400
16838,CURUÇA,00047003,CURUÇA,PARÁ,ANA,HIDROWEB,PA,-0.737500,-47.853600
16839,SALINÓPOLIS,00047002,SALINÓPOLIS,PARÁ,ANA,HIDROWEB,PA,-0.623100,-47.353600


In [None]:
del df_info
del df_data, df_qc_info

In [None]:
df_info_qc.to_hdf('./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_QC.h5', key='table_info', mode='w', format='table', encoding='utf-8', index=False, complevel=9)

In [None]:
chunk_size = 12000000  # Define the chunk size
with pd.HDFStore('./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_QC.h5', mode='r+', complevel=9) as store:
    for i in range(0, len(df_data_qc), chunk_size):
        chunk = df_data_qc.iloc[i:i + chunk_size]
        chunk.to_hdf(store
                     , key='table_data'
                     , format='table'
                     , append=True
                     , index=False
                     , encoding='utf-8'
                     , mode='a'
                     , complevel=9
                     , min_itemsize={'gauge_code': 15})
        print(f"Appended chunk {i // chunk_size + 1}")

Appended chunk 1
Appended chunk 2
Appended chunk 3
Appended chunk 4
Appended chunk 5
Appended chunk 6


KeyboardInterrupt: 

In [None]:
del chunk, df_data

NameError: name 'df_data' is not defined

In [None]:
df_data_qc = pd.read_hdf('./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_QC.h5', key='table_data')
df_data_qc

Unnamed: 0,gauge_code,datetime,rain_mm
365,00047000,1962-01-01,0.0
366,00047000,1962-01-02,0.0
367,00047000,1962-01-03,0.0
368,00047000,1962-01-04,0.0
369,00047000,1962-01-05,0.0
...,...,...,...
123592975,S716,2021-12-27,0.0
123592976,S716,2021-12-28,0.0
123592977,S716,2021-12-29,0.0
123592978,S716,2021-12-30,4.4
