In [5]:
import pandas as pd
import numpy as np
import h5py
import datetime
from datetime import datetime
from datetime import timedelta 
import time

# Data processing

In [2]:
cleaned_path = './1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5'
with h5py.File(cleaned_path, 'r') as hdf:
    keys = list(hdf.keys())
    print(keys)

['table_data', 'table_data_filtered', 'table_info', 'table_p_availability', 'table_preclassif', 'table_q1_gaps', 'table_q2_week', 'table_q3_outliers', 'table_qc_info']


In [3]:
chunk_size = 13000000  # Adjust the chunk size as needed
df_data = pd.DataFrame()

# Read the filtered data in chunks  from the HDF5 file

with pd.HDFStore(cleaned_path, mode='r') as store:
    total_chunks = store.get_storer('table_data_filtered').nrows // chunk_size + 1
    print(f"Rows in table_data_filtered: {store.get_storer('table_data_filtered').nrows}\n")
    start_time, step_time = time.time(), time.time()
    for i, chunk in enumerate(store.select('table_data_filtered', chunksize=chunk_size)):
        if df_data.empty:
            df_data = chunk
        else:
            df_data = pd.concat([df_data, chunk], ignore_index=True)  # Concatenate chunk to df_data_filtered (inplace=True)
        del chunk  # Delete the chunk variable to free up memory
        print(f"Processed chunk {i + 1} of {total_chunks} | Time taken: {(time.time() - step_time):.1f} seconds")
        step_time = time.time()
print(f"Total time taken:  {(time.time() - start_time):.1f} seconds")
del store, total_chunks
df_data

Rows in table_data_filtered: 123611008

Processed chunk 1 of 10 | Time taken: 8.3 seconds

Processed chunk 2 of 10 | Time taken: 6.9 seconds

Processed chunk 3 of 10 | Time taken: 7.3 seconds

Processed chunk 4 of 10 | Time taken: 8.6 seconds

Processed chunk 5 of 10 | Time taken: 29.3 seconds

Processed chunk 6 of 10 | Time taken: 11.2 seconds

Processed chunk 7 of 10 | Time taken: 15.6 seconds

Processed chunk 8 of 10 | Time taken: 60.2 seconds

Processed chunk 9 of 10 | Time taken: 40.0 seconds

Processed chunk 10 of 10 | Time taken: 63.0 seconds

Total time taken:  251.0 seconds


Unnamed: 0,gauge_code,datetime,rain_mm
0,00047000,1961-01-01,0.0
1,00047000,1961-01-02,0.0
2,00047000,1961-01-03,0.0
3,00047000,1961-01-04,0.0
4,00047000,1961-01-05,0.0
...,...,...,...
123611003,S717,2021-12-27,0.0
123611004,S717,2021-12-28,0.0
123611005,S717,2021-12-29,0.0
123611006,S717,2021-12-30,0.0


In [None]:
df_data['year'] = pd.to_datetime(df_data['datetime']).dt.year
df_p_availability = df_data.groupby(['gauge_code', 'year']).agg({'rain_mm': 'count'}).reset_index()
df_p_availability['days_in_year'] = df_p_availability.apply(lambda x: 365 if (x['year'] % 4 != 0 or (x['year'] % 100 == 0 and x['year'] % 400 != 0)) else 366, axis=1)
df_p_availability['p_availability'] = df_p_availability['rain_mm'] / df_p_availability['days_in_year'] * 100
df_p_availability['p_availability'] = df_p_availability['p_availability'].fillna(0).replace([np.inf, -np.inf], 0)
df_p_availability = df_p_availability[['gauge_code', 'year', 'p_availability']]
df_p_availability

Unnamed: 0,gauge_code,year,p_availability
0,00047000,1961,100.00000
1,00047000,1962,100.00000
2,00047000,1963,100.00000
3,00047000,1964,100.00000
4,00047002,1977,6.30137
...,...,...,...
346024,S713,2021,100.00000
346025,S714,2021,100.00000
346026,S715,2021,100.00000
346027,S716,2021,100.00000


In [45]:
df_p_availability.to_hdf(cleaned_path
                         , key = 'table_p_availability'
                         , encoding = 'utf-8'
                         , mode='r+'
                         , format='table'
                         , complevel=9
                         , append=False)


df_p_availability = pd.read_hdf(cleaned_path, key='table_p_availability')
df_p_availability

Unnamed: 0,gauge_code,year,p_availability
0,00047000,1961,100.00000
1,00047000,1962,100.00000
2,00047000,1963,100.00000
3,00047000,1964,100.00000
4,00047002,1977,6.30137
...,...,...,...
346024,S713,2021,100.00000
346025,S714,2021,100.00000
346026,S715,2021,100.00000
346027,S716,2021,100.00000


In [46]:
with h5py.File(cleaned_path, 'r') as hdf:
    keys = list(hdf.keys())
    print(keys)

['table_data', 'table_data_filtered', 'table_info', 'table_p_availability', 'table_preclassif', 'table_q1_gaps', 'table_q2_week', 'table_q3_outliers', 'table_qc_info']
