In [5]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time

# Data processing

In [6]:
cleaned_path = r'./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5'

In [None]:
chunk_size = 13000000  # Adjust the chunk size as needed
chunks = []
df_data = pd.DataFrame()

# Read the filtered data in chunks  from the HDF5 file

with pd.HDFStore(cleaned_path, mode='r') as store:
    total_chunks = store.get_storer('table_data_filtered').nrows // chunk_size + 1
    print(f"Rows in table_data_filtered: {store.get_storer('table_data_filtered').nrows}\n")
    start_time, step_time = time.time(), time.time()
    for i, chunk in enumerate(store.select('table_data_filtered', chunksize=chunk_size)):
        if df_data.empty:
            df_data = chunk
        else:
            df_data = pd.concat([df_data, chunk], ignore_index=True)  # Concatenate chunk to df_data_filtered (inplace=True)
        del chunk  # Delete the chunk variable to free up memory
        print(f"Processed chunk {i + 1} of {total_chunks} | Time taken: {(time.time() - step_time):.1f} seconds")
        step_time = time.time()
print(f"Total time taken:  {(time.time() - start_time):.1f} seconds")
df_data

Rows in table_data_filtered: 123611008

Processed chunk 1 of 10 | Time taken: 6.4 seconds
Processed chunk 2 of 10 | Time taken: 7.6 seconds
Processed chunk 3 of 10 | Time taken: 7.4 seconds
Processed chunk 4 of 10 | Time taken: 8.1 seconds
Processed chunk 5 of 10 | Time taken: 9.7 seconds
Processed chunk 6 of 10 | Time taken: 9.1 seconds
Processed chunk 7 of 10 | Time taken: 13.1 seconds
Processed chunk 8 of 10 | Time taken: 20.5 seconds
Processed chunk 9 of 10 | Time taken: 44.4 seconds
Processed chunk 10 of 10 | Time taken: 24.4 seconds
Total time taken:  151.4 seconds


Unnamed: 0,gauge_code,datetime,rain_mm
0,00047000,1961-01-01,0.0
1,00047000,1961-01-02,0.0
2,00047000,1961-01-03,0.0
3,00047000,1961-01-04,0.0
4,00047000,1961-01-05,0.0
...,...,...,...
123611003,S717,2021-12-27,0.0
123611004,S717,2021-12-28,0.0
123611005,S717,2021-12-29,0.0
123611006,S717,2021-12-30,0.0


In [18]:
def calculateQ2(df):
    # Filter rows where rain_mm >= 1 mm (wet days)
    df_wet_days = df[df['rain_mm'] >= 1.0].copy()
    
    # Extract year and day of the week from datetime
    df_wet_days['year'] = df_wet_days['datetime'].dt.year
    df_wet_days['day_of_week'] = df_wet_days['datetime'].dt.dayofweek  # Monday=0, Sunday=6
    
    # Group by gauge_code, year, and day_of_week, then count occurrences
    df_grouped = df_wet_days.groupby(['gauge_code', 'year', 'day_of_week']).size().reset_index(name='count')

    df_pivot = df_grouped.pivot(index=['gauge_code', 'year']
                                , columns='day_of_week'
                                , values='count').fillna(0).reset_index()    

    # Calculate the coefficient of variation (CV)
    df_pivot['std'] = df_pivot[[0, 1, 2, 3, 4, 5, 6]].std(axis=1)
    df_pivot['mean'] = df_pivot[[0, 1, 2, 3, 4, 5, 6]].mean(axis=1)
    
    df_pivot['cv'] =  df_pivot.apply(lambda x: x['std'] / x['mean'] if x['mean'] != 0 else 1.0, axis=1)
    
    # Calculate the Q2 for each group
    df_pivot['q2_week'] = 100 - 100 * df_pivot['cv']
    df_pivot['q2_week'] = df_pivot['q2_week'].clip(lower=0, upper=100)  # Set values below 0 to 0
    
    return df_pivot

df_q2_week = calculateQ2(df_data)
df_q2_week

day_of_week,gauge_code,year,0,1,2,3,4,5,6,std,mean,cv,q2_week
0,00047000,1961,7.0,7.0,7.0,9.0,7.0,4.0,4.0,1.812654,6.428571,0.281968,71.803161
1,00047000,1962,1.0,2.0,3.0,3.0,3.0,3.0,3.0,0.786796,2.571429,0.305976,69.402386
2,00047000,1963,7.0,9.0,11.0,12.0,8.0,13.0,18.0,3.716117,11.142857,0.333498,66.650234
3,00047000,1964,10.0,9.0,11.0,9.0,12.0,12.0,10.0,1.272418,10.428571,0.122013,87.798731
4,00047002,1977,3.0,0.0,1.0,0.0,1.0,2.0,1.0,1.069045,1.142857,0.935414,6.458565
...,...,...,...,...,...,...,...,...,...,...,...,...,...
324411,S712,2021,8.0,9.0,7.0,12.0,12.0,6.0,10.0,2.340126,9.142857,0.255951,74.404870
324412,S713,2021,1.0,1.0,2.0,1.0,2.0,1.0,2.0,0.534522,1.428571,0.374166,62.583426
324413,S714,2021,12.0,4.0,5.0,8.0,7.0,9.0,8.0,2.636737,7.571429,0.348248,65.175174
324414,S715,2021,11.0,7.0,11.0,12.0,14.0,13.0,9.0,2.380476,11.000000,0.216407,78.359308


In [19]:
df_data['year'] = df_data['datetime'].dt.year
df_gauge_code = df_data[['gauge_code', 'year']].drop_duplicates().sort_values(by=['gauge_code', 'year'])
df_gauge_code = df_gauge_code.reset_index(drop=True)
df_gauge_code

Unnamed: 0,gauge_code,year
0,00047000,1961
1,00047000,1962
2,00047000,1963
3,00047000,1964
4,00047002,1977
...,...,...
346024,S713,2021
346025,S714,2021
346026,S715,2021
346027,S716,2021


In [22]:
df_q2_week = df_q2_week[['gauge_code', 'year', 'q2_week']]
df_q2_week = pd.merge(df_gauge_code, df_q2_week, on=['gauge_code', 'year'], how='left')
df_q2_week = df_q2_week.sort_values(by=['gauge_code', 'year']).fillna(0)
df_q2_week = df_q2_week.reset_index(drop=True)
df_q2_week

Unnamed: 0,gauge_code,year,q2_week
0,00047000,1961,71.803161
1,00047000,1962,69.402386
2,00047000,1963,66.650234
3,00047000,1964,87.798731
4,00047002,1977,6.458565
...,...,...,...
346024,S713,2021,62.583426
346025,S714,2021,65.175174
346026,S715,2021,78.359308
346027,S716,2021,77.324050


In [23]:
df_q2_week.to_hdf('./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5'
                  , key = 'table_q2_week'
                  , encoding = 'utf-8'
                  , mode='r+'
                  , format='table'
                  , append=False)
df_q2_week

Unnamed: 0,gauge_code,year,q2_week
0,00047000,1961,71.803161
1,00047000,1962,69.402386
2,00047000,1963,66.650234
3,00047000,1964,87.798731
4,00047002,1977,6.458565
...,...,...,...
346024,S713,2021,62.583426
346025,S714,2021,65.175174
346026,S715,2021,78.359308
346027,S716,2021,77.324050


In [24]:
df_teste = pd.read_hdf('./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5'
                       , key = 'table_q2_week'
                       , encoding = 'utf-8')
df_teste

Unnamed: 0,gauge_code,year,q2_week
0,00047000,1961,71.803161
1,00047000,1962,69.402386
2,00047000,1963,66.650234
3,00047000,1964,87.798731
4,00047002,1977,6.458565
...,...,...,...
346024,S713,2021,62.583426
346025,S714,2021,65.175174
346026,S715,2021,78.359308
346027,S716,2021,77.324050
