In [1]:
import pandas as pd
import numpy as np

# Data processing

In [2]:
df_data = pd.DataFrame()  # Initialize an empty DataFrame

# Read the HDF file in chunks
for chunk in pd.read_hdf('./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5', key='table_data_filtered', encoding='utf-8', chunksize=10000000, iterator=True):
    if df_data.empty:
        df_data = chunk.copy(deep=True)  # If this is the first chunk, assign it directly to df_data
    else:
        df_data = pd.concat([df_data, chunk], ignore_index=True)

df_data

Unnamed: 0,gauge_code,datetime,rain_mm
0,00047000,1961-01-01,0.0
1,00047000,1961-01-02,0.0
2,00047000,1961-01-03,0.0
3,00047000,1961-01-04,0.0
4,00047000,1961-01-05,0.0
...,...,...,...
123609861,S717,2021-12-27,0.0
123609862,S717,2021-12-28,0.0
123609863,S717,2021-12-29,0.0
123609864,S717,2021-12-30,0.0


In [3]:
df_info = pd.read_hdf('./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5', key = 'table_info', encoding = 'utf-8')
df_info

Unnamed: 0,name_station,gauge_code,city,state,responsible,source,state_abbreviation,lat,long
18981,SELVIRIA | S717,S717,SELVIRIA,MATO GROSSO DO SUL,INMET,INMET,MS,-20.351389,-51.430278
18980,SANTA RITA DO PARDO | S716,S716,SANTA RITA DO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-21.305889,-52.820375
18979,RIBAS DO RIO PARDO | S715,S715,RIBAS DO RIO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-20.466694,-53.763028
18978,PEDRO GOMES | S714,S714,PEDRO GOMES,MATO GROSSO DO SUL,INMET,INMET,MS,-18.072778,-54.548889
18977,NOVA ANDRADINA | S713,S713,NOVA ANDRADINA,MATO GROSSO DO SUL,INMET,INMET,MS,-22.078611,-53.465833
...,...,...,...,...,...,...,...,...,...
4,MARUDA,00047005,MARAPANIM,PARÁ,ANA,HIDROWEB,PA,-0.633600,-47.658300
3,PRIMAVERA,00047004,PRIMAVERA,PARÁ,ANA,HIDROWEB,PA,-0.929400,-47.099400
2,CURUÇA,00047003,CURUÇA,PARÁ,ANA,HIDROWEB,PA,-0.737500,-47.853600
1,SALINÓPOLIS,00047002,SALINÓPOLIS,PARÁ,ANA,HIDROWEB,PA,-0.623100,-47.353600


In [5]:
df_complete_info = pd.merge(df_data, df_info, on = 'gauge_code', how = 'inner')
del df_data, df_info
df_complete_info['year'] = df_complete_info['datetime'].dt.year
df_complete_info

Unnamed: 0,gauge_code,datetime,rain_mm,name_station,city,state,responsible,source,state_abbreviation,lat,long,year
0,00047000,1961-01-01,0.0,SALINÓPOLIS,SALINÓPOLIS,PARÁ,INMET,HIDROWEB,PA,-0.650000,-47.550000,1961
1,00047000,1961-01-02,0.0,SALINÓPOLIS,SALINÓPOLIS,PARÁ,INMET,HIDROWEB,PA,-0.650000,-47.550000,1961
2,00047000,1961-01-03,0.0,SALINÓPOLIS,SALINÓPOLIS,PARÁ,INMET,HIDROWEB,PA,-0.650000,-47.550000,1961
3,00047000,1961-01-04,0.0,SALINÓPOLIS,SALINÓPOLIS,PARÁ,INMET,HIDROWEB,PA,-0.650000,-47.550000,1961
4,00047000,1961-01-05,0.0,SALINÓPOLIS,SALINÓPOLIS,PARÁ,INMET,HIDROWEB,PA,-0.650000,-47.550000,1961
...,...,...,...,...,...,...,...,...,...,...,...,...
123593340,S717,2021-12-27,0.0,SELVIRIA | S717,SELVIRIA,MATO GROSSO DO SUL,INMET,INMET,MS,-20.351389,-51.430278,2021
123593341,S717,2021-12-28,0.0,SELVIRIA | S717,SELVIRIA,MATO GROSSO DO SUL,INMET,INMET,MS,-20.351389,-51.430278,2021
123593342,S717,2021-12-29,0.0,SELVIRIA | S717,SELVIRIA,MATO GROSSO DO SUL,INMET,INMET,MS,-20.351389,-51.430278,2021
123593343,S717,2021-12-30,0.0,SELVIRIA | S717,SELVIRIA,MATO GROSSO DO SUL,INMET,INMET,MS,-20.351389,-51.430278,2021


In [9]:
def calculateQ2(df):
    # Filter rows where rain_mm > 1 mm (wet days)
    df_wet_days = df[df['rain_mm'] > 1.0].copy()
    
    # Extract year and day of the week from datetime
    df_wet_days['year'] = df_wet_days['datetime'].dt.year
    df_wet_days['day_of_week'] = df_wet_days['datetime'].dt.dayofweek  # Monday=0, Sunday=6
    
    # Group by gauge_code and year, then count wet days for each day of the week
    df_grouped = df_wet_days.groupby(['gauge_code', 'year', 'day_of_week']).size().unstack(fill_value=0)
    
    # Calculate the coefficient of variation (CV) for each group
    cv = df_grouped.std(axis=1) / df_grouped.mean(axis=1)
    
    # Calculate q2_week as 100 - 100 * CV, ensuring it's not less than 0
    q2_week = 100 - 100 * cv
    q2_week = q2_week.clip(lower=0)  # Set values below 0 to 0
    
    # Create the result DataFrame
    df_q2_week = pd.DataFrame({
        'gauge_code': df_grouped.index.get_level_values('gauge_code'),
        'year': df_grouped.index.get_level_values('year'),
        'q2_week': q2_week.values
    })
    
    return df_q2_week

df_q2_week = calculateQ2(df_complete_info)

df_q2_week

Unnamed: 0,gauge_code,year,q2_week
0,00047000,1961,71.803161
1,00047000,1962,69.402386
2,00047000,1963,66.650234
3,00047000,1964,87.798731
4,00047002,1977,6.458565
...,...,...,...
324298,S712,2021,72.786988
324299,S713,2021,62.583426
324300,S714,2021,63.664531
324301,S715,2021,76.074189


In [10]:
df_q2_week.to_hdf('./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5', key = 'table_q2_week', encoding = 'utf-8', mode='r+')
df_q2_week

Unnamed: 0,gauge_code,year,q2_week
0,00047000,1961,71.803161
1,00047000,1962,69.402386
2,00047000,1963,66.650234
3,00047000,1964,87.798731
4,00047002,1977,6.458565
...,...,...,...
324298,S712,2021,72.786988
324299,S713,2021,62.583426
324300,S714,2021,63.664531
324301,S715,2021,76.074189


In [11]:
df_teste = pd.read_hdf('./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5', key = 'table_q2_week', encoding = 'utf-8')
df_teste

Unnamed: 0,gauge_code,year,q2_week
0,00047000,1961,71.803161
1,00047000,1962,69.402386
2,00047000,1963,66.650234
3,00047000,1964,87.798731
4,00047002,1977,6.458565
...,...,...,...
324298,S712,2021,72.786988
324299,S713,2021,62.583426
324300,S714,2021,63.664531
324301,S715,2021,76.074189
