In [1]:
import pandas as pd
import numpy as np
import h5py

# Data processing

In [2]:
cleaned_path = './1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5'

In [3]:
with h5py.File(cleaned_path, 'r') as hdf:
    keys = list(hdf.keys())
    print(keys)

['table_data', 'table_data_filtered', 'table_info', 'table_p_availability', 'table_preclassif', 'table_q1_gaps', 'table_q2_week', 'table_q3_outliers', 'table_qc_info']


In [4]:
df_data = pd.DataFrame()  # Initialize an empty DataFrame

# Read the HDF file in chunks
for chunk in pd.read_hdf(cleaned_path, key='table_data_filtered', encoding='utf-8', chunksize=10000000, iterator=True):
    if df_data.empty:
        df_data = chunk.copy(deep=True)  # If this is the first chunk, assign it directly to df_data
    else:
        df_data = pd.concat([df_data, chunk], ignore_index=True)

df_data

Unnamed: 0,gauge_code,datetime,rain_mm
0,00047000,1961-01-01,0.0
1,00047000,1961-01-02,0.0
2,00047000,1961-01-03,0.0
3,00047000,1961-01-04,0.0
4,00047000,1961-01-05,0.0
...,...,...,...
123609861,S717,2021-12-27,0.0
123609862,S717,2021-12-28,0.0
123609863,S717,2021-12-29,0.0
123609864,S717,2021-12-30,0.0


In [5]:
del chunk

In [6]:
def calculateQ3(df):
    # Calculate monthly thresholds using groupby
    df.loc[:, 'month'] = df['datetime'].dt.month
    df.loc[:, 'year'] = df['datetime'].dt.year
    # Filter out non-rainy days and calculate monthly thresholds
    monthly_thresholds = (
        df[df['rain_mm'] >= 1.0]  # Filter rainy days first
        .assign(month=lambda x: x['datetime'].dt.month)
        .groupby(['gauge_code', 'month'])
        ['rain_mm']
        .agg(
            Q1=lambda x: x.quantile(0.25) if not x.empty else np.nan,
            Q3=lambda x: x.quantile(0.75) if not x.empty else np.nan
        )
        .reset_index()
        .assign(
            # Combine both operations in one step
            upper_limit=lambda x: (x['Q3'] + 1.5*(x['Q3'] - x['Q1'])).fillna(np.inf)
        )
        [['gauge_code', 'month', 'upper_limit']]
    )
    df_upper_limit = pd.merge(df, monthly_thresholds, on=['gauge_code', 'month'], how='left')
    df_upper_limit['is_outlier'] = df_upper_limit['rain_mm'] > df_upper_limit['upper_limit']
    df_q3_outliers = df_upper_limit.groupby(['gauge_code', 'year'])['is_outlier'].sum().reset_index()
    df_active_days = df.groupby(['gauge_code', 'year'])['rain_mm'].count().reset_index()
    df_q3_outliers = pd.merge(df_q3_outliers, df_active_days, on=['gauge_code', 'year'], how='left')
    df_q3_outliers['q3_outliers'] = 100 - df_q3_outliers['is_outlier'] / df_q3_outliers['rain_mm'] * 100
    df_q3_outliers['q3_outliers'] = df_q3_outliers['q3_outliers'].clip(lower=0, upper=100)

    del df, df_upper_limit, df_active_days  # Free up memory
    return df_q3_outliers[['gauge_code', 'year', 'q3_outliers']]


df_q3_outliers = calculateQ3(df_data)
df_q3_outliers

Unnamed: 0,gauge_code,year,q3_outliers
0,00047000,1961,98.082192
1,00047000,1962,99.452055
2,00047000,1963,99.726027
3,00047000,1964,99.726776
4,00047002,1977,91.304348
...,...,...,...
346024,S713,2021,99.726027
346025,S714,2021,99.178082
346026,S715,2021,98.630137
346027,S716,2021,98.904110


In [7]:
df_q3_outliers.to_hdf(cleaned_path, key = 'table_q3_outliers', encoding = 'utf-8', mode= 'r+')
df_q3_outliers

Unnamed: 0,gauge_code,year,q3_outliers
0,00047000,1961,98.082192
1,00047000,1962,99.452055
2,00047000,1963,99.726027
3,00047000,1964,99.726776
4,00047002,1977,91.304348
...,...,...,...
346024,S713,2021,99.726027
346025,S714,2021,99.178082
346026,S715,2021,98.630137
346027,S716,2021,98.904110


In [8]:
df_q3_outliers= pd.read_hdf(cleaned_path, key = 'table_q3_outliers', encoding = 'utf-8')
df_q3_outliers

Unnamed: 0,gauge_code,year,q3_outliers
0,00047000,1961,98.082192
1,00047000,1962,99.452055
2,00047000,1963,99.726027
3,00047000,1964,99.726776
4,00047002,1977,91.304348
...,...,...,...
346024,S713,2021,99.726027
346025,S714,2021,99.178082
346026,S715,2021,98.630137
346027,S716,2021,98.904110
