In [1]:
import pandas as pd
import numpy as np

# Data processing

In [2]:
df_data = pd.DataFrame()  # Initialize an empty DataFrame

# Read the HDF file in chunks
for chunk in pd.read_hdf('./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5', key='table_data_filtered', encoding='utf-8', chunksize=10000000, iterator=True):
    if df_data.empty:
        df_data = chunk.copy(deep=True)  # If this is the first chunk, assign it directly to df_data
    else:
        df_data = pd.concat([df_data, chunk], ignore_index=True)

df_data

Unnamed: 0,gauge_code,datetime,rain_mm
0,00047000,1961-01-01,0.0
1,00047000,1961-01-02,0.0
2,00047000,1961-01-03,0.0
3,00047000,1961-01-04,0.0
4,00047000,1961-01-05,0.0
...,...,...,...
123609861,S717,2021-12-27,0.0
123609862,S717,2021-12-28,0.0
123609863,S717,2021-12-29,0.0
123609864,S717,2021-12-30,0.0


In [3]:
df_info = pd.read_hdf('./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5', key = 'table_info', encoding = 'utf-8')
df_info

Unnamed: 0,name_station,gauge_code,city,state,responsible,source,state_abbreviation,lat,long
18981,SELVIRIA | S717,S717,SELVIRIA,MATO GROSSO DO SUL,INMET,INMET,MS,-20.351389,-51.430278
18980,SANTA RITA DO PARDO | S716,S716,SANTA RITA DO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-21.305889,-52.820375
18979,RIBAS DO RIO PARDO | S715,S715,RIBAS DO RIO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-20.466694,-53.763028
18978,PEDRO GOMES | S714,S714,PEDRO GOMES,MATO GROSSO DO SUL,INMET,INMET,MS,-18.072778,-54.548889
18977,NOVA ANDRADINA | S713,S713,NOVA ANDRADINA,MATO GROSSO DO SUL,INMET,INMET,MS,-22.078611,-53.465833
...,...,...,...,...,...,...,...,...,...
4,MARUDA,00047005,MARAPANIM,PARÁ,ANA,HIDROWEB,PA,-0.633600,-47.658300
3,PRIMAVERA,00047004,PRIMAVERA,PARÁ,ANA,HIDROWEB,PA,-0.929400,-47.099400
2,CURUÇA,00047003,CURUÇA,PARÁ,ANA,HIDROWEB,PA,-0.737500,-47.853600
1,SALINÓPOLIS,00047002,SALINÓPOLIS,PARÁ,ANA,HIDROWEB,PA,-0.623100,-47.353600


In [4]:
df_complete_info = pd.merge(df_data, df_info, on = 'gauge_code', how = 'inner')
del df_data, df_info
df_complete_info['year'] = df_complete_info['datetime'].dt.year
df_complete_info

Unnamed: 0,gauge_code,datetime,rain_mm,name_station,city,state,responsible,source,state_abbreviation,lat,long,year
0,00047000,1961-01-01,0.0,SALINÓPOLIS,SALINÓPOLIS,PARÁ,INMET,HIDROWEB,PA,-0.650000,-47.550000,1961
1,00047000,1961-01-02,0.0,SALINÓPOLIS,SALINÓPOLIS,PARÁ,INMET,HIDROWEB,PA,-0.650000,-47.550000,1961
2,00047000,1961-01-03,0.0,SALINÓPOLIS,SALINÓPOLIS,PARÁ,INMET,HIDROWEB,PA,-0.650000,-47.550000,1961
3,00047000,1961-01-04,0.0,SALINÓPOLIS,SALINÓPOLIS,PARÁ,INMET,HIDROWEB,PA,-0.650000,-47.550000,1961
4,00047000,1961-01-05,0.0,SALINÓPOLIS,SALINÓPOLIS,PARÁ,INMET,HIDROWEB,PA,-0.650000,-47.550000,1961
...,...,...,...,...,...,...,...,...,...,...,...,...
123593340,S717,2021-12-27,0.0,SELVIRIA | S717,SELVIRIA,MATO GROSSO DO SUL,INMET,INMET,MS,-20.351389,-51.430278,2021
123593341,S717,2021-12-28,0.0,SELVIRIA | S717,SELVIRIA,MATO GROSSO DO SUL,INMET,INMET,MS,-20.351389,-51.430278,2021
123593342,S717,2021-12-29,0.0,SELVIRIA | S717,SELVIRIA,MATO GROSSO DO SUL,INMET,INMET,MS,-20.351389,-51.430278,2021
123593343,S717,2021-12-30,0.0,SELVIRIA | S717,SELVIRIA,MATO GROSSO DO SUL,INMET,INMET,MS,-20.351389,-51.430278,2021


In [5]:
def calculateQ1(df):
    # Ensure datetime is in the correct format
    df['datetime'] = pd.to_datetime(df['datetime'])
    
    # Extract year from datetime
    df['year'] = df['datetime'].dt.year
    
    # Group by gauge_code and year
    grouped = df.groupby(['gauge_code', 'year'])
    
    results = []
    
    for (gauge_code, year), group in grouped:
        # Sort by datetime
        group = group.sort_values('datetime')
        
        # Create a complete date range for the year
        date_range = pd.date_range(start=f'{year}-01-01', end=f'{year}-12-31', freq='D')
        full_df = pd.DataFrame({'datetime': date_range})
        merged_df = pd.merge(full_df, group, on='datetime', how='left')
        
        # Calculate days_in_a_year
        days_in_a_year = len(full_df)
        
        # Calculate active_days (days with valid rain_mm data)
        active_days = merged_df['rain_mm'].notna().sum()
        
        # Calculate total_gap
        total_gap = days_in_a_year - active_days
        
        # Calculate max_gap, start_date, and last_date
        if total_gap == 0:
            max_gap = 0
            start_date = np.nan
            last_date = np.nan
        else:
            # Identify gaps (consecutive NaNs in rain_mm)
            merged_df['gap'] = merged_df['rain_mm'].isna().astype(int)
            merged_df['gap_group'] = (merged_df['gap'].diff() != 0).cumsum()
            
            # Calculate the length of each gap
            gap_lengths = merged_df[merged_df['gap'] == 1].groupby('gap_group').size()
            
            # Find the max_gap and its start and end dates
            if not gap_lengths.empty:
                max_gap = gap_lengths.max()
                max_gap_group = gap_lengths.idxmax()
                gap_dates = merged_df[merged_df['gap_group'] == max_gap_group]['datetime']
                start_date = gap_dates.min()
                last_date = gap_dates.max()
            else:
                max_gap = 0
                start_date = np.nan
                last_date = np.nan
        
        # Append results
        results.append({
            'gauge_code': gauge_code,
            'year': year,
            'days_in_a_year': days_in_a_year,
            'active_days': active_days,
            'total_gap': total_gap,
            'max_gap': max_gap,
            'start_date': start_date,
            'last_date': last_date
        })
    
    # Create the result DataFrame
    df_q1_gaps = pd.DataFrame(results)
    return df_q1_gaps

# Example usage
df_q1_gaps = calculateQ1(df_complete_info)
del df_complete_info
df_q1_gaps

Unnamed: 0,gauge_code,year,days_in_a_year,active_days,total_gap,max_gap,start_date,last_date
0,00047000,1961,365,365,0,0,NaT,NaT
1,00047000,1962,365,365,0,0,NaT,NaT
2,00047000,1963,365,365,0,0,NaT,NaT
3,00047000,1964,366,366,0,0,NaT,NaT
4,00047002,1977,365,23,342,342,1977-01-01,1977-12-08
...,...,...,...,...,...,...,...,...
345863,S713,2021,365,365,0,0,NaT,NaT
345864,S714,2021,365,365,0,0,NaT,NaT
345865,S715,2021,365,365,0,0,NaT,NaT
345866,S716,2021,365,365,0,0,NaT,NaT


In [6]:
df_q1_gaps['q1_gaps'] = 100.0 - 100.0 * (((2.0 * df_q1_gaps['total_gap']) + df_q1_gaps['max_gap']) / df_q1_gaps['active_days'])
df_q1_gaps['q1_gaps'] = df_q1_gaps['q1_gaps'].apply(lambda x: max(x, 0))
df_q1_gaps

Unnamed: 0,gauge_code,year,days_in_a_year,active_days,total_gap,max_gap,start_date,last_date,q1_gaps
0,00047000,1961,365,365,0,0,NaT,NaT,100.0
1,00047000,1962,365,365,0,0,NaT,NaT,100.0
2,00047000,1963,365,365,0,0,NaT,NaT,100.0
3,00047000,1964,366,366,0,0,NaT,NaT,100.0
4,00047002,1977,365,23,342,342,1977-01-01,1977-12-08,0.0
...,...,...,...,...,...,...,...,...,...
345863,S713,2021,365,365,0,0,NaT,NaT,100.0
345864,S714,2021,365,365,0,0,NaT,NaT,100.0
345865,S715,2021,365,365,0,0,NaT,NaT,100.0
345866,S716,2021,365,365,0,0,NaT,NaT,100.0


In [7]:
df_q1_gaps = df_q1_gaps[['gauge_code', 'year', 'q1_gaps']]
df_q1_gaps

Unnamed: 0,gauge_code,year,q1_gaps
0,00047000,1961,100.0
1,00047000,1962,100.0
2,00047000,1963,100.0
3,00047000,1964,100.0
4,00047002,1977,0.0
...,...,...,...
345863,S713,2021,100.0
345864,S714,2021,100.0
345865,S715,2021,100.0
345866,S716,2021,100.0


In [8]:
df_q1_gaps.to_hdf('./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5', key = 'table_q1_gaps', encoding = 'utf-8', mode='r+')
df_q1_gaps = pd.read_hdf('./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5', key = 'table_q1_gaps', encoding = 'utf-8')
df_q1_gaps

Unnamed: 0,gauge_code,year,q1_gaps
0,00047000,1961,100.0
1,00047000,1962,100.0
2,00047000,1963,100.0
3,00047000,1964,100.0
4,00047002,1977,0.0
...,...,...,...
345863,S713,2021,100.0
345864,S714,2021,100.0
345865,S715,2021,100.0
345866,S716,2021,100.0
