In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time

# Data processing

In [2]:
cleaned_path = r'./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5'

In [None]:
chunk_size = 13000000  # Adjust the chunk size as needed
chunks = []
df_data = pd.DataFrame()

# Read the filtered data in chunks  from the HDF5 file

with pd.HDFStore(cleaned_path, mode='r') as store:
    total_chunks = store.get_storer('table_data_filtered').nrows // chunk_size + 1
    print(f"Rows in table_data_filtered: {store.get_storer('table_data_filtered').nrows}\n")
    start_time, step_time = time.time(), time.time()
    for i, chunk in enumerate(store.select('table_data_filtered', chunksize=chunk_size)):
        if df_data.empty:
            df_data = chunk
        else:
            df_data = pd.concat([df_data, chunk], ignore_index=True)  # Concatenate chunk to df_data_filtered (inplace=True)
        del chunk  # Delete the chunk variable to free up memory
        print(f"Processed chunk {i + 1} of {total_chunks} | Time taken: {(time.time() - step_time):.1f} seconds")
        step_time = time.time()
print(f"Total time taken:  {(time.time() - start_time):.1f} seconds")
df_data

Rows in table_data_filtered: 123611008

Processed chunk 1 of 10 | Time taken: 6.4 seconds
Processed chunk 2 of 10 | Time taken: 6.7 seconds
Processed chunk 3 of 10 | Time taken: 7.3 seconds
Processed chunk 4 of 10 | Time taken: 8.1 seconds
Processed chunk 5 of 10 | Time taken: 9.3 seconds
Processed chunk 6 of 10 | Time taken: 8.8 seconds
Processed chunk 7 of 10 | Time taken: 11.5 seconds
Processed chunk 8 of 10 | Time taken: 31.6 seconds
Processed chunk 9 of 10 | Time taken: 34.8 seconds
Processed chunk 10 of 10 | Time taken: 58.3 seconds
Total time taken:  183.1 seconds


Unnamed: 0,gauge_code,datetime,rain_mm
0,00047000,1961-01-01,0.0
1,00047000,1961-01-02,0.0
2,00047000,1961-01-03,0.0
3,00047000,1961-01-04,0.0
4,00047000,1961-01-05,0.0
...,...,...,...
123611003,S717,2021-12-27,0.0
123611004,S717,2021-12-28,0.0
123611005,S717,2021-12-29,0.0
123611006,S717,2021-12-30,0.0


In [4]:
del store, total_chunks, step_time, start_time

In [5]:
# Convert to datetime and sort
df_data = df_data.sort_values(['gauge_code', 'datetime'])

# Extract year for start/end gap calculations
df_data['year'] = df_data['datetime'].dt.year

# Group by station and year (to handle multi-year data)
df_data = df_data.groupby(['gauge_code', 'year'])



In [6]:
# Calculate gaps
def calculateQ1(group):
    # Internal gaps (between consecutive observations)
    time_diffs = group['datetime'].diff().dt.days - 1.0
    active_days = group['datetime'].nunique()  # Count of active days in the group

    # Start gap: (first observation) - (first day of year)
    first_day = pd.Timestamp(f"{group['year'].iloc[0]}-01-01")
    start_gap = (group['datetime'].iloc[0] - first_day).days
    
    # End gap: (last day of year) - (last observation)
    last_day = pd.Timestamp(f"{group['year'].iloc[0]}-12-31")
    end_gap = (last_day - group['datetime'].iloc[-1]).days
    
    # Combine all gaps and find max
    all_gaps = time_diffs.tolist() + [start_gap, end_gap]
    max_gap = np.nanmax(all_gaps) if not all(np.isnan(all_gaps)) else np.nan
    
    return pd.Series({
        'max_gap_days': max_gap,
        'start_gap_days': start_gap,
        'end_gap_days': end_gap,
        'active_days': active_days  
    })

# Apply to each group
df_q1_gaps = df_data.apply(calculateQ1).reset_index()
df_q1_gaps

  df_q1_gaps = df_data.apply(calculateQ1).reset_index()


Unnamed: 0,gauge_code,year,max_gap_days,start_gap_days,end_gap_days,active_days
0,00047000,1961,0.0,0.0,0.0,365.0
1,00047000,1962,0.0,0.0,0.0,365.0
2,00047000,1963,0.0,0.0,0.0,365.0
3,00047000,1964,0.0,0.0,0.0,366.0
4,00047002,1977,342.0,342.0,0.0,23.0
...,...,...,...,...,...,...
346024,S713,2021,0.0,0.0,0.0,365.0
346025,S714,2021,0.0,0.0,0.0,365.0
346026,S715,2021,0.0,0.0,0.0,365.0
346027,S716,2021,0.0,0.0,0.0,365.0


In [None]:
df_q1_gaps['days_in_year'] = df_q1_gaps.apply(lambda x: 366 if x['year'] % 4 == 0 and (x['year'] % 100 != 0 or x['year'] % 400 == 0) else 365, axis=1)
df_q1_gaps['total_gaps'] = df_q1_gaps['days_in_year'] - df_q1_gaps['active_days']
df_q1_gaps['max_gap_days'] = df_q1_gaps['max_gap_days'].astype(int)  # Convert to nullable integer type
df_q1_gaps['start_gap_days'] = df_q1_gaps['start_gap_days'].astype(int)  # Convert to nullable integer type
df_q1_gaps['end_gap_days'] = df_q1_gaps['end_gap_days'].astype(int)  # Convert to nullable integer type
df_q1_gaps['active_days'] = df_q1_gaps['active_days'].astype(int)  # Convert to nullable integer type
df_q1_gaps['total_gaps'] = df_q1_gaps['total_gaps'].astype(int)  # Convert to nullable integer type
df_q1_gaps[df_q1_gaps['total_gaps'] > 0].head(30)

Unnamed: 0,gauge_code,year,max_gap_days,start_gap_days,end_gap_days,active_days,days_in_year,total_gaps
4,47002,1977,342,342,0,23,365,342
48,47003,1981,181,181,0,184,365,181
88,47004,1982,48,48,0,317,365,48
127,47005,1989,232,232,0,133,365,232
149,47005,2011,1,0,0,363,365,2
156,47005,2018,1,0,0,364,365,1
159,47006,1989,241,241,0,124,365,241
188,47006,2018,1,0,0,364,365,1
191,47007,1995,320,320,0,45,365,320
255,48002,1971,263,263,0,102,365,263


In [11]:
df_q1_gaps['q1_gaps'] = 100.0 - 100.0 * (((2.0 * df_q1_gaps['total_gaps']) + df_q1_gaps['max_gap_days']) / df_q1_gaps['active_days'])
df_q1_gaps['q1_gaps'] = df_q1_gaps['q1_gaps'].clip(lower=0, upper=100)  # Clip to [0, 100]
df_q1_gaps[df_q1_gaps['total_gaps'] > 0].head(30)

Unnamed: 0,gauge_code,year,max_gap_days,start_gap_days,end_gap_days,active_days,days_in_year,total_gaps,q1_gaps
4,47002,1977,342,342,0,23,365,342,0.0
48,47003,1981,181,181,0,184,365,181,0.0
88,47004,1982,48,48,0,317,365,48,54.574132
127,47005,1989,232,232,0,133,365,232,0.0
149,47005,2011,1,0,0,363,365,2,98.62259
156,47005,2018,1,0,0,364,365,1,99.175824
159,47006,1989,241,241,0,124,365,241,0.0
188,47006,2018,1,0,0,364,365,1,99.175824
191,47007,1995,320,320,0,45,365,320,0.0
255,48002,1971,263,263,0,102,365,263,0.0


In [12]:
df_q1_gaps_export = df_q1_gaps[['gauge_code', 'year', 'q1_gaps']]
df_q1_gaps_export

Unnamed: 0,gauge_code,year,q1_gaps
0,00047000,1961,100.0
1,00047000,1962,100.0
2,00047000,1963,100.0
3,00047000,1964,100.0
4,00047002,1977,0.0
...,...,...,...
346024,S713,2021,100.0
346025,S714,2021,100.0
346026,S715,2021,100.0
346027,S716,2021,100.0


In [13]:
df_q1_gaps_export.to_hdf('./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5'
                  , key = 'table_q1_gaps'
                  , encoding = 'utf-8'
                  , mode='r+'
                  , append = False
                  , complevel=9
                  , format='table')
df_q1_gaps = pd.read_hdf('./1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5', key = 'table_q1_gaps', encoding = 'utf-8')
df_q1_gaps

Unnamed: 0,gauge_code,year,q1_gaps
0,00047000,1961,100.0
1,00047000,1962,100.0
2,00047000,1963,100.0
3,00047000,1964,100.0
4,00047002,1977,0.0
...,...,...,...
346024,S713,2021,100.0
346025,S714,2021,100.0
346026,S715,2021,100.0
346027,S716,2021,100.0
