In [41]:
import pandas as pd
import numpy as np
from scipy.spatial import KDTree
import os
import time
import h5py

# Data processing

In [42]:
neighboring_data_path = './1 - Organized data gauge/BRAZIL'
cleaned_path = './1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5'

In [43]:
df_data = pd.read_hdf(cleaned_path, key = 'table_data', encoding = 'utf-8')
# df_data['year'] = df_data['datetime'].dt.year

df_data = df_data[(df_data['rain_mm']>= 0.0)
                  & (df_data['rain_mm']<= 600.0)
                  ].reset_index(drop=True)
df_data

Unnamed: 0,gauge_code,datetime,rain_mm
0,00047000,1961-01-01,0.0
1,00047000,1961-01-02,0.0
2,00047000,1961-01-03,0.0
3,00047000,1961-01-04,0.0
4,00047000,1961-01-05,0.0
...,...,...,...
123612837,S717,2021-12-27,0.0
123612838,S717,2021-12-28,0.0
123612839,S717,2021-12-29,0.0
123612840,S717,2021-12-30,0.0


In [44]:
df_info = pd.read_hdf(cleaned_path, key = 'table_info', encoding = 'utf-8')
df_info

Unnamed: 0,name_station,gauge_code,city,state,responsible,source,state_abbreviation,lat,long
18981,SELVIRIA | S717,S717,SELVIRIA,MATO GROSSO DO SUL,INMET,INMET,MS,-20.351389,-51.430278
18980,SANTA RITA DO PARDO | S716,S716,SANTA RITA DO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-21.305889,-52.820375
18979,RIBAS DO RIO PARDO | S715,S715,RIBAS DO RIO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-20.466694,-53.763028
18978,PEDRO GOMES | S714,S714,PEDRO GOMES,MATO GROSSO DO SUL,INMET,INMET,MS,-18.072778,-54.548889
18977,NOVA ANDRADINA | S713,S713,NOVA ANDRADINA,MATO GROSSO DO SUL,INMET,INMET,MS,-22.078611,-53.465833
...,...,...,...,...,...,...,...,...,...
4,MARUDA,00047005,MARAPANIM,PARÁ,ANA,HIDROWEB,PA,-0.633600,-47.658300
3,PRIMAVERA,00047004,PRIMAVERA,PARÁ,ANA,HIDROWEB,PA,-0.929400,-47.099400
2,CURUÇA,00047003,CURUÇA,PARÁ,ANA,HIDROWEB,PA,-0.737500,-47.853600
1,SALINÓPOLIS,00047002,SALINÓPOLIS,PARÁ,ANA,HIDROWEB,PA,-0.623100,-47.353600


In [45]:
# Add a year column to the dataframe
df_data['year'] = df_data['datetime'].dt.year

# Group by gauge_code and year
grouped = df_data.groupby(['gauge_code', 'year'])

# Calculate annual_rainfall_mm, active_days, and consecutive_dry_days
def calculate_metrics(group):
    annual_rainfall_mm = group['rain_mm'].sum()
    active_days = (group['rain_mm'] >= 0.0).sum()
    
    # Calculate consecutive dry days
    dry_days = (group['rain_mm'] == 0.0).astype(int)
    # Calculate the maximum number of consecutive dry days
    consecutive_dry_days = (dry_days.groupby((dry_days != dry_days.shift()).cumsum()).cumsum() * dry_days).max()
    
    return pd.Series({
        'annual_rainfall_mm': annual_rainfall_mm,
        'active_days': active_days,
        'consecutive_dry_days': consecutive_dry_days
    })

In [46]:
df_preclassif = grouped.apply(calculate_metrics).reset_index()
df_preclassif

# 345868 rows × 5 columns

  df_preclassif = grouped.apply(calculate_metrics).reset_index()


Unnamed: 0,gauge_code,year,annual_rainfall_mm,active_days,consecutive_dry_days
0,00047000,1961,2186.0,365.0,275.0
1,00047000,1962,273.8,365.0,153.0
2,00047000,1963,686.2,365.0,115.0
3,00047000,1964,597.5,366.0,145.0
4,00047002,1977,133.4,23.0,6.0
...,...,...,...,...,...
346024,S713,2021,76.2,365.0,150.0
346025,S714,2021,828.0,365.0,75.0
346026,S715,2021,1041.8,365.0,76.0
346027,S716,2021,928.8,365.0,68.0


In [47]:
df_preclassif['preclassif'] = df_preclassif.apply(
    lambda row: 'LQ' if (row['annual_rainfall_mm'] < 300 or row['annual_rainfall_mm'] > 6000
                        #  or row['active_days'] < 305
                         or row['consecutive_dry_days'] > 200) else "", axis=1)
df_preclassif

Unnamed: 0,gauge_code,year,annual_rainfall_mm,active_days,consecutive_dry_days,preclassif
0,00047000,1961,2186.0,365.0,275.0,LQ
1,00047000,1962,273.8,365.0,153.0,LQ
2,00047000,1963,686.2,365.0,115.0,
3,00047000,1964,597.5,366.0,145.0,
4,00047002,1977,133.4,23.0,6.0,LQ
...,...,...,...,...,...,...
346024,S713,2021,76.2,365.0,150.0,LQ
346025,S714,2021,828.0,365.0,75.0,
346026,S715,2021,1041.8,365.0,76.0,
346027,S716,2021,928.8,365.0,68.0,


In [48]:
preclassif_counts = df_preclassif['preclassif'].value_counts()
print(preclassif_counts)

preclassif
      299777
LQ     46252
Name: count, dtype: int64


In [49]:
preclassif_counts['LQ'] / preclassif_counts.sum()

0.1336650974340301

In [50]:
df_preclassif.to_hdf(cleaned_path, key='table_preclassif', mode='r+', complevel=9, complib='zlib')
df_preclassif

Unnamed: 0,gauge_code,year,annual_rainfall_mm,active_days,consecutive_dry_days,preclassif
0,00047000,1961,2186.0,365.0,275.0,LQ
1,00047000,1962,273.8,365.0,153.0,LQ
2,00047000,1963,686.2,365.0,115.0,
3,00047000,1964,597.5,366.0,145.0,
4,00047002,1977,133.4,23.0,6.0,LQ
...,...,...,...,...,...,...
346024,S713,2021,76.2,365.0,150.0,LQ
346025,S714,2021,828.0,365.0,75.0,
346026,S715,2021,1041.8,365.0,76.0,
346027,S716,2021,928.8,365.0,68.0,


In [51]:
df_preclassif = pd.read_hdf(cleaned_path, key = 'table_preclassif', encoding = 'utf-8')
df_preclassif

Unnamed: 0,gauge_code,year,annual_rainfall_mm,active_days,consecutive_dry_days,preclassif
0,00047000,1961,2186.0,365.0,275.0,LQ
1,00047000,1962,273.8,365.0,153.0,LQ
2,00047000,1963,686.2,365.0,115.0,
3,00047000,1964,597.5,366.0,145.0,
4,00047002,1977,133.4,23.0,6.0,LQ
...,...,...,...,...,...,...
346024,S713,2021,76.2,365.0,150.0,LQ
346025,S714,2021,828.0,365.0,75.0,
346026,S715,2021,1041.8,365.0,76.0,
346027,S716,2021,928.8,365.0,68.0,


# Outlier treatment

In [52]:
# df_complete_info = pd.merge(df_data, df_info, on='gauge_code', how = 'inner').sort_values('lat', ascending = True)
# # .dropna(how='any')
# print("df_complete_info", len(df_complete_info['gauge_code'].unique()), 'df_info', len(df_info['gauge_code'].unique()))
# df_complete_info

# # left: 123612842 rows
# # inner: 123596321 rows

In [53]:
# df_outlier = pd.merge(df_complete_info[['gauge_code', 'rain_mm', 'datetime', 'year']], df_preclassif, on = ['gauge_code', 'year'], how = 'left')
# df_outlier = df_outlier[df_outlier['preclassif'] != 'LQ']
# df_outlier = df_outlier[['gauge_code',	'datetime',	'rain_mm']]
# df_outlier

In [54]:
# del df_complete_info

In [55]:
# def mark_outlier_rain(df, threshold_rain_mm=200):
#     # Garantir que a coluna 'datetime' está no formato datetime
#     df['datetime'] = pd.to_datetime(df['datetime'])

#     df_yesterday = df.copy(deep = True)
#     df_yesterday['datetime'] = df_yesterday['datetime'] + pd.Timedelta(days=1)
#     df_tomorrow = df.copy(deep = True)
#     df_tomorrow['datetime'] = df_tomorrow['datetime'] - pd.Timedelta(days=1)

#     df = pd.merge(df, df_yesterday[['gauge_code', 'datetime', 'rain_mm']], on=['gauge_code', 'datetime'], how='left', suffixes=('', '_yesterday'))
#     del df_yesterday
#     df = pd.merge(df, df_tomorrow[['gauge_code', 'datetime', 'rain_mm']], on=['gauge_code', 'datetime'], how='left', suffixes=('', '_tomorrow'))
#     del df_tomorrow
#     df_sorted = df.sort_values(['gauge_code', 'datetime']).reset_index(drop=True)
#     del df

#     # Calcular soma da chuva nos dias adjacentes
#     df_sorted['adjacent_days_mm'] = df_sorted['rain_mm_yesterday'] + df_sorted['rain_mm_tomorrow']

#     # Regra para identificar outlier
#     condition = (
#         (df_sorted['rain_mm'] > threshold_rain_mm) &
#         (df_sorted['adjacent_days_mm'] < 0.025 * df_sorted['rain_mm'])
#     )

#     df_sorted['outlier_status_1'] = np.where(condition, 1, 0)

#     return df_sorted

# # Aplicar na base df_outlier_filter_1
# df_outlier_filter_1 = mark_outlier_rain(df_outlier)
# df_outlier_filter_1

In [56]:
# Configurations
neighboring_data_path = './1 - Organized data gauge/BRAZIL'

In [57]:
# df_outlier_filter_1_export = df_outlier_filter_1[df_outlier_filter_1['outlier_status_1'] == 1]
# df_outlier_filter_1_export.to_hdf(
#         os.path.join(neighboring_data_path, "adjacent_day_analysis_filter_1.h5"),
#         key='table_data',
#         mode='w',
#         format='table',
#         complevel=9,
#         encoding='utf-8',
#         append=False,
#         min_itemsize={'gauge_code': 20}  # Adjust based on your max gauge code length
#     )
# df_outlier_filter_1_export

In [58]:
# df_outlier_filter_1 = df_outlier_filter_1[['gauge_code','datetime',	'rain_mm', 'outlier_status_1']]
# df_outlier_filter_1 = df_outlier_filter_1[df_outlier_filter_1['outlier_status_1']==0]
# df_outlier_filter_1 = pd.merge(df_outlier_filter_1, df_info[['gauge_code', 'lat', 'long']], on ='gauge_code', how='left')
# df_outlier_filter_1.reset_index(drop=True, inplace=True)
# df_outlier_filter_1

In [59]:
del df_info

In [60]:
# def idw_interpolation(latitude, longitude, df_temp_without_gauge, kdtree, p=2):
#     row = [latitude, longitude]
#     distances, indices = kdtree.query(row, k=5)
#     weights = 1 / (distances + 1e-6) ** p
#     values = df_temp_without_gauge.iloc[indices]['rain_mm'].values
#     return (np.sum(weights * values) / np.sum(weights))

# # Initialize empty DataFrame for results
# outlier_analysis_results = pd.DataFrame()

# # Configurations
# output_filename = os.path.join(neighboring_data_path, "neighboring_analysis.h5")
# rainfall_threshold = 200.0  # mm

# start_date = '2014-01-01'
# end_date = '2020-12-31'

# df_date_filter = df_outlier_filter_1.loc[(df_outlier_filter_1['datetime'] >= start_date) & (df_outlier_filter_1['datetime'] <= end_date)].sort_values('datetime', ignore_index=True, ascending=True)

# df_date_filter = df_date_filter[df_date_filter['rain_mm'] > 200.0].reset_index(drop=True) # Filter for high rainfall values

# # Get sorted unique dates
# analysis_dates = df_date_filter['datetime'].unique().tolist() # Get unique dates for the analysis
# del df_date_filter
# analysis_dates.sort()

# # Process each date
# for current_date in analysis_dates[:]:
#     # Filter data for current date
#     daily_data = df_outlier_filter_1[df_outlier_filter_1['datetime'] == current_date]
    
#     df_gauge_filter = daily_data[daily_data['rain_mm'] > 200.0].reset_index(drop=True) # Filter for high rainfall values
    
#     gauge_codes = df_gauge_filter['gauge_code'].unique() # Get unique gauge codes for the current date
    
#     date_results = []
    
#     for gauge in gauge_codes:
#         gauge_data = daily_data[daily_data['gauge_code'] == gauge].iloc[0]
#         lat, lon = gauge_data['lat'], gauge_data['long']
#         observed_rain = gauge_data['rain_mm']
        
#         # Initialize result row
#         result_row = {
#             'gauge_code': gauge,
#             'datetime': current_date,
#             'lat': lat,
#             'long': lon,
#             'observed_rain_mm': observed_rain,
#             'interpolated_rain_mm': np.nan
#         }
        
#         # Only interpolate for high rainfall values
#         if observed_rain > rainfall_threshold:
#             neighbor_data = daily_data[daily_data['gauge_code'] != gauge]
            
#             if len(neighbor_data) > 0:
#                 kd_tree = KDTree(neighbor_data[['lat', 'long']].values)
#                 result_row['interpolated_rain_mm'] = idw_interpolation(lat, lon, neighbor_data, kd_tree)
        
#         date_results.append(pd.DataFrame([result_row]))
    
#     # Combine results for current date
#     daily_results = pd.concat(date_results, ignore_index=True)
    
#     # Save to HDF5 with proper configuration
#     storage_mode = 'w' if current_date == analysis_dates[0] else 'a'
#     append_mode = False if current_date == analysis_dates[0] else True

#     # storage_mode = 'r+'
#     # append_mode = True
    
#     daily_results.to_hdf(
#         output_filename,
#         key='table_data',
#         mode=storage_mode,
#         format='table',
#         complevel=9,
#         encoding='utf-8',
#         append=append_mode,
#         min_itemsize={'gauge_code': 20}  # Adjust based on your max gauge code length
#     )
    
#     print(f"Saved results for {current_date} to {output_filename}")

In [61]:
# del df_outlier_filter_1

In [62]:
# output_filename = os.path.join(neighboring_data_path, "neighboring_analysis.h5")

# df_outlier_2 = pd.read_hdf(output_filename, key='table_data')
# df_outlier_2

In [63]:
# df_outlier_filter_2_export = df_outlier_2[df_outlier_2['interpolated_rain_mm'] >= 0.0].reset_index(drop = True)
# df_outlier_filter_2_export = df_outlier_filter_2_export[df_outlier_filter_2_export['interpolated_rain_mm'] >= 0.35 * df_outlier_filter_2_export['observed_rain_mm']]
# df_outlier_filter_2_export['outlier_status_2'] = 1
# df_outlier_filter_2_export

In [64]:
# del df_outlier_2

In [65]:
# df_outlier_filter_2_export.to_hdf(
#         os.path.join(neighboring_data_path, "neighboring_analysis_filter_2.h5"),
#         key='table_data',
#         mode='w',
#         format='table',
#         complevel=9,
#         encoding='utf-8',
#         append=False,
#         min_itemsize={'gauge_code': 20}  # Adjust based on your max gauge code length
#     )
# df_outlier_filter_2_export

# Data Deletion

In [66]:
df_filter_1 = pd.read_hdf(os.path.join(neighboring_data_path, "adjacent_day_analysis_filter_1.h5"), key='table_data')
df_filter_1 = df_filter_1[['gauge_code', 'datetime', 'outlier_status_1']]
df_filter_1

Unnamed: 0,gauge_code,datetime,outlier_status_1
51792,00047005,2011-01-23,1
51795,00047005,2011-01-26,1
102041,00048005,1980-01-14,1
102126,00048005,1980-04-08,1
102418,00048005,1981-01-25,1
...,...,...,...
111916372,86743950,2021-03-20,1
112047085,A110,2024-01-04,1
112322733,A606,2022-11-27,1
112371071,A667,2023-02-08,1


In [67]:

df_filter_2 = pd.read_hdf(os.path.join(neighboring_data_path, "neighboring_analysis_filter_2.h5"), key='table_data')
df_filter_2['outlier_status_2'] = 1
df_filter_2 = df_filter_2[['gauge_code', 'datetime', 'outlier_status_2']]
df_filter_2

Unnamed: 0,gauge_code,datetime,outlier_status_2
5,02548043,2014-02-15,1
6,02548044,2014-02-16,1
10,02247182,2014-03-05,1
11,02247196,2014-03-05,1
12,02247182,2014-03-07,1
...,...,...,...
208,02040011,2020-01-18,1
209,02040020,2020-01-18,1
217,00638071,2020-03-16,1
229,02949001,2020-07-08,1


In [68]:
df_data_filtered = pd.merge(df_data, df_filter_1, on = ['gauge_code', 'datetime'], how = 'left').merge(df_filter_2, on = ['gauge_code', 'datetime'], how = 'left')
df_data_filtered = df_data_filtered[(df_data_filtered['outlier_status_1'] != 1) & (df_data_filtered['outlier_status_2'] != 1)]
df_data_filtered = df_data_filtered[['gauge_code', 'datetime', 'rain_mm']].copy(deep = True)
df_data_filtered

Unnamed: 0,gauge_code,datetime,rain_mm
0,00047000,1961-01-01,0.0
1,00047000,1961-01-02,0.0
2,00047000,1961-01-03,0.0
3,00047000,1961-01-04,0.0
4,00047000,1961-01-05,0.0
...,...,...,...
123612837,S717,2021-12-27,0.0
123612838,S717,2021-12-28,0.0
123612839,S717,2021-12-29,0.0
123612840,S717,2021-12-30,0.0


In [69]:
del df_data, df_filter_1, df_filter_2, df_preclassif, grouped

In [70]:
key = 'table_data_filtered'
chunk_size = 13000000  # Adjust based on available memory

with pd.HDFStore(cleaned_path, mode='r+', complevel=9, complib='blosc:zstd') as store:
    for start in range(0, len(df_data_filtered), chunk_size):
        i = start // chunk_size
        end = start + chunk_size
        chunk = df_data_filtered.iloc[start:end]
        if i == 0:
            append_mode = False
        else:  
            append_mode = True
        # Append the chunk to the HDF5 file
        store.append(key, chunk, format='table', data_columns=True, encoding='utf-8', min_itemsize={'gauge_code': 20}, append=append_mode)

        print(f"Chunk {i + 1} of {len(df_data_filtered) // chunk_size + 1} completed.")

Chunk 1 of 10 completed.
Chunk 2 of 10 completed.
Chunk 3 of 10 completed.
Chunk 4 of 10 completed.
Chunk 5 of 10 completed.
Chunk 6 of 10 completed.
Chunk 7 of 10 completed.
Chunk 8 of 10 completed.
Chunk 9 of 10 completed.
Chunk 10 of 10 completed.


In [71]:
# Open the HDF5 file and list all keys
with h5py.File(cleaned_path, 'r') as h5file:
    keys = list(h5file.keys())
    print("Keys in the HDF5 file:", keys)

Keys in the HDF5 file: ['table_data', 'table_data_filtered', 'table_info', 'table_p_availability', 'table_preclassif', 'table_q1_gaps', 'table_q2_week', 'table_q3_outliers', 'table_qc_info']


In [72]:
chunk_size = 13000000  # Adjust the chunk size as needed
chunks = []
df_data_filtered = pd.DataFrame()

start_time = time.time()
# Read the filtered data in chunks  from the HDF5 file

with pd.HDFStore(cleaned_path, mode='r') as store:
    total_chunks = store.get_storer('table_data_filtered').nrows // chunk_size + 1
    print("Rows in table_data_filtered:", store.get_storer('table_data_filtered').nrows)
    for i, chunk in enumerate(store.select('table_data_filtered', chunksize=chunk_size)):
        chunks.append(chunk)
        if df_data_filtered.empty:
            df_data_filtered = chunk
        else:
            df_data_filtered = pd.concat([df_data_filtered, chunk], ignore_index=True)  # Concatenate chunk to df_data_filtered (inplace=True)
        del chunk  # Delete the chunk variable to free up memory
        print(f"Processed chunk {i + 1} of {total_chunks}")
        print(time.time() - start_time, "seconds")
print("Total time taken:", time.time() - start_time, "seconds")
df_data_filtered

Rows in table_data_filtered: 123611008
Processed chunk 1 of 10
7.133180379867554 seconds
Processed chunk 2 of 10
14.398112058639526 seconds
Processed chunk 3 of 10
23.7789568901062 seconds
Processed chunk 4 of 10
32.0426561832428 seconds
Processed chunk 5 of 10
41.69509768486023 seconds
Processed chunk 6 of 10
52.5738890171051 seconds
Processed chunk 7 of 10
62.37548875808716 seconds
Processed chunk 8 of 10
81.14187622070312 seconds
Processed chunk 9 of 10
105.8172333240509 seconds
Processed chunk 10 of 10
124.63157391548157 seconds
Total time taken: 124.90125918388367 seconds


Unnamed: 0,gauge_code,datetime,rain_mm
0,00047000,1961-01-01,0.0
1,00047000,1961-01-02,0.0
2,00047000,1961-01-03,0.0
3,00047000,1961-01-04,0.0
4,00047000,1961-01-05,0.0
...,...,...,...
123611003,S717,2021-12-27,0.0
123611004,S717,2021-12-28,0.0
123611005,S717,2021-12-29,0.0
123611006,S717,2021-12-30,0.0


In [73]:
df_data_filtered['gauge_code'].nunique()

18503

In [74]:
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

2025-04-20 15:21:43
