In [2]:
import pandas as pd
import numpy as np
from scipy.spatial import KDTree
import os

# Data processing

In [None]:
neighboring_data_path = './1 - Organized data gauge/BRAZIL'
cleaned_path = './1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_CLEANED.h5'

In [None]:
df_data = pd.read_hdf(cleaned_path, key = 'table_data', encoding = 'utf-8')
df_data

Unnamed: 0,gauge_code,datetime,rain_mm
0,00047000,1961-01-01,0.0
1,00047000,1961-01-02,0.0
2,00047000,1961-01-03,0.0
3,00047000,1961-01-04,0.0
4,00047000,1961-01-05,0.0
...,...,...,...
1076753,S717,2021-12-27,0.0
1076754,S717,2021-12-28,0.0
1076755,S717,2021-12-29,0.0
1076756,S717,2021-12-30,0.0


In [None]:
df_info = pd.read_hdf(cleaned_path, key = 'table_info', encoding = 'utf-8')
df_info

Unnamed: 0,name_station,gauge_code,city,state,responsible,source,state_abbreviation,lat,long
18981,SELVIRIA | S717,S717,SELVIRIA,MATO GROSSO DO SUL,INMET,INMET,MS,-20.351389,-51.430278
18980,SANTA RITA DO PARDO | S716,S716,SANTA RITA DO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-21.305889,-52.820375
18979,RIBAS DO RIO PARDO | S715,S715,RIBAS DO RIO PARDO,MATO GROSSO DO SUL,INMET,INMET,MS,-20.466694,-53.763028
18978,PEDRO GOMES | S714,S714,PEDRO GOMES,MATO GROSSO DO SUL,INMET,INMET,MS,-18.072778,-54.548889
18977,NOVA ANDRADINA | S713,S713,NOVA ANDRADINA,MATO GROSSO DO SUL,INMET,INMET,MS,-22.078611,-53.465833
...,...,...,...,...,...,...,...,...,...
4,MARUDA,00047005,MARAPANIM,PARÁ,ANA,HIDROWEB,PA,-0.633600,-47.658300
3,PRIMAVERA,00047004,PRIMAVERA,PARÁ,ANA,HIDROWEB,PA,-0.929400,-47.099400
2,CURUÇA,00047003,CURUÇA,PARÁ,ANA,HIDROWEB,PA,-0.737500,-47.853600
1,SALINÓPOLIS,00047002,SALINÓPOLIS,PARÁ,ANA,HIDROWEB,PA,-0.623100,-47.353600


In [4]:
df_complete_info = pd.merge(df_data, df_info, on='gauge_code', how = 'inner').sort_values('lat', ascending = True)
# .dropna(how='any')
df_complete_info

# left: 123612842 rows
# inner: 123596321 rows

Unnamed: 0,gauge_code,datetime,rain_mm,name_station,city,state,responsible,source,state_abbreviation,lat,long
123521436,A899,2022-01-13,0.0,Santa Vitoria do Palmar - Barra do Chui | A899,SANTA VITORIA DO PALMAR - BARRA DO CHUI,RIO GRANDE DO SUL,INMET,INMET,RS,-33.742222,-53.372222
123521704,A899,2022-10-08,0.0,Santa Vitoria do Palmar - Barra do Chui | A899,SANTA VITORIA DO PALMAR - BARRA DO CHUI,RIO GRANDE DO SUL,INMET,INMET,RS,-33.742222,-53.372222
123521705,A899,2022-10-09,2.4,Santa Vitoria do Palmar - Barra do Chui | A899,SANTA VITORIA DO PALMAR - BARRA DO CHUI,RIO GRANDE DO SUL,INMET,INMET,RS,-33.742222,-53.372222
123521706,A899,2022-10-10,0.0,Santa Vitoria do Palmar - Barra do Chui | A899,SANTA VITORIA DO PALMAR - BARRA DO CHUI,RIO GRANDE DO SUL,INMET,INMET,RS,-33.742222,-53.372222
123521707,A899,2022-10-11,0.0,Santa Vitoria do Palmar - Barra do Chui | A899,SANTA VITORIA DO PALMAR - BARRA DO CHUI,RIO GRANDE DO SUL,INMET,INMET,RS,-33.742222,-53.372222
...,...,...,...,...,...,...,...,...,...,...,...
114849060,08460003,2017-01-07,0.0,ÁGUA FRIA,UIRAMUTA,RORAIMA,ANA,HIDROWEB,RR,4.642800,-60.496400
114849061,08460003,2017-01-08,0.0,ÁGUA FRIA,UIRAMUTA,RORAIMA,ANA,HIDROWEB,RR,4.642800,-60.496400
114849062,08460003,2017-01-09,0.0,ÁGUA FRIA,UIRAMUTA,RORAIMA,ANA,HIDROWEB,RR,4.642800,-60.496400
114849049,08460003,2016-12-27,0.0,ÁGUA FRIA,UIRAMUTA,RORAIMA,ANA,HIDROWEB,RR,4.642800,-60.496400


In [15]:
len(df_complete_info['gauge_code'].unique())

18370

In [16]:
len(df_info['gauge_code'].unique())

18370

In [8]:
# Add a year column to the dataframe
df_complete_info['year'] = df_complete_info['datetime'].dt.year

# Group by gauge_code and year
grouped = df_complete_info.groupby(['gauge_code', 'year'])

# Calculate annual_rainfall_mm, active_days, and consecutive_dry_days
def calculate_metrics(group):
    annual_rainfall_mm = group['rain_mm'].sum()
    active_days = (group['rain_mm'] >= 0.0).sum()
    
    # Calculate consecutive dry days
    dry_days = (group['rain_mm'] == 0.0).astype(int)
    # Calculate the maximum number of consecutive dry days
    consecutive_dry_days = (dry_days.groupby((dry_days != dry_days.shift()).cumsum()).cumsum() * dry_days).max()
    
    return pd.Series({
        'annual_rainfall_mm': annual_rainfall_mm,
        'active_days': active_days,
        'consecutive_dry_days': consecutive_dry_days
    })

In [18]:
df_preclassif = grouped.apply(calculate_metrics).reset_index()
df_preclassif

# 345868 rows × 5 columns

  df_preclassif = grouped.apply(calculate_metrics).reset_index()


Unnamed: 0,gauge_code,year,annual_rainfall_mm,active_days,consecutive_dry_days
0,00047000,1961,2186.0,365.0,265.0
1,00047000,1962,273.8,365.0,93.0
2,00047000,1963,686.2,365.0,71.0
3,00047000,1964,597.5,366.0,93.0
4,00047002,1977,133.4,23.0,5.0
...,...,...,...,...,...
345863,S713,2021,76.2,365.0,137.0
345864,S714,2021,828.0,365.0,58.0
345865,S715,2021,1041.8,365.0,38.0
345866,S716,2021,928.8,365.0,32.0


In [19]:
df_preclassif['preclassif'] = df_preclassif.apply(
    lambda row: 'LQ' if (row['annual_rainfall_mm'] < 200 or row['annual_rainfall_mm'] > 6000
                        #  or row['active_days'] < 305
                         or row['consecutive_dry_days'] > 200) else "", axis=1)
df_preclassif

Unnamed: 0,gauge_code,year,annual_rainfall_mm,active_days,consecutive_dry_days,preclassif
0,00047000,1961,2186.0,365.0,265.0,LQ
1,00047000,1962,273.8,365.0,93.0,
2,00047000,1963,686.2,365.0,71.0,
3,00047000,1964,597.5,366.0,93.0,
4,00047002,1977,133.4,23.0,5.0,LQ
...,...,...,...,...,...,...
345863,S713,2021,76.2,365.0,137.0,LQ
345864,S714,2021,828.0,365.0,58.0,
345865,S715,2021,1041.8,365.0,38.0,
345866,S716,2021,928.8,365.0,32.0,


In [20]:
preclassif_counts = df_preclassif['preclassif'].value_counts()
print(preclassif_counts)

preclassif
      313264
LQ     32604
Name: count, dtype: int64


In [21]:
preclassif_counts['LQ'] / preclassif_counts.sum()

0.09426717707333433

In [None]:
df_preclassif.to_hdf(cleaned_path, key='table_preclassif', mode='r+', complevel=9, complib='zlib')
df_preclassif

Unnamed: 0,gauge_code,year,annual_rainfall_mm,active_days,consecutive_dry_days,preclassif
0,00047000,1961,2186.0,365.0,265.0,LQ
1,00047000,1962,273.8,365.0,93.0,
2,00047000,1963,686.2,365.0,71.0,
3,00047000,1964,597.5,366.0,93.0,
4,00047002,1977,133.4,23.0,5.0,LQ
...,...,...,...,...,...,...
345863,S713,2021,76.2,365.0,137.0,LQ
345864,S714,2021,828.0,365.0,58.0,
345865,S715,2021,1041.8,365.0,38.0,
345866,S716,2021,928.8,365.0,32.0,


In [None]:
df_preclassif = pd.read_hdf(cleaned_path, key = 'table_preclassif', encoding = 'utf-8')
df_preclassif

Unnamed: 0,gauge_code,year,annual_rainfall_mm,active_days,consecutive_dry_days,preclassif
0,00047000,1961,2186.0,365.0,265.0,LQ
1,00047000,1962,273.8,365.0,93.0,
2,00047000,1963,686.2,365.0,71.0,
3,00047000,1964,597.5,366.0,93.0,
4,00047002,1977,133.4,23.0,5.0,LQ
...,...,...,...,...,...,...
345863,S713,2021,76.2,365.0,137.0,LQ
345864,S714,2021,828.0,365.0,58.0,
345865,S715,2021,1041.8,365.0,38.0,
345866,S716,2021,928.8,365.0,32.0,


# Outlier treatment

In [9]:
df_outlier = pd.merge(df_complete_info[['gauge_code', 'rain_mm', 'datetime', 'year']], df_preclassif, on = ['gauge_code', 'year'], how = 'left')
df_outlier = df_outlier[df_outlier['preclassif'] != 'LQ']
df_outlier = df_outlier[['gauge_code',	'datetime',	'rain_mm']]
df_outlier

Unnamed: 0,gauge_code,datetime,rain_mm
0,A899,2022-01-13,0.0
1,A899,2022-10-08,0.0
2,A899,2022-10-09,2.4
3,A899,2022-10-10,0.0
4,A899,2022-10-11,0.0
...,...,...,...
123596316,08460003,2017-01-07,0.0
123596317,08460003,2017-01-08,0.0
123596318,08460003,2017-01-09,0.0
123596319,08460003,2016-12-27,0.0


In [25]:
del df_complete_info

In [26]:
def mark_outlier_rain(df, threshold_rain_mm=200):
    # Garantir que a coluna 'datetime' está no formato datetime
    df['datetime'] = pd.to_datetime(df['datetime'])

    # Ordenar por estação e data
    df_sorted = df.sort_values(['gauge_code', 'datetime']).copy()

    # Calcular chuva do dia anterior e do dia seguinte
    df_sorted['yesterday_rain_mm'] = df_sorted.groupby('gauge_code')['rain_mm'].shift(1)
    df_sorted['tomorrow_rain_mm'] = df_sorted.groupby('gauge_code')['rain_mm'].shift(-1)

    # Calcular soma da chuva nos dias adjacentes
    df_sorted['adjacent_days_mm'] = df_sorted['yesterday_rain_mm'] + df_sorted['tomorrow_rain_mm']

    # Regra para identificar outlier
    condition = (
        (df_sorted['rain_mm'] > threshold_rain_mm) &
        (df_sorted['adjacent_days_mm'] < 0.025 * df_sorted['rain_mm'])
    )

    df_sorted['outlier_status_1'] = np.where(condition, 1, 0)

    return df_sorted

# Aplicar na base df_outlier_filter_1
df_outlier_filter_1 = mark_outlier_rain(df_outlier)
df_outlier_filter_1

Unnamed: 0,gauge_code,datetime,rain_mm,yesterday_rain_mm,tomorrow_rain_mm,adjacent_days_mm,outlier_status_1
122174860,00047000,1962-01-01,0.0,,0.0,,0
122174861,00047000,1962-01-02,0.0,0.0,0.0,0.0,0
122174862,00047000,1962-01-03,0.0,0.0,0.0,0.0,0
122174863,00047000,1962-01-04,0.0,0.0,0.0,0.0,0
122174864,00047000,1962-01-05,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...
48826156,S716,2021-12-27,0.0,0.0,0.0,0.0,0
48826157,S716,2021-12-28,0.0,0.0,0.0,0.0,0
48826158,S716,2021-12-29,0.0,0.0,4.4,4.4,0
48826159,S716,2021-12-30,4.4,0.0,3.2,3.2,0


In [28]:
# Configurations
neighboring_data_path = './1 - Organized data gauge/BRAZIL'

In [29]:
df_outlier_filter_1_export = df_outlier_filter_1[df_outlier_filter_1['outlier_status_1'] == 1]
df_outlier_filter_1_export.to_hdf(
        os.path.join(neighboring_data_path, "adjacent_day_analysis_filter_1.h5"),
        key='table_data',
        mode='w',
        format='table',
        complevel=9,
        encoding='utf-8',
        append=False,
        min_itemsize={'gauge_code': 20}  # Adjust based on your max gauge code length
    )
df_outlier_filter_1_export

Unnamed: 0,gauge_code,datetime,rain_mm,yesterday_rain_mm,tomorrow_rain_mm,adjacent_days_mm,outlier_status_1
122183545,00047005,2011-01-23,201.8,0.0,0.0,0.0,1
122183542,00047005,2011-01-26,201.2,0.0,3.1,3.1,1
122415828,00048005,1980-01-14,202.3,0.0,0.0,0.0,1
122415748,00048005,1980-04-08,229.3,0.0,0.0,0.0,1
122415694,00048005,1981-01-25,238.4,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...
3345677,86743950,2021-03-20,566.0,0.0,3.0,3.0,1
91180725,A110,2024-01-04,214.6,0.2,2.2,2.4,1
31942356,A606,2022-11-27,201.0,0.0,1.0,1.0,1
33597804,A667,2023-02-08,288.8,0.0,0.0,0.0,1


In [None]:
df_outlier_filter_1 = df_outlier_filter_1[['gauge_code','datetime',	'rain_mm', 'outlier_status_1']]
df_outlier_filter_1 = df_outlier_filter_1[df_outlier_filter_1['outlier_status_1']==0]
df_outlier_filter_1 = pd.merge(df_outlier_filter_1, df_info[['gauge_code', 'lat', 'long']], on ='gauge_code', how='left')

df_outlier_filter_1

Unnamed: 0,gauge_code,datetime,rain_mm,outlier_status_1,lat,long
0,00047000,1962-01-01,0.0,0,-0.650000,-47.550000
1,00047000,1962-01-02,0.0,0,-0.650000,-47.550000
2,00047000,1962-01-03,0.0,0,-0.650000,-47.550000
3,00047000,1962-01-04,0.0,0,-0.650000,-47.550000
4,00047000,1962-01-05,0.0,0,-0.650000,-47.550000
...,...,...,...,...,...,...
112620984,S716,2021-12-27,0.0,0,-21.305889,-52.820375
112620985,S716,2021-12-28,0.0,0,-21.305889,-52.820375
112620986,S716,2021-12-29,0.0,0,-21.305889,-52.820375
112620987,S716,2021-12-30,4.4,0,-21.305889,-52.820375


In [None]:
del df_info

In [None]:
# def idw_interpolation(latitude, longitude, df_temp_without_gauge, kdtree, p=2):
#     row = [latitude, longitude]
#     distances, indices = kdtree.query(row, k=5)
#     weights = 1 / (distances + 1e-6) ** p
#     values = df_temp_without_gauge.iloc[indices]['rain_mm'].values
#     return (np.sum(weights * values) / np.sum(weights))

# # Initialize empty DataFrame for results
# outlier_analysis_results = pd.DataFrame()

# # Configurations
# neighboring_data_path = './1 - Organized data gauge/BRAZIL'
# output_filename = os.path.join(neighboring_data_path, "neighboring_analysis.h5")
# rainfall_threshold = 200.0  # mm

# start_date = '1969-07-01'
# end_date = '2024-12-31'

# df_date = df_outlier_filter_1.loc[(df_outlier_filter_1['datetime'] >= start_date) & (df_outlier_filter_1['datetime'] <= end_date)].sort_values('datetime', ignore_index=True, ascending=True)

# # Get sorted unique dates
# analysis_dates = df_date['datetime'].unique().tolist()
# analysis_dates.sort()

# # Process each date
# for current_date in analysis_dates[:]:
#     # Filter data for current date
#     daily_data = df_outlier_filter_1[df_outlier_filter_1['datetime'] == current_date]
#     gauge_codes = daily_data['gauge_code'].unique()
    
#     date_results = []
    
#     for gauge in gauge_codes:
#         gauge_data = daily_data[daily_data['gauge_code'] == gauge].iloc[0]
#         lat, lon = gauge_data['lat'], gauge_data['long']
#         observed_rain = gauge_data['rain_mm']
        
#         # Initialize result row
#         result_row = {
#             'gauge_code': gauge,
#             'datetime': current_date,
#             'lat': lat,
#             'long': lon,
#             'observed_rain_mm': observed_rain,
#             'interpolated_rain_mm': np.nan
#         }
        
#         # Only interpolate for high rainfall values
#         if observed_rain > rainfall_threshold:
#             neighbor_data = daily_data[daily_data['gauge_code'] != gauge]
            
#             if len(neighbor_data) > 0:
#                 kd_tree = KDTree(neighbor_data[['lat', 'long']].values)
#                 result_row['interpolated_rain_mm'] = idw_interpolation(lat, lon, neighbor_data, kd_tree)
        
#         date_results.append(pd.DataFrame([result_row]))
    
#     # Combine results for current date
#     daily_results = pd.concat(date_results, ignore_index=True)
    
#     # Save to HDF5 with proper configuration
#     # storage_mode = 'w' if current_date == analysis_dates[0] else 'a'
#     # append_mode = False if current_date == analysis_dates[0] else True

#     storage_mode = 'r+'
#     append_mode = True
    
#     daily_results.to_hdf(
#         output_filename,
#         key='table_data',
#         mode=storage_mode,
#         format='table',
#         complevel=9,
#         encoding='utf-8',
#         append=append_mode,
#         min_itemsize={'gauge_code': 20}  # Adjust based on your max gauge code length
#     )
    
#     print(f"Saved results for {current_date} to {output_filename}")

Saved results for 1969-07-01 00:00:00 to ./1 - Organized data gauge/BRAZIL\neighboring_analysis.h5
Saved results for 1969-07-02 00:00:00 to ./1 - Organized data gauge/BRAZIL\neighboring_analysis.h5
Saved results for 1969-07-03 00:00:00 to ./1 - Organized data gauge/BRAZIL\neighboring_analysis.h5
Saved results for 1969-07-04 00:00:00 to ./1 - Organized data gauge/BRAZIL\neighboring_analysis.h5
Saved results for 1969-07-05 00:00:00 to ./1 - Organized data gauge/BRAZIL\neighboring_analysis.h5
Saved results for 1969-07-06 00:00:00 to ./1 - Organized data gauge/BRAZIL\neighboring_analysis.h5
Saved results for 1969-07-07 00:00:00 to ./1 - Organized data gauge/BRAZIL\neighboring_analysis.h5
Saved results for 1969-07-08 00:00:00 to ./1 - Organized data gauge/BRAZIL\neighboring_analysis.h5
Saved results for 1969-07-09 00:00:00 to ./1 - Organized data gauge/BRAZIL\neighboring_analysis.h5
Saved results for 1969-07-10 00:00:00 to ./1 - Organized data gauge/BRAZIL\neighboring_analysis.h5
Saved resu

In [36]:
del df_outlier_filter_1

In [None]:
output_filename = os.path.join(neighboring_data_path, "neighboring_analysis.h5")

df_outlier_2 = pd.read_hdf(output_filename, key='table_data')
df_outlier_2

Unnamed: 0,gauge_code,datetime,lat,long,observed_rain_mm,interpolated_rain_mm
0,00048000,1961-01-01,-0.727800,-48.515800,0.0,
1,00062000,1961-01-01,-0.971100,-62.928600,0.0,
2,00067000,1961-01-01,-0.125300,-67.061100,0.0,
3,00145004,1961-01-01,-1.667800,-45.368300,0.0,
4,00148001,1961-01-01,-1.450000,-48.500000,0.0,
...,...,...,...,...,...,...
4894,B803,2024-12-31,-24.570833,-52.800278,0.0,
4895,B804,2024-12-31,-25.368889,-52.391944,0.0,
4896,B806,2024-12-31,-25.322464,-49.157733,0.0,
4897,B807,2024-12-31,-30.186111,-51.178056,0.0,


In [None]:
df_outlier_filter_2_export = df_outlier_2[df_outlier_2['interpolated_rain_mm'] >= 0.0].reset_index(drop = True)
df_outlier_filter_2_export = df_outlier_filter_2_export[df_outlier_filter_2_export['interpolated_rain_mm'] >= 0.35 * df_outlier_filter_2_export['observed_rain_mm']]
df_outlier_filter_2_export['outlier_status_2'] = 1
df_outlier_filter_2_export

Unnamed: 0,gauge_code,datetime,lat,long,observed_rain_mm,interpolated_rain_mm
7,02346034,1961-01-23,-23.4167,-46.5667,200.30,128.216270
10,02346162,1961-01-26,-23.7000,-46.0667,231.00,182.781510
11,02346223,1961-01-26,-23.7000,-46.0167,272.60,151.228139
12,02345145,1961-01-27,-23.5997,-45.9089,201.40,71.139330
17,02447038,1961-02-16,-24.8000,-47.9667,252.60,113.109432
...,...,...,...,...,...,...
3029,82230815,2024-05-27,-25.7122,-48.9706,342.40,412.382794
3030,82234000,2024-05-27,-25.8181,-48.8072,563.00,314.877713
3060,62780700,2024-11-04,-21.8194,-48.8281,218.20,120.550931
3087,65824880,2024-12-09,-25.6406,-51.8603,209.60,78.153355


In [7]:
del df_outlier_2

In [9]:
df_outlier_filter_2_export.to_hdf(
        os.path.join(neighboring_data_path, "neighboring_analysis_filter_2.h5"),
        key='table_data',
        mode='w',
        format='table',
        complevel=9,
        encoding='utf-8',
        append=False,
        min_itemsize={'gauge_code': 20}  # Adjust based on your max gauge code length
    )
df_outlier_filter_2_export

Unnamed: 0,gauge_code,datetime,lat,long,observed_rain_mm,interpolated_rain_mm
7,02346034,1961-01-23,-23.4167,-46.5667,200.30,128.216270
10,02346162,1961-01-26,-23.7000,-46.0667,231.00,182.781510
11,02346223,1961-01-26,-23.7000,-46.0167,272.60,151.228139
12,02345145,1961-01-27,-23.5997,-45.9089,201.40,71.139330
17,02447038,1961-02-16,-24.8000,-47.9667,252.60,113.109432
...,...,...,...,...,...,...
3029,82230815,2024-05-27,-25.7122,-48.9706,342.40,412.382794
3030,82234000,2024-05-27,-25.8181,-48.8072,563.00,314.877713
3060,62780700,2024-11-04,-21.8194,-48.8281,218.20,120.550931
3087,65824880,2024-12-09,-25.6406,-51.8603,209.60,78.153355


# Data Deletion

In [37]:
df_filter_1 = pd.read_hdf(os.path.join(neighboring_data_path, "adjacent_day_analysis_filter_1.h5"), key='table_data')
df_filter_1 = df_filter_1[['gauge_code', 'datetime', 'outlier_status_1']]
df_filter_1

Unnamed: 0,gauge_code,datetime,outlier_status_1
122183545,00047005,2011-01-23,1
122183542,00047005,2011-01-26,1
122415828,00048005,1980-01-14,1
122415748,00048005,1980-04-08,1
122415694,00048005,1981-01-25,1
...,...,...,...
3345677,86743950,2021-03-20,1
91180725,A110,2024-01-04,1
31942356,A606,2022-11-27,1
33597804,A667,2023-02-08,1


In [38]:

df_filter_2 = pd.read_hdf(os.path.join(neighboring_data_path, "neighboring_analysis_filter_2.h5"), key='table_data')
df_filter_2['outlier_status_2'] = 1
df_filter_2 = df_filter_2[['gauge_code', 'datetime', 'outlier_status_2']]
df_filter_2

Unnamed: 0,gauge_code,datetime,outlier_status_2
7,02346034,1961-01-23,1
10,02346162,1961-01-26,1
11,02346223,1961-01-26,1
12,02345145,1961-01-27,1
17,02447038,1961-02-16,1
...,...,...,...
3029,82230815,2024-05-27,1
3030,82234000,2024-05-27,1
3060,62780700,2024-11-04,1
3087,65824880,2024-12-09,1


In [None]:
df_data_filtered = pd.merge(df_data, df_filter_1, on = ['gauge_code', 'datetime'], how = 'left').merge(df_filter_2, on = ['gauge_code', 'datetime'], how = 'left')
df_data_filtered = df_data_filtered[(df_data_filtered['outlier_status_1'] != 1) & (df_data_filtered['outlier_status_2'] != 1)]
df_data_filtered

Unnamed: 0,gauge_code,datetime,rain_mm,outlier_status_1,outlier_status_2
0,00047000,1961-01-01,0.0,,
1,00047000,1961-01-02,0.0,,
2,00047000,1961-01-03,0.0,,
3,00047000,1961-01-04,0.0,,
4,00047000,1961-01-05,0.0,,
...,...,...,...,...,...
123612837,S717,2021-12-27,0.0,,
123612838,S717,2021-12-28,0.0,,
123612839,S717,2021-12-29,0.0,,
123612840,S717,2021-12-30,0.0,,


In [None]:
key = 'table_data_filtered'
chunk_size = 10000000  # Adjust based on available memory

with pd.HDFStore(cleaned_path, mode='r+', complevel=9, complib='blosc') as store:
    for start in range(0, len(df_data_filtered_test), chunk_size):
        end = start + chunk_size
        chunk = df_data_filtered_test.iloc[start:end]
        
        store.append(key, chunk, format='table', data_columns=True, encoding='utf-8', min_itemsize={'gauge_code': 20})

        print(f"Chunk {start // chunk_size + 1} of {len(df_data_filtered_test) // chunk_size + 1} completed.")

Chunk 1 of 13 completed.
Chunk 2 of 13 completed.
Chunk 3 of 13 completed.
Chunk 4 of 13 completed.
Chunk 5 of 13 completed.
Chunk 6 of 13 completed.
Chunk 7 of 13 completed.
Chunk 8 of 13 completed.
Chunk 9 of 13 completed.
Chunk 10 of 13 completed.
Chunk 11 of 13 completed.
Chunk 12 of 13 completed.
Chunk 13 of 13 completed.


In [8]:
df_data_filtered_test = pd.read_hdf(cleaned_path, key = 'table_data_filtered', encoding = 'utf-8')
df_data_filtered_test

Unnamed: 0,gauge_code,datetime,rain_mm
0,00047000,1961-01-01,0.0
1,00047000,1961-01-02,0.0
2,00047000,1961-01-03,0.0
3,00047000,1961-01-04,0.0
4,00047000,1961-01-05,0.0
...,...,...,...
123612837,S717,2021-12-27,0.0
123612838,S717,2021-12-28,0.0
123612839,S717,2021-12-29,0.0
123612840,S717,2021-12-30,0.0


In [4]:
df_data_filtered_test['gauge_code'].nunique()

18503