In [1]:
import pandas as pd # pandas is a powerful data manipulation library
import numpy as np  # NumPy is the fundamental package for scientific computing with Python
from scipy.spatial import cKDTree  # Replace KDTree with cKDTree for faster queries
import xarray as xr # xarray is a powerful data structure that simplifies working with multi-dimensional arrays
# import h5py # h5py is a common package for working with HDF5 files
from statistics import mean
from joblib import Parallel, delayed

In [2]:
cleaned_dataset_path = './1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_QC.h5'

table_grid = pd.read_hdf(cleaned_dataset_path, key='table_grid')
table_grid.drop_duplicates().sort_values(['long', 'lat'])

KeyError: 'No object named table_grid in the file'

In [None]:
# Function for IDW interpolation

# In the IDW methodology, each of the nearest stations
# selected for the interpolation at a query point is weighted
# (Wk) by Wk=d(k)−p, where d is the distance of station k and
# the specified query point. The p values is the power
# parameter that we use p = 2, as suggested by 
# Dirks et al. (1998), Goovaert (2000), Lloyd (2005), Ly et al. (2011) and Xavier et al. (2016).

def idw_interpolation(row, p=2, df_temp= pd.DataFrame(), grid_points=[]):
    # Build KDTree from station locations
    locations = df_temp[['lat', 'long']].values
    kdtree = cKDTree(locations)
    
    # Find the indices and distances of the 5 nearest stations
    spatial_resolution = 0.1 
    step_size = spatial_resolution / 4

    start_lat = row['lat'] - (spatial_resolution / 2)
    end_lat = row['lat'] + (spatial_resolution / 2) + step_size  # Add step_size to include the endpoint
    generated_latitudes = [round(start_lat + i * step_size, 6) for i in range(int((end_lat - start_lat) / step_size))]

    start_lon = row['long'] - (spatial_resolution / 2)
    end_lon = row['long'] + (spatial_resolution / 2) + step_size  # Add step_size to include the endpoint
    generated_longitudes = [round(start_lon + i * step_size, 6) for i in range(int((end_lon - start_lon) / step_size))]

    interpolated_value_avg = []

    for lat in generated_latitudes:
        for lon in generated_longitudes:
            distances, indices = kdtree.query([lat, lon], k=5)
            max_distance = 0
            if max(distances) >= max_distance:
                max_distance = max(distances)
            # Compute the inverse distance weights
            weights = 1 / (distances + 1e-6) ** p  # Adding a small value to prevent division by zero
    
            # Get the values at the nearest stations
            values = df_temp.iloc[indices]['rain_mm'].values
    
            # Calculate the weighted average
            interpolated_value = np.sum(weights * values) / np.sum(weights)
            interpolated_value_avg.append(interpolated_value)
    # print("max distance", max_distance)
            
    interpolated_value_final = mean(interpolated_value_avg)
    return interpolated_value_final

In [None]:
# Function to process a single date
def process_date(ref_date, df_data_info, df_coords_temp, grid_points):
    """
    Process a single date for IDW interpolation and save to NetCDF.
    """
    # Filter stations with data for the current date
    df_temp = df_data_info[df_data_info['datetime'] == ref_date]
    
    if df_temp.empty:
        # print(f"No data for {ref_date}. Skipping.")
        return
    
    # IDW interpolation
    interpolated_rain = df_coords_temp.apply(lambda row: idw_interpolation(row, p=2, df_temp=df_temp, grid_points=grid_points), axis=1)
    
    # Create output DataFrame
    df_precip = df_coords_temp.copy()
    df_precip['rain_mm'] = interpolated_rain
    df_precip['datetime'] = ref_date
    
    # Save to hdf5
    output_path = f'./1 - Organized data gauge/BRAZIL/NetCDF/IDW_optimization/precipitation_idw_{ref_date.date()}.h5'
    df_precip.to_hdf(output_path, key='table_data', mode='w', format='table', complevel=9, append=False)
    print(f"Saved: {output_path}")

    # Save to NetCDF
    ds = xr.Dataset.from_dataframe(df_precip.set_index(['lat', 'long', 'datetime']))
    ds['rain_mm'].attrs['units'] = 'mm'
    output_path = f'./1 - Organized data gauge/BRAZIL/NetCDF/IDW_optimization/precipitation_idw_{ref_date.date()}.nc'
    ds.to_netcdf(output_path)
    print(f"Saved: {output_path}")

In [None]:
# Main script
if __name__ == "__main__":
    # Load data
    cleaned_dataset_path = './1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_QC.h5'
    df_data = pd.read_hdf(cleaned_dataset_path, key='table_data')
    df_info = pd.read_hdf(cleaned_dataset_path, key='table_info')
    df_coords = pd.read_hdf(cleaned_dataset_path, key='table_grid')
    
    # Merge data and info
    df_data_info = pd.merge(df_data, df_info[['gauge_code', 'lat', 'long']], on='gauge_code', how='left')
    del df_data, df_info
    df_coords_temp = df_coords[['lat', 'long']]
    
    # Define start_date and end_date
    start_date = '2011-01-01'
    end_date = '2011-01-01'
    
    # Filter dates of interest
    df_data_info = df_data_info.query("datetime >= @start_date and datetime <= @end_date")
    df_date_list = pd.DataFrame(df_data_info['datetime'].drop_duplicates().sort_values())
    df_date_list = df_date_list.query("datetime >= @start_date and datetime <= @end_date")
    date_list = df_date_list['datetime'].tolist()
    
    # Precompute grid points
    grid_points = df_coords_temp[['lat', 'long']].values
    
    # Parallel processing with joblib
    Parallel(n_jobs=-2)(delayed(process_date)(ref_date, df_data_info, df_coords_temp, grid_points) for ref_date in date_list)