In [5]:
import pandas as pd
import numpy as np
from scipy.spatial import KDTree
import os
from joblib import Parallel, delayed

In [6]:
def idw_interpolation(latitude, longitude, df_temp_without_gauge, kdtree, p=2):
    row = [latitude, longitude]
    distances, indices = kdtree.query(row, k=5)
    weights = 1 / (distances + 1e-6) ** p
    values = df_temp_without_gauge.iloc[indices]['rain_mm'].values
    return (np.sum(weights * values) / np.sum(weights))

In [7]:
def process_date(ref_date, df_total, crossvalidation_path):
    df_temp = df_total[(df_total['datetime'] == ref_date)]
    gauges_loop = list(df_temp['gauge_code'].unique())
    result_list = []
    for gauge_code in gauges_loop[:]:
        df_result = pd.DataFrame(columns=['gauge_code','datetime', "lat", "long",  'interpolated_rain_mm', 'rain_mm'])

        index = (df_temp[df_temp['gauge_code']==gauge_code]).index[0]
        latitude = df_temp.loc[index, "lat"]
        longitude = df_temp.loc[index, "long"]
        ground_value = df_temp.loc[index, "rain_mm"]

        df_temp_without_gauge = df_temp[df_temp['gauge_code'] != gauge_code]

        locations = df_temp_without_gauge[['lat', 'long']].values
        kdtree = KDTree(locations)        

        interpolated_value = idw_interpolation(latitude, longitude, df_temp_without_gauge, kdtree)
        del df_temp_without_gauge, locations, kdtree
        
        df_result.loc[len(df_result)] = [gauge_code, ref_date, latitude, longitude, interpolated_value, ground_value]
        result_list.append(df_result)
    del df_temp
    
    df_final_result = pd.concat(result_list, ignore_index=True).drop_duplicates(ignore_index=True).sort_values('datetime', ignore_index=True)
    del result_list
    output_path = os.path.join(crossvalidation_path, f"{ref_date.date().strftime('%Y_%m_%d')}_crossvalidation.h5")
    print(output_path)
    df_final_result.to_hdf(output_path
                            , key = 'table_crossvalidation'
                            , mode = 'w'
                            , append = False
                            , complevel = 9
                            , encoding="utf-8")
    del df_final_result

In [8]:
def main():
    file_path = './1 - Organized data gauge/BRAZIL/DATASETS/BRAZIL_DAILY_1961_2024_QC.h5'
    crossvalidation_path = './1 - Organized data gauge/BRAZIL/CROSSVALIDATION'

    start_date = '2021-01-01'
    end_date = '2024-12-31'
    
    df_info = pd.read_hdf(file_path, key='table_info')
    df_data = pd.read_hdf(file_path, key='table_data')
    df_total = pd.merge(df_data, df_info, on='gauge_code', how='left')
    del df_data, df_info

    df_total = df_total.loc[(df_total['datetime'] >= start_date) & (df_total['datetime'] <= end_date)].sort_values('datetime', ignore_index=True, ascending=True)
    date_list = df_total['datetime'].unique().tolist()

    print('data processing start...')
    Parallel(n_jobs=-2)(delayed(process_date)(ref_date, df_total, crossvalidation_path) for ref_date in date_list)
    
    del df_total
    print('\ndata processing end...')

if __name__ == "__main__":
    main()

data processing start...

data processing end...
