In [14]:
import os
import numpy as np
import pandas as pd
import json
from tqdm import tqdm

In [2]:
# define basin list dir
basin_dir = r'..\data\camels\gauch_etal_2020'
basin_filename = 'basin_list_516.txt'
output_dir = basin_dir

# define observation file dir
#obs_dir = os.path.join(working_dir,'usgs-streamflow')
obs_dir = r'..\data\camels\gauch_etal_2020\usgs_streamflow'

# define atmospheric forcing file dir
forcing_path = r'..\data\camels\gauch_etal_2020\nldas_hourly'

# define the spinup-calib-val period
time_split_file = r'..\data\model_common_configs\cal-val-test-period.json'


In [10]:

def load_basin_list(basin_file):
    with open(basin_file, 'r') as file:
        lines = file.readlines()
    lines = [line.strip() for line in lines]
    return lines

def load_csv(file_path):
    return pd.read_csv(file_path)

def calc_nan_percentages_forcing(forcing_data, fields):
    return [len(np.where(np.isnan(forcing_data[field]))[0]) / len(forcing_data[field]) for field in fields]

def calc_nan_percentages_obs(obs_data):
    return len(np.where(np.isnan(obs_data['QObs(mm/h)'].values))[0]) / len(obs_data['QObs(mm/h)'])

def calc_indices(data, datetime):
    start_idx = np.where(data['date'] == datetime["start_datetime"])
    end_idx = np.where(data['date'] == datetime["end_datetime"])
    return start_idx, end_idx

def initialize_nan_check():
    keys = [
            'basin id', 
            'spinup-for-calibration - pet', 'spinup-for-calibration - precip', 
            'spinup-for-testing - pet', 'spinup-for-testing - precip', 
            'calibration - pet', 'calibration - precip', 'calibration - usgs', 
            'testing - pet', 'testing - precip', 'testing - usgs'
            ]
    return {key: [] for key in keys}


In [15]:

def process_data(data_dir, obs_dir, forcing_path, time_split_file, output_dir):
    
    nan_check = initialize_nan_check()

    basin_list = load_basin_list(os.path.join(data_dir, 'basin_list_516.txt'))
    time_split = json.load(open(time_split_file, 'r'))

    for basin in tqdm(basin_list):
        obs_data = load_csv(os.path.join(obs_dir, f'{basin}-usgs-hourly.csv'))
        forcing_data = load_csv(os.path.join(forcing_path, f'{basin}_hourly_nldas.csv'))

        for phase, fields in time_split.items():
            
            # Forcing 
            start_idx_forcing, end_idx_forcing = calc_indices(forcing_data, fields)
            if start_idx_forcing[0].size == 0 or end_idx_forcing[0].size == 0: 
                print(f"none or missing forcing data for {phase} period")
                nan_perc = [1, 1]
            else:
                sliced_forcing_data = forcing_data.iloc[start_idx_forcing[0][0]:end_idx_forcing[0][0]+1,:]
                nan_perc = calc_nan_percentages_forcing(sliced_forcing_data, ['potential_evaporation', 'total_precipitation'])
                
            # Observation
            start_idx_obs, end_idx_obs = calc_indices(obs_data, fields)
            if start_idx_obs[0].size == 0 or end_idx_obs[0].size == 0: 
                print(f"none or missing observation data for {phase} period")
                nan_perc_obs = 1
            else:
                sliced_obs_data = obs_data.iloc[start_idx_obs[0][0]:end_idx_obs[0][0]+1,:]
                nan_perc_obs = calc_nan_percentages_obs(sliced_obs_data)
                
            nan_check[f'{phase} - pet'].append(nan_perc[0])
            nan_check[f'{phase} - precip'].append(nan_perc[1])
            if "spinup" not in phase:
                nan_check[f'{phase} - usgs'].append(nan_perc_obs)

    df = pd.DataFrame(nan_check)
    df.to_csv(os.path.join(output_dir, "check_for_nan_in_data_hourly.csv"))

data_dir = os.path.join('..', 'data', 'camels', 'gauch_etal_2020')
obs_dir = os.path.join(data_dir, 'usgs_streamflow')
forcing_path = os.path.join(data_dir, 'nldas_hourly')
time_split_file = os.path.join('..', 'data', 'model_common_configs', 'cal-val-test-period.json')
output_dir = data_dir

process_data(data_dir, obs_dir, forcing_path, time_split_file, output_dir)



  0%|          | 0/516 [00:00<?, ?it/s]

  7%|▋         | 37/516 [00:45<10:11,  1.28s/it]