In [1]:
import sys
import os
import platform
import importlib
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from concurrent.futures import ProcessPoolExecutor
from functools import partial

if platform.system() == 'Darwin':  # macOS
    base_FP = '/Users/hyunglokkim/Insync/hkim@geol.sc.edu/Google_Drive'
    cpuserver_data_FP = '/Users/hyunglokkim/cpuserver_data'
elif platform.system() == 'Linux':
    base_FP = '/home/subin/data'
    cpuserver_data_FP = '/home/subin/cpuserver_data'
else:
    base_FP = '/data'
    cpuserver_data_FP = '/data'
sys.path.append(base_FP + '/python_modules')
print(base_FP, cpuserver_data_FP)

#hydroAI libs
import HydroAI.ASCAT_TUW as hASCAT_TUW
import HydroAI.Plot as hPlot
import HydroAI.Data as hData
importlib.reload(hASCAT_TUW)
importlib.reload(hPlot)
importlib.reload(hData)

# Ignore runtime warnings
import warnings
warnings.filterwarnings("ignore")

# Define your directory where to save nc files
nc_save_dir = cpuserver_data_FP + '/extracted_nc'
output_dir  = cpuserver_data_FP + '/ASCAT/TUW/csv'  

/Users/hyunglokkim/Insync/hkim@geol.sc.edu/Google_Drive /Users/hyunglokkim/cpuserver_data


## 0. Create ASCAT data from raw files

In [2]:
# get the WARP5 Grid Information (GI) nc file
ASCAT_FP = cpuserver_data_FP + '/ASCAT/TUW'
GI_file_path = os.path.join(ASCAT_FP, 'warp5_grid/TUW_WARP5_grid_info_2_1.nc')
# check the variable names and their units
hData.get_nc_variable_names_units(GI_file_path);

gpi = hData.get_variable_from_nc(GI_file_path, 'gpi')
lat = hData.get_variable_from_nc(GI_file_path, 'lat')
lon = hData.get_variable_from_nc(GI_file_path, 'lon')
cell = hData.get_variable_from_nc(GI_file_path, 'cell')

+-----------+------------------+--------------+
| Name      | Long Name        | Units        |
| cell      |                  |              |
+-----------+------------------+--------------+
| gpi       | Grid point index |              |
+-----------+------------------+--------------+
| land_flag | Land flag        |              |
+-----------+------------------+--------------+
| lat       | Latitude         | degree_north |
+-----------+------------------+--------------+
| lon       | Cell             |              |
+-----------+------------------+--------------+


In [3]:
# ASCAT nc files
nc_file_path = os.path.join(ASCAT_FP, 'h119')
h119 = hData.get_file_list(nc_file_path, 'nc')
nc_file_path = os.path.join(ASCAT_FP, 'h120')
h120 = hData.get_file_list(nc_file_path, 'nc')
ASCAT_file_path = h119+h120
hData.get_nc_variable_names_units(ASCAT_file_path[0]);
nof = len(ASCAT_file_path)

+----------------------+-----------------------------------------+--------------------------------+
| Name                 | Long Name                               | Units                          |
| row_size             | number of observations at this location |                                |
+----------------------+-----------------------------------------+--------------------------------+
| lon                  | location longitude                      | degrees_east                   |
+----------------------+-----------------------------------------+--------------------------------+
| lat                  | location latitude                       | degrees_north                  |
+----------------------+-----------------------------------------+--------------------------------+
| alt                  | vertical distance above the surface     | m                              |
+----------------------+-----------------------------------------+--------------------------------+


In [4]:
unique_cell_numbers = []
index_map = {}

for index, path in enumerate(ASCAT_file_path):
    identifier = path.split('_')[-1].split('.')[0]
    if identifier not in unique_cell_numbers:
        unique_cell_numbers.append(identifier)
        index_map[identifier] = [index]
    else:
        index_map[identifier].append(index)

In [5]:
# Function to convert fraction of days since 1900-01-01 00:00:00 UTC to local time
def convert_to_local_time(df):
    # Calculate UTC time
    base_date = datetime(1900, 1, 1, 0, 0, 0)
    utc_time = base_date + pd.to_timedelta(df['time'], unit='D')

    # Calculate timezone offset based on longitude
    timezone_offset_hours = df['lon'] / 15  # 15 degrees per hour
    timezone_offset = pd.to_timedelta(timezone_offset_hours, unit='H')

    # Convert to local time
    local_time = utc_time + timezone_offset

    # Convert local time back to fraction of days since 1900-01-01 00:00:00
    local_time_fraction_days = (local_time - base_date) / timedelta(days=1)

    return local_time_fraction_days

In [6]:
geo_vars = ['row_size', 'lon', 'lat', 'location_id']
obs_vars = ['time', 'sm', 'sm_noise', 'ssf', 'dir', 'proc_flag', 'corr_flag', 'conf_flag','sigma40', 'sigma40_noise']

# Extract geolocation related data for a specific k
for k in [2]:#range(unique_cell_numbers):

    h_file_index = index_map.get(unique_cell_numbers[k], [])
    
    obs_df_list = []
    for kk in h_file_index:
        t_ASCAT_file_name = ASCAT_file_path[kk]
        t_cell_number = t_ASCAT_file_name[-7:-3] # cell #
        #t_h_number = t_ASCAT_file_name[-11:-8]   # H119 or H120
    
        print(t_ASCAT_file_name)
        # Extract geolocation related data
        geo_df = pd.DataFrame(columns=geo_vars).drop(columns='row_size')
        for var in geo_vars:
            globals()[f't_{var}'] = hData.get_variable_from_nc(t_ASCAT_file_name, var)
        
        # Extract observation related data
        for var in obs_vars:
            globals()[f't_{var}'] = hData.get_variable_from_nc(t_ASCAT_file_name, var)
        
        t_dir = t_dir.astype(float)  # Convert orbit_dir to float
        t_row_size = t_row_size[~np.isnan(t_row_size)]
        obs_partial_df = pd.DataFrame(index=np.arange(t_row_size.sum()), columns=obs_vars+['lon', 'lat', 'location_id'])
        
        # Save them to dataframes
        row_start = np.zeros(len(t_row_size), dtype=int)
        row_end = np.zeros(len(t_row_size), dtype=int)
    
        for i in range(len(t_row_size)):
    
            geo_df.loc[i, 'lon'] = t_lon[i]
            geo_df.loc[i, 'lat'] = t_lat[i]
            geo_df.loc[i, 'location_id'] = t_location_id[i]
                
            if i == 0:
                row_start[0] = 0
                row_end[0] = t_row_size[0]
            else:
                row_start[i] = row_end[i-1]
                row_end[i] = row_start[i] + t_row_size[i]
        
            for var in obs_vars:
                obs_partial_df.loc[row_start[i]:row_end[i]-1, var] = globals()[f't_{var}'][row_start[i]:row_end[i]]
    
            obs_partial_df.loc[row_start[i]:row_end[i]-1, 'lon'] = geo_df.loc[i, 'lon']
            obs_partial_df.loc[row_start[i]:row_end[i]-1, 'lat'] = geo_df.loc[i, 'lat']
            obs_partial_df.loc[row_start[i]:row_end[i]-1, 'location_id'] = geo_df.loc[i, 'location_id']
             
        # Append the partial DataFrame to the list
        obs_df_list.append(obs_partial_df)
    
    # Concatenate all partial DataFrames into a single DataFrame
    obs_df = pd.concat(obs_df_list, ignore_index=True)
    obs_df['local_time'] = convert_to_local_time(obs_df)
    obs_df.to_csv(os.path.join(output_dir, 'h119_h120_'+t_cell_number+'.csv'))

/Users/hyunglokkim/cpuserver_data/ASCAT/TUW/h119/H119_0032.nc
/Users/hyunglokkim/cpuserver_data/ASCAT/TUW/h120/H120_0032.nc


In [10]:
def process_cell(k, index_map, unique_cell_numbers, ASCAT_file_path, geo_vars, obs_vars, hData, output_dir):
    h_file_index = index_map.get(unique_cell_numbers[k], [])
    
    obs_df_list = []
    for kk in h_file_index:
        t_ASCAT_file_name = ASCAT_file_path[kk]
        t_cell_number = t_ASCAT_file_name[-7:-3] # cell #
    
        print(t_ASCAT_file_name)
        # Extract geolocation related data
        geo_df = pd.DataFrame(columns=geo_vars).drop(columns='row_size')
        for var in geo_vars:
            globals()[f't_{var}'] = hData.get_variable_from_nc(t_ASCAT_file_name, var)
        
        # Extract observation related data
        for var in obs_vars:
            globals()[f't_{var}'] = hData.get_variable_from_nc(t_ASCAT_file_name, var)
        
        t_dir = t_dir.astype(float)  # Convert orbit_dir to float
        t_row_size = t_row_size[~np.isnan(t_row_size)]
        obs_partial_df = pd.DataFrame(index=np.arange(t_row_size.sum()), columns=obs_vars + ['lon', 'lat', 'location_id'])
        
        # Save them to dataframes
        row_start = np.zeros(len(t_row_size), dtype=int)
        row_end = np.zeros(len(t_row_size), dtype=int)
    
        for i in range(len(t_row_size)):
            geo_df.loc[i, 'lon'] = t_lon[i]
            geo_df.loc[i, 'lat'] = t_lat[i]
            geo_df.loc[i, 'location_id'] = t_location_id[i]
                
            if i == 0:
                row_start[0] = 0
                row_end[0] = t_row_size[0]
            else:
                row_start[i] = row_end[i-1]
                row_end[i] = row_start[i] + t_row_size[i]
        
            for var in obs_vars:
                obs_partial_df.loc[row_start[i]:row_end[i]-1, var] = globals()[f't_{var}'][row_start[i]:row_end[i]]
    
            obs_partial_df.loc[row_start[i]:row_end[i]-1, 'lon'] = geo_df.loc[i, 'lon']
            obs_partial_df.loc[row_start[i]:row_end[i]-1, 'lat'] = geo_df.loc[i, 'lat']
            obs_partial_df.loc[row_start[i]:row_end[i]-1, 'location_id'] = geo_df.loc[i, 'location_id']
             
        # Append the partial DataFrame to the list
        obs_df_list.append(obs_partial_df)
    
    # Concatenate all partial DataFrames into a single DataFrame
    obs_df = pd.concat(obs_df_list, ignore_index=True)
    obs_df['local_time'] = convert_to_local_time(obs_df)
    obs_df.to_csv(os.path.join(output_dir, 'h119_h120_' + t_cell_number + '.csv'))

def main():
    with ProcessPoolExecutor(max_workers=5) as executor:
        process_cell_partial = partial(process_cell, index_map=index_map, unique_cell_numbers=unique_cell_numbers, ASCAT_file_path=ASCAT_file_path, geo_vars=geo_vars, obs_vars=obs_vars, hData=hData, output_dir=output_dir)
        executor.map(process_cell_partial, range(len(unique_cell_numbers)))

if __name__ == "__main__":
    main()