In [1]:
#!/usr/bin/env python3
import os
import pandas as pd
import datetime
from timezonefinder import TimezoneFinder
from zoneinfo import ZoneInfo  # Replaces pytz

# -----------------------------------------------------------
# Step 1. Read metadata and filter by gauge IDs present in .nc files
# -----------------------------------------------------------
# Path configuration
attributes_file = '/p/scratch/cesmtst/zhang36/NeuralFAS_dataset/GRDC_Caravan/GRDC_Caravan_extension_csv/attributes/grdc/GRDC_Stations.csv'
nc_dir = '/p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/inference_aifas/kfold_splits_google_filtered'

# Get gauge IDs from the .nc files (remove prefix and suffix)
gauge_ids_to_keep = set(os.path.splitext(f)[0].replace("GRDC_", "") 
                        for f in os.listdir(nc_dir) if f.endswith(".nc"))

print(f"📦 Found {len(gauge_ids_to_keep)} gauge IDs from .nc files.")

# Read the attributes CSV and filter to only those gauge IDs
df_attributes = pd.read_csv(attributes_file, encoding='latin1')
df_attributes['grdc_no'] = df_attributes['grdc_no'].astype(str)
df_attributes = df_attributes[df_attributes['grdc_no'].isin(gauge_ids_to_keep)]

print(f"✅ Retained {len(df_attributes)} gauge IDs with matching metadata.")

# Build mapping dictionary: gauge_id -> (lat, lon)
gauge_coords = {row['grdc_no']: (row['lat'], row['long']) for _, row in df_attributes.iterrows()}

# -----------------------------------------------------------
# Step 2. Determine each gauge's UTC time zone offset (in hours)
# -----------------------------------------------------------
tf = TimezoneFinder()
gauge_tz_offset = {}  # Store UTC offset for each gauge
rep_date = datetime.datetime(2020, 1, 1)  # Representative date to get timezone offset

print("\n🌍 Calculating timezone offsets ...\n")

for gauge_id, (lat, lon) in gauge_coords.items():
    tz_str = tf.timezone_at(lng=lon, lat=lat)
    if tz_str is None:
        print(f"⚠️ Timezone not found for {gauge_id} (lat={lat}, lon={lon}). Using UTC (offset = 0).")
        gauge_tz_offset[gauge_id] = 0
    else:
        try:
            tz = ZoneInfo(tz_str)
            localized = rep_date.replace(tzinfo=tz)
            offset_hours = localized.utcoffset().total_seconds() / 3600
            gauge_tz_offset[gauge_id] = offset_hours
            print(f"✅ Gauge {gauge_id}: Timezone {tz_str}, Offset {offset_hours} hours")
        except Exception as e:
            print(f"⚠️ Failed to process timezone for {gauge_id} ({tz_str}): {e}. Using UTC (offset = 0).")
            gauge_tz_offset[gauge_id] = 0

print("\n✅ Timezone offset calculation completed.")

📦 Found 2957 gauge IDs from .nc files.
✅ Retained 2957 gauge IDs with matching metadata.

🌍 Calculating timezone offsets ...

✅ Gauge 1104150: Timezone Africa/Algiers, Offset 1.0 hours
✅ Gauge 1104800: Timezone Africa/Algiers, Offset 1.0 hours
✅ Gauge 1112200: Timezone Africa/Bamako, Offset 0.0 hours
✅ Gauge 1112340: Timezone Africa/Bamako, Offset 0.0 hours
✅ Gauge 1112480: Timezone Africa/Bamako, Offset 0.0 hours
✅ Gauge 1112500: Timezone Africa/Bamako, Offset 0.0 hours
✅ Gauge 1134030: Timezone Africa/Bamako, Offset 0.0 hours
✅ Gauge 1134040: Timezone Africa/Bamako, Offset 0.0 hours
✅ Gauge 1134080: Timezone Africa/Bamako, Offset 0.0 hours
✅ Gauge 1134100: Timezone Africa/Bamako, Offset 0.0 hours
✅ Gauge 1134110: Timezone Africa/Bamako, Offset 0.0 hours
✅ Gauge 1134220: Timezone Africa/Bamako, Offset 0.0 hours
✅ Gauge 1134250: Timezone Africa/Bamako, Offset 0.0 hours
✅ Gauge 1134400: Timezone Africa/Bamako, Offset 0.0 hours
✅ Gauge 1134700: Timezone Africa/Bamako, Offset 0.0 hours
✅ 

In [3]:
import xarray as xr

test = xr.open_dataset('/p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/kfold_splits_google_filtered/GRDC_1591231.nc')

In [1]:
import pandas as pd
import os

# Path to attributes file
attributes_file = '/p/scratch/cesmtst/zhang36/NeuralFAS_dataset/GRDC_Caravan/GRDC_Caravan_extension_csv/attributes/grdc/GRDC_Stations.csv'
nc_dir = '/p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/kfold_splits_google_filtered'

# Step 1. Extract gauge IDs from .nc filenames
gauge_ids_in_folder = set(
    os.path.splitext(f)[0].replace("GRDC_", "") 
    for f in os.listdir(nc_dir) if f.endswith(".nc")
)

# Step 2. Load CSV and match only those gauges
df_attributes = pd.read_csv(attributes_file, encoding='latin1')
df_attributes['grdc_no'] = df_attributes['grdc_no'].astype(str)

df_matched = df_attributes[df_attributes['grdc_no'].isin(gauge_ids_in_folder)]

# Step 3. Create dictionary: gauge_id → area_km2
gauge_area = {
    row['grdc_no']: row['area']
    for _, row in df_matched.iterrows()
}

print(f"✅ Created gauge_area for {len(gauge_area)} gauges.")

✅ Created gauge_area for 2957 gauges.


In [None]:
import os
import xarray as xr
import numpy as np
from timezonefinder import TimezoneFinder
from zoneinfo import ZoneInfo
import datetime

def apply_utc_shift(ds, offset):
    """Shift daily values from local time to UTC using weighted average."""
    ds = ds.copy()

    # Step 0: Correct time axis (shift back 1 day since timestamps are right-labeled)
    ds['time'] = ds['time'] - np.timedelta64(1, 'D')

    # Step 1: Apply UTC shift based on offset
    if offset == 0:
        ds['streamflow_utc'] = ds['google_prediction']
    elif offset > 0:
        w1 = (24 - offset) / 24
        w2 = offset / 24
        shifted = ds['google_prediction'].shift(time=-1)
        ds['streamflow_utc'] = w1 * ds['google_prediction'] + w2 * shifted
        ds = ds.isel(time=slice(0, -1))  # drop last time step
    elif offset < 0:
        abs_offset = abs(offset)
        w1 = abs_offset / 24
        w2 = (24 - abs_offset) / 24
        shifted = ds['google_prediction'].shift(time=1)
        ds['streamflow_utc'] = w1 * shifted + w2 * ds['google_prediction']
        ds = ds.isel(time=slice(1, None))  # drop first time step

    return ds

def convert_units(ds, area_km2):
    """Convert from mm/day to m³/s and rename to final variable name."""
    ds = ds.copy()
    var_name = 'google_prediction_m3s_utc0'
    ds[var_name] = (ds['streamflow_utc'] * area_km2) / 86.4
    ds[var_name].attrs['units'] = 'm3/s'
    ds[var_name].attrs['description'] = 'Google forecast streamflow converted to m³/s and aligned to UTC'
    return ds.drop_vars('streamflow_utc')

def process_netcdf_file(filepath, offset, area_km2, output_dir):
    """Full processing pipeline for one NetCDF file."""
    ds = xr.open_dataset(filepath)

    # Apply conversion steps
    ds = apply_utc_shift(ds, offset)
    ds = convert_units(ds, area_km2)

    # Save to new NetCDF file
    filename = os.path.basename(filepath)
    output_path = os.path.join(output_dir, filename)
    ds.to_netcdf(output_path)
    print(f"✅ Processed and saved: {output_path}")

In [4]:
input_dir = '/p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/kfold_splits_google_filtered'
output_dir = '/p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/kfold_splits_google_filtered_converted'
os.makedirs(output_dir, exist_ok=True)

for file in os.listdir(input_dir):
    if file.endswith('.nc'):
        gauge_id = file.replace("GRDC_", "").replace(".nc", "")
        area = gauge_area.get(gauge_id)
        offset = gauge_tz_offset.get(gauge_id)
        if area is None or offset is None:
            print(f"Skipping {gauge_id}: missing area or offset")
            continue
        filepath = os.path.join(input_dir, file)
        process_netcdf_file(filepath, offset, area, output_dir)

✅ Processed and saved: /p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/kfold_splits_google_filtered_converted/GRDC_1591231.nc
✅ Processed and saved: /p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/kfold_splits_google_filtered_converted/GRDC_6544100.nc
✅ Processed and saved: /p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/kfold_splits_google_filtered_converted/GRDC_6348500.nc
✅ Processed and saved: /p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/kfold_splits_google_filtered_converted/GRDC_4116301.nc
✅ Processed and saved: /p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/kfold_splits_google_filtered_converted/GRDC_1733500.nc
✅ Processed and saved: /p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/kfold_splits_google_filtered_converted/GRDC_4208487.nc
✅ Processed and saved: /p/largedata2/detectdata/CentralDB/projects/d05/worki

In [5]:
# test 
original = xr.open_dataset('/p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/kfold_splits_google_filtered/GRDC_1134700.nc')
converted = xr.open_dataset('/p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/kfold_splits_google_filtered_converted/GRDC_1134700.nc')

In [6]:
original

In [7]:
converted

In [9]:
import tensorflow as tf

model = tf.saved_model.load('/p/scratch/cesmtst/zhang36/trainer_fold_0')

# Print top-level object structure
print(dir(model))

# If signatures are available (usually under 'serving_default')
if 'serving_default' in model.signatures:
    print(model.signatures['serving_default'].pretty_printed_signature())

2025-04-24 22:42:55.466855: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


['__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_add_trackable_child', '_add_variable_with_custom_getter', '_checkpoint_adapter', '_checkpoint_dependencies', '_copy_trackable_to_cpu', '_default_save_signature', '_deferred_dependencies', '_delete_tracking', '_deserialization_dependencies', '_deserialize_from_proto', '_export_to_saved_model_graph', '_gather_saveables_for_checkpoint', '_handle_deferred_dependencies', '_inner_layer', '_lookup_dependency', '_maybe_initialize_trackable', '_name_based_attribute_restore', '_name_based_restores', '_no_dependency', '_object_identifier', '_preload_simple_restoration', '_restore_from_tensors', '_self_name_bas

In [15]:
####### ONLY change the unit of google paper ####

#!/usr/bin/env python3
import os
import pandas as pd
import datetime
from timezonefinder import TimezoneFinder
from zoneinfo import ZoneInfo  # Replaces pytz

# -----------------------------------------------------------
# Step 1. Read metadata and filter by gauge IDs present in .nc files
# -----------------------------------------------------------
# Path configuration
attributes_file = '/p/scratch/cesmtst/zhang36/NeuralFAS_dataset/GRDC_Caravan/GRDC_Caravan_extension_csv/attributes/grdc/GRDC_Stations.csv'
nc_dir = '/p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/full_run_google_filtered'

# Get gauge IDs from the .nc files (remove prefix and suffix)
gauge_ids_to_keep = set(os.path.splitext(f)[0].replace("GRDC_", "") 
                        for f in os.listdir(nc_dir) if f.endswith(".nc"))

print(f"📦 Found {len(gauge_ids_to_keep)} gauge IDs from .nc files.")

# Read the attributes CSV and filter to only those gauge IDs
df_attributes = pd.read_csv(attributes_file, encoding='latin1')
df_attributes['grdc_no'] = df_attributes['grdc_no'].astype(str)
df_attributes = df_attributes[df_attributes['grdc_no'].isin(gauge_ids_to_keep)]

print(f"✅ Retained {len(df_attributes)} gauge IDs with matching metadata.")

# Build mapping dictionary: gauge_id -> (lat, lon)
gauge_coords = {row['grdc_no']: (row['lat'], row['long']) for _, row in df_attributes.iterrows()}



📦 Found 2957 gauge IDs from .nc files.
✅ Retained 2957 gauge IDs with matching metadata.


In [16]:
import pandas as pd
import os

# Path to attributes file
attributes_file = '/p/scratch/cesmtst/zhang36/NeuralFAS_dataset/GRDC_Caravan/GRDC_Caravan_extension_csv/attributes/grdc/GRDC_Stations.csv'
nc_dir = '/p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/full_run_google_filtered'

# Step 1. Extract gauge IDs from .nc filenames
gauge_ids_in_folder = set(
    os.path.splitext(f)[0].replace("GRDC_", "") 
    for f in os.listdir(nc_dir) if f.endswith(".nc")
)

# Step 2. Load CSV and match only those gauges
df_attributes = pd.read_csv(attributes_file, encoding='latin1')
df_attributes['grdc_no'] = df_attributes['grdc_no'].astype(str)

df_matched = df_attributes[df_attributes['grdc_no'].isin(gauge_ids_in_folder)]

# Step 3. Create dictionary: gauge_id → area_km2
gauge_area = {
    row['grdc_no']: row['area']
    for _, row in df_matched.iterrows()
}

print(f"✅ Created gauge_area for {len(gauge_area)} gauges.")

✅ Created gauge_area for 2957 gauges.


In [17]:
import os
import xarray as xr
import numpy as np
from timezonefinder import TimezoneFinder
from zoneinfo import ZoneInfo
import datetime


def convert_units(ds, area_km2):
    """Convert from mm/day to m³/s and rename to final variable name."""
    ds = ds.copy()
    var_name = 'google_prediction_m3s'
    ds[var_name] = (ds['google_prediction'] * area_km2) / 86.4
    ds[var_name].attrs['units'] = 'm3/s'
    ds[var_name].attrs['description'] = 'Google forecast streamflow converted to m³/s and aligned to UTC'
    return ds

def process_netcdf_file(filepath, area_km2, output_dir):
    """Full processing pipeline for one NetCDF file."""
    ds = xr.open_dataset(filepath)

    # Apply conversion steps
    ds = convert_units(ds, area_km2)

    # Save to new NetCDF file
    filename = os.path.basename(filepath)
    output_path = os.path.join(output_dir, filename)
    ds.to_netcdf(output_path)
    print(f"✅ Processed and saved: {output_path}")

In [18]:
input_dir = '/p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/full_run_google_filtered'
output_dir = '/p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/full_run_google_filtered_unit'
os.makedirs(output_dir, exist_ok=True)

for file in os.listdir(input_dir):
    if file.endswith('.nc'):
        gauge_id = file.replace("GRDC_", "").replace(".nc", "")
        area = gauge_area.get(gauge_id)
        if area is None:
            print(f"Skipping {gauge_id}: missing area or offset")
            continue
        filepath = os.path.join(input_dir, file)
        process_netcdf_file(filepath, area, output_dir)

✅ Processed and saved: /p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/full_run_google_filtered_unit/GRDC_1591231.nc
✅ Processed and saved: /p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/full_run_google_filtered_unit/GRDC_6544100.nc
✅ Processed and saved: /p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/full_run_google_filtered_unit/GRDC_6348500.nc
✅ Processed and saved: /p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/full_run_google_filtered_unit/GRDC_4116301.nc
✅ Processed and saved: /p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/full_run_google_filtered_unit/GRDC_1733500.nc
✅ Processed and saved: /p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/full_run_google_filtered_unit/GRDC_4208487.nc
✅ Processed and saved: /p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/full_run_google_filtered_unit/G

In [11]:
original = xr.open_dataset('/p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/kfold_splits_google_filtered/GRDC_1134700.nc')


In [13]:
original

In [12]:
converted = xr.open_dataset('/p/largedata2/detectdata/CentralDB/projects/d05/working_directory/NeuralFAS/kfold_splits_google_filtered_unit/GRDC_1134700.nc')

In [20]:
converted.google_prediction.values

array([[0.28422776, 0.28468296, 0.29238507, ..., 0.26818314, 0.26227573,
        0.25757208],
       [0.27932087, 0.27936926, 0.28655589, ..., 0.26308247, 0.25738582,
        0.25290066],
       [0.27403697, 0.27316561, 0.2798377 , ..., 0.25697491, 0.25183326,
        0.24771179],
       ...,
       [0.37576112, 0.38125119, 0.37432683, ..., 0.36824456, 0.36289883,
        0.35844946],
       [0.38026181, 0.38021648, 0.372996  , ..., 0.36207998, 0.35681936,
        0.35208297],
       [0.3784512 , 0.37821403, 0.37344131, ..., 0.36221251, 0.36013675,
        0.35594034]])

In [21]:
converted.google_prediction_m3s.values

array([[1118.48886504, 1120.28016398, 1150.5894018 , ..., 1055.35032711,
        1032.10354983, 1013.59385169],
       [1099.17933328, 1099.3697922 , 1127.65047837, ..., 1035.27825602,
        1012.86086633,  995.21093071],
       [1078.38623916, 1074.957275  , 1101.21316242, ..., 1011.24384199,
         991.01051412,  974.79177639],
       ...,
       [1478.68959854, 1500.2940195 , 1473.04537672, ..., 1449.11053142,
        1428.07408615, 1410.56500099],
       [1496.40063583, 1496.22225651, 1467.80834292, ..., 1424.85176524,
        1404.15026496, 1385.51167868],
       [1489.27554902, 1488.34225342, 1469.56070568, ..., 1425.37329898,
        1417.20479285, 1400.69116045]])