## Finalize the output files

Edit variable names, metadata, etc.

### Imports

In [1]:
# --- Import Modules --- #

# Import Python Core Modules
import sys
import os
import time
import datetime

# Import Additional Modules
import numpy as np
import xarray as xr
import pandas as pd
import geopandas as gpd

tic = time.time()
print('Process initiated at {0}'.format(time.ctime()))
# --- End Import Modules --- #

Process initiated at Mon Apr 22 10:48:22 2024


In [2]:
#in_nc = r'/glade/derecho/scratch/ksampson/USGS/CONUS_Water_Budget/Water_Budget/CONUS_HUC12_WB_combined_20091001_20190930.nc'
#in_nc = r'/glade/derecho/scratch/ksampson/USGS/CONUS_Water_Budget/Water_Budget/CONUS_HUC12_WB_combined_20191001_20210930.nc'
#in_nc = r'/glade/derecho/scratch/ksampson/USGS/CONUS_Water_Budget/Water_Budget/CONUS_HUC12_WB_combined_20091001_20210930.nc'
in_nc = r'/glade/derecho/scratch/ksampson/USGS/CONUS_Water_Budget/Water_Budget/CONUS_HUC12_WB_combined_19791001_20220930_2.nc'

# Output directory
outDir = r'/glade/derecho/scratch/ksampson/USGS/CONUS_Water_Budget/Water_Budget'

# Output files
#out_nc = os.path.join(outDir, 'huc12_monthly_wb_iwaa_wrfhydro_WY2010_2019.nc')
#out_csv = os.path.join(outDir, 'huc12_monthly_wb_iwaa_wrfhydro_WY2010_2019.csv')
#out_nc = os.path.join(outDir, 'huc12_monthly_wb_iwaa_wrfhydro_WY2020_2021.nc')
#out_csv = os.path.join(outDir, 'huc12_monthly_wb_iwaa_wrfhydro_WY2020_2021.csv')
#out_nc = os.path.join(outDir, 'huc12_monthly_wb_iwaa_wrfhydro_WY2010_2021.nc')
#out_csv = os.path.join(outDir, 'huc12_monthly_wb_iwaa_wrfhydro_WY2010_2021.csv')
out_nc = os.path.join(outDir, 'huc12_monthly_wb_iwaa_wrfhydro_WY1980_2022_2.nc')
out_csv = os.path.join(outDir, 'huc12_monthly_wb_iwaa_wrfhydro_WY1980_2022_2.csv')

# Select output formats
write_NC = True      # Output netCDF file
write_CSV = True     # Output CSV file

# Name the zone dimension
zone_name = 'WBDHU12'

# Name the time dimension
time_coord = 'time'

### Dictionaries to rename variables and set attributes

In [3]:
# Dictionary to rename variables. Also used to subset dataset by variable name.
var_rename_dict = {'totPRECIP':'Precip',
                   'totPRECIP':'PrecipLand',
                   'deltaACSNOW':'Snowfall',
                   'totqSfcLatRunoff':'Surfaceflow',
                   'totqBucket':'Baseflow',
                   'deltaACCET':'ET',
                   'avgSNEQV':'SWE',
                   'avgSOILM':'SoilWater',
                   'avgSOILSAT':'SoilSat',
                   'deltaUGDRNOFF':'Recharge',
                   'avgDepth':'GWStore',
                   'Area_sqkm':'CatchmentArea',
                   'Precip':'Precip',
                   'landmask':'LandFraction',
                   'total_gridded_area': 'total_gridded_area',
                   'avgSOILM_wltadj_depthmean': 'avgSOILM_wltadj_depthmean',
                   'avgSOILSAT_wltadj_top1': 'avgSOILSAT_wltadj_top1',}

# Rename dimensions
rename_dim_dict = {zone_name:'huc_id'}

# Variable attributes dictionary
var_atts_dict = {'Precip':{'units':'mm',
                           'long_name':"Total monthly precipitation (land & water)"},
                'PrecipLand':{'units':'mm',
                              'long_name':"Total monthly precipitation (land only)"},
                'Snowfall':{'units':'mm',
                            'long_name':"Total monthly snowfall (land only)"},
                'Surfaceflow':{'units':'mm',
                               'long_name':"Total monthly surface flow"},
                'Baseflow':{'units':'mm',
                            'long_name':"Total monthly baseflow"},
                'ET':{'units':'mm',
                      'long_name':"Total monthly evapotranspiration (land only)"},
                'SWE':{'units':'mm',
                       'long_name':"Average monthly snow water equivalent (land only)"},
                'SoilWater':{'units':'mm',
                             'long_name':"Average monthly soil moisture in 2m soil column (land only)"},
                'SoilSat':{'units':'fraction',
                           'long_name':"Average monthly fractional soil saturation in 2m soil column (land only)"},
                'Recharge':{'units':'mm',
                            'long_name':"Total monthly recharge (land only)"},
                'GWStore':{'units':'mm',
                           'long_name':"Average monthly groundwater store"},
                'LandFraction':{'units':'fraction',
                                'standard_name':'area_fraction',
                                'long_name':"Land fraction of HUC12 from gridded data"},
                'CatchmentArea':{'units':'square kilometers',
                                 'long_name':"Total NWM catchment area (square kilometers)"},
                }

# Global attributes dictionary
out_global_atts = {'title':"HUC12 monthly water budget components from WRF-Hydro IWAA v1.0",
                   'institution':"NCAR",
                   'source1':"https://github.com/NOAA-Big-Data-Program/bdp-data-docs/blob/main/nwm/README.md",
                   'source2':"https://www.sciencebase.gov/catalog/file/get/60cb5edfd34e86b938a373f4?name=WBD_National_GDB.zip",
                   'history':"A. Dugger, Tue Mar 14 20:37:45 2023"}   

In [4]:
ds = xr.open_dataset(in_nc)
ds

### Code to add back in Char HUCIDs from source file

In [5]:
%%time

# Convert the HUC dataset (polygons) to WGS84 to match the points
HUC_gpkg = r'/glade/derecho/scratch/ksampson/USGS/CONUS_Water_Budget/HUCs/HUC12.gpkg'
HUC_gdf = gpd.read_file(HUC_gpkg, layer='WBDHU12_CONUS', ignore_geometry=True)

CPU times: user 1.09 s, sys: 1.37 s, total: 2.46 s
Wall time: 3.68 s


### Assign the coordinates to match the string-type HUC12 IDs from the input feature class

In [6]:
%%time

# Create a new dataframe with just the HUC12 data
HUC_df = HUC_gdf[['HUC12', 'AREASQKM', 'STATES', 'TOHUC']].copy()

# Create new field that will match to the datatype in the input file
HUC_df['HUC12_int'] = HUC_df['HUC12'].astype(np.int64)

# Export the netCDF coordinate to a dataframe
nc_df = ds['WBDHU12'].to_dataframe()
nc_df.index = np.arange(nc_df.shape[0]) #reset_index()
combined_df = pd.merge(nc_df, HUC_df,  how='inner', left_on=['WBDHU12'], right_on=['HUC12_int'])

# Deal with duplicates
combined_df = combined_df[~combined_df.duplicated(subset=['HUC12_int'], keep='last')]

# Make sure they have the same number of values
assert combined_df['HUC12'].unique().shape == nc_df['WBDHU12'].unique().shape

# Make sure they are identical
assert (combined_df['HUC12_int'] == ds['WBDHU12'].data).sum() == ds['WBDHU12'].data.shape

da = xr.DataArray(combined_df['HUC12'].astype('S12'), coords={'WBDHU12': combined_df['HUC12'].astype('S12')},dims=['WBDHU12'])
ds['WBDHU12'] = da
#del da, HUC_df, combined_df, nc_df, HUC_gdf
ds

CPU times: user 91.9 ms, sys: 1.11 ms, total: 93 ms
Wall time: 121 ms


### Rename variables

In [7]:
ds_out = ds.rename_vars(var_rename_dict)
#ds_out

### Rename Dimensions and coodinate variables

Using `xr.rename` instead of `xr.rename_dims` ensures that any coordinate variables are also renamed

In [8]:
ds_out = ds_out.rename(rename_dim_dict)
#ds_out

### Subset variables

In [9]:
ds_out = ds_out[list(var_rename_dict.values())]
ds_out

### Change data types

In [10]:
for variable in ds_out.data_vars:
    if ds_out[variable].dtype == np.float64:
        print('Found a float64 for variable {0}'.format(variable))
        ds_out[variable] = ds_out[variable].astype(np.float32)
ds_out

Found a float64 for variable Surfaceflow
Found a float64 for variable Baseflow
Found a float64 for variable SoilWater
Found a float64 for variable GWStore
Found a float64 for variable CatchmentArea
Found a float64 for variable LandFraction
Found a float64 for variable total_gridded_area


### Re-order dimensions

In [11]:
for variable in ds_out.data_vars:
    #print(variable, ds_out[variable].dims)
    if ds_out[variable].dims == ('time', 'huc_id'):
        print('Var {0} not correct: {1}'.format(variable, ds_out[variable].dims))
        ds_out[variable] = ds_out[variable].transpose()
        #ds_out[variable] = ds_out[variable][['huc_id', 'time', variable]]
ds_out.load()
ds_out

Var Surfaceflow not correct: ('time', 'huc_id')
Var Baseflow not correct: ('time', 'huc_id')
Var GWStore not correct: ('time', 'huc_id')


### Set variable and global attributes

In [12]:
# Eliminate any unecessary variable attributes (such as spatial metadata)
for variable in ds_out.data_vars:
    if variable in var_atts_dict:
        ds_out[variable].attrs = var_atts_dict[variable]

# Now eliminate unnecessary global attributes 
ds_out.attrs = out_global_atts

ds_out

### Reorganize time dimension to year and month dimensions

In [13]:
# Interpret times as strings - for later input to CSV files as a time index
datetime_strings = [pd.to_datetime(ds_out['time']).strftime('%Y%m%d%H')]

# year-month strings
yearmo_strings = [pd.to_datetime(ds_out['time']).strftime('%Y-%m')]
yearmo_strings

ds_out['yrmo'] = xr.DataArray(np.array(yearmo_strings, dtype='U'), dims=('yrmo_index', time_coord), name='yrmo')
ds_out

### Export NetCDF

In [16]:
%%time

# Read into memory before writing to disk?
ds_out.compute()
    
# Write output file (netCDF)
if write_NC:
    tic1 = time.time()
    print('  Writing output to {0}'.format(out_nc))
    ds_out.to_netcdf(out_nc, 
                     mode='w', 
                     format="NETCDF4", 
                     compute=True)
    print('\tExport to netCDF completed in {0:3.2f} seconds.'.format(time.time()-tic1))

  Writing output to /glade/derecho/scratch/ksampson/USGS/CONUS_Water_Budget/Water_Budget/huc12_monthly_wb_iwaa_wrfhydro_WY1980_2022_2.nc
	Export to netCDF completed in 2.20 seconds.
CPU times: user 438 ms, sys: 1.58 s, total: 2.02 s
Wall time: 2.2 s


### Export CSV

In [17]:
%%time

# Remove certain variables (non-temporal) from the output CSV file
remove_vars = ['CatchmentArea', 'yrmo']    # 'LandFraction'     
    
# Write output file (CSV)
if write_CSV:
    tic1 = time.time()    
    
    # Output to Pandas DataFrame
    df_out = ds_out.to_dataframe()
    
    # Remove variables we do not want
    if remove_vars is not None:
        df_out = df_out.drop(columns=remove_vars)
        
    df_out.index = df_out.index.droplevel(2)
    df_out = df_out.reset_index()
    
    # Add year and month columns
    df_out.insert(2, 'year',  pd.DatetimeIndex(df_out[time_coord]).year)
    df_out.insert(3, 'month',  pd.DatetimeIndex(df_out[time_coord]).month)
    #df_out['year'] = pd.DatetimeIndex(df_out[time_coord]).year
    #df_out['month'] = pd.DatetimeIndex(df_out[time_coord]).month
    df_out = df_out.drop(columns='time')

    df_out.to_csv(out_csv, index=False)
    print('\tExport to CSV completed in {0:3.2f} seconds.'.format(time.time()-tic1))
df_out

	Export to CSV completed in 488.08 seconds.
CPU times: user 7min 27s, sys: 8.83 s, total: 7min 36s
Wall time: 8min 8s


Unnamed: 0,huc_id,year,month,PrecipLand,Snowfall,Surfaceflow,Baseflow,ET,SWE,SoilWater,SoilSat,Recharge,GWStore,Precip,LandFraction,total_gridded_area,avgSOILM_wltadj_depthmean,avgSOILSAT_wltadj_top1
0,b'010100020101',1979,10,98.146187,10.533096,5.523248,49.086155,26.784851,0.030718,776.428162,0.782396,54.641285,28.178392,98.221153,0.941176,64.0,0.304792,0.672568
1,b'010100020101',1979,11,94.964050,13.133768,6.367284,59.504997,14.086288,1.602692,779.849609,0.785838,63.613102,31.195091,94.945091,0.941176,64.0,0.306503,0.680506
2,b'010100020101',1979,12,78.661407,40.858681,2.486018,38.301289,8.190549,13.337690,767.924438,0.773827,38.900734,25.234598,78.534531,0.941176,64.0,0.300540,0.662999
3,b'010100020101',1980,1,31.070864,16.470680,0.478237,32.838886,7.153370,29.792389,762.371216,0.768230,30.651800,23.338078,30.971245,0.941176,64.0,0.297764,0.651460
4,b'010100020101',1980,2,26.385855,26.370274,0.000000,16.991766,11.235175,39.781162,745.327271,0.751053,12.465616,16.612938,26.416483,0.941176,64.0,0.289242,0.632264
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44694367,b'181002041600',2022,5,0.000000,0.000000,0.000000,0.045813,0.772527,0.000000,461.642395,0.667096,0.067336,0.812371,0.000000,0.062975,58.0,0.174442,0.166360
44694368,b'181002041600',2022,6,1.625857,0.000000,0.000000,0.043191,2.783360,0.000000,460.667572,0.665666,0.062526,0.791524,1.763754,0.062975,58.0,0.173954,0.171355
44694369,b'181002041600',2022,7,0.049715,0.000000,0.000000,0.043483,1.500470,0.000000,459.090576,0.663342,0.062012,0.771303,0.035178,0.062975,58.0,0.173166,0.169341
44694370,b'181002041600',2022,8,1.189113,0.000000,0.000000,0.042347,1.899218,0.000000,458.147491,0.661971,0.059702,0.751274,0.860141,0.062975,58.0,0.172694,0.179474


### Clean up

In [18]:
ds.close()
ds_out.close()
print('Process completed in {0: 3.2f} seconds.'.format(time.time()-tic))

Process completed in  578.92 seconds.
