# Demo Notebook to Test Metadata Functions

In [1]:
import pandas as pd
import numpy as np
import os
import xarray as xr
import socket
import sys
from datetime import datetime
from pathlib import Path
from typing import Dict, Any, Optional
import netCDF4

In [2]:

def get_python_path():
    hostname = socket.gethostname()                                 # 1. Identify the computer by hostname
    code_locations = {                                              # 2. Set default Python code location based on hostname
        "NECMAC04363461.local": "/Users/kimberly.hyde/Documents/",  # Mac laptop
        "nefscsatdata": "/mnt/EDAB_Archive/",                       # Satdata
        "guihyde": "/mnt/EDAB_Archive/"                             # Kim's Satdata container
    }

    base_path = code_locations.get(hostname)
    if not base_path:
        print(f"Unknown hostname: {hostname}")
        return None

    default_utility_path = Path(base_path) / "nadata/python"
    if not default_utility_path.is_dir():
        print(f"Directory not found: {default_utility_path}")
        return None

    print(f"Default utilities path: {default_utility_path}")
    return default_utility_path

python_path = get_python_path()
if str(python_path) not in sys.path:
    sys.path.insert(0, str(python_path))

from utilities import date_utilities, gridding_utilities, file_utilities, import_utilities, calc_daylength, metadata_utilities, calc_primprod

Default utilities path: /Users/kimberly.hyde/Documents/nadata/python


### Get Sample Data and Create "New" Product

In [None]:
from utilities import build_pp_date_map
from utilities import load_all_metadata
from utilities import get_dates
from utilities import get_metadata_table
from utilities import get_source_metadata
from utilities import build_product_attributes
from utilities import get_lut_products
from utilities import get_default_metadata
from utilities import get_reference_metadata
from utilities import parse_dataset_info
from utilities import get_geospatial_metadata
from utilities import get_lut_metadata
from utilities import get_python_dir
from utilities import get_temporal_metadata
from utilities import process_daily_pp
from utilities import regrid_wrapper
from utilities import get_nc_prod
from utilities import get_prod_files
from utilities import run_pp_pipeline


In [None]:
run_pp_pipeline(sst_dataset="CORALSST")

In [None]:

# Get the input and output files for primary productivity
pmap = build_pp_date_map(get_date_prod="CHL", sst_dataset='CORALSST')
date, first_date_info = next(iter(pmap.items()))
chl_file, sst_file, par_file, ppd_file, _ = first_date_info

ppd = process_daily_pp(date,chl_file,sst_file,par_file,ppd_file)
ppd = xr.open_dataset(ppd_file)
chl = xr.open_dataset(chl_file)
ppd


In [None]:
pmap = build_pp_date_map(get_date_prod="CHL", sst_dataset='CORALSST')
second_date, second_date_info = list(pmap.items())[1]
chl_file, sst_file, par_file, ppd_file, _ = second_date_info
ppd = process_daily_pp(second_date,chl_file,sst_file,par_file,ppd_file)
ppd = xr.open_dataset(ppd_file)
chl = xr.open_dataset(chl_file)


In [None]:
pmap = build_pp_date_map(get_date_prod="CHL", sst_dataset='CORALSST')
date3, third_date_info = list(pmap.items())[2]
chl_file, sst_file, par_file, ppd_file, _ = third_date_info
ppd = process_daily_pp(date3,chl_file,sst_file,par_file,ppd_file)


In [None]:
pmap = build_pp_date_map(get_date_prod="CHL", sst_dataset='CORALSST')

sst_files = [entry[1] for entry in pmap.values()]
sst_file = sst_files[0]
chl_files = [entry[0] for entry in pmap.values()]
chl_file = chl_files[0]

chl_info = parse_dataset_info(chl_file)
sst_info = parse_dataset_info(sst_file)

chl_nc_var = get_nc_prod(chl_info['dataset'],'CHL')
sst_nc_var = get_nc_prod(sst_info['dataset'],'SST')

sst_rg = regrid_wrapper(chl_file,sst_file,source_vars=[sst_nc_var])
sst_rg


In [4]:
pmap = build_pp_date_map(get_date_prod="CHL", sst_dataset='CORALSST')

sst_files = [entry[1] for entry in pmap.values()]
sst_file = sst_files[0]
chl_files = [entry[0] for entry in pmap.values()]
chl_file = chl_files[0]

chl_info = parse_dataset_info(chl_file)
sst_info = parse_dataset_info(sst_file)
#par_info = parse_dataset_info(par_file)

chl_nc_var = get_nc_prod(chl_info['dataset'],'CHL')
sst_nc_var = get_nc_prod(sst_info['dataset'],'SST')
#par_nc_var = get_nc_prod(par_info['dataset'],'PAR')

sst_rgs = regrid_wrapper(chl_files,sst_files,source_vars=[sst_nc_var])
sst_rgs


📅 Mapped 30 dates with complete product files.


  result_var = func(*data_vars)
  intermediate = blockwise(


Unnamed: 0,Array,Chunk
Bytes,8.34 GiB,569.53 kiB
Shape,"(30, 4320, 8640)","(1, 270, 270)"
Dask graph,15360 chunks in 68 graph layers,15360 chunks in 68 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 8.34 GiB 569.53 kiB Shape (30, 4320, 8640) (1, 270, 270) Dask graph 15360 chunks in 68 graph layers Data type float64 numpy.ndarray",8640  4320  30,

Unnamed: 0,Array,Chunk
Bytes,8.34 GiB,569.53 kiB
Shape,"(30, 4320, 8640)","(1, 270, 270)"
Dask graph,15360 chunks in 68 graph layers,15360 chunks in 68 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [5]:
sst_rgs.analysed_sst

Unnamed: 0,Array,Chunk
Bytes,8.34 GiB,569.53 kiB
Shape,"(30, 4320, 8640)","(1, 270, 270)"
Dask graph,15360 chunks in 68 graph layers,15360 chunks in 68 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 8.34 GiB 569.53 kiB Shape (30, 4320, 8640) (1, 270, 270) Dask graph 15360 chunks in 68 graph layers Data type float64 numpy.ndarray",8640  4320  30,

Unnamed: 0,Array,Chunk
Bytes,8.34 GiB,569.53 kiB
Shape,"(30, 4320, 8640)","(1, 270, 270)"
Dask graph,15360 chunks in 68 graph layers,15360 chunks in 68 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [None]:
print(netCDF4.default_fillvals[ppd.Z_eu.dtype.str[-2:]])
print(netCDF4.default_fillvals[ppd.PP_Eppley.dtype.str[-2:]])
netCDF4.default_fillvals

In [None]:
def get_fill_value(var) -> Optional[float]:
    """
    Retrieves the _FillValue for a given xarray or netCDF4 variable.
    If not explicitly set, returns the NetCDF default for the variable's dtype.
    """
    if '_FillValue' in var.attrs:
        return var.attrs['_FillValue']  # For xarray
    dtype = var.dtype
    try:
        # Map dtype to NetCDF fill key (e.g. 'f4', 'i4')
        kind = np.dtype(dtype).kind
        itemsize = np.dtype(dtype).itemsize
        key = f"{kind}{itemsize}"
        return netCDF4.default_fillvals[key]
    except KeyError:
        raise ValueError(f"Unknown default _FillValue for dtype {dtype}")
    
for var_name in ppd.data_vars:
    var = ppd[var_name]
    fv = get_fill_value(var)
    print(f"Variable: {var_name}, dtype: {var.dtype}, _FillValue: {fv}")



In [None]:
def resolve_fill_value(var):
    if '_FillValue' in var.attrs:
        return var.getncattr('_FillValue')
    dtype = var.dtype
    try:
        return netCDF4.default_fillvals[dtype.str[-2:]]  # e.g. 'f4', 'i4'
    except KeyError:
        raise ValueError(f"Unknown default _FillValue for dtype {dtype}")

for var_name in ppd.data_vars:
    var = ppd[var_name]
    fv = resolve_fill_value(var)
    print(f"Variable: {var_name}, dtype: {var.dtype}, _FillValue: {fv}")



In [None]:
build_product_attributes("PP_Eppley")

In [None]:
chl_source = get_source_metadata(parse_dataset_info(chl_file)["dataset"],dataset_version=parse_dataset_info(chl_file)["version"],source_prefix="source_chl")
sst_source = get_source_metadata(parse_dataset_info(sst_file)["dataset"],dataset_version=parse_dataset_info(sst_file)["version"],source_prefix="source_sst")
par_source = get_source_metadata(parse_dataset_info(par_file)["dataset"],dataset_version=parse_dataset_info(par_file)["version"],source_prefix="source_par")
# Build the ppd history
History = " ".join([f"{build_product_attributes('PPD')['long_name']} is calculated using the VGPM {get_reference_metadata('VGPM')[0]['citation']} and VGPM-EPPLEY {get_reference_metadata('VGPM_EPPLEY')[0]['citation']}models. ",
        f"The input chlorophyll file ({os.path.basename(chl_file)}) is from the {chl_source['source_chl_title']}. ",
        f"The input sea surface temperature file ({os.path.basename(sst_file)}) is from the {sst_source['source_sst_title']}. ",
        f"The input photosynthetic active radiation file ({os.path.basename(par_file)}) is from the {par_source['source_par_title']}. ",
        f"The SST and PAR data were regridded to the CHL grid using xesmf bilinear regridding.",
        f"Day length was calculated according to Kirk (1994)"
])

# Global Attributes
attrs = get_default_metadata(sheet="Global")
attrs = attrs | get_lut_metadata(
    add_program="Ecosystem Dynamics and Assessment Branch",
    add_project="State of the Ecosystem")

# Geospatial Attributes
attrs = attrs | get_geospatial_metadata(use_inputdata_path=chl_file)

# Temporal Attributes
attrs = attrs | get_temporal_metadata(ds=ppd)

# Product Specific Attributes
attrs["history"] = History
attrs["references"] = get_reference_metadata(['VGPM_EPPLEY','KIRK','ZEU'],refs_only=True)
#attrs["keywords"] = get_keywords()
attrs["product_name"] = build_product_attributes("PPD")["long_name"]

# Source Metadata
#attrs[]
attrs


In [None]:
get_source_metadata('ROMS_NWA')

In [None]:

get_reference_metadata(["KIRK","VGPM_EPPLEY"])

In [None]:
build_product_attributes("DOC",_FillValue=-9999)


In [None]:

build_product_attributes("PP_Eppley")

# → {'units': 'mg m^-3', 'standard_name': 'ocean_mass_content_of_dissolved_organic_carbon', ...}

In [None]:
from utilities import get_source_metadata

get_source_metadata("OCCCI", dataset_version="V4.2")  # ✅ returns V4.2 if present
get_source_metadata("OCCCI")                          # ✅ returns default V6.0
get_source_metadata("OCCCI", dataset_version="V9.9")  # ❌ raises error

In [None]:
get_metadata_table(sheet = "LUT_Programs")


In [None]:
import pandas as pd
dir = get_python_dir(resources=True)
metapath = os.path.join(dir,'metadata','EDAB_metadata.xlsx')

xls = pd.ExcelFile(metapath)
print(xls.sheet_names)  # Confirm "Temporal" is listed exactly as expected

df = pd.read_excel(metapath, sheet_name="Temporal")
df.columns = df.columns.str.strip().str.lower()
print(df.columns)

attribute = df["attribute"]
required = str(df["required"]).strip().lower() == "true"
default = df.get("default", None)

print("Parsed Temporal metadata:")
for attr, rules in df.items():
    print(f"  {attr}: {rules}")

In [None]:


program = get_lut_metadata(
    add_program="Ecosystem Dynamics and Assessment Branch",
    add_project="State of the Ecosystem"
)
program

In [None]:
from utilities import get_default_metadata

required_meta = get_default_metadata(sheet="Global")
print("✅ Required metadata with defaults:")
for k, v in required_meta.items():
    print(f"  {k}: {v}")

In [None]:
from utilities import get_geospatial_metadata

meta = get_geospatial_metadata(use_inputdata_path=chl_file)
meta

In [None]:
from utilities import get_geospatial_metadata

meta = get_geospatial_metadata(use_current_data=ppd)
meta

In [None]:
sst = xr.open_dataset(sst_file)
sst.attrs

In [None]:
par = xr.open_dataset(par_file)
par.attrs

#### Global attributes to add or update
* acknowledgement
* license
* institution
* naming_authority
* conventions
* 

##### Processing date information
* date_created
* date_issued
* date_metadata_modified
* date_modified
##### Creator
The individual or organization with primary responsibility for creating the data. This would be the entity directly generating the scientific data itself.
* creator_email
* creator_name
* creator_url
* creator_type (person or group)
##### Contributor
An individual or institution that made a significant but secondary or indirect contribution to the data creation. This might include someone who provided funding, data analysis, or quality control, but wasn't directly responsible for generating the core data. A contributor should not also be listed as a creator, if an agent acts in multiple capacities, clarification should be provided in the role section.
* contributor_name
* contributor_role
##### Project
The specific scientific project for which the data was collected or created.
##### Program
The scientific project or initiative that produced the data. This provides context for the data's origin and may be important for understanding its scope or intended use. The project that produced the data is included in the metadata as the 'project' attribute.
* program_name
* program_url
* program_email
##### Publisher
The entity responsible for making the data available to the public. This might be a data repository, a scientific journal, or an institution hosting the data. The 'publisher_name', 'publisher_url', and 'publisher_email' attributes describe the publisher's contact information.
* publisher_email
* publisher_institution
* publisher_name
* publisher_type
* publisher_url

##### New product specific information
* comment
* keywords
* title
* summary
* id
* product_name (file name)
* product_type (temporal resolution e.g. day)
* product_version
* product_level (L3, L4, etc)

##### Project specific information (SOE?)
* project

##### Merge from all source files
* platform
* sensor
* instruments
* source
* references

#### Attributes to copy from "grid" source
* 'geospatial_lat_max': np.float32(90.0),
* 'geospatial_lat_min': np.float32(-90.0),
* 'geospatial_lat_resolution': '.04166666666666666666',
* 'geospatial_lat_units': 'decimal degrees north',
* 'geospatial_lon_max': np.float32(180.0),
* 'geospatial_lon_min': np.float32(-180.0),
* 'geospatial_lon_resolution': '.04166666666666666666',
* 'geospatial_lon_units': 'decimal degrees east',
* 'geospatial_vertical_max': np.float32(0.0),
* 'geospatial_vertical_min': np.float32(0.0),
* 'time_coverage_duration': 'P1D',
* 'time_coverage_end': '199801012359Z',
* 'time_coverage_resolution': 'P1D',
* 'time_coverage_start': '199801010000Z',

#### Attributes to check
* keywords_vocabulary


#### Attributes to remove?
* git_commit_hash
* tracking_id
* number_of_bands_used_to_classify
* number_of_optical_water_types

In [None]:
# Attributes to update




In [None]:
""" 
    ds_out.attrs.update({
        "title": "Estimated Primary Production from Chlorophyll",
        "summary": "Derived using Eppley and VGPM models from satellite chlorophyll data",
        "history": f"{chl_ds.attrs.get('history', '')}; Processed with PP models on {pd.Timestamp.now()}",
        "references": "Behrenfeld & Falkowski (1997), Morel (1991)",
        "processing_level": "L3",
        "creator_name": "Kim",
        "software_version": "v1.0",
    })
"""


### Read Metadata Spreadsheet

In [None]:
from utilities import get_python_dir

dir = get_python_dir(resources=True)
metapath = os.path.join(dir,'metadata','EDAB_metadata.xlsx')

metadict = pd.read_excel(metapath,sheet_name=None)
allmeta = pd.concat(metadict.values(), ignore_index=True)
print(allmeta.head())

In [None]:
def read_metadata_lookup(excel_path: str) -> Dict[str, Dict[str, Any]]:
    """
    Reads an Excel file with multiple sheets containing metadata mappings.
    Returns a dictionary: {sheet_name: {attribute_name: value, ...}, ...}
    """
    metadata_dict = {}
    xls = pd.ExcelFile(excel_path)
    for sheet in xls.sheet_names:
        df = xls.parse(sheet)
        # Assumes two columns: 'Attribute' and 'Value'
        if 'Attribute' in df.columns and 'Value' in df.columns:
            metadata_dict[sheet] = dict(zip(df['Attribute'], df['Value']))
        else:
            raise ValueError(f"Sheet '{sheet}' must contain 'Attribute' and 'Value' columns.")
    return metadata_dict

In [None]:
def extract_netcdf_metadata(nc_path: str) -> Dict[str, Any]:
    """
    Extracts global attributes from a NetCDF file.
    """
    with xr.open_dataset(nc_path) as ds:
        return dict(ds.attrs)

In [None]:
def update_netcdf_metadata(nc_path: str, updates: Dict[str, Any], output_path: str = None) -> None:
    """
    Updates or adds global attributes in a NetCDF file.
    Writes to output_path if provided, otherwise overwrites original.
    """
    output_path = output_path or nc_path
    with xr.open_dataset(nc_path) as ds:
        ds.attrs.update(updates)
        ds.to_netcdf(output_path)

In [None]:
def apply_metadata_updates(nc_path: str, excel_path: str, sheet: str = 'global', output_path: str = None) -> None:
    """
    Wrapper to apply metadata updates from a specific sheet in the Excel file to a NetCDF file.
    """
    metadata_lookup = read_metadata_lookup(excel_path)
    if sheet not in metadata_lookup:
        raise KeyError(f"Sheet '{sheet}' not found in Excel file.")
    
    updates = metadata_lookup[sheet]
    update_netcdf_metadata(nc_path, updates, output_path)

In [None]:
apply_metadata_updates(
    nc_path="data.nc",
    excel_path="metadata_lookup.xlsx",
    sheet="global",  # or any other sheet name
    output_path="data_updated.nc"
)