-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #47 from andersy005/master
Add utilities for building catalogs
- Loading branch information
Showing
16 changed files
with
475 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# Scripts to build catalogs | ||
|
||
- [Scripts to build catalogs](#scripts-to-build-catalogs) | ||
- [CMIP5 and CMIP6](#cmip5-and-cmip6) | ||
- [TODO: CESM](#todo-cesm) | ||
|
||
## CMIP5 and CMIP6 | ||
|
||
```bash | ||
$ python cmip.py --help | ||
Usage: cmip.py [OPTIONS] | ||
|
||
Options: | ||
--root-path PATH Root path of the CMIP project output. | ||
-d, --depth INTEGER Recursion depth. Recursively walk root_path to a | ||
specified depth [default: 4] | ||
--pick-latest-version Whether to only catalog lastest version of data | ||
assets or keep all versions [default: False] | ||
-v, --cmip-version INTEGER CMIP phase (e.g. 5 for CMIP5 or 6 for CMIP6) | ||
--csv-filepath TEXT File path to use when saving the built catalog | ||
--help Show this message and exit. | ||
``` | ||
|
||
**Example:** | ||
|
||
```bash | ||
$ python cmip.py --root-path /glade/collections/cmip/CMIP6 --pick-latest-version --cmip-version 6 --csv-filepath ../catalogs/glade-cmip6.csv.gz --depth 5 | ||
``` | ||
|
||
## TODO: CESM |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import os | ||
|
||
import yaml | ||
from core import Builder, extract_attr_with_regex, get_asset_list | ||
from requests.structures import CaseInsensitiveDict | ||
|
||
with open('cesm2_cmip6.yaml') as f: | ||
cesm2_cmip6_definitions = yaml.safe_load(f) | ||
|
||
component_streams = cesm2_cmip6_definitions['component_streams'] | ||
cesm2_cmip6_exps = CaseInsensitiveDict(cesm2_cmip6_definitions['experiments']) | ||
date_str_regex = r'\d{4}\-\d{4}|\d{6}\-\d{6}|\d{8}\-\d{8}|\d{10}Z\-\d{10}Z|\d{12}Z\-\d{12}Z|\d{10}\-\d{10}|\d{12}\-\d{12}' | ||
|
||
|
||
def cesm2_cmip6_parser(filepath): | ||
file_basename = os.path.basename(filepath) | ||
date_str = extract_attr_with_regex(file_basename, date_str_regex) | ||
for component, streams in component_streams.items(): | ||
# Loop over stream strings | ||
# NOTE: The order matters here! | ||
for stream in sorted(streams, key=lambda s: len(s), reverse=True): | ||
s = file_basename.find(stream) | ||
if s >= 0: | ||
file_stream = stream | ||
file_component = component | ||
x = file_basename.split(stream) | ||
case = x[0].strip('.') | ||
file_experiment = '.'.join(case.split('.')[4:-1]) | ||
y = x[-1].strip('.').split(date_str) | ||
variable = y[0].strip('.') | ||
break | ||
|
||
attrs = {} | ||
attrs['path'] = filepath | ||
attrs['case'] = case | ||
attrs['variable'] = variable | ||
attrs['date_range'] = date_str | ||
attrs['stream'] = file_stream | ||
attrs['component'] = file_component | ||
attrs['experiment'] = file_experiment | ||
try: | ||
case_members = cesm2_cmip6_exps[file_experiment]['case_members'] | ||
attrs.update(case_members[case]) | ||
except Exception: | ||
pass | ||
return attrs | ||
|
||
|
||
def build_cesm(root_path, depth=4, exclude_patterns=['*.nc_temp_.nc']): | ||
columns = [ | ||
'experiment', | ||
'case', | ||
'component', | ||
'stream', | ||
'variable', | ||
'date_range', | ||
'member_id', | ||
'path', | ||
'ctrl_branch_year', | ||
'ctrl_experiment', | ||
'ctrl_member_id', | ||
] | ||
filelist = get_asset_list(root_path, depth=depth) | ||
b = Builder(columns, exclude_patterns) | ||
df = b(filelist, cesm2_cmip6_parser) | ||
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,225 @@ | ||
import os | ||
|
||
import click | ||
import dask | ||
import numpy as np | ||
import requests | ||
from core import Builder, extract_attr_with_regex, get_asset_list, reverse_filename_format | ||
from dask.diagnostics import ProgressBar | ||
|
||
|
||
def cmip6_parser(filepath): | ||
""" | ||
Extract attributes of a file using information from CMI6 DRS. | ||
References | ||
CMIP6 DRS: http://goo.gl/v1drZl | ||
Controlled Vocabularies (CVs) for use in CMIP6: https://github.com/WCRP-CMIP/CMIP6_CVs | ||
Directory structure = | ||
<mip_era>/ | ||
<activity_id>/ | ||
<institution_id>/ | ||
<source_id>/ | ||
<experiment_id>/ | ||
<member_id>/ | ||
<table_id>/ | ||
<variable_id>/ | ||
<grid_label>/ | ||
<version> | ||
file name = <variable_id>_<table_id>_<source_id>_<experiment_id >_<member_id>_<grid_label>[_<time_range>].nc For time-invariant fields, the last segment (time_range) above is omitted. Example when there is no sub-experiment: tas_Amon_GFDL-CM4_historical_r1i1p1f1_gn_196001-199912.nc Example with a sub-experiment: pr_day_CNRM-CM6-1_dcppA-hindcast_s1960-r2i1p1f1_gn_198001-198412.nc | ||
""" | ||
basename = os.path.basename(filepath) | ||
filename_template = '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}_{time_range}.nc' | ||
|
||
gridspec_template = ( | ||
'{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}.nc' | ||
) | ||
templates = [filename_template, gridspec_template] | ||
fileparts = reverse_filename_format(basename, templates=templates) | ||
try: | ||
parent = os.path.dirname(filepath).strip('/') | ||
parent_split = parent.split(f"/{fileparts['source_id']}/") | ||
part_1 = parent_split[0].strip('/').split('/') | ||
grid_label = parent.split(f"/{fileparts['variable_id']}/")[1].strip('/').split('/')[0] | ||
fileparts['grid_label'] = grid_label | ||
fileparts['activity_id'] = part_1[-2] | ||
fileparts['institution_id'] = part_1[-1] | ||
version_regex = r'v\d{4}\d{2}\d{2}|v\d{1}' | ||
version = extract_attr_with_regex(parent, regex=version_regex) or 'v0' | ||
fileparts['version'] = version | ||
fileparts['path'] = filepath | ||
if fileparts['member_id'].startswith('s'): | ||
fileparts['dcpp_init_year'] = float(fileparts['member_id'].split('-')[0][1:]) | ||
fileparts['member_id'] = fileparts['member_id'].split('-')[-1] | ||
else: | ||
fileparts['dcpp_init_year'] = np.nan | ||
|
||
except Exception: | ||
pass | ||
|
||
return fileparts | ||
|
||
|
||
def cmip5_parser(filepath): | ||
""" Extract attributes of a file using information from CMIP5 DRS. | ||
Notes | ||
----- | ||
Reference: | ||
- CMIP5 DRS: https://pcmdi.llnl.gov/mips/cmip5/docs/cmip5_data_reference_syntax.pdf?id=27 | ||
""" | ||
|
||
freq_regex = r'/3hr/|/6hr/|/day/|/fx/|/mon/|/monClim/|/subhr/|/yr/' | ||
realm_regex = r'aerosol|atmos|land|landIce|ocean|ocnBgchem|seaIce' | ||
version_regex = r'v\d{4}\d{2}\d{2}|v\d{1}' | ||
|
||
file_basename = os.path.basename(filepath) | ||
|
||
filename_template = ( | ||
'{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}_{temporal_subset}.nc' | ||
) | ||
gridspec_template = '{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}.nc' | ||
|
||
templates = [filename_template, gridspec_template] | ||
fileparts = reverse_filename_format(file_basename, templates) | ||
frequency = extract_attr_with_regex(filepath, regex=freq_regex, strip_chars='/') | ||
realm = extract_attr_with_regex(filepath, regex=realm_regex) | ||
version = extract_attr_with_regex(filepath, regex=version_regex) or 'v0' | ||
fileparts['frequency'] = frequency | ||
fileparts['modeling_realm'] = realm | ||
fileparts['version'] = version | ||
fileparts['path'] = filepath | ||
try: | ||
part1, part2 = os.path.dirname(filepath).split(fileparts['experiment']) | ||
part1 = part1.strip('/').split('/') | ||
fileparts['institute'] = part1[-2] | ||
fileparts['product_id'] = part1[-3] | ||
except Exception: | ||
pass | ||
|
||
return fileparts | ||
|
||
|
||
def _pick_latest_version(df): | ||
import itertools | ||
|
||
grpby = list(set(df.columns.tolist()) - {'path', 'version'}) | ||
groups = df.groupby(grpby) | ||
|
||
@dask.delayed | ||
def _pick_latest_v(group): | ||
idx = [] | ||
if group.version.nunique() > 1: | ||
idx = group.sort_values(by=['version'], ascending=False).index[1:].values.tolist() | ||
return idx | ||
|
||
idx_to_remove = [_pick_latest_v(group) for _, group in groups] | ||
print('Getting latest version...\n') | ||
with ProgressBar(): | ||
idx_to_remove = dask.compute(*idx_to_remove) | ||
|
||
idx_to_remove = list(itertools.chain(*idx_to_remove)) | ||
df = df.drop(index=idx_to_remove) | ||
print('\nDone....\n') | ||
return df | ||
|
||
|
||
def build_cmip( | ||
root_path, | ||
cmip_version, | ||
depth=4, | ||
columns=None, | ||
exclude_patterns=['*/files/*', '*/latest/*'], | ||
pick_latest_version=False, | ||
): | ||
parsers = {'6': cmip6_parser, '5': cmip5_parser} | ||
cmip_columns = { | ||
'6': [ | ||
'activity_id', | ||
'institution_id', | ||
'source_id', | ||
'experiment_id', | ||
'member_id', | ||
'table_id', | ||
'variable_id', | ||
'grid_label', | ||
'dcpp_init_year', | ||
'version', | ||
'time_range', | ||
'path', | ||
], | ||
'5': [ | ||
'product_id', | ||
'institute', | ||
'model', | ||
'experiment', | ||
'frequency', | ||
'modeling_realm', | ||
'mip_table', | ||
'ensemble_member', | ||
'variable', | ||
'temporal_subset', | ||
'version', | ||
'path', | ||
], | ||
} | ||
|
||
filelist = get_asset_list(root_path, depth=depth) | ||
cmip_version = str(cmip_version) | ||
if columns is None: | ||
columns = cmip_columns[cmip_version] | ||
b = Builder(columns, exclude_patterns) | ||
df = b(filelist, parsers[cmip_version]) | ||
|
||
if cmip_version == '6': | ||
# Some entries are invalid: Don't conform to the CMIP6 Data Reference Syntax | ||
cmip6_activity_id_url = ( | ||
'https://raw.githubusercontent.com/WCRP-CMIP/CMIP6_CVs/master/CMIP6_activity_id.json' | ||
) | ||
resp = requests.get(cmip6_activity_id_url) | ||
activity_ids = list(resp.json()['activity_id'].keys()) | ||
# invalids = df[~df.activity_id.isin(activity_ids)] | ||
df = df[df.activity_id.isin(activity_ids)] | ||
if pick_latest_version: | ||
df = _pick_latest_version(df) | ||
return df | ||
|
||
|
||
@click.command() | ||
@click.option( | ||
'--root-path', type=click.Path(exists=True), help='Root path of the CMIP project output.' | ||
) | ||
@click.option( | ||
'-d', | ||
'--depth', | ||
default=4, | ||
type=int, | ||
show_default=True, | ||
help='Recursion depth. Recursively walk root_path to a specified depth', | ||
) | ||
@click.option( | ||
'--pick-latest-version', | ||
default=False, | ||
is_flag=True, | ||
show_default=True, | ||
help='Whether to only catalog lastest version of data assets or keep all versions', | ||
) | ||
@click.option('-v', '--cmip-version', type=int, help='CMIP phase (e.g. 5 for CMIP5 or 6 for CMIP6)') | ||
@click.option('--csv-filepath', type=str, help='File path to use when saving the built catalog') | ||
def cli(root_path, depth, pick_latest_version, cmip_version, csv_filepath): | ||
|
||
if cmip_version not in set([5, 6]): | ||
raise ValueError( | ||
f'cmip_version = {cmip_version} is not valid. Valid options include: 5 and 6.' | ||
) | ||
|
||
if csv_filepath is None: | ||
raise ValueError("Please provide csv-filepath. e.g.: './cmip5.csv.gz'") | ||
|
||
df = build_cmip(root_path, cmip_version, depth=depth, pick_latest_version=pick_latest_version) | ||
|
||
df.to_csv(csv_filepath, compression='gzip', index=False) | ||
|
||
|
||
if __name__ == '__main__': | ||
cli() |
Oops, something went wrong.