Merge pull request #47 from andersy005/master

Add utilities for building catalogs
NCAR · Jan 7, 2020 · 3e46a0e · 3e46a0e
2 parents 59a3dbf + ba58c2d
commit 3e46a0e
Show file tree

Hide file tree

Showing 16 changed files with 475 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -109,3 +109,8 @@ worker-*
 slurm-*
 dask-worker*
 *.txt
+builders/*csv.gz
+builders/Convert nc stores to json.ipynb
+builders/Untitled.ipynb
+builders/Untitled1.ipynb
+builders/notebooks/glade-cmip6_catalog_builder copy.ipynb
diff --git a/.isort.cfg b/.isort.cfg
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v2.3.0
+    rev: v2.4.0
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer
@@ -10,13 +10,13 @@ repos:
       - id: double-quote-string-fixer
 
   - repo: https://github.com/ambv/black
-    rev: 19.3b0
+    rev: 19.10b0
     hooks:
       - id: black
         args: ["--line-length", "100", "--skip-string-normalization"]
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v2.3.0
+    rev: v2.4.0
     hooks:
       - id: flake8
   - repo: https://github.com/asottile/seed-isort-config
@@ -29,6 +29,15 @@ repos:
       - id: isort
 
   - repo: https://github.com/prettier/prettier
-    rev: 1.18.2
+    rev: 1.19.1
     hooks:
       - id: prettier
+  # - repo: local
+  #   #rev: v0.8.0
+  #   hooks:
+  #     - id: wily
+  #       name: wily
+  #       entry: wily diff
+  #       verbose: true
+  #       language: python
+  #       additional_dependencies: [wily]
diff --git a/builders/README.md b/builders/README.md
@@ -0,0 +1,30 @@
+# Scripts to build catalogs
+
+- [Scripts to build catalogs](#scripts-to-build-catalogs)
+  - [CMIP5 and CMIP6](#cmip5-and-cmip6)
+  - [TODO: CESM](#todo-cesm)
+
+## CMIP5 and CMIP6
+
+```bash
+$ python cmip.py --help
+Usage: cmip.py [OPTIONS]
+
+Options:
+  --root-path PATH            Root path of the CMIP project output.
+  -d, --depth INTEGER         Recursion depth. Recursively walk root_path to a
+                              specified depth  [default: 4]
+  --pick-latest-version       Whether to only catalog lastest version of data
+                              assets or keep all versions  [default: False]
+  -v, --cmip-version INTEGER  CMIP phase (e.g. 5 for CMIP5 or 6 for CMIP6)
+  --csv-filepath TEXT         File path to use when saving the built catalog
+  --help                      Show this message and exit.
+```
+
+**Example:**
+
+```bash
+$ python cmip.py --root-path /glade/collections/cmip/CMIP6 --pick-latest-version --cmip-version 6 --csv-filepath ../catalogs/glade-cmip6.csv.gz --depth 5
+```
+
+## TODO: CESM
diff --git a/builders/cesm.py b/builders/cesm.py
@@ -0,0 +1,66 @@
+import os
+
+import yaml
+from core import Builder, extract_attr_with_regex, get_asset_list
+from requests.structures import CaseInsensitiveDict
+
+with open('cesm2_cmip6.yaml') as f:
+    cesm2_cmip6_definitions = yaml.safe_load(f)
+
+component_streams = cesm2_cmip6_definitions['component_streams']
+cesm2_cmip6_exps = CaseInsensitiveDict(cesm2_cmip6_definitions['experiments'])
+date_str_regex = r'\d{4}\-\d{4}|\d{6}\-\d{6}|\d{8}\-\d{8}|\d{10}Z\-\d{10}Z|\d{12}Z\-\d{12}Z|\d{10}\-\d{10}|\d{12}\-\d{12}'
+
+
+def cesm2_cmip6_parser(filepath):
+    file_basename = os.path.basename(filepath)
+    date_str = extract_attr_with_regex(file_basename, date_str_regex)
+    for component, streams in component_streams.items():
+        # Loop over stream strings
+        # NOTE: The order matters here!
+        for stream in sorted(streams, key=lambda s: len(s), reverse=True):
+            s = file_basename.find(stream)
+            if s >= 0:
+                file_stream = stream
+                file_component = component
+                x = file_basename.split(stream)
+                case = x[0].strip('.')
+                file_experiment = '.'.join(case.split('.')[4:-1])
+                y = x[-1].strip('.').split(date_str)
+                variable = y[0].strip('.')
+                break
+
+    attrs = {}
+    attrs['path'] = filepath
+    attrs['case'] = case
+    attrs['variable'] = variable
+    attrs['date_range'] = date_str
+    attrs['stream'] = file_stream
+    attrs['component'] = file_component
+    attrs['experiment'] = file_experiment
+    try:
+        case_members = cesm2_cmip6_exps[file_experiment]['case_members']
+        attrs.update(case_members[case])
+    except Exception:
+        pass
+    return attrs
+
+
+def build_cesm(root_path, depth=4, exclude_patterns=['*.nc_temp_.nc']):
+    columns = [
+        'experiment',
+        'case',
+        'component',
+        'stream',
+        'variable',
+        'date_range',
+        'member_id',
+        'path',
+        'ctrl_branch_year',
+        'ctrl_experiment',
+        'ctrl_member_id',
+    ]
+    filelist = get_asset_list(root_path, depth=depth)
+    b = Builder(columns, exclude_patterns)
+    df = b(filelist, cesm2_cmip6_parser)
+    return df
diff --git a/builders/cmip.py b/builders/cmip.py
@@ -0,0 +1,225 @@
+import os
+
+import click
+import dask
+import numpy as np
+import requests
+from core import Builder, extract_attr_with_regex, get_asset_list, reverse_filename_format
+from dask.diagnostics import ProgressBar
+
+
+def cmip6_parser(filepath):
+    """
+    Extract attributes of a file using information from CMI6 DRS.
+    References
+
+    CMIP6 DRS: http://goo.gl/v1drZl
+    Controlled Vocabularies (CVs) for use in CMIP6: https://github.com/WCRP-CMIP/CMIP6_CVs
+    Directory structure =
+
+    <mip_era>/
+        <activity_id>/
+            <institution_id>/
+                <source_id>/
+                    <experiment_id>/
+                        <member_id>/
+                            <table_id>/
+                                <variable_id>/
+                                    <grid_label>/
+                                        <version>
+    file name = <variable_id>_<table_id>_<source_id>_<experiment_id >_<member_id>_<grid_label>[_<time_range>].nc For time-invariant fields, the last segment (time_range) above is omitted. Example when there is no sub-experiment: tas_Amon_GFDL-CM4_historical_r1i1p1f1_gn_196001-199912.nc Example with a sub-experiment: pr_day_CNRM-CM6-1_dcppA-hindcast_s1960-r2i1p1f1_gn_198001-198412.nc
+    """
+    basename = os.path.basename(filepath)
+    filename_template = '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}_{time_range}.nc'
+
+    gridspec_template = (
+        '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}.nc'
+    )
+    templates = [filename_template, gridspec_template]
+    fileparts = reverse_filename_format(basename, templates=templates)
+    try:
+        parent = os.path.dirname(filepath).strip('/')
+        parent_split = parent.split(f"/{fileparts['source_id']}/")
+        part_1 = parent_split[0].strip('/').split('/')
+        grid_label = parent.split(f"/{fileparts['variable_id']}/")[1].strip('/').split('/')[0]
+        fileparts['grid_label'] = grid_label
+        fileparts['activity_id'] = part_1[-2]
+        fileparts['institution_id'] = part_1[-1]
+        version_regex = r'v\d{4}\d{2}\d{2}|v\d{1}'
+        version = extract_attr_with_regex(parent, regex=version_regex) or 'v0'
+        fileparts['version'] = version
+        fileparts['path'] = filepath
+        if fileparts['member_id'].startswith('s'):
+            fileparts['dcpp_init_year'] = float(fileparts['member_id'].split('-')[0][1:])
+            fileparts['member_id'] = fileparts['member_id'].split('-')[-1]
+        else:
+            fileparts['dcpp_init_year'] = np.nan
+
+    except Exception:
+        pass
+
+    return fileparts
+
+
+def cmip5_parser(filepath):
+    """ Extract attributes of a file using information from CMIP5 DRS.
+    Notes
+    -----
+    Reference:
+    - CMIP5 DRS: https://pcmdi.llnl.gov/mips/cmip5/docs/cmip5_data_reference_syntax.pdf?id=27
+    """
+
+    freq_regex = r'/3hr/|/6hr/|/day/|/fx/|/mon/|/monClim/|/subhr/|/yr/'
+    realm_regex = r'aerosol|atmos|land|landIce|ocean|ocnBgchem|seaIce'
+    version_regex = r'v\d{4}\d{2}\d{2}|v\d{1}'
+
+    file_basename = os.path.basename(filepath)
+
+    filename_template = (
+        '{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}_{temporal_subset}.nc'
+    )
+    gridspec_template = '{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}.nc'
+
+    templates = [filename_template, gridspec_template]
+    fileparts = reverse_filename_format(file_basename, templates)
+    frequency = extract_attr_with_regex(filepath, regex=freq_regex, strip_chars='/')
+    realm = extract_attr_with_regex(filepath, regex=realm_regex)
+    version = extract_attr_with_regex(filepath, regex=version_regex) or 'v0'
+    fileparts['frequency'] = frequency
+    fileparts['modeling_realm'] = realm
+    fileparts['version'] = version
+    fileparts['path'] = filepath
+    try:
+        part1, part2 = os.path.dirname(filepath).split(fileparts['experiment'])
+        part1 = part1.strip('/').split('/')
+        fileparts['institute'] = part1[-2]
+        fileparts['product_id'] = part1[-3]
+    except Exception:
+        pass
+
+    return fileparts
+
+
+def _pick_latest_version(df):
+    import itertools
+
+    grpby = list(set(df.columns.tolist()) - {'path', 'version'})
+    groups = df.groupby(grpby)
+
+    @dask.delayed
+    def _pick_latest_v(group):
+        idx = []
+        if group.version.nunique() > 1:
+            idx = group.sort_values(by=['version'], ascending=False).index[1:].values.tolist()
+        return idx
+
+    idx_to_remove = [_pick_latest_v(group) for _, group in groups]
+    print('Getting latest version...\n')
+    with ProgressBar():
+        idx_to_remove = dask.compute(*idx_to_remove)
+
+    idx_to_remove = list(itertools.chain(*idx_to_remove))
+    df = df.drop(index=idx_to_remove)
+    print('\nDone....\n')
+    return df
+
+
+def build_cmip(
+    root_path,
+    cmip_version,
+    depth=4,
+    columns=None,
+    exclude_patterns=['*/files/*', '*/latest/*'],
+    pick_latest_version=False,
+):
+    parsers = {'6': cmip6_parser, '5': cmip5_parser}
+    cmip_columns = {
+        '6': [
+            'activity_id',
+            'institution_id',
+            'source_id',
+            'experiment_id',
+            'member_id',
+            'table_id',
+            'variable_id',
+            'grid_label',
+            'dcpp_init_year',
+            'version',
+            'time_range',
+            'path',
+        ],
+        '5': [
+            'product_id',
+            'institute',
+            'model',
+            'experiment',
+            'frequency',
+            'modeling_realm',
+            'mip_table',
+            'ensemble_member',
+            'variable',
+            'temporal_subset',
+            'version',
+            'path',
+        ],
+    }
+
+    filelist = get_asset_list(root_path, depth=depth)
+    cmip_version = str(cmip_version)
+    if columns is None:
+        columns = cmip_columns[cmip_version]
+    b = Builder(columns, exclude_patterns)
+    df = b(filelist, parsers[cmip_version])
+
+    if cmip_version == '6':
+        # Some entries are invalid: Don't conform to the CMIP6 Data Reference Syntax
+        cmip6_activity_id_url = (
+            'https://raw.githubusercontent.com/WCRP-CMIP/CMIP6_CVs/master/CMIP6_activity_id.json'
+        )
+        resp = requests.get(cmip6_activity_id_url)
+        activity_ids = list(resp.json()['activity_id'].keys())
+        # invalids = df[~df.activity_id.isin(activity_ids)]
+        df = df[df.activity_id.isin(activity_ids)]
+    if pick_latest_version:
+        df = _pick_latest_version(df)
+    return df
+
+
+@click.command()
+@click.option(
+    '--root-path', type=click.Path(exists=True), help='Root path of the CMIP project output.'
+)
+@click.option(
+    '-d',
+    '--depth',
+    default=4,
+    type=int,
+    show_default=True,
+    help='Recursion depth. Recursively walk root_path to a specified depth',
+)
+@click.option(
+    '--pick-latest-version',
+    default=False,
+    is_flag=True,
+    show_default=True,
+    help='Whether to only catalog lastest version of data assets or keep all versions',
+)
+@click.option('-v', '--cmip-version', type=int, help='CMIP phase (e.g. 5 for CMIP5 or 6 for CMIP6)')
+@click.option('--csv-filepath', type=str, help='File path to use when saving the built catalog')
+def cli(root_path, depth, pick_latest_version, cmip_version, csv_filepath):
+
+    if cmip_version not in set([5, 6]):
+        raise ValueError(
+            f'cmip_version = {cmip_version} is not valid. Valid options include: 5 and 6.'
+        )
+
+    if csv_filepath is None:
+        raise ValueError("Please provide csv-filepath. e.g.: './cmip5.csv.gz'")
+
+    df = build_cmip(root_path, cmip_version, depth=depth, pick_latest_version=pick_latest_version)
+
+    df.to_csv(csv_filepath, compression='gzip', index=False)
+
+
+if __name__ == '__main__':
+    cli()