Skip to content

Commit

Permalink
Merge pull request #47 from andersy005/master
Browse files Browse the repository at this point in the history
Add utilities for building catalogs
  • Loading branch information
andersy005 committed Jan 7, 2020
2 parents 59a3dbf + ba58c2d commit 3e46a0e
Show file tree
Hide file tree
Showing 16 changed files with 475 additions and 6 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,8 @@ worker-*
slurm-*
dask-worker*
*.txt
builders/*csv.gz
builders/Convert nc stores to json.ipynb
builders/Untitled.ipynb
builders/Untitled1.ipynb
builders/notebooks/glade-cmip6_catalog_builder copy.ipynb
2 changes: 0 additions & 2 deletions .isort.cfg

This file was deleted.

17 changes: 13 additions & 4 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.3.0
rev: v2.4.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
Expand All @@ -10,13 +10,13 @@ repos:
- id: double-quote-string-fixer

- repo: https://github.com/ambv/black
rev: 19.3b0
rev: 19.10b0
hooks:
- id: black
args: ["--line-length", "100", "--skip-string-normalization"]

- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.3.0
rev: v2.4.0
hooks:
- id: flake8
- repo: https://github.com/asottile/seed-isort-config
Expand All @@ -29,6 +29,15 @@ repos:
- id: isort

- repo: https://github.com/prettier/prettier
rev: 1.18.2
rev: 1.19.1
hooks:
- id: prettier
# - repo: local
# #rev: v0.8.0
# hooks:
# - id: wily
# name: wily
# entry: wily diff
# verbose: true
# language: python
# additional_dependencies: [wily]
30 changes: 30 additions & 0 deletions builders/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Scripts to build catalogs

- [Scripts to build catalogs](#scripts-to-build-catalogs)
- [CMIP5 and CMIP6](#cmip5-and-cmip6)
- [TODO: CESM](#todo-cesm)

## CMIP5 and CMIP6

```bash
$ python cmip.py --help
Usage: cmip.py [OPTIONS]

Options:
--root-path PATH Root path of the CMIP project output.
-d, --depth INTEGER Recursion depth. Recursively walk root_path to a
specified depth [default: 4]
--pick-latest-version Whether to only catalog lastest version of data
assets or keep all versions [default: False]
-v, --cmip-version INTEGER CMIP phase (e.g. 5 for CMIP5 or 6 for CMIP6)
--csv-filepath TEXT File path to use when saving the built catalog
--help Show this message and exit.
```

**Example:**

```bash
$ python cmip.py --root-path /glade/collections/cmip/CMIP6 --pick-latest-version --cmip-version 6 --csv-filepath ../catalogs/glade-cmip6.csv.gz --depth 5
```

## TODO: CESM
66 changes: 66 additions & 0 deletions builders/cesm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import os

import yaml
from core import Builder, extract_attr_with_regex, get_asset_list
from requests.structures import CaseInsensitiveDict

with open('cesm2_cmip6.yaml') as f:
cesm2_cmip6_definitions = yaml.safe_load(f)

component_streams = cesm2_cmip6_definitions['component_streams']
cesm2_cmip6_exps = CaseInsensitiveDict(cesm2_cmip6_definitions['experiments'])
date_str_regex = r'\d{4}\-\d{4}|\d{6}\-\d{6}|\d{8}\-\d{8}|\d{10}Z\-\d{10}Z|\d{12}Z\-\d{12}Z|\d{10}\-\d{10}|\d{12}\-\d{12}'


def cesm2_cmip6_parser(filepath):
file_basename = os.path.basename(filepath)
date_str = extract_attr_with_regex(file_basename, date_str_regex)
for component, streams in component_streams.items():
# Loop over stream strings
# NOTE: The order matters here!
for stream in sorted(streams, key=lambda s: len(s), reverse=True):
s = file_basename.find(stream)
if s >= 0:
file_stream = stream
file_component = component
x = file_basename.split(stream)
case = x[0].strip('.')
file_experiment = '.'.join(case.split('.')[4:-1])
y = x[-1].strip('.').split(date_str)
variable = y[0].strip('.')
break

attrs = {}
attrs['path'] = filepath
attrs['case'] = case
attrs['variable'] = variable
attrs['date_range'] = date_str
attrs['stream'] = file_stream
attrs['component'] = file_component
attrs['experiment'] = file_experiment
try:
case_members = cesm2_cmip6_exps[file_experiment]['case_members']
attrs.update(case_members[case])
except Exception:
pass
return attrs


def build_cesm(root_path, depth=4, exclude_patterns=['*.nc_temp_.nc']):
columns = [
'experiment',
'case',
'component',
'stream',
'variable',
'date_range',
'member_id',
'path',
'ctrl_branch_year',
'ctrl_experiment',
'ctrl_member_id',
]
filelist = get_asset_list(root_path, depth=depth)
b = Builder(columns, exclude_patterns)
df = b(filelist, cesm2_cmip6_parser)
return df
225 changes: 225 additions & 0 deletions builders/cmip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
import os

import click
import dask
import numpy as np
import requests
from core import Builder, extract_attr_with_regex, get_asset_list, reverse_filename_format
from dask.diagnostics import ProgressBar


def cmip6_parser(filepath):
"""
Extract attributes of a file using information from CMI6 DRS.
References
CMIP6 DRS: http://goo.gl/v1drZl
Controlled Vocabularies (CVs) for use in CMIP6: https://github.com/WCRP-CMIP/CMIP6_CVs
Directory structure =
<mip_era>/
<activity_id>/
<institution_id>/
<source_id>/
<experiment_id>/
<member_id>/
<table_id>/
<variable_id>/
<grid_label>/
<version>
file name = <variable_id>_<table_id>_<source_id>_<experiment_id >_<member_id>_<grid_label>[_<time_range>].nc For time-invariant fields, the last segment (time_range) above is omitted. Example when there is no sub-experiment: tas_Amon_GFDL-CM4_historical_r1i1p1f1_gn_196001-199912.nc Example with a sub-experiment: pr_day_CNRM-CM6-1_dcppA-hindcast_s1960-r2i1p1f1_gn_198001-198412.nc
"""
basename = os.path.basename(filepath)
filename_template = '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}_{time_range}.nc'

gridspec_template = (
'{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}.nc'
)
templates = [filename_template, gridspec_template]
fileparts = reverse_filename_format(basename, templates=templates)
try:
parent = os.path.dirname(filepath).strip('/')
parent_split = parent.split(f"/{fileparts['source_id']}/")
part_1 = parent_split[0].strip('/').split('/')
grid_label = parent.split(f"/{fileparts['variable_id']}/")[1].strip('/').split('/')[0]
fileparts['grid_label'] = grid_label
fileparts['activity_id'] = part_1[-2]
fileparts['institution_id'] = part_1[-1]
version_regex = r'v\d{4}\d{2}\d{2}|v\d{1}'
version = extract_attr_with_regex(parent, regex=version_regex) or 'v0'
fileparts['version'] = version
fileparts['path'] = filepath
if fileparts['member_id'].startswith('s'):
fileparts['dcpp_init_year'] = float(fileparts['member_id'].split('-')[0][1:])
fileparts['member_id'] = fileparts['member_id'].split('-')[-1]
else:
fileparts['dcpp_init_year'] = np.nan

except Exception:
pass

return fileparts


def cmip5_parser(filepath):
""" Extract attributes of a file using information from CMIP5 DRS.
Notes
-----
Reference:
- CMIP5 DRS: https://pcmdi.llnl.gov/mips/cmip5/docs/cmip5_data_reference_syntax.pdf?id=27
"""

freq_regex = r'/3hr/|/6hr/|/day/|/fx/|/mon/|/monClim/|/subhr/|/yr/'
realm_regex = r'aerosol|atmos|land|landIce|ocean|ocnBgchem|seaIce'
version_regex = r'v\d{4}\d{2}\d{2}|v\d{1}'

file_basename = os.path.basename(filepath)

filename_template = (
'{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}_{temporal_subset}.nc'
)
gridspec_template = '{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}.nc'

templates = [filename_template, gridspec_template]
fileparts = reverse_filename_format(file_basename, templates)
frequency = extract_attr_with_regex(filepath, regex=freq_regex, strip_chars='/')
realm = extract_attr_with_regex(filepath, regex=realm_regex)
version = extract_attr_with_regex(filepath, regex=version_regex) or 'v0'
fileparts['frequency'] = frequency
fileparts['modeling_realm'] = realm
fileparts['version'] = version
fileparts['path'] = filepath
try:
part1, part2 = os.path.dirname(filepath).split(fileparts['experiment'])
part1 = part1.strip('/').split('/')
fileparts['institute'] = part1[-2]
fileparts['product_id'] = part1[-3]
except Exception:
pass

return fileparts


def _pick_latest_version(df):
import itertools

grpby = list(set(df.columns.tolist()) - {'path', 'version'})
groups = df.groupby(grpby)

@dask.delayed
def _pick_latest_v(group):
idx = []
if group.version.nunique() > 1:
idx = group.sort_values(by=['version'], ascending=False).index[1:].values.tolist()
return idx

idx_to_remove = [_pick_latest_v(group) for _, group in groups]
print('Getting latest version...\n')
with ProgressBar():
idx_to_remove = dask.compute(*idx_to_remove)

idx_to_remove = list(itertools.chain(*idx_to_remove))
df = df.drop(index=idx_to_remove)
print('\nDone....\n')
return df


def build_cmip(
root_path,
cmip_version,
depth=4,
columns=None,
exclude_patterns=['*/files/*', '*/latest/*'],
pick_latest_version=False,
):
parsers = {'6': cmip6_parser, '5': cmip5_parser}
cmip_columns = {
'6': [
'activity_id',
'institution_id',
'source_id',
'experiment_id',
'member_id',
'table_id',
'variable_id',
'grid_label',
'dcpp_init_year',
'version',
'time_range',
'path',
],
'5': [
'product_id',
'institute',
'model',
'experiment',
'frequency',
'modeling_realm',
'mip_table',
'ensemble_member',
'variable',
'temporal_subset',
'version',
'path',
],
}

filelist = get_asset_list(root_path, depth=depth)
cmip_version = str(cmip_version)
if columns is None:
columns = cmip_columns[cmip_version]
b = Builder(columns, exclude_patterns)
df = b(filelist, parsers[cmip_version])

if cmip_version == '6':
# Some entries are invalid: Don't conform to the CMIP6 Data Reference Syntax
cmip6_activity_id_url = (
'https://raw.githubusercontent.com/WCRP-CMIP/CMIP6_CVs/master/CMIP6_activity_id.json'
)
resp = requests.get(cmip6_activity_id_url)
activity_ids = list(resp.json()['activity_id'].keys())
# invalids = df[~df.activity_id.isin(activity_ids)]
df = df[df.activity_id.isin(activity_ids)]
if pick_latest_version:
df = _pick_latest_version(df)
return df


@click.command()
@click.option(
'--root-path', type=click.Path(exists=True), help='Root path of the CMIP project output.'
)
@click.option(
'-d',
'--depth',
default=4,
type=int,
show_default=True,
help='Recursion depth. Recursively walk root_path to a specified depth',
)
@click.option(
'--pick-latest-version',
default=False,
is_flag=True,
show_default=True,
help='Whether to only catalog lastest version of data assets or keep all versions',
)
@click.option('-v', '--cmip-version', type=int, help='CMIP phase (e.g. 5 for CMIP5 or 6 for CMIP6)')
@click.option('--csv-filepath', type=str, help='File path to use when saving the built catalog')
def cli(root_path, depth, pick_latest_version, cmip_version, csv_filepath):

if cmip_version not in set([5, 6]):
raise ValueError(
f'cmip_version = {cmip_version} is not valid. Valid options include: 5 and 6.'
)

if csv_filepath is None:
raise ValueError("Please provide csv-filepath. e.g.: './cmip5.csv.gz'")

df = build_cmip(root_path, cmip_version, depth=depth, pick_latest_version=pick_latest_version)

df.to_csv(csv_filepath, compression='gzip', index=False)


if __name__ == '__main__':
cli()
Loading

0 comments on commit 3e46a0e

Please sign in to comment.