In [1]:
import pandas as pd
import fnmatch
import dask.dataframe as dd
from intake.source.utils import reverse_format
import os
import re
import subprocess
from tqdm.auto import tqdm
from pathlib import Path
import shutil
import numpy as np

## Create text file containing all files available

In [2]:
def get_file_list(persist_path):
    root = Path("/glade/collections/cmip/cmip5/")
    p_path = Path(persist_path)
    p_path.mkdir(exist_ok=True)
    dirs = [x for x in root.iterdir() if x.is_dir()]
    for directory in tqdm(dirs):
        print(directory)
        stem = directory.stem
        f = open(f"{persist_path}/{stem}.txt", "w")
        cmd = ["find", "-L", directory.as_posix(), "-name", "*.nc"]
        p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=f)
        p.wait()
        

In [3]:
persist_path = "./CMIP5_filelist"
#get_file_list(persist_path)


## Extract attributes of a file using information from CMI6 DRS.


References
 1. CMIP6 DRS: http://goo.gl/v1drZl
 2. Controlled Vocabularies (CVs) for use in CMIP6:
    https://github.com/WCRP-CMIP/CMIP6_CVs
    
    
Directory structure =
```<mip_era>/
    <activity_id>/
        <institution_id>/
            <source_id>/
                <experiment_id>/
                    <member_id>/
                        <table_id>/
                            <variable_id>/
                                <grid_label>/
                                    <version>
```
file name =
```<variable_id>_<table_id>_<source_id>_<experiment_id >_<member_id>_<grid_label>[_<time_range>].nc```
For time-invariant fields, the last segment (time_range) above is omitted.
Example when there is no sub-experiment: `tas_Amon_GFDL-CM4_historical_r1i1p1f1_gn_196001-199912.nc`
Example with a sub-experiment:   `pr_day_CNRM-CM6-1_dcppA-hindcast_s1960-r2i1p1f1_gn_198001-198412.nc`


In [35]:
a = Path("/glade/collections/cmip/cmip5/output1/")
a
for d in a.iterdir():
    print(d)

/glade/collections/cmip/cmip5/output1/CNRM-CERFACS
/glade/collections/cmip/cmip5/output1/UNSW
/glade/collections/cmip/cmip5/output1/BCC
/glade/collections/cmip/cmip5/output1/NCAR
/glade/collections/cmip/cmip5/output1/FIO
/glade/collections/cmip/cmip5/output1/MOHC
/glade/collections/cmip/cmip5/output1/CSIRO-BOM
/glade/collections/cmip/cmip5/output1/NOAA-GFDL
/glade/collections/cmip/cmip5/output1/NIMR-KMA
/glade/collections/cmip/cmip5/output1/NASA-GISS
/glade/collections/cmip/cmip5/output1/CSIRO-QCCCE
/glade/collections/cmip/cmip5/output1/CCCma
/glade/collections/cmip/cmip5/output1/INPE
/glade/collections/cmip/cmip5/output1/LASG-CESS
/glade/collections/cmip/cmip5/output1/INM
/glade/collections/cmip/cmip5/output1/ICHEC
/glade/collections/cmip/cmip5/output1/NSF-DOE-NCAR
/glade/collections/cmip/cmip5/output1/CMCC
/glade/collections/cmip/cmip5/output1/BNU
/glade/collections/cmip/cmip5/output1/LASG-IAP
/glade/collections/cmip/cmip5/output1/MRI
/glade/collections/cmip/cmip5/output1/MIROC
/glad

In [4]:
activity_ids = list(Path(persist_path).rglob("*.txt"))
activity_ids = [activity_id.stem for activity_id in activity_ids]
activity_ids

['output1', 'output2', 'output']

In [18]:
df = dd.read_csv(f"{persist_path}/*.txt", header=None).compute()
df.columns = ["path"]
df.head()

Unnamed: 0,path
0,/glade/collections/cmip/cmip5/output/CCCma/Can...
1,/glade/collections/cmip/cmip5/output/CCCma/Can...
2,/glade/collections/cmip/cmip5/output/CCCma/Can...
3,/glade/collections/cmip/cmip5/output/CCCma/Can...
4,/glade/collections/cmip/cmip5/output/CCCma/Can...


In [19]:
len(df)

927318

In [20]:
def _reverse_filename_format(file_basename, filename_template=None, gridspec_template=None):
    """
    Uses intake's ``reverse_format`` utility to reverse the string method format.
    Given format_string and resolved_string, find arguments
    that would give format_string.format(arguments) == resolved_string
    """
    try:
        return reverse_format(filename_template, file_basename)
    except ValueError:
        try:
            return reverse_format(gridspec_template, file_basename)
        except:
            print(
                f'Failed to parse file: {file_basename} using patterns: {filename_template} and {gridspec_template}'
            )
            return {}
            
def _extract_attr_with_regex(input_str, regex, strip_chars=None):
    pattern = re.compile(regex, re.IGNORECASE)
    match = re.findall(pattern, input_str)
    if match:
        match = max(match, key=len)
        if strip_chars:
            match = match.strip(strip_chars)

        else:
            match = match.strip()

        return match

    else:
        return None
    

exclude_patterns = ['*/files/*', '*/latest/*']
def _filter_func(path):
    return not any(
        fnmatch.fnmatch(path, pat=exclude_pattern) for exclude_pattern in exclude_patterns
    )



In [21]:
%%time
files = df.path.tolist()
filelist = list(filter(_filter_func, files))

CPU times: user 3.68 s, sys: 94 ms, total: 3.77 s
Wall time: 3.76 s


In [22]:
len(filelist)

629942

In [23]:
# def get_attrs(filepath):
#     basename = os.path.basename(filepath)
#     dirname = os.path.dirname(filepath)
#     filename_template = '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}_{time_range}.nc'

#     gridspec_template = (
#                 '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}.nc'
#             )
    
#     f = _reverse_filename_format(
#             basename, filename_template=filename_template, gridspec_template=gridspec_template
#         )

#     fileparts = {}
#     fileparts.update(f)
#     parent = os.path.dirname(filepath).strip('/')
#     parent_split = parent.split(f"/{fileparts['source_id']}/")
#     part_1 = parent_split[0].strip('/').split('/')
#     grid_label = parent.split(f"/{fileparts['variable_id']}/")[1].strip('/').split('/')[0]
#     fileparts['grid_label'] = grid_label
#     fileparts['activity_id'] = part_1[-2]
#     fileparts['institution_id'] = part_1[-1]
#     version_regex = r'v\d{4}\d{2}\d{2}|v\d{1}'
#     version = _extract_attr_with_regex(parent, regex=version_regex) or 'v0'
#     fileparts['version'] = version
#     fileparts['path'] = filepath
#     return fileparts 

def get_attrs(filepath):
    """ Extract attributes of a file using information from CMIP5 DRS.
    Notes
    -----
    Reference:
    - CMIP5 DRS: https://pcmdi.llnl.gov/mips/cmip5/docs/cmip5_data_reference_syntax.pdf?id=27
    """

    fileparts = {}

    freq_regex = r'/3hr/|/6hr/|/day/|/fx/|/mon/|/monClim/|/subhr/|/yr/'
    realm_regex = r'aerosol|atmos|land|landIce|ocean|ocnBgchem|seaIce'
    version_regex = r'v\d{4}\d{2}\d{2}|v\d{1}'

    file_basename = os.path.basename(filepath)
    fileparts['path'] = filepath

    filename_template = (
        '{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}_{temporal_subset}.nc'
    )
    gridspec_template = '{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}.nc'
    f = _reverse_filename_format(
        file_basename, filename_template=filename_template, gridspec_template=gridspec_template
    )
    fileparts.update(f)

    frequency = _extract_attr_with_regex(
        filepath, regex=freq_regex, strip_chars='/'
    )
    realm = _extract_attr_with_regex(filepath, regex=realm_regex)
    version = _extract_attr_with_regex(filepath, regex=version_regex) or 'v0'
    fileparts['frequency'] = frequency
    fileparts['modeling_realm'] = realm
    fileparts['version'] = version

    return fileparts

In [24]:
get_attrs(filelist[0])

{'path': '/glade/collections/cmip/cmip5/output/CCCma/CanCM4/historical/mon/atmos/r10i1p1/v20130331/hfls/hfls_Amon_CanCM4_historical_r10i1p1_196101-200512.nc',
 'variable': 'hfls',
 'mip_table': 'Amon',
 'model': 'CanCM4',
 'experiment': 'historical',
 'ensemble_member': 'r10i1p1',
 'temporal_subset': '196101-200512',
 'frequency': 'mon',
 'modeling_realm': 'atmos',
 'version': 'v20130331'}

In [25]:
%%time
entries = list(map(get_attrs, filelist))

CPU times: user 28.6 s, sys: 639 ms, total: 29.2 s
Wall time: 29.3 s


In [26]:
entries[10]

{'path': '/glade/collections/cmip/cmip5/output/CCCma/CanCM4/historical/mon/atmos/r9i1p1/v20130331/tas/tas_Amon_CanCM4_historical_r9i1p1_196101-200512.nc',
 'variable': 'tas',
 'mip_table': 'Amon',
 'model': 'CanCM4',
 'experiment': 'historical',
 'ensemble_member': 'r9i1p1',
 'temporal_subset': '196101-200512',
 'frequency': 'mon',
 'modeling_realm': 'atmos',
 'version': 'v20130331'}

In [27]:
len(entries)

629942

In [28]:
df = pd.DataFrame(entries)
df = df.drop_duplicates(subset=['path'], keep='last').reset_index(drop=True)
df.head()

Unnamed: 0,path,variable,mip_table,model,experiment,ensemble_member,temporal_subset,frequency,modeling_realm,version
0,/glade/collections/cmip/cmip5/output/CCCma/Can...,hfls,Amon,CanCM4,historical,r10i1p1,196101-200512,mon,atmos,v20130331
1,/glade/collections/cmip/cmip5/output/CCCma/Can...,tas,Amon,CanCM4,historical,r10i1p1,196101-200512,mon,atmos,v20130331
2,/glade/collections/cmip/cmip5/output/CCCma/Can...,pr,Amon,CanCM4,historical,r10i1p1,196101-200512,mon,atmos,v20130331
3,/glade/collections/cmip/cmip5/output/CCCma/Can...,hfls,Amon,CanCM4,historical,r1i1p1,196101-200512,mon,atmos,v20130331
4,/glade/collections/cmip/cmip5/output/CCCma/Can...,tas,Amon,CanCM4,historical,r1i1p1,196101-200512,mon,atmos,v20130331


In [29]:
len(df)

629942

In [30]:
# Some entries are invalid
invalids = df[~df.activity_id.isin(activity_ids)]
#df = df[df.activity_id.isin(activity_ids)]

AttributeError: 'DataFrame' object has no attribute 'activity_id'

In [15]:
invalids

NameError: name 'invalids' is not defined

In [18]:
len(df)

609904

## Separate Decadal Predictions from the rest of activities

- Decadal prediction catalog requires additional columns (`start_year`)

In [19]:
dcpp = df[df.activity_id=="DCPP"].copy().reset_index(drop=True)
rest = df[~(df.activity_id=="DCPP")].copy().reset_index(drop=True)

In [20]:
columns = ["activity_id", "institution_id", "source_id", "experiment_id", "member_id", "table_id", "variable_id",
           "grid_label", "version", "time_range", "path"]
rest = rest[columns]
rest.head()

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,version,time_range,path
0,AerChemMIP,BCC,BCC-ESM1,ssp370,r2i1p1f1,Amon,hfls,gn,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
1,AerChemMIP,BCC,BCC-ESM1,ssp370,r2i1p1f1,Amon,va,gn,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
2,AerChemMIP,BCC,BCC-ESM1,ssp370,r2i1p1f1,Amon,tas,gn,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
3,AerChemMIP,BCC,BCC-ESM1,ssp370,r2i1p1f1,Amon,rsds,gn,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...
4,AerChemMIP,BCC,BCC-ESM1,ssp370,r2i1p1f1,Amon,pr,gn,v20190624,201501-205512,/glade/collections/cmip/CMIP6/AerChemMIP/BCC/B...


In [21]:
rest.to_csv("../catalogs/glade-cmip6.csv.gz", compression="gzip", index=False)

In [22]:
dcpp.head()

Unnamed: 0,variable_id,table_id,source_id,experiment_id,member_id,grid_label,time_range,activity_id,institution_id,version,path
0,tas,day,CanESM5,dcppA-hindcast,s2015-r7i1p2f1,gn,20160101-20251231,DCPP,CCCma,v20190429,/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
1,tas,day,CanESM5,dcppA-hindcast,s1977-r2i1p2f1,gn,19780101-19871231,DCPP,CCCma,v20190429,/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
2,tas,Amon,CanESM5,dcppA-hindcast,s1977-r2i1p2f1,gn,197801-198712,DCPP,CCCma,v20190429,/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
3,tas,day,CanESM5,dcppA-hindcast,s1975-r8i1p2f1,gn,19760101-19851231,DCPP,CCCma,v20190429,/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
4,tas,Amon,CanESM5,dcppA-hindcast,s1975-r8i1p2f1,gn,197601-198512,DCPP,CCCma,v20190429,/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...


In [23]:
# Note: For 'dcppA-assim' experiment_id, there's no start year
dcpp["start_year"] = dcpp.member_id.map(lambda x: float(x.split("-")[0][1:] if x.startswith("s") else np.nan))
dcpp["member_id"] = dcpp["member_id"].map(lambda x: x.split("-")[-1] if x.startswith("s") else x)
dcpp.head()

Unnamed: 0,variable_id,table_id,source_id,experiment_id,member_id,grid_label,time_range,activity_id,institution_id,version,path,start_year
0,tas,day,CanESM5,dcppA-hindcast,r7i1p2f1,gn,20160101-20251231,DCPP,CCCma,v20190429,/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...,2015.0
1,tas,day,CanESM5,dcppA-hindcast,r2i1p2f1,gn,19780101-19871231,DCPP,CCCma,v20190429,/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...,1977.0
2,tas,Amon,CanESM5,dcppA-hindcast,r2i1p2f1,gn,197801-198712,DCPP,CCCma,v20190429,/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...,1977.0
3,tas,day,CanESM5,dcppA-hindcast,r8i1p2f1,gn,19760101-19851231,DCPP,CCCma,v20190429,/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...,1975.0
4,tas,Amon,CanESM5,dcppA-hindcast,r8i1p2f1,gn,197601-198512,DCPP,CCCma,v20190429,/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...,1975.0


In [24]:
columns = ["activity_id", "institution_id", "source_id", "experiment_id", "member_id", "start_year", "table_id", "variable_id",
           "grid_label", "version", "time_range", "path"]
dcpp = dcpp[columns]
dcpp.head()

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,start_year,table_id,variable_id,grid_label,version,time_range,path
0,DCPP,CCCma,CanESM5,dcppA-hindcast,r7i1p2f1,2015.0,day,tas,gn,v20190429,20160101-20251231,/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
1,DCPP,CCCma,CanESM5,dcppA-hindcast,r2i1p2f1,1977.0,day,tas,gn,v20190429,19780101-19871231,/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
2,DCPP,CCCma,CanESM5,dcppA-hindcast,r2i1p2f1,1977.0,Amon,tas,gn,v20190429,197801-198712,/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
3,DCPP,CCCma,CanESM5,dcppA-hindcast,r8i1p2f1,1975.0,day,tas,gn,v20190429,19760101-19851231,/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...
4,DCPP,CCCma,CanESM5,dcppA-hindcast,r8i1p2f1,1975.0,Amon,tas,gn,v20190429,197601-198512,/glade/collections/cmip/CMIP6/DCPP/CCCma/CanES...


In [25]:
dcpp.to_csv("../catalogs/glade-cmip6-dcpp.csv.gz", compression="gzip", index=False)