In [1]:
import pandas as pd
import fnmatch
import dask.dataframe as dd
from intake.source.utils import reverse_format
import os
import re
import subprocess
from tqdm.auto import tqdm
from pathlib import Path
import shutil
import numpy as np

## Create text file containing all files available

In [2]:
def get_file_list(persist_path):
    persist_path = Path(persist_path)
    persist_path.mkdir(exist_ok=True)
    root = Path("/work/ik1017/CMIP6/data/CMIP6")
    dirs = [x for x in root.iterdir() if x.is_dir()]
    for directory in tqdm(dirs):
        print(directory)
        stem = directory.stem
        f = open(f"{persist_path}/{stem}.txt", "w")
        cmd = ["find", "-L", directory.as_posix(), "-name", "*.nc"]
        p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=f)
        p.wait()

In [3]:
persist_path = "./CMIP6_filelist"
get_file_list(persist_path)

HBox(children=(IntProgress(value=0, max=19), HTML(value='')))

/work/ik1017/CMIP6/data/CMIP6/GeoMIP
/work/ik1017/CMIP6/data/CMIP6/HighResMIP
/work/ik1017/CMIP6/data/CMIP6/DAMIP
/work/ik1017/CMIP6/data/CMIP6/C4MIP
/work/ik1017/CMIP6/data/CMIP6/ScenarioMIP
/work/ik1017/CMIP6/data/CMIP6/ISMIP6
/work/ik1017/CMIP6/data/CMIP6/PMIP
/work/ik1017/CMIP6/data/CMIP6/GMMIP
/work/ik1017/CMIP6/data/CMIP6/AerChemMIP
/work/ik1017/CMIP6/data/CMIP6/LS3MIP
/work/ik1017/CMIP6/data/CMIP6/CMIP
/work/ik1017/CMIP6/data/CMIP6/RFMIP
/work/ik1017/CMIP6/data/CMIP6/DCPP
/work/ik1017/CMIP6/data/CMIP6/FAFMIP
/work/ik1017/CMIP6/data/CMIP6/OMIP
/work/ik1017/CMIP6/data/CMIP6/CDRMIP
/work/ik1017/CMIP6/data/CMIP6/PAMIP
/work/ik1017/CMIP6/data/CMIP6/LUMIP
/work/ik1017/CMIP6/data/CMIP6/CFMIP




## Extract attributes of a file using information from CMI6 DRS.


References
 1. CMIP6 DRS: http://goo.gl/v1drZl
 2. Controlled Vocabularies (CVs) for use in CMIP6:
    https://github.com/WCRP-CMIP/CMIP6_CVs
    
    
Directory structure =
```<mip_era>/
    <activity_id>/
        <institution_id>/
            <source_id>/
                <experiment_id>/
                    <member_id>/
                        <table_id>/
                            <variable_id>/
                                <grid_label>/
                                    <version>
```
file name =
```<variable_id>_<table_id>_<source_id>_<experiment_id >_<member_id>_<grid_label>[_<time_range>].nc```
For time-invariant fields, the last segment (time_range) above is omitted.
Example when there is no sub-experiment: `tas_Amon_GFDL-CM4_historical_r1i1p1f1_gn_196001-199912.nc`
Example with a sub-experiment:   `pr_day_CNRM-CM6-1_dcppA-hindcast_s1960-r2i1p1f1_gn_198001-198412.nc`


In [4]:
activity_ids = list(Path(persist_path).rglob("*.txt"))
activity_ids = [activity_id.stem for activity_id in activity_ids]
activity_ids

['CDRMIP',
 'PAMIP',
 'ISMIP6',
 'ScenarioMIP',
 'DCPP',
 'RFMIP',
 'LS3MIP',
 'FAFMIP',
 'DAMIP',
 'C4MIP',
 'OMIP',
 'LUMIP',
 'AerChemMIP',
 'CMIP',
 'GeoMIP',
 'HighResMIP',
 'PMIP',
 'GMMIP',
 'CFMIP']

In [5]:
df = dd.read_csv(f"{persist_path}/*.txt", header=None).compute()
df.columns = ["path"]
df.head()

Unnamed: 0,path
0,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...
1,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...
2,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...
3,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...
4,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...


- 5.11.2019: 1868201

In [6]:
len(df)

1868201

In [7]:
# new
len(df)

1868201

In [8]:
def _reverse_filename_format(file_basename, filename_template=None, gridspec_template=None):
    """
    Uses intake's ``reverse_format`` utility to reverse the string method format.
    Given format_string and resolved_string, find arguments
    that would give format_string.format(arguments) == resolved_string
    """
    try:
        return reverse_format(filename_template, file_basename)
    except ValueError:
        try:
            return reverse_format(gridspec_template, file_basename)
        except:
            print(
                f'Failed to parse file: {file_basename} using patterns: {filename_template} and {gridspec_template}'
            )
            return {}


def _extract_attr_with_regex(input_str, regex, strip_chars=None):
    pattern = re.compile(regex, re.IGNORECASE)
    match = re.findall(pattern, input_str)
    if match:
        match = max(match, key=len)
        if strip_chars:
            match = match.strip(strip_chars)

        else:
            match = match.strip()

        return match

    else:
        return None


exclude_patterns = ['*/files/*', '*/latest/*']


def _filter_func(path):
    return not any(fnmatch.fnmatch(path, pat=exclude_pattern) for exclude_pattern in exclude_patterns)

In [9]:
%%time
files = df.path.tolist()
filelist = list(filter(_filter_func, files))

CPU times: user 7.45 s, sys: 0 ns, total: 7.45 s
Wall time: 7.44 s


In [10]:
len(filelist)

1868201

In [11]:
def get_attrs(filepath):
    basename = os.path.basename(filepath)
    dirname = os.path.dirname(filepath)
    filename_template = (
        '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}_{time_range}.nc'
    )

    gridspec_template = '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}.nc'

    f = _reverse_filename_format(
        basename, filename_template=filename_template, gridspec_template=gridspec_template
    )

    fileparts = {}
    fileparts.update(f)
    parent = os.path.dirname(filepath).strip('/')
    parent_split = parent.split(f"/{fileparts['source_id']}/")
    part_1 = parent_split[0].strip('/').split('/')
    grid_label = parent.split(f"/{fileparts['variable_id']}/")[1].strip('/').split('/')[0]
    fileparts['grid_label'] = grid_label
    fileparts['activity_id'] = part_1[-2]
    fileparts['institution_id'] = part_1[-1]
    version_regex = r'v\d{4}\d{2}\d{2}|v\d{1}'
    version = _extract_attr_with_regex(parent, regex=version_regex) or 'v0'
    fileparts['version'] = version
    fileparts['path'] = filepath
    return fileparts

In [12]:
%%time
entries = list(map(get_attrs, filelist))

CPU times: user 51.6 s, sys: 1.49 s, total: 53.1 s
Wall time: 53.1 s


In [13]:
entries[0]

{'variable_id': 'mrsos',
 'table_id': 'Lmon',
 'source_id': 'GFDL-ESM4',
 'experiment_id': 'piClim-aer',
 'member_id': 'r1i1p1f1',
 'grid_label': 'gr1',
 'time_range': '000101-003012',
 'activity_id': 'AerChemMIP',
 'institution_id': 'NOAA-GFDL',
 'version': 'v20180701',
 'path': '/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-GFDL/GFDL-ESM4/piClim-aer/r1i1p1f1/Lmon/mrsos/gr1/v20180701/mrsos_Lmon_GFDL-ESM4_piClim-aer_r1i1p1f1_gr1_000101-003012.nc'}

In [14]:
len(entries)

1868201

In [15]:
df1 = pd.DataFrame(entries)
df1.head()

Unnamed: 0,variable_id,table_id,source_id,experiment_id,member_id,grid_label,time_range,activity_id,institution_id,version,path
0,mrsos,Lmon,GFDL-ESM4,piClim-aer,r1i1p1f1,gr1,000101-003012,AerChemMIP,NOAA-GFDL,v20180701,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...
1,mrso,Lmon,GFDL-ESM4,piClim-aer,r1i1p1f1,gr1,000101-003012,AerChemMIP,NOAA-GFDL,v20180701,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...
2,mrro,Lmon,GFDL-ESM4,piClim-aer,r1i1p1f1,gr1,000101-003012,AerChemMIP,NOAA-GFDL,v20180701,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...
3,mrros,Lmon,GFDL-ESM4,piClim-aer,r1i1p1f1,gr1,000101-003012,AerChemMIP,NOAA-GFDL,v20180701,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...
4,rlus,Amon,GFDL-ESM4,piClim-aer,r1i1p1f1,gr1,000101-003012,AerChemMIP,NOAA-GFDL,v20180701,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...


In [16]:
len(df1)

1868201

In [17]:
# Some entries are invalid
invalids = df1[~df1.activity_id.isin(activity_ids)]
df = df1[df1.activity_id.isin(activity_ids)]
invalids

Unnamed: 0,variable_id,table_id,source_id,experiment_id,member_id,grid_label,time_range,activity_id,institution_id,version,path
14173,orog,fx,piClim-SO2,NorESM2-LM,r1i1p1f1,gn,,NCC,NorESM2-LM,v20190815,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NCC/N...
14174,areacella,fx,piClim-SO2,NorESM2-LM,r1i1p1f1,gn,,NCC,NorESM2-LM,v20190815,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NCC/N...
14175,sftlf,fx,piClim-SO2,NorESM2-LM,r1i1p1f1,gn,,NCC,NorESM2-LM,v20190815,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NCC/N...
14179,areacella,fx,piClim-OC,NorESM2-LM,r1i1p1f1,gn,,NCC,NorESM2-LM,v20190815,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NCC/N...
14180,sftlf,fx,piClim-OC,NorESM2-LM,r1i1p1f1,gn,,NCC,NorESM2-LM,v20190815,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NCC/N...
...,...,...,...,...,...,...,...,...,...,...,...
1432782,uo,Omon,land-hist,GISS-E2-1-G,r4i1p1f1,gn,187101-189012,NASA-GISS,GISS-E2-1-G,v20180914,/work/ik1017/CMIP6/data/CMIP6/LUMIP/NASA-GISS/...
1432783,uo,Omon,land-hist,GISS-E2-1-G,r4i1p1f1,gn,201101-201412,NASA-GISS,GISS-E2-1-G,v20180914,/work/ik1017/CMIP6/data/CMIP6/LUMIP/NASA-GISS/...
1432784,uo,Omon,land-hist,GISS-E2-1-G,r4i1p1f1,gn,193101-195012,NASA-GISS,GISS-E2-1-G,v20180914,/work/ik1017/CMIP6/data/CMIP6/LUMIP/NASA-GISS/...
1461577,areacella,fx,piClim-ghg,NorESM2-LM,r1i1p1f1,gn,,NCC,NorESM2-LM,v20190815,/work/ik1017/CMIP6/data/CMIP6/RFMIP/NCC/NorESM...


In [18]:
invalids.path.tolist()

## Keep latest version

['/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NCC/NorESM2-LM/piClim-SO2/r1i1p1f1/fx/orog/gn/v20190815/orog_fx_piClim-SO2_NorESM2-LM_r1i1p1f1_gn.nc',
 '/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NCC/NorESM2-LM/piClim-SO2/r1i1p1f1/fx/areacella/gn/v20190815/areacella_fx_piClim-SO2_NorESM2-LM_r1i1p1f1_gn.nc',
 '/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NCC/NorESM2-LM/piClim-SO2/r1i1p1f1/fx/sftlf/gn/v20190815/sftlf_fx_piClim-SO2_NorESM2-LM_r1i1p1f1_gn.nc',
 '/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NCC/NorESM2-LM/piClim-OC/r1i1p1f1/fx/areacella/gn/v20190815/areacella_fx_piClim-OC_NorESM2-LM_r1i1p1f1_gn.nc',
 '/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NCC/NorESM2-LM/piClim-OC/r1i1p1f1/fx/sftlf/gn/v20190815/sftlf_fx_piClim-OC_NorESM2-LM_r1i1p1f1_gn.nc',
 '/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NCC/NorESM2-LM/piClim-BC/r1i1p1f1/fx/orog/gn/v20190815/orog_fx_piClim-BC_NorESM2-LM_r1i1p1f1_gn.nc',
 '/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NCC/NorESM2-LM/piClim-BC/r1i1p1f1/fx/areacella/gn/v20190815/are

In [19]:
grpby = list(set(df.columns.tolist()) - {'path', 'version'})
groups = df.groupby(grpby)

In [20]:
%%time
idx_to_remove = []
for _, group in groups:
    if group.version.nunique() > 1:
        idx_to_remove.extend(group.sort_values(by=['version'], ascending=False).index[1:].values.tolist())

CPU times: user 11min 55s, sys: 13.1 s, total: 12min 8s
Wall time: 11min 48s


In [21]:
len(idx_to_remove)

62389

In [22]:
len(df)

1867298

In [23]:
df = df.drop(index=idx_to_remove)
len(df)

1804909

In [24]:
df["dcpp_init_year"] = df.member_id.map(lambda x: float(x.split("-")[0][1:] if x.startswith("s") else np.nan))
df["member_id"] = df["member_id"].map(lambda x: x.split("-")[-1] if x.startswith("s") else x)
df.head()

Unnamed: 0,variable_id,table_id,source_id,experiment_id,member_id,grid_label,time_range,activity_id,institution_id,version,path,dcpp_init_year
0,mrsos,Lmon,GFDL-ESM4,piClim-aer,r1i1p1f1,gr1,000101-003012,AerChemMIP,NOAA-GFDL,v20180701,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...,
1,mrso,Lmon,GFDL-ESM4,piClim-aer,r1i1p1f1,gr1,000101-003012,AerChemMIP,NOAA-GFDL,v20180701,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...,
2,mrro,Lmon,GFDL-ESM4,piClim-aer,r1i1p1f1,gr1,000101-003012,AerChemMIP,NOAA-GFDL,v20180701,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...,
3,mrros,Lmon,GFDL-ESM4,piClim-aer,r1i1p1f1,gr1,000101-003012,AerChemMIP,NOAA-GFDL,v20180701,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...,
4,rlus,Amon,GFDL-ESM4,piClim-aer,r1i1p1f1,gr1,000101-003012,AerChemMIP,NOAA-GFDL,v20180701,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...,


In [25]:
columns = [
    "activity_id",
    "institution_id",
    "source_id",
    "experiment_id",
    "member_id",
    "table_id",
    "variable_id",
    "grid_label",
    "dcpp_init_year",
    "version",
    "time_range",
    "path",
]
df = df[columns]
df.head()

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,dcpp_init_year,version,time_range,path
0,AerChemMIP,NOAA-GFDL,GFDL-ESM4,piClim-aer,r1i1p1f1,Lmon,mrsos,gr1,,v20180701,000101-003012,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...
1,AerChemMIP,NOAA-GFDL,GFDL-ESM4,piClim-aer,r1i1p1f1,Lmon,mrso,gr1,,v20180701,000101-003012,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...
2,AerChemMIP,NOAA-GFDL,GFDL-ESM4,piClim-aer,r1i1p1f1,Lmon,mrro,gr1,,v20180701,000101-003012,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...
3,AerChemMIP,NOAA-GFDL,GFDL-ESM4,piClim-aer,r1i1p1f1,Lmon,mrros,gr1,,v20180701,000101-003012,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...
4,AerChemMIP,NOAA-GFDL,GFDL-ESM4,piClim-aer,r1i1p1f1,Amon,rlus,gr1,,v20180701,000101-003012,/work/ik1017/CMIP6/data/CMIP6/AerChemMIP/NOAA-...


In [26]:
!pwd

/mnt/lustre01/pf/zmaw/m300524/intake-esm-datastore/builders


In [27]:
df.to_csv("../catalogs/mistral-cmip6.csv.gz", compression="gzip", index=False)

In [28]:
df.memory_usage(index=True).sum()

187710536

In [29]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1804909 entries, 0 to 1868200
Data columns (total 12 columns):
activity_id       object
institution_id    object
source_id         object
experiment_id     object
member_id         object
table_id          object
variable_id       object
grid_label        object
dcpp_init_year    float64
version           object
time_range        object
path              object
dtypes: float64(1), object(11)
memory usage: 1.5 GB


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1804909 entries, 0 to 1868200
Data columns (total 12 columns):
activity_id       object
institution_id    object
source_id         object
experiment_id     object
member_id         object
table_id          object
variable_id       object
grid_label        object
dcpp_init_year    float64
version           object
time_range        object
path              object
dtypes: float64(1), object(11)
memory usage: 179.0+ MB
