In [1]:
from dask.distributed import Client
import multiprocessing
ncpu = multiprocessing.cpu_count()
threads = 6
nworker = ncpu//threads
print(ncpu, threads, nworker)

48 6 8


In [2]:
client = Client(processes=False, threads_per_worker=threads, n_workers=nworker, memory_limit='256GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:33165  Dashboard: http://localhost:8888/proxy/8787/status,Cluster  Workers: 8  Cores: 48  Memory: 515.40 GB


In [1]:
import pandas as pd
import fnmatch
import dask.dataframe as dd
from intake.source.utils import reverse_format
import os
import re
import subprocess
from tqdm.auto import tqdm
from pathlib import Path
import shutil
import numpy as np

## Create text file containing all files available

In [2]:
def get_file_list(persist_path):
    root = Path("/work/mh1007/CMOR/MPI-GE")
    p_path = Path(persist_path)
    p_path.mkdir(exist_ok=True)
    dirs = [x for x in root.iterdir() if x.is_dir()]
    for directory in tqdm(dirs):
        print(directory)
        stem = directory.stem
        f = open(f"{persist_path}/{stem}.txt", "w")
        cmd = ["find", "-L", directory.as_posix(), "-name", "*.nc"]
        p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=f)
        p.wait()
        

In [3]:
persist_path = "./MPIGE_filelist"
get_file_list(persist_path)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

/work/mh1007/CMOR/MPI-GE/output1




## Extract attributes of a file using information from CMIP5 DRS.



Reference:
- CMIP5 DRS: https://pcmdi.llnl.gov/mips/cmip5/docs/cmip5_data_reference_syntax.pdf?id=27

Directory:
```
  <activity>/
    <product>/
        <institute>/
            <model>/
                <experiment>/
                    <frequency>/
                        <modeling realm>/
                            <MIP table>/
                                <ensemble member>/
                                    <version number>/
                                        <variable name>/
                                            <CMOR filename>.nc
```
                                                
CMOR filename: `<variable name>_<MIP table>_<model>_<experiment>_ <ensemble member>[_<temporal subset>][_<geographical info>].nc`
"""

In [4]:
products = list(Path(persist_path).rglob("*.txt"))
products = [product.stem for product in products]
products

['output1']

In [5]:
df = dd.read_csv(f"{persist_path}/*.txt", header=None).compute()
df.columns = ["path"]
df.head()

Unnamed: 0,path
0,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...
1,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...
2,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...
3,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...
4,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...


06/11/2019: 70124

In [7]:
len(df)

70124

In [8]:
def _reverse_filename_format(file_basename, filename_template=None, gridspec_template=None):
    """
    Uses intake's ``reverse_format`` utility to reverse the string method format.
    Given format_string and resolved_string, find arguments
    that would give format_string.format(arguments) == resolved_string
    """
    try:
        return reverse_format(filename_template, file_basename)
    except ValueError:
        try:
            return reverse_format(gridspec_template, file_basename)
        except:
            print(
                f'Failed to parse file: {file_basename} using patterns: {filename_template} and {gridspec_template}'
            )
            return {}
            
def _extract_attr_with_regex(input_str, regex, strip_chars=None):
    pattern = re.compile(regex, re.IGNORECASE)
    match = re.findall(pattern, input_str)
    if match:
        match = max(match, key=len)
        if strip_chars:
            match = match.strip(strip_chars)

        else:
            match = match.strip()

        return match

    else:
        return None
    

exclude_patterns = ['*/files/*', '*/latest/*']
def _filter_func(path):
    return not any(
        fnmatch.fnmatch(path, pat=exclude_pattern) for exclude_pattern in exclude_patterns
    )



In [9]:
%%time
files = df.path.tolist()
filelist = list(filter(_filter_func, files))

CPU times: user 323 ms, sys: 1 ms, total: 324 ms
Wall time: 319 ms


In [10]:
len(filelist)

70124

In [11]:

def get_attrs(filepath):
    """ Extract attributes of a file using information from CMIP5 DRS.
    Notes
    -----
    Reference:
    - CMIP5 DRS: https://pcmdi.llnl.gov/mips/cmip5/docs/cmip5_data_reference_syntax.pdf?id=27
    """

    fileparts = {}

    freq_regex = r'/3hr/|/6hr/|/day/|/fx/|/mon/|/monClim/|/subhr/|/yr/'
    realm_regex = r'aerosol|atmos|land|landIce|ocean|ocnBgchem|seaIce'
    version_regex = r'v\d{4}\d{2}\d{2}|v\d{1}'

    file_basename = os.path.basename(filepath)
    fileparts['path'] = filepath

    filename_template = (
        '{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}_{temporal_subset}.nc'
    )
    gridspec_template = '{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}.nc'
    f = _reverse_filename_format(
        file_basename, filename_template=filename_template, gridspec_template=gridspec_template
    )
    fileparts.update(f)

    frequency = _extract_attr_with_regex(
        filepath, regex=freq_regex, strip_chars='/'
    )
    realm = _extract_attr_with_regex(filepath, regex=realm_regex)
    version = _extract_attr_with_regex(filepath, regex=version_regex) or 'v0'
    fileparts['frequency'] = frequency
    fileparts['modeling_realm'] = realm
    fileparts['version'] = version
    try:
        part1, part2 = os.path.dirname(filepath).split(fileparts['experiment'])
        part1 = part1.strip("/").split("/")
        fileparts['institute'] = part1[-2]
        fileparts['product_id'] = part1[-3]
    except Exception:
        print(fileparts)
    

    return fileparts

In [12]:
get_attrs(filelist[0])

{'path': '/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM/piControl/mon/atmos/prsn/r001i1850p3/v20190123/prsn_Amon_MPI-ESM_piControl_r001i1850p3_310001-319912.nc',
 'variable': 'prsn',
 'mip_table': 'Amon',
 'model': 'MPI-ESM',
 'experiment': 'piControl',
 'ensemble_member': 'r001i1850p3',
 'temporal_subset': '310001-319912',
 'frequency': 'mon',
 'modeling_realm': 'atmos',
 'version': 'v20190123',
 'institute': 'MPI-M',
 'product_id': 'output1'}

In [13]:
%%time
entries = list(map(get_attrs, filelist))

{'path': '/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM/rcp85/mon/ocnBgchem/prelim/rcp85_036/CMIP5/output/MPI-M/MPI-ESM/rcp85/mon/ocnBgchem/ph/r036i2005p3/ph_Omon_MPI-ESM_rr036i2005p3_200601-209912.nc', 'variable': 'ph', 'mip_table': 'Omon', 'model': 'MPI-ESM', 'experiment': 'rr036i2005p3', 'ensemble_member': '200601-209912', 'frequency': 'mon', 'modeling_realm': 'ocnBgchem', 'version': 'v0'}
{'path': '/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM/rcp85/mon/ocnBgchem/prelim/rcp85_036/CMIP5/output/MPI-M/MPI-ESM/rcp85/mon/ocnBgchem/o2/r036i2005p3/o2_Omon_MPI-ESM_rr036i2005p3_200601-209912.nc', 'variable': 'o2', 'mip_table': 'Omon', 'model': 'MPI-ESM', 'experiment': 'rr036i2005p3', 'ensemble_member': '200601-209912', 'frequency': 'mon', 'modeling_realm': 'ocnBgchem', 'version': 'v0'}
{'path': '/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM/rcp85/mon/ocnBgchem/prelim/rcp85_036/CMIP5/output/MPI-M/MPI-ESM/rcp85/mon/ocnBgchem/dcalc/r036i2005p3/dcalc_Omon_MPI-ESM_rr036i2005p3_200601-209912.n

In [14]:
entries[10]

{'path': '/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM/piControl/mon/atmos/prsn/r001i1850p3/v20190123/prsn_Amon_MPI-ESM_piControl_r001i1850p3_340001-349912.nc',
 'variable': 'prsn',
 'mip_table': 'Amon',
 'model': 'MPI-ESM',
 'experiment': 'piControl',
 'ensemble_member': 'r001i1850p3',
 'temporal_subset': '340001-349912',
 'frequency': 'mon',
 'modeling_realm': 'atmos',
 'version': 'v20190123',
 'institute': 'MPI-M',
 'product_id': 'output1'}

In [15]:
len(entries)

70124

In [16]:
df = pd.DataFrame(entries)
df = df.drop_duplicates(subset=['path'], keep='last').reset_index(drop=True)
df.head()

Unnamed: 0,path,variable,mip_table,model,experiment,ensemble_member,temporal_subset,frequency,modeling_realm,version,institute,product_id
0,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...,prsn,Amon,MPI-ESM,piControl,r001i1850p3,310001-319912,mon,atmos,v20190123,MPI-M,output1
1,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...,prsn,Amon,MPI-ESM,piControl,r001i1850p3,380001-385012,mon,atmos,v20190123,MPI-M,output1
2,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...,prsn,Amon,MPI-ESM,piControl,r001i1850p3,230001-239912,mon,atmos,v20190123,MPI-M,output1
3,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...,prsn,Amon,MPI-ESM,piControl,r001i1850p3,280001-289912,mon,atmos,v20190123,MPI-M,output1
4,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...,prsn,Amon,MPI-ESM,piControl,r001i1850p3,190001-199912,mon,atmos,v20190123,MPI-M,output1


In [17]:
len(df)

70124

In [19]:
# Some entries are invalid
invalids = df[~df.product_id.isin(products)]
invalids
#df = df[df.activity_id.isin(activity_ids)]

Unnamed: 0,path,variable,mip_table,model,experiment,ensemble_member,temporal_subset,frequency,modeling_realm,version,institute,product_id
14170,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...,ph,Omon,MPI-ESM,rr036i2005p3,200601-209912,,mon,ocnBgchem,v0,,
14171,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...,o2,Omon,MPI-ESM,rr036i2005p3,200601-209912,,mon,ocnBgchem,v0,,
14172,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...,dcalc,Omon,MPI-ESM,rr036i2005p3,200601-209912,,mon,ocnBgchem,v0,,
14173,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...,bfe,Omon,MPI-ESM,rr036i2005p3,200601-209912,,mon,ocnBgchem,v0,,
14174,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...,co3,Omon,MPI-ESM,rr036i2005p3,200601-209912,,mon,ocnBgchem,v0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
38350,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...,fgo2,Omon,MPI-ESM,1pcr029i1850p3,185001-200512,,mon,ocnBgchem,v0,,
38352,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...,fgo2,Omon,MPI-ESM,1pcr016i1850p3,185001-200512,,mon,ocnBgchem,v0,,
38354,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...,fgo2,Omon,MPI-ESM,1pcr090i1850p3,185001-200512,,mon,ocnBgchem,v0,,
38356,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...,fgo2,Omon,MPI-ESM,1pcr009i1850p3,185001-200512,,mon,ocnBgchem,v0,,


In [20]:
df = df[df.product_id.isin(products)]
len(df)

67788

In [21]:
df.ensemble_member.unique()

array(['r001i1850p3', 'r097i2005p3', 'r046i2005p3', 'r001i2005p3',
       'r086i2005p3', 'r002i2005p3', 'r015i2005p3', 'r024i2005p3',
       'r018i2005p3', 'r083i2005p3', 'r032i2005p3', 'r007i2005p3',
       'r013i2005p3', 'r019i2005p3', 'r068i2005p3', 'r082i2005p3',
       'r016i2005p3', 'r056i2005p3', 'r031i2005p3', 'r064i2005p3',
       'r017i2005p3', 'r069i2005p3', 'r045i2005p3', 'r070i2005p3',
       'r072i2005p3', 'r099i2005p3', 'r067i2005p3', 'r043i2005p3',
       'r081i2005p3', 'r094i2005p3', 'r051i2005p3', 'r057i2005p3',
       'r063i2005p3', 'r009i2005p3', 'r095i2005p3', 'r042i2005p3',
       'r030i2005p3', 'r079i2005p3', 'r050i2005p3', 'r073i2005p3',
       'r025i2005p3', 'r084i2005p3', 'r036i2005p3', 'r003i2005p3',
       'r040i2005p3', 'r089i2005p3', 'r066i2005p3', 'r028i2005p3',
       'r062i2005p3', 'r090i2005p3', 'r078i2005p3', 'r011i2005p3',
       'r100i2005p3', 'r087i2005p3', 'r044i2005p3', 'r047i2005p3',
       'r076i2005p3', 'r098i2005p3', 'r065i2005p3', 'r055i2005

## Pick the latest versions only

In [22]:
grpby = list(set(df.columns.tolist()) - {'path', 'version'})
groups = df.groupby(grpby)

In [23]:
%%time
idx_to_remove = []
for _, group in groups:
    if group.version.nunique() > 1:
        idx_to_remove.extend(group.sort_values(by=['version'], ascending=False).index[1:].values.tolist())

CPU times: user 27.2 s, sys: 280 ms, total: 27.5 s
Wall time: 27.1 s


In [24]:
len(idx_to_remove)

4

In [25]:
len(df)

67788

In [26]:
df1 = df.copy()
df = df.drop(index=idx_to_remove)
len(df)

67784

In [27]:
df.head()

Unnamed: 0,path,variable,mip_table,model,experiment,ensemble_member,temporal_subset,frequency,modeling_realm,version,institute,product_id
0,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...,prsn,Amon,MPI-ESM,piControl,r001i1850p3,310001-319912,mon,atmos,v20190123,MPI-M,output1
1,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...,prsn,Amon,MPI-ESM,piControl,r001i1850p3,380001-385012,mon,atmos,v20190123,MPI-M,output1
2,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...,prsn,Amon,MPI-ESM,piControl,r001i1850p3,230001-239912,mon,atmos,v20190123,MPI-M,output1
3,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...,prsn,Amon,MPI-ESM,piControl,r001i1850p3,280001-289912,mon,atmos,v20190123,MPI-M,output1
4,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...,prsn,Amon,MPI-ESM,piControl,r001i1850p3,190001-199912,mon,atmos,v20190123,MPI-M,output1


In [28]:
df.columns.shape

(12,)

In [29]:
# Re-arange columns
columns = ["product_id", "institute", "model", "experiment", "frequency", 
           "modeling_realm", "mip_table", "ensemble_member", "variable", 
           "temporal_subset", "version", "path"]
df = df[columns]
df.head()

Unnamed: 0,product_id,institute,model,experiment,frequency,modeling_realm,mip_table,ensemble_member,variable,temporal_subset,version,path
0,output1,MPI-M,MPI-ESM,piControl,mon,atmos,Amon,r001i1850p3,prsn,310001-319912,v20190123,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...
1,output1,MPI-M,MPI-ESM,piControl,mon,atmos,Amon,r001i1850p3,prsn,380001-385012,v20190123,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...
2,output1,MPI-M,MPI-ESM,piControl,mon,atmos,Amon,r001i1850p3,prsn,230001-239912,v20190123,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...
3,output1,MPI-M,MPI-ESM,piControl,mon,atmos,Amon,r001i1850p3,prsn,280001-289912,v20190123,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...
4,output1,MPI-M,MPI-ESM,piControl,mon,atmos,Amon,r001i1850p3,prsn,190001-199912,v20190123,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...


#### change ensemble member id

In [None]:
# backup
df2=df.copy()
#df = df2

In [41]:
df['ensemble_member'] = df['ensemble_member'].str.replace('i1850p3','')
df['ensemble_member'] = df['ensemble_member'].str.replace('i2005p3','')
df['ensemble_member'] = df['ensemble_member'].str.replace('r','')
df['ensemble_member'] = df.ensemble_member.astype('int')

In [42]:
df

Unnamed: 0,product_id,institute,model,experiment,frequency,modeling_realm,mip_table,ensemble_member,variable,temporal_subset,version,path
0,output1,MPI-M,MPI-ESM,piControl,mon,atmos,Amon,1,prsn,310001-319912,v20190123,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...
1,output1,MPI-M,MPI-ESM,piControl,mon,atmos,Amon,1,prsn,380001-385012,v20190123,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...
2,output1,MPI-M,MPI-ESM,piControl,mon,atmos,Amon,1,prsn,230001-239912,v20190123,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...
3,output1,MPI-M,MPI-ESM,piControl,mon,atmos,Amon,1,prsn,280001-289912,v20190123,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...
4,output1,MPI-M,MPI-ESM,piControl,mon,atmos,Amon,1,prsn,190001-199912,v20190123,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...
...,...,...,...,...,...,...,...,...,...,...,...,...
70119,output1,MPI-M,MPI-ESM,rcp26,yr,ocnBgchem,Oyr,93,po4,2006-2099,v20190123,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...
70120,output1,MPI-M,MPI-ESM,rcp26,yr,ocnBgchem,Oyr,29,po4,2006-2099,v20190123,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...
70121,output1,MPI-M,MPI-ESM,rcp26,yr,ocnBgchem,Oyr,60,po4,2006-2099,v20190123,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...
70122,output1,MPI-M,MPI-ESM,rcp26,yr,ocnBgchem,Oyr,20,po4,2006-2099,v20190123,/work/mh1007/CMOR/MPI-GE/output1/MPI-M/MPI-ESM...


In [44]:
df.experiment.unique()

array(['piControl', 'rcp85', 'historical', '1pctCO2', 'rcp45', 'rcp26'],
      dtype=object)

In [45]:
df.mip_table.unique()

array(['Amon', 'Omon', 'Oyr', 'Lmon', 'OImon'], dtype=object)

In [43]:
df.variable.unique()

array(['prsn', 'rsus', 'vas', 'wap', 'uas', 'hfls', 'rlus', 'sfcWind',
       'rlut', 'rlutcs', 'zg', 'clt', 'ta', 'rsdt', 'va', 'ua', 'hfss',
       'ps', 'prc', 'pr', 'rsut', 'hus', 'ts', 'tas', 'psl', 'tauu',
       'evspsbl', 'tauv', 'clivi', 'clwvi', 'prw', 'wmo', 'umo', 'vmo',
       'so', 'thetao', 'uo', 'vo', 'rhopoto', 'dpco2', 'spco2', 'intpp',
       'fgco2', 'fgo2', 'dissic', 'o2', 'co3', 'no3', 'talk', 'si', 'po4',
       'cVegLut', 'grassFracC4', 'cLitter', 'grassFrac', 'cropFracC4',
       'et', 'mrsos', 'rsds', 'snm', 'fracLut', 'grassFracC3', 'mrro',
       'cVegGrass', 'rhGrass', 'raTree', 'npp', 'nwdFracLut', 'cSoilTree',
       'vegFrac', 'gppGrass', 'nppGrass', 'ra', 'lai', 'cropFracC3',
       'snc', 'nbp', 'shrubFrac', 'cropFrac', 'gppShrub', 'c4PftFrac',
       'nppTree', 'nppLut', 'pastureFrac', 'snw', 'tran', 'nep', 'cVeg',
       'rh', 'cLitterGrass', 'mrros', 'baresoilFrac', 'gpp', 'cSoilLut',
       'raShrub', 'mrso', 'gppTree', 'rlds', 'tsl', 'cSoilGrass',

In [46]:
df.to_csv("../catalogs/mistral-MPI-GE.csv.gz", compression="gzip", index=False)