In [1]:
import os 
from dask.distributed import Client
from dask import delayed
import dask
from pathlib import Path
import re
import pandas as pd
from dask_jobqueue import SLURMCluster
PROJECT = os.environ["PBS_ACCOUNT"]

In [2]:
dask.config.set({'distributed.dashboard.link':'http://localhost:8877/proxy/{port}/status'})

<dask.config.set at 0x2aab62d4f5f8>

In [3]:
cluster = SLURMCluster(project=PROJECT, processes=6, cores=12, memory="5GB",
                           env_extra=['export LANG="en_US.utf8"',
                                      'export LANGUAGE="en_US.utf8"',
                                      'export LC_ALL="en_US.utf8"',
                                      'export LD_LIBRARY_PATH=""',])

In [4]:
cluster

VBox(children=(HTML(value='<h2>SLURMCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    …

In [5]:
!squeue -u $USER

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
           1949765       dav     srun  abanihi  R      13:20      1 pronghorn01
           1949785       dav dask-wor  abanihi  R       0:02      1 casper10
           1949786       dav dask-wor  abanihi  R       0:02      1 casper10
           1949787       dav dask-wor  abanihi  R       0:02      1 casper10
           1949788       dav dask-wor  abanihi  R       0:02      1 casper10
           1949789       dav dask-wor  abanihi  R       0:02      1 casper10
           1949790       dav dask-wor  abanihi  R       0:02      1 casper11
           1949791       dav dask-wor  abanihi  R       0:02      1 casper11
           1949792       dav dask-wor  abanihi  R       0:02      1 casper11
           1949793       dav dask-wor  abanihi  R       0:02      1 casper11
           1949794       dav dask-wor  abanihi  R       0:02      1 casper11
           1949795       dav dask-wor  abanihi  R       0:02     

In [6]:
client = Client(cluster)

In [7]:
client

0,1
Client  Scheduler: tcp://10.12.202.17:36607  Dashboard: http://localhost:8877/proxy/8787/status,Cluster  Workers: 72  Cores: 144  Memory: 60.00 GB


In [8]:
institutions = ['BCC', 'BNU', 'CCCma', 'CMCC', 'CNRM-CERFACS', 'CSIRO-BOM',
                'CSIRO-QCCCE', 'FIO', 'ICHEC', 'INM', 'INPE', 'IPSL',
                'LASG-CESS', 'LASG-IAP', 'MIROC', 'MOHC', 'MPI-M', 'MRI',
                'NASA-GISS', 'NCC', 'NIMR-KMA', 'NOAA-GFDL',
                'NSF-DOE-NCAR', 'UNSW', 'NCAR', ]
realms = ['atmos','land','landIce','ocean','ocnBgchem','seaIce']
frequencies = ['fx','day','mon','yr']
cmip5_root = "/glade/collections/cmip/cmip5"

In [9]:
modeling_groups = [os.path.join(cmip5_root, activity, institution) for institution in institutions for activity in
                   os.listdir(cmip5_root)
                   if os.path.isdir(os.path.join(cmip5_root, activity, institution))]

models = [os.path.join(modeling_group, model) for modeling_group in modeling_groups for model in os.listdir(modeling_group)]

In [10]:
modeling_groups[:2]

['/glade/collections/cmip/cmip5/output1/BCC',
 '/glade/collections/cmip/cmip5/output1/BNU']

In [11]:
models[:2]

['/glade/collections/cmip/cmip5/output1/BCC/bcc-csm1-1',
 '/glade/collections/cmip/cmip5/output1/BCC/bcc-csm1-1-m']

In [12]:
experiments = [os.path.join(model, exp) for model in models for exp in os.listdir(model)]

In [13]:
experiments[:2]

['/glade/collections/cmip/cmip5/output1/BCC/bcc-csm1-1/esmControl',
 '/glade/collections/cmip/cmip5/output1/BCC/bcc-csm1-1/historical']

In [14]:
freqs = [os.path.join(exp, freq) for exp in experiments for freq in os.listdir(exp)]
freqs[:2]

['/glade/collections/cmip/cmip5/output1/BCC/bcc-csm1-1/esmControl/mon',
 '/glade/collections/cmip/cmip5/output1/BCC/bcc-csm1-1/historical/6hr']

In [15]:
realm_dirs = [os.path.join(freq, realm) for freq in freqs for realm in os.listdir(freq)]
realm_dirs[0:2]

['/glade/collections/cmip/cmip5/output1/BCC/bcc-csm1-1/esmControl/mon/ocean',
 '/glade/collections/cmip/cmip5/output1/BCC/bcc-csm1-1/esmControl/mon/ocnBgchem']

In [16]:
len(modeling_groups), len(models), len(experiments), len(freqs), len(realm_dirs)

(38, 81, 418, 756, 1448)

In [22]:
def get_entry(directory):
    dir_split = directory.split('/')
    entry = {}
    entry['realm'] = dir_split[-1]
    entry['frequency'] = dir_split[-2]
    entry['experiment'] = dir_split[-3]
    entry['model'] = dir_split[-4]
    entry['institution'] = dir_split[-5]
    return entry

@delayed
def parse_files(directory):
    exclude = set(["files", "latests"]) # directories to exclude
    
    columns = ["ensemble", "experiment", "file_basename", "file_fullpath", 
                  "frequency", "institution", "model", "root", "realm", "varname"]
    df = pd.DataFrame(columns=columns)
    
    entry = get_entry(directory)
    
    for root, dirs, files in os.walk(directory):
        print(root)
        dirs[:] = [d for d in dirs if d not in exclude]
        if not files:
            continue
        sfiles = sorted([f for f in files if os.path.splitext(f)[1] == ".nc"])
        if not sfiles: continue
            
        fs = []
        for f in sfiles:
            try:
                f_split = f.split("_")
                entry['varname'] = f_split[0]
                entry['ensemble'] = f_split[-2]
                entry['root'] = root
                entry['file_basename'] = f
                entry['file_fullpath'] = os.path.join(root, f)
                fs.append(entry)
            except:
                continue
        if fs:
            temp_df = pd.DataFrame(fs)
                
        else:
            temp_df = pd.DataFrame()
            temp_df.columns = df.columns
        df = pd.concat([temp_df, df], ignore_index=True)
    return df           

In [23]:
dfs = [parse_files(directory) for directory in realm_dirs]
len(dfs)

1448

In [24]:
import dask.dataframe as dd

In [25]:
df = dd.from_delayed(dfs)

In [27]:
%time df.head()

CPU times: user 55.6 ms, sys: 7.42 ms, total: 63.1 ms
Wall time: 199 ms


Unnamed: 0,ensemble,experiment,file_basename,file_fullpath,frequency,institution,model,realm,root,varname
0,r1i1p1,esmControl,thetao_Omon_bcc-csm1-1_esmControl_r1i1p1_02410...,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,mon,BCC,bcc-csm1-1,ocean,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,thetao
1,r1i1p1,esmControl,thetao_Omon_bcc-csm1-1_esmControl_r1i1p1_02410...,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,mon,BCC,bcc-csm1-1,ocean,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,thetao
2,r1i1p1,esmControl,thetao_Omon_bcc-csm1-1_esmControl_r1i1p1_02410...,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,mon,BCC,bcc-csm1-1,ocean,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,thetao
3,r1i1p1,esmControl,thetao_Omon_bcc-csm1-1_esmControl_r1i1p1_02410...,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,mon,BCC,bcc-csm1-1,ocean,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,thetao
4,r1i1p1,esmControl,thetao_Omon_bcc-csm1-1_esmControl_r1i1p1_02410...,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,mon,BCC,bcc-csm1-1,ocean,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,thetao


In [28]:
%time len(df)

CPU times: user 1min 16s, sys: 6.47 s, total: 1min 23s
Wall time: 5min 36s


602260

In [29]:
%time pdf = df.compute()

CPU times: user 1min 7s, sys: 5.66 s, total: 1min 12s
Wall time: 4min 43s


In [30]:
pdf.head()

Unnamed: 0,ensemble,experiment,file_basename,file_fullpath,frequency,institution,model,realm,root,varname
0,r1i1p1,esmControl,thetao_Omon_bcc-csm1-1_esmControl_r1i1p1_02410...,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,mon,BCC,bcc-csm1-1,ocean,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,thetao
1,r1i1p1,esmControl,thetao_Omon_bcc-csm1-1_esmControl_r1i1p1_02410...,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,mon,BCC,bcc-csm1-1,ocean,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,thetao
2,r1i1p1,esmControl,thetao_Omon_bcc-csm1-1_esmControl_r1i1p1_02410...,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,mon,BCC,bcc-csm1-1,ocean,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,thetao
3,r1i1p1,esmControl,thetao_Omon_bcc-csm1-1_esmControl_r1i1p1_02410...,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,mon,BCC,bcc-csm1-1,ocean,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,thetao
4,r1i1p1,esmControl,thetao_Omon_bcc-csm1-1_esmControl_r1i1p1_02410...,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,mon,BCC,bcc-csm1-1,ocean,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,thetao


In [31]:
pdf.model.nunique()

55

In [32]:
pdf.realm.unique()

array(['ocean', 'ocnBgchem', 'atmos', 'land', 'seaIce', 'landIce',
       'aerosol'], dtype=object)

In [33]:
pdf.varname.nunique()

454

In [34]:
import re

In [35]:
vYYYYMMDD = r'v\d{4}\d{2}\d{2}'
vN = r'v\d{1}'
v = re.compile( "|".join([vYYYYMMDD, vN])) # Combine both regex into one

In [36]:
pdf["version"] = pdf.root.str.findall(v)

In [37]:
pdf.head()

Unnamed: 0,ensemble,experiment,file_basename,file_fullpath,frequency,institution,model,realm,root,varname,version
0,r1i1p1,esmControl,thetao_Omon_bcc-csm1-1_esmControl_r1i1p1_02410...,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,mon,BCC,bcc-csm1-1,ocean,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,thetao,[v20120202]
1,r1i1p1,esmControl,thetao_Omon_bcc-csm1-1_esmControl_r1i1p1_02410...,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,mon,BCC,bcc-csm1-1,ocean,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,thetao,[v20120202]
2,r1i1p1,esmControl,thetao_Omon_bcc-csm1-1_esmControl_r1i1p1_02410...,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,mon,BCC,bcc-csm1-1,ocean,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,thetao,[v20120202]
3,r1i1p1,esmControl,thetao_Omon_bcc-csm1-1_esmControl_r1i1p1_02410...,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,mon,BCC,bcc-csm1-1,ocean,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,thetao,[v20120202]
4,r1i1p1,esmControl,thetao_Omon_bcc-csm1-1_esmControl_r1i1p1_02410...,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,mon,BCC,bcc-csm1-1,ocean,/glade/collections/cmip/cmip5/output1/BCC/bcc-...,thetao,[v20120202]


In [38]:
pdf.root[~pdf.root.str.contains(v)].iloc[0]

'/glade/collections/cmip/cmip5/output1/CCCma/CanESM2/historical/mon/atmos/Amon/r4i1p1'

In [39]:
pdf.root[pdf.root.str.contains(v)].iloc[0]

'/glade/collections/cmip/cmip5/output1/BCC/bcc-csm1-1/esmControl/mon/ocean/Omon/r1i1p1/v20120202/thetao'

In [40]:
pdf.version = pdf.version.apply(lambda x: x[0] if x else 'v0')
pdf.tail()

Unnamed: 0,ensemble,experiment,file_basename,file_fullpath,frequency,institution,model,realm,root,varname,version
530,r1i1p1,rcp85,rhopoto_Omon_CCSM4_rcp85_r1i1p1_209001-210012.nc,/glade/collections/cmip/cmip5/output2/NCAR/CCS...,mon,NCAR,CCSM4,ocean,/glade/collections/cmip/cmip5/output2/NCAR/CCS...,rhopoto,v20120205
531,r1i1p1,rcp85,rhopoto_Omon_CCSM4_rcp85_r1i1p1_209001-210012.nc,/glade/collections/cmip/cmip5/output2/NCAR/CCS...,mon,NCAR,CCSM4,ocean,/glade/collections/cmip/cmip5/output2/NCAR/CCS...,rhopoto,v20120205
532,r1i1p1,rcp85,rhopoto_Omon_CCSM4_rcp85_r1i1p1_209001-210012.nc,/glade/collections/cmip/cmip5/output2/NCAR/CCS...,mon,NCAR,CCSM4,ocean,/glade/collections/cmip/cmip5/output2/NCAR/CCS...,rhopoto,v20120205
533,r1i1p1,rcp85,rhopoto_Omon_CCSM4_rcp85_r1i1p1_209001-210012.nc,/glade/collections/cmip/cmip5/output2/NCAR/CCS...,mon,NCAR,CCSM4,ocean,/glade/collections/cmip/cmip5/output2/NCAR/CCS...,rhopoto,v20120205
534,r1i1p1,rcp85,rhopoto_Omon_CCSM4_rcp85_r1i1p1_209001-210012.nc,/glade/collections/cmip/cmip5/output2/NCAR/CCS...,mon,NCAR,CCSM4,ocean,/glade/collections/cmip/cmip5/output2/NCAR/CCS...,rhopoto,v20120205


In [41]:
%time len(pdf)

CPU times: user 17 µs, sys: 1 µs, total: 18 µs
Wall time: 27.7 µs


602260

In [42]:
sorted_df = pdf.sort_values('version').drop_duplicates(subset='file_basename', keep='last')

In [43]:
%time len(sorted_df)

CPU times: user 19 µs, sys: 2 µs, total: 21 µs
Wall time: 30.5 µs


87565

In [44]:
sorted_df.to_csv("/glade/u/home/abanihi/sorted_cmip5_database.csv", index=False)
pdf.to_csv("/glade/u/home/abanihi/unsorted_cmip5_database.csv", index=False)

In [45]:
sorted_df.groupby('model').nunique()

Unnamed: 0_level_0,ensemble,experiment,file_basename,file_fullpath,frequency,institution,model,realm,root,varname,version
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ACCESS1-0,3,3,190,190,4,1,1,4,190,59,11
ACCESS1-3,3,6,213,213,4,1,1,3,213,53,17
ACCESS1.0,2,3,3,3,1,1,1,1,3,1,1
BNU-ESM,1,6,75,75,4,1,1,6,75,41,3
CCSM4,87,44,55336,55336,5,1,1,5,55336,190,95
CESM1-BGC,15,12,3804,3804,4,1,1,6,3804,219,22
CESM1-CAM5,37,16,7075,7075,5,1,1,6,7075,181,52
CESM1-FASTCHEM,5,2,633,633,3,1,1,5,633,152,6
CESM1-WACCM,12,5,3214,3214,3,1,1,5,3214,156,19
CMCC-CESM,1,2,75,75,2,1,1,5,75,46,6
