# GLADE - CMIP5 Database

This notebook demonstrates how to generate CMIP5 database using `intake_cmip` for CMIP5 datasets on NCAR's GLADE file system.

In [1]:
# Import Packages
import os 
from dask.distributed import Client
from dask import delayed
import dask
from pathlib import Path
import re
import pandas as pd
from dask_jobqueue import SLURMCluster
PROJECT = os.environ["PBS_ACCOUNT"]

In [2]:
dask.config.set({'distributed.dashboard.link':'http://localhost:8877/proxy/{port}/status'})

<dask.config.set at 0x2ac7932aae10>

In [3]:
# Create Dask Cluster using Jobqueue
cluster = SLURMCluster(project=PROJECT, processes=6, cores=12, memory="2GB",
                           env_extra=['export LANG="en_US.utf8"',
                                      'export LANGUAGE="en_US.utf8"',
                                      'export LC_ALL="en_US.utf8"',
                                      'export LD_LIBRARY_PATH=""',])

In [4]:
cluster.adapt(minimum=72, maximum=108)

<distributed.deploy.adaptive.Adaptive at 0x2ac7baa9feb8>

In [5]:
!squeue -u $USER

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
           1981391       dav dask-wor  abanihi PD       0:00      1 (Priority)
           1981392       dav dask-wor  abanihi PD       0:00      1 (Priority)
           1981393       dav dask-wor  abanihi PD       0:00      1 (Priority)
           1981394       dav dask-wor  abanihi PD       0:00      1 (Priority)
           1981395       dav dask-wor  abanihi PD       0:00      1 (Priority)
           1981396       dav dask-wor  abanihi PD       0:00      1 (Priority)
           1981397       dav dask-wor  abanihi PD       0:00      1 (Priority)
           1981398       dav dask-wor  abanihi PD       0:00      1 (Priority)
           1981399       dav dask-wor  abanihi PD       0:00      1 (Priority)
           1981400       dav dask-wor  abanihi PD       0:00      1 (Priority)
           1981401       dav dask-wor  abanihi PD       0:00      1 (Priority)
           1981402       dav dask-wor  abanihi

In [6]:
client = Client(cluster)

In [7]:
client

0,1
Client  Scheduler: tcp://10.12.205.200:46459  Dashboard: http://localhost:8877/proxy/8787/status,Cluster  Workers: 102  Cores: 204  Memory: 34.00 GB


In [8]:
import intake_cmip
from intake_cmip.database import create_cmip5_database

In [9]:
cmip5_root = "/glade/collections/cmip/cmip5" # This is the root dir of CMIP5 datasets on Glade.

In [10]:
%time df = create_cmip5_database(root_dir=cmip5_root)  # Create and persist data in user's home directory

**** Persisting CMIP5 database: /glade/u/home/abanihi/.intake_cmip/cmip5.csv ****
CPU times: user 1min 11s, sys: 4.16 s, total: 1min 15s
Wall time: 8min 2s


In [11]:
df.head()

Unnamed: 0,ensemble,experiment,file_basename,file_fullpath,frequency,institution,model,realm,root,varname,version
0,r2i1p1,rcp85,ua_Amon_CanESM2_rcp85_r2i1p1_200601-210012.nc,/glade/collections/cmip/cmip5/output1/CCCma/Ca...,mon,CCCma,CanESM2,atmos,/glade/collections/cmip/cmip5/output1/CCCma/Ca...,ua,v0
1,r5i1p1,rcp85,ua_Amon_CanESM2_rcp85_r5i1p1_200601-210012.nc,/glade/collections/cmip/cmip5/output1/CCCma/Ca...,mon,CCCma,CanESM2,atmos,/glade/collections/cmip/cmip5/output1/CCCma/Ca...,ua,v0
2,r3i1p1,historical,ua_Amon_GFDL-CM3_historical_r3i1p1_200501-2005...,/glade/collections/cmip/cmip5/output1/NOAA-GFD...,mon,NOAA-GFDL,GFDL-CM3,atmos,/glade/collections/cmip/cmip5/output1/NOAA-GFD...,ua,v0
3,r4i1p1,rcp85,ua_Amon_CanESM2_rcp85_r4i1p1_200601-210012.nc,/glade/collections/cmip/cmip5/output1/CCCma/Ca...,mon,CCCma,CanESM2,atmos,/glade/collections/cmip/cmip5/output1/CCCma/Ca...,ua,v0
4,r1i1p1,historical,ua_Amon_GFDL-CM3_historical_r1i1p1_200501-2005...,/glade/collections/cmip/cmip5/output1/NOAA-GFD...,mon,NOAA-GFDL,GFDL-CM3,atmos,/glade/collections/cmip/cmip5/output1/NOAA-GFD...,ua,v0


In [12]:
df.model.nunique()  # Find the total number of unique climate models

55

In [13]:
df.realm.unique()  # find the list of unique realms

array(['atmos', 'seaIce', 'aerosol', 'landIce', 'land', 'ocean',
       'ocnBgchem'], dtype=object)

In [14]:
df.varname.nunique() # How many unique variables

454

In [15]:
df.tail()

Unnamed: 0,ensemble,experiment,file_basename,file_fullpath,frequency,institution,model,realm,root,varname,version
87560,r1i1p1,rcp85,tasmax_day_ACCESS1-3_rcp85_r1i1p1_20810101-210...,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,day,CSIRO-BOM,ACCESS1-3,atmos,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,tasmax,v4
87561,r1i1p1,rcp85,tas_day_ACCESS1-3_rcp85_r1i1p1_20810101-210012...,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,day,CSIRO-BOM,ACCESS1-3,atmos,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,tas,v4
87562,r1i1p1,historical,ua_day_ACCESS1-0_historical_r1i1p1_20050101-20...,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,day,CSIRO-BOM,ACCESS1-0,atmos,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,ua,v4
87563,r1i1p1,rcp85,va_day_ACCESS1-3_rcp85_r1i1p1_20960101-2100123...,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,day,CSIRO-BOM,ACCESS1-3,atmos,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,va,v4
87564,r1i1p1,historical,va_day_ACCESS1-0_historical_r1i1p1_20050101-20...,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,day,CSIRO-BOM,ACCESS1-0,atmos,/glade/collections/cmip/cmip5/output1/CSIRO-BO...,va,v4


In [16]:
%time len(df) # Find the total number of files available in generated CMIP5 database

CPU times: user 23 µs, sys: 2 µs, total: 25 µs
Wall time: 35.8 µs


87565

In [17]:
df.groupby('model').nunique()

Unnamed: 0_level_0,ensemble,experiment,file_basename,file_fullpath,frequency,institution,model,realm,root,varname,version
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ACCESS1-0,3,3,190,190,4,1,1,4,190,59,11
ACCESS1-3,3,6,213,213,4,1,1,3,213,53,17
ACCESS1.0,2,3,3,3,1,1,1,1,3,1,1
BNU-ESM,1,6,75,75,4,1,1,6,75,41,3
CCSM4,87,44,55336,55336,5,1,1,5,55336,190,95
CESM1-BGC,15,12,3804,3804,4,1,1,6,3804,219,22
CESM1-CAM5,37,16,7075,7075,5,1,1,6,7075,181,52
CESM1-FASTCHEM,5,2,633,633,3,1,1,5,633,152,6
CESM1-WACCM,12,5,3214,3214,3,1,1,5,3214,156,19
CMCC-CESM,1,2,75,75,2,1,1,5,75,46,6


In [18]:
%load_ext watermark

In [27]:
%watermark --iversion -g -h -m -v -u -d

dask        0.20.0
re          2.2.1
pandas      0.23.4
intake_cmip  0+untagged.77.gf8d74ed.dirty
last updated: 2018-12-26 

CPython 3.6.6
IPython 7.0.1

compiler   : GCC 4.8.2 20140120 (Red Hat 4.8.2-15)
system     : Linux
release    : 3.10.0-693.21.1.el7.x86_64
machine    : x86_64
processor  : x86_64
CPU cores  : 72
interpreter: 64bit
host name  : casper01
Git hash   : f8d74ed4e3ee566582ed98a542584338b04e1d4b
