# build a database of CESM-LE files

In [1]:
import intake
import intake_esm
import yaml

import sys

import pandas as pd

import os
from subprocess import check_call

import config

show the directories to store database and cached files (set in .config-intake-cesm.yml)

In [2]:
intake_esm.get_options()

{'database_directory': '/glade/work/mclong/intake-collections',
 'cache_directory': '/glade/scratch/mclong/intake-cesm-data'}

### query collection using intake plugin interface

Connect to database

In [3]:
build_kwargs = dict(collection_input_file='intake-collection-input-cesm1_le.yml',
                    collection_type_def_file='intake-cesm-definitions.yml',
                    overwrite_existing=False, 
                    include_cache_dir=True)

col = intake.open_cesm_metadatastore('cesm1_le', build_args=build_kwargs)
col                     

<Intake catalog: cesm1_le>

In [4]:
col.df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 262092 entries, 0 to 262091
Data columns (total 18 columns):
resource            262092 non-null object
resource_type       262092 non-null object
direct_access       262092 non-null bool
experiment          262092 non-null object
case                262092 non-null object
component           262092 non-null object
stream              262092 non-null object
variable            262092 non-null object
date_range          262092 non-null object
ensemble            262092 non-null int64
files               262092 non-null object
files_basename      262092 non-null object
files_dirname       262092 non-null object
ctrl_branch_year    0 non-null float64
year_offset         34112 non-null float64
sequence_order      262092 non-null int64
has_ocean_bgc       262092 non-null bool
grid                52764 non-null object
dtypes: bool(2), float64(2), int64(2), object(12)
memory usage: 34.5+ MB


Determine which ensembles have ocean biogeochemistry variables.

In [5]:
experiments=['20C', 'RCP85']
ensembles = col.search(experiment=experiments, has_ocean_bgc=True).results.ensemble.unique().tolist()
print(ensembles)


[1, 2, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 34, 35, 101, 102, 103, 104, 105]


Determine what ocean variables are available at monthly and daily resolution.  The POP model is not smart enough to write the same variable to two different streams, so it is not necessary to inclued `stream` in database queries for POP. We can, however, view all the variables defined in each stream.

In [6]:
stream_def = {'mon': 'pop.h', 'day': ['pop.h.nday1', 'pop.h.ecosys.nday1']}

all_variables = {'mon': col.search(component='ocn', stream=stream_def['mon']).results.variable.unique().tolist(), 
             'day': col.search(component='ocn', stream=stream_def['day']).results.variable.unique().tolist()}
print(yaml.dump(all_variables))

day: [CaCO3_form_zint, DpCO2_2, ECOSYS_IFRAC_2, ECOSYS_XKW_2, FG_CO2_2, HBLT_2, HMXL_2,
  SSH_2, SST, SST2, STF_O2_2, TAUX_2, TAUY_2, WVEL_50m, XBLT_2, XMXL_2, diatC_zint_100m,
  diatChl_SURF, diazC_zint_100m, diazChl_SURF, photoC_diat_zint, photoC_diaz_zint,
  photoC_sp_zint, spC_zint_100m, spCaCO3_zint_100m, spChl_SURF, zooC_zint_100m]
mon: [ADVS, ADVS_ISOP, ADVS_SUBM, ADVT, ADVT_ISOP, ADVT_SUBM, ALK, AOU, ATM_ALT_CO2,
  ATM_CO2, BSF, CFC11, CFC12, CFC_ATM_PRESS, CFC_IFRAC, CFC_XKW, CO2STAR, CO3, CaCO3_FLUX_IN,
  CaCO3_PROD, CaCO3_form, DCO2STAR, DCO2STAR_ALT_CO2, DENITRIF, DIA_DEPTH, DIA_IMPVF_CFC11,
  DIA_IMPVF_CFC12, DIA_IMPVF_IAGE, DIA_IMPVF_SALT, DIA_IMPVF_TEMP, DIC, DIC_ALT_CO2,
  DOC, DOC_prod, DOC_remin, DOFe, DOFe_prod, DON, DON_prod, DOP, DOP_prod, DpCO2,
  DpCO2_ALT_CO2, ECOSYS_ATM_PRESS, ECOSYS_IFRAC, ECOSYS_XKW, EVAP_F, FG_ALT_CO2, FG_CO2,
  FW, Fe, Fe_scavenge, Fe_scavenge_rate, FvICE_ALK, FvICE_DIC, FvPER_ALK, FvPER_DIC,
  H2CO3, HBLT, HCO3, HDIFB_CFC11, HDIFB_CFC12, H

In [7]:
%load_ext watermark

In [8]:
%watermark --iversion -g -h -m -v -u -d

intake     0.4.1
pandas     0.24.1
yaml       3.13
intake_esm v2019.2.0+87.gbadcef2
last updated: 2019-02-15 

CPython 3.6.7
IPython 7.1.1

compiler   : GCC 7.3.0
system     : Linux
release    : 3.12.62-60.64.8-default
machine    : x86_64
processor  : x86_64
CPU cores  : 72
interpreter: 64bit
host name  : r6i6n30
Git hash   : 8e761764629eb94ca02fda92af1a5ea2524a6208
