In [1]:
import pandas as pd
import fnmatch
import dask.dataframe as dd
from intake.source.utils import reverse_format
import os
import re
import subprocess
from tqdm.auto import tqdm
from pathlib import Path
import shutil
import numpy as np

In [2]:
!ls /work/bmx828/miklip-ces/data4miklip/model/global/miklip

baseline0  baseline1  preop  preop-dcpp  prototype


## Create text file containing all files available

In [5]:
def get_file_list(persist_path):
    root = Path("/work/bmx828/miklip-ces/data4miklip/model/global/miklip/")
    p_path = Path(persist_path)
    p_path.mkdir(exist_ok=True)
    dirs = [x for x in root.iterdir() if x.is_dir()]
    for directory in tqdm(dirs):
        print(directory)
        stem = directory.stem
        f = open(f"{persist_path}/{stem}.txt", "w")
        cmd = ["find", "-L", directory.as_posix(), "-name", "*.nc"]
        p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=f)
        p.wait()

In [6]:
persist_path = "./miklip_filelist"
#get_file_list(persist_path)


## Extract attributes of a file using information from CMIP5 DRS.



Reference:
- CMIP5 DRS: https://pcmdi.llnl.gov/mips/cmip5/docs/cmip5_data_reference_syntax.pdf?id=27

Directory:
```
  <activity>/
    <product>/
        <institute>/
            <model>/
                <experiment>/
                    <frequency>/
                        <modeling realm>/
                            <MIP table>/
                                <ensemble member>/
                                    <version number>/
                                        <variable name>/
                                            <CMOR filename>.nc
```
                                                
CMOR filename: `<variable name>_<MIP table>_<model>_<experiment>_ <ensemble member>[_<temporal subset>][_<geographical info>].nc`
"""

In [7]:
generations = list(Path(persist_path).rglob("*.txt"))
generations = [generation.stem for generation in generations]
generations

['prototype',
 'baseline1',
 'baseline0',
 'preop',
 'preop-dcpp',
 'baseline0-checkpoint']

In [8]:
df = dd.read_csv(f"{persist_path}/*.txt", header=None).compute()
df.columns = ["path"]
df.head()

Unnamed: 0,path
0,/work/bmx828/miklip-ces/data4miklip/model/glob...
1,/work/bmx828/miklip-ces/data4miklip/model/glob...
2,/work/bmx828/miklip-ces/data4miklip/model/glob...
3,/work/bmx828/miklip-ces/data4miklip/model/glob...
4,/work/bmx828/miklip-ces/data4miklip/model/glob...


In [9]:
len(df)

1859769

In [11]:
def _reverse_filename_format(file_basename, filename_template=None, gridspec_template=None):
    """
    Uses intake's ``reverse_format`` utility to reverse the string method format.
    Given format_string and resolved_string, find arguments
    that would give format_string.format(arguments) == resolved_string
    """
    try:
        return reverse_format(filename_template, file_basename)
    except ValueError:
        try:
            return reverse_format(gridspec_template, file_basename)
        except:
            print(
                f'Failed to parse file: {file_basename} using patterns: {filename_template} and {gridspec_template}'
            )
            return {}
            
def _extract_attr_with_regex(input_str, regex, strip_chars=None):
    pattern = re.compile(regex, re.IGNORECASE)
    match = re.findall(pattern, input_str)
    if match:
        match = max(match, key=len)
        if strip_chars:
            match = match.strip(strip_chars)

        else:
            match = match.strip()

        return match

    else:
        return None
    

exclude_patterns = ['*/files/*', '*/latest/*']
def _filter_func(path):
    return not any(
        fnmatch.fnmatch(path, pat=exclude_pattern) for exclude_pattern in exclude_patterns
    )



In [12]:
%%time
files = df.path.tolist()
filelist = list(filter(_filter_func, files))

CPU times: user 7.74 s, sys: 9 ms, total: 7.74 s
Wall time: 7.73 s


In [13]:
len(filelist)

1859769

In [16]:
filelist[2]

'/work/bmx828/miklip-ces/data4miklip/model/global/miklip/baseline0/output1/MPI-M/MPI-ESM-LR/decadal2001/day/atmos/day/r2i1p1/v20111122/psl/psl_day_MPI-ESM-LR_decadal2001_r2i1p1_20020101-20111231.nc'

In [22]:
def get_attrs(filepath):
    """ Extract attributes of a file using information from CMIP5 DRS.
    Notes
    -----
    Reference:
    - CMIP5 DRS: https://pcmdi.llnl.gov/mips/cmip5/docs/cmip5_data_reference_syntax.pdf?id=27
    """

    fileparts = {}

    freq_regex = r'/3hr/|/6hr/|/day/|/fx/|/mon/|/monClim/|/subhr/|/yr/'
    realm_regex = r'aerosol|atmos|land|landIce|ocean|ocnBgchem|seaIce'
    version_regex = r'v\d{4}\d{2}\d{2}|v\d{1}'

    file_basename = os.path.basename(filepath)
    fileparts['path'] = filepath

    filename_template = (
        '{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}_{temporal_subset}.nc'
    )
    gridspec_template = '{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}.nc'
    f = _reverse_filename_format(
        file_basename, filename_template=filename_template, gridspec_template=gridspec_template
    )
    fileparts.update(f)

    frequency = _extract_attr_with_regex(
        filepath, regex=freq_regex, strip_chars='/'
    )
    realm = _extract_attr_with_regex(filepath, regex=realm_regex)
    version = _extract_attr_with_regex(filepath, regex=version_regex) or 'v0'
    fileparts['frequency'] = frequency
    fileparts['modeling_realm'] = realm
    fileparts['version'] = version
    try:
        part1, part2 = os.path.dirname(filepath).split(fileparts['experiment'])
        part1 = part1.strip("/").split("/")
        #print(part1,part2)
        fileparts['institute'] = part1[-2]
        fileparts['product_id'] = part1[-3]
        fileparts['generation'] = part1[-4]
    except Exception:
        print(fileparts)
    

    return fileparts

In [23]:
get_attrs(filelist[0])

{'path': '/work/bmx828/miklip-ces/data4miklip/model/global/miklip/baseline0/output1/MPI-M/MPI-ESM-LR/decadal2001/day/atmos/day/r2i1p1/v20111122/sfcWind/sfcWind_day_MPI-ESM-LR_decadal2001_r2i1p1_20020101-20111231.nc',
 'variable': 'sfcWind',
 'mip_table': 'day',
 'model': 'MPI-ESM-LR',
 'experiment': 'decadal2001',
 'ensemble_member': 'r2i1p1',
 'temporal_subset': '20020101-20111231',
 'frequency': 'day',
 'modeling_realm': 'atmos',
 'version': 'v20111122',
 'institute': 'MPI-M',
 'product_id': 'output1',
 'generation': 'baseline0'}

In [24]:
%%time
entries = list(map(get_attrs, filelist))

Failed to parse file: test.nc using patterns: {variable}_{mip_table}_{model}_{experiment}_{ensemble_member}_{temporal_subset}.nc and {variable}_{mip_table}_{model}_{experiment}_{ensemble_member}.nc
{'path': '/work/bmx828/miklip-ces/data4miklip/model/global/miklip/preop-dcpp/output/MPIM-DWD/MPI-ESM-HR/hicast2006/mon/atmos/zg/r6i1p1/test.nc', 'frequency': 'mon', 'modeling_realm': 'atmos', 'version': 'v0'}
Failed to parse file: evspsbl_6hr_MPI-ESM-HR_dcppA2001_r7i1p1f using patterns: {variable}_{mip_table}_{model}_{experiment}_{ensemble_member}_{temporal_subset}.nc and {variable}_{mip_table}_{model}_{experiment}_{ensemble_member}.nc
{'path': '/work/bmx828/miklip-ces/data4miklip/model/global/miklip/preop/output/DWD/MPI-ESM-HR/dcppA2001/6hr/atmos/evspsbl/r7i1p1f1/evspsbl_6hr_MPI-ESM-HR_dcppA2001_r7i1p1f', 'frequency': '6hr', 'modeling_realm': 'atmos', 'version': 'v0'}
CPU times: user 1min 36s, sys: 1.61 s, total: 1min 38s
Wall time: 1min 37s


In [25]:
entries[10]

{'path': '/work/bmx828/miklip-ces/data4miklip/model/global/miklip/baseline0/output1/MPI-M/MPI-ESM-LR/decadal2001/day/atmos/day/r5i1p1/v20111122/tasmin/tasmin_day_MPI-ESM-LR_decadal2001_r5i1p1_20020101-20111231.nc',
 'variable': 'tasmin',
 'mip_table': 'day',
 'model': 'MPI-ESM-LR',
 'experiment': 'decadal2001',
 'ensemble_member': 'r5i1p1',
 'temporal_subset': '20020101-20111231',
 'frequency': 'day',
 'modeling_realm': 'atmos',
 'version': 'v20111122',
 'institute': 'MPI-M',
 'product_id': 'output1',
 'generation': 'baseline0'}

In [26]:
len(entries)

1859769

In [56]:
df = pd.DataFrame(entries)

In [80]:
df[df['variable']=='ps']

Unnamed: 0,path,variable,mip_table,model,experiment,ensemble_member,temporal_subset,frequency,modeling_realm,version,institute,product_id,generation
93,/work/bmx828/miklip-ces/data4miklip/model/glob...,ps,Amon,MPI-ESM-LR,decadal2001,r2i1p1,200201-201112,mon,atmos,v20111122,MPI-M,output1,baseline0
141,/work/bmx828/miklip-ces/data4miklip/model/glob...,ps,Amon,MPI-ESM-LR,decadal2001,r2i1p1,200201-201112,mon,atmos,v20120529,MPI-M,output1,baseline0
188,/work/bmx828/miklip-ces/data4miklip/model/glob...,ps,Amon,MPI-ESM-LR,decadal2001,r5i1p1,200201-201112,mon,atmos,v20111122,MPI-M,output1,baseline0
236,/work/bmx828/miklip-ces/data4miklip/model/glob...,ps,Amon,MPI-ESM-LR,decadal2001,r5i1p1,200201-201112,mon,atmos,v20120529,MPI-M,output1,baseline0
283,/work/bmx828/miklip-ces/data4miklip/model/glob...,ps,Amon,MPI-ESM-LR,decadal2001,r1i1p1,200201-201112,mon,atmos,v20111122,MPI-M,output1,baseline0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1856509,/work/bmx828/miklip-ces/data4miklip/model/glob...,ps,6hrPlev,MPI-ESM-LR,dffg2e2000,r15i1p1,2009010100-2009123118,6hr,atmos,v0,MPI-M,miklip-extra,prototype
1856510,/work/bmx828/miklip-ces/data4miklip/model/glob...,ps,6hrPlev,MPI-ESM-LR,dffg2e2000,r15i1p1,2005010100-2005123118,6hr,atmos,v0,MPI-M,miklip-extra,prototype
1856511,/work/bmx828/miklip-ces/data4miklip/model/glob...,ps,6hrPlev,MPI-ESM-LR,dffg2e2000,r15i1p1,2002010100-2002123118,6hr,atmos,v0,MPI-M,miklip-extra,prototype
1856512,/work/bmx828/miklip-ces/data4miklip/model/glob...,ps,6hrPlev,MPI-ESM-LR,dffg2e2000,r15i1p1,2007010100-2007123118,6hr,atmos,v0,MPI-M,miklip-extra,prototype


In [73]:
df.product_id.unique()

array(['output1', 'output', 'miklip-extra', nan, 'preop'], dtype=object)

In [74]:
df.generation.unique()

array(['baseline0', 'baseline1', 'preop-dcpp', nan, 'preop', 'miklip',
       'prototype'], dtype=object)

In [75]:
df[df.generation=='preop-dcpp']['variable'].unique()

array(['mrso', 'tauu', 'ps', 'rsdt', 'rlut', 'zg', 'va', 'clt', 'pr',
       'rsut', 'wap', 'rsds', 'rlds', 'uas', 'rls', 'ta', 'hus',
       'sfcWind', 'hfss', 'rsus', 'tas', 'prsn', 'tdps', 'tasmax',
       'sfcWindmax', 'rlus', 'psl', 'prc', 'ua', 'vas', 'hur', 'tasmin',
       'hfls', 'tauv', 'evspsbl', 'zos', 'sic', 'so', 'vsi', 'uo', 'usi',
       'snd', 'mlotst', 'sivol', 'strairx', 'msftmyz', 'vsf', 'zossq',
       'thetao', 'tos', 'strairy', 'wmo', 'hfds', 'sos', 'tossq', 'vo',
       'mrro', 'prw', 'albedo', 'rldscs', 'rsutcs', 'ts', 'rss', 'al',
       'rsdscs', 'rsuscs', 'rlutcs', 'clivi', 'rtmt', 'clwvi', 'seaice',
       'areacello', 'deptho', 'sftof', 'mlotstsq', 'hfy', 'hfxba', 'pbo',
       'tauvo', 'wfo', 'hfyba', 'hfxdiff', 'msftbarot', 'hfx', 'rsntds',
       'tauuo', 'hfydiff', 'snm', 'snw', 'transiy', 'sim', 'sit',
       'transix'], dtype=object)

In [76]:
df.ensemble_member.unique()

array(['r2i1p1', 'r5i1p1', 'r1i1p1', 'r10i1p1', 'r6i1p1', 'r8i1p1',
       'r4i1p1', 'r7i1p1', 'r3i1p1', 'r9i1p1', 'r0i0p0', 'r1i1p2',
       'r8i2p1', 'r5i2p1', 'r7i2p1', 'r9i2p1', 'r6i2p1', 'r10i2p1',
       'r2i2p1', 'r3i2p1', 'r1i2p1', 'r4i2p1', 'r1i1p8', nan,
       'dpes4e1968', 'dpes4e2015', 'dpes4e1978', 'dpes4e1989',
       'dpes4e1967', 'dpes4e1999', 'dpes4e2003', 'dpes4e1964',
       'dpes4e1960', 'dpes4e1976', 'dpes4e1998', 'dpes4e1977',
       'dpes4e1992', 'dpes4e2004', 'dpes4e1962', 'dpes4e1988',
       'dpes4e1984', 'dpes4e2001', 'dpes4e2014', 'dpes4e1970',
       'dpes4e1983', 'dpes4e1961', 'dpes4e2000', 'dpes4e2011',
       'dpes4e1987', 'dpes4e2009', 'dpes4e1972', 'dpes4e1985',
       'dpes4e2010', 'dpes4e1991', 'dpes4e1971', 'asORAaERAf',
       'dpes4e1982', 'dpes4e1990', 'dpes4e1993', 'dpes4e2002',
       'dpes4e1965', 'dpes4e2013', 'r11i1p1', 'r0i1p1', 'historical',
       'dpes4e2016', 'dpes4e2012', 'dpes4e1997', 'dpes4e1994',
       'dpes4e1979', 'dpes4e2006', 

In [34]:
#df[df['generation']=='nan']

In [35]:
df = df.drop_duplicates(subset=['path'], keep='last').reset_index(drop=True)
df.head()

Unnamed: 0,path,variable,mip_table,model,experiment,ensemble_member,temporal_subset,frequency,modeling_realm,version,institute,product_id,generation
0,/work/bmx828/miklip-ces/data4miklip/model/glob...,sfcWind,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0
1,/work/bmx828/miklip-ces/data4miklip/model/glob...,tas,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0
2,/work/bmx828/miklip-ces/data4miklip/model/glob...,psl,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0
3,/work/bmx828/miklip-ces/data4miklip/model/glob...,pr,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0
4,/work/bmx828/miklip-ces/data4miklip/model/glob...,tasmin,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0


In [36]:
len(df)

1859769

In [37]:
df.head()

Unnamed: 0,path,variable,mip_table,model,experiment,ensemble_member,temporal_subset,frequency,modeling_realm,version,institute,product_id,generation
0,/work/bmx828/miklip-ces/data4miklip/model/glob...,sfcWind,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0
1,/work/bmx828/miklip-ces/data4miklip/model/glob...,tas,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0
2,/work/bmx828/miklip-ces/data4miklip/model/glob...,psl,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0
3,/work/bmx828/miklip-ces/data4miklip/model/glob...,pr,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0
4,/work/bmx828/miklip-ces/data4miklip/model/glob...,tasmin,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0


In [49]:
products=df.product_id.unique()
products

array(['output1', 'output', 'miklip-extra', nan, 'preop'], dtype=object)

In [51]:
products=list(products)
products.remove(np.nan)
products

['output1', 'output', 'miklip-extra', 'preop']

In [52]:
# Some entries are invalid
invalids = df[~df.product_id.isin(products)]
invalids
#df = df[df.activity_id.isin(activity_ids)]

Unnamed: 0,path,variable,mip_table,model,experiment,ensemble_member,temporal_subset,frequency,modeling_realm,version,institute,product_id,generation
748414,/work/bmx828/miklip-ces/data4miklip/model/glob...,,,,,,,mon,atmos,v0,,,
1093507,/work/bmx828/miklip-ces/data4miklip/model/glob...,,,,,,,6hr,atmos,v0,,,


In [53]:
df = df[df.product_id.isin(products)]
len(df)

1859767

In [81]:
df.ensemble_member.unique()

array(['r2i1p1', 'r5i1p1', 'r1i1p1', 'r10i1p1', 'r6i1p1', 'r8i1p1',
       'r4i1p1', 'r7i1p1', 'r3i1p1', 'r9i1p1', 'r0i0p0', 'r1i1p2',
       'r8i2p1', 'r5i2p1', 'r7i2p1', 'r9i2p1', 'r6i2p1', 'r10i2p1',
       'r2i2p1', 'r3i2p1', 'r1i2p1', 'r4i2p1', 'r1i1p8', nan,
       'dpes4e1968', 'dpes4e2015', 'dpes4e1978', 'dpes4e1989',
       'dpes4e1967', 'dpes4e1999', 'dpes4e2003', 'dpes4e1964',
       'dpes4e1960', 'dpes4e1976', 'dpes4e1998', 'dpes4e1977',
       'dpes4e1992', 'dpes4e2004', 'dpes4e1962', 'dpes4e1988',
       'dpes4e1984', 'dpes4e2001', 'dpes4e2014', 'dpes4e1970',
       'dpes4e1983', 'dpes4e1961', 'dpes4e2000', 'dpes4e2011',
       'dpes4e1987', 'dpes4e2009', 'dpes4e1972', 'dpes4e1985',
       'dpes4e2010', 'dpes4e1991', 'dpes4e1971', 'asORAaERAf',
       'dpes4e1982', 'dpes4e1990', 'dpes4e1993', 'dpes4e2002',
       'dpes4e1965', 'dpes4e2013', 'r11i1p1', 'r0i1p1', 'historical',
       'dpes4e2016', 'dpes4e2012', 'dpes4e1997', 'dpes4e1994',
       'dpes4e1979', 'dpes4e2006', 

In [82]:
df.experiment.unique()

array(['decadal2001', 'amip4K', 'decadal1982', 'noVolc1975',
       'decadal1961', 'esmHistorical', 'aqua4xCO2', 'noVolc1990',
       'decadal2003', 'esmFixClim1', 'decadal1973', 'decadal1963',
       'historical', 'esmrcp85', 'decadal1975', 'sstClim', 'decadal1996',
       'decadal2005', 'sstClimAerosol', 'decadal2000', 'decadal1988',
       'decadal2002', 'decadal1972', 'decadal1977', 'sstClimSulfate',
       'piControl', 'decadal2008', 'noVolc1960', 'decadal1986',
       'decadal1968', 'decadal1999', 'rcp26', 'decadal1967',
       'decadal1992', 'decadal1990', 'decadal1978', 'amip4xCO2', 'amip',
       'decadal1960', 'decadal1984', 'noVolc1985', 'decadal1976',
       'esmFdbk1', 'sstClim4xCO2', 'decadal1980', 'decadal1995',
       'decadal1987', 'decadal1970', 'decadal1989', 'decadal2004',
       'decadal1979', 'decadal1998', 'rcp85', 'decadal1962', '1pctCO2',
       'decadal2007', 'decadal1966', 'decadal1991', 'volcIn2010',
       'decadal1985', 'decadal1994', 'aqua4K', 'decadal200

In [83]:
df.head()

Unnamed: 0,path,variable,mip_table,model,experiment,ensemble_member,temporal_subset,frequency,modeling_realm,version,institute,product_id,generation
0,/work/bmx828/miklip-ces/data4miklip/model/glob...,sfcWind,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0
1,/work/bmx828/miklip-ces/data4miklip/model/glob...,tas,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0
2,/work/bmx828/miklip-ces/data4miklip/model/glob...,psl,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0
3,/work/bmx828/miklip-ces/data4miklip/model/glob...,pr,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0
4,/work/bmx828/miklip-ces/data4miklip/model/glob...,tasmin,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0


## Pick the latest versions only

In [84]:
grpby = list(set(df.columns.tolist()) - {'path', 'version'})
groups = df.groupby(grpby)

In [85]:
%%time
idx_to_remove = []
for _, group in groups:
    if group.version.nunique() > 1:
        idx_to_remove.extend(group.sort_values(by=['version'], ascending=False).index[1:].values.tolist())

CPU times: user 11min 50s, sys: 13.7 s, total: 12min 4s
Wall time: 11min 48s


In [86]:
len(idx_to_remove)

86487

In [87]:
len(df)

1859769

In [88]:
df1 = df.copy()
df = df.drop(index=idx_to_remove)
len(df)

1773282

In [91]:
df.head()

Unnamed: 0,path,variable,mip_table,model,experiment,ensemble_member,temporal_subset,frequency,modeling_realm,version,institute,product_id,generation
0,/work/bmx828/miklip-ces/data4miklip/model/glob...,sfcWind,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0
1,/work/bmx828/miklip-ces/data4miklip/model/glob...,tas,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0
2,/work/bmx828/miklip-ces/data4miklip/model/glob...,psl,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0
3,/work/bmx828/miklip-ces/data4miklip/model/glob...,pr,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0
4,/work/bmx828/miklip-ces/data4miklip/model/glob...,tasmin,day,MPI-ESM-LR,decadal2001,r2i1p1,20020101-20111231,day,atmos,v20111122,MPI-M,output1,baseline0


In [95]:
df.columns.shape

(13,)

In [96]:
# Re-arange columns
columns = ["generation","product_id", "institute", "model", "experiment", "frequency", 
           "modeling_realm", "mip_table", "ensemble_member", "variable", 
           "temporal_subset", "version", "path"]
df = df[columns]
df.head()

Unnamed: 0,generation,product_id,institute,model,experiment,frequency,modeling_realm,mip_table,ensemble_member,variable,temporal_subset,version,path
0,baseline0,output1,MPI-M,MPI-ESM-LR,decadal2001,day,atmos,day,r2i1p1,sfcWind,20020101-20111231,v20111122,/work/bmx828/miklip-ces/data4miklip/model/glob...
1,baseline0,output1,MPI-M,MPI-ESM-LR,decadal2001,day,atmos,day,r2i1p1,tas,20020101-20111231,v20111122,/work/bmx828/miklip-ces/data4miklip/model/glob...
2,baseline0,output1,MPI-M,MPI-ESM-LR,decadal2001,day,atmos,day,r2i1p1,psl,20020101-20111231,v20111122,/work/bmx828/miklip-ces/data4miklip/model/glob...
3,baseline0,output1,MPI-M,MPI-ESM-LR,decadal2001,day,atmos,day,r2i1p1,pr,20020101-20111231,v20111122,/work/bmx828/miklip-ces/data4miklip/model/glob...
4,baseline0,output1,MPI-M,MPI-ESM-LR,decadal2001,day,atmos,day,r2i1p1,tasmin,20020101-20111231,v20111122,/work/bmx828/miklip-ces/data4miklip/model/glob...


In [97]:
df.to_csv("../catalogs/mistral-miklip.csv.gz", compression="gzip", index=False)

In [99]:
!ls -all ../catalogs/

total 78000
drwxr-xr-x 2 m300524 mpioes     4096 Oct 23 16:52 .
drwxr-xr-x 6 m300524 mpioes     4096 Oct 17 09:24 ..
-rw-r--r-- 1 m300524 mpioes     2116 Oct 20 16:10 aws-cesm1-le.csv.gz
-rw-r--r-- 1 m300524 mpioes  3189244 Oct 20 16:10 glade-cmip5.csv.gz
-rw-r--r-- 1 m300524 mpioes     1818 Oct 23 16:52 glade-cmip5.json
-rw-r--r-- 1 m300524 mpioes 13167014 Oct 20 16:10 glade-cmip6.csv.gz
-rw-r--r-- 1 m300524 mpioes     2386 Oct 17 09:24 glade-cmip6.json
-rw-r--r-- 1 m300524 mpioes 32141206 Oct 21 10:40 mistral-cmip5.csv.gz
-rw-r--r-- 1 m300524 mpioes     1613 Oct 25 13:04 mistral-cmip5.json
-rw-r--r-- 1 m300524 mpioes 16301936 Oct 17 10:52 mistral-cmip6.csv.gz
-rw-r--r-- 1 m300524 mpioes     2379 Oct 17 09:31 mistral-cmip6.json
-rw-r--r-- 1 m300524 mpioes 15031962 Oct 25 13:03 mistral-miklip.csv.gz
-rw-r--r-- 1 m300524 mpioes     1711 Oct 25 13:03 mistral-miklip.json
-rw-r--r-- 1 m300524 mpioes     2181 Oct 17 09:24 pangeo-cmip6.json


In [3]:
df=pd.read_csv("../catalogs/mistral-miklip.csv.gz")

In [14]:
pd.read_csv("../catalogs/mistral-cmip5.csv.gz").mip_table.unique()

array(['day', 'Amon', 'Omon', 'LImon', 'Lmon', 'aero', 'OImon', '6hrPlev',
       'Oyr', 'fx', 'cfDay', 'cfOff', 'cfMon', 'cf3hr', 'cfSites', '3hr',
       '6hrLev', 'Oclim'], dtype=object)

In [19]:
df.variable.unique()

array(['indopacific', 'global', 'atlantic', 'tntr'], dtype=object)

In [17]:
df.tail(1)

Unnamed: 0,product_id,institute,model,experiment,frequency,modeling_realm,mip_table,ensemble_member,variable,temporal_subset,version,path
2525,preop,output,Omon,MPI-ESM-LR,mon,ocean,moc,dpes4e1963,atlantic,r7i1p1_196311-197312,v0,/work/bmx828/miklip-ces/data4miklip/model/glob...


In [None]:
columns = ["generation", "product_id", "mip_table", "model", "frequency", 
           "modeling_realm", "variable", "ensemble_member", "variable", 
           "temporal_subset", "version", "path"]
df = df[columns]
df.head()