# Check data availability in google cloud

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from scipy.signal import detrend
from matplotlib import pyplot as plt
from scipy import signal
import pandas as pd
import xarray as xr
import intake
import pprint 
import util 

col_url = "https://storage.googleapis.com/cmip6/pangeo-cmip6.json"
col = intake.open_esm_datastore(col_url)
col


pangeo-cmip6-ESM Collection with 270390 entries:
	> 15 activity_id(s)

	> 33 institution_id(s)

	> 73 source_id(s)

	> 103 experiment_id(s)

	> 189 member_id(s)

	> 29 table_id(s)

	> 370 variable_id(s)

	> 10 grid_label(s)

	> 270390 zstore(s)

	> 60 dcpp_init_year(s)

In [2]:
cat = col.search(experiment_id = 'piClim-4xCO2', variable_id=['tas', 'rlut', 'rsut', 'rsdt'], table_id='Amon')
cat.df

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,zstore,dcpp_init_year
220621,RFMIP,CCCma,CanESM5,piClim-4xCO2,r1i1p2f1,Amon,rsdt,gn,gs://cmip6/RFMIP/CCCma/CanESM5/piClim-4xCO2/r1...,
220877,RFMIP,IPSL,IPSL-CM6A-LR,piClim-4xCO2,r1i1p1f1,Amon,rlut,gr,gs://cmip6/RFMIP/IPSL/IPSL-CM6A-LR/piClim-4xCO...,
220880,RFMIP,IPSL,IPSL-CM6A-LR,piClim-4xCO2,r1i1p1f1,Amon,rsdt,gr,gs://cmip6/RFMIP/IPSL/IPSL-CM6A-LR/piClim-4xCO...,
220882,RFMIP,IPSL,IPSL-CM6A-LR,piClim-4xCO2,r1i1p1f1,Amon,rsut,gr,gs://cmip6/RFMIP/IPSL/IPSL-CM6A-LR/piClim-4xCO...,
220888,RFMIP,IPSL,IPSL-CM6A-LR,piClim-4xCO2,r2i1p1f1,Amon,rlut,gr,gs://cmip6/RFMIP/IPSL/IPSL-CM6A-LR/piClim-4xCO...,
220891,RFMIP,IPSL,IPSL-CM6A-LR,piClim-4xCO2,r2i1p1f1,Amon,rsdt,gr,gs://cmip6/RFMIP/IPSL/IPSL-CM6A-LR/piClim-4xCO...,
220893,RFMIP,IPSL,IPSL-CM6A-LR,piClim-4xCO2,r2i1p1f1,Amon,rsut,gr,gs://cmip6/RFMIP/IPSL/IPSL-CM6A-LR/piClim-4xCO...,
220899,RFMIP,IPSL,IPSL-CM6A-LR,piClim-4xCO2,r3i1p1f1,Amon,rlut,gr,gs://cmip6/RFMIP/IPSL/IPSL-CM6A-LR/piClim-4xCO...,
220902,RFMIP,IPSL,IPSL-CM6A-LR,piClim-4xCO2,r3i1p1f1,Amon,rsdt,gr,gs://cmip6/RFMIP/IPSL/IPSL-CM6A-LR/piClim-4xCO...,
220904,RFMIP,IPSL,IPSL-CM6A-LR,piClim-4xCO2,r3i1p1f1,Amon,rsut,gr,gs://cmip6/RFMIP/IPSL/IPSL-CM6A-LR/piClim-4xCO...,


In [2]:
exp_list = ['piControl', 'historical', 'ssp126', 'ssp245', 'ssp370', 'ssp585', 'abrupt-4xCO2']

df = pd.DataFrame()
#for chosen_exp in [exp_list[0]]:
for chosen_exp in exp_list:
    print(chosen_exp)
    cat = col.search(experiment_id = chosen_exp, variable_id='ts', table_id='Amon')

    # the following code removes models that causes errors in .to_dataset_dict:
    uni_dict = cat.unique(['source_id'])
    models = uni_dict['source_id']['values']; 
    source_ids = models[:]

    if chosen_exp == 'piControl':
            remove_models = ['IPSL-CM6A-LR', 'MRI-ESM2-0', 'MPI-ESM1-2-LR']
    else:
        remove_models = []

    for mod in remove_models:
        source_ids.remove(mod) # remove models
        df.loc[mod, chosen_exp + ' (ens.mem.)'] = 'Problem when loading more than 1'
        df.loc[mod, chosen_exp + ' (yrs)'] = 'need manual check'

    # load new table without the problematic models:
    cat = col.search(source_id = source_ids, experiment_id = chosen_exp, variable_id='ts', table_id='Amon')
    #for source_id in source_ids[26:]:
    #    print(source_id)
    #    cat = col.search(source_id = source_id, experiment_id = chosen_exp, variable_id='ts', table_id='Amon')

    dset_dict = cat.to_dataset_dict(zarr_kwargs={'consolidated': True}, cdf_kwargs={'chunks': {}})
    keys = []
    for key in dset_dict.keys():
        keys.append(key)

    for key in keys:
        ds = dset_dict[key]
        model = ds.source_id
        #print(model)
        start_time = str(ds['time'][0].values)[:7]
        end_time = str(ds['time'][-1].values)[:7]
        run_length = int(end_time[:4]) + 1 - int(start_time[:4])
        df.loc[model,chosen_exp + ' (yrs)'] = run_length
        df.loc[model,chosen_exp + ' (ens.mem.)'] = ds.member_id.size # number of ensemble members

    

piControl
--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 52 group(s)
TaiESM1
AWI-CM-1-1-MR
AWI-ESM-1-1-LR
BCC-CSM2-MR
BCC-ESM1
CAMS-CSM1-0
FGOALS-f3-L
FGOALS-g3
IITM-ESM
CanESM5
CanESM5-CanOE
CNRM-CM6-1
CNRM-CM6-1-HR
CNRM-ESM2-1
ACCESS-ESM1-5
ACCESS-CM2
E3SM-1-0
E3SM-1-1
E3SM-1-1-ECA
EC-Earth3
EC-Earth3-LR
EC-Earth3-Veg
EC-Earth3-Veg-LR
FIO-ESM-2-0
MPI-ESM-1-2-HAM
INM-CM4-8
INM-CM5-0
MIROC-ES2L
MIROC6
HadGEM3-GC31-LL
HadGEM3-GC31-MM
UKESM1-0-LL
MPI-ESM1-2-HR
GISS-E2-1-G
GISS-E2-1-G-CC
GISS-E2-1-H
GISS-E2-2-G
CESM2
CESM2-FV2
CESM2-WACCM
CESM2-WACCM-FV2
NorCPM1
NorESM1-F
NorESM2-LM
NorESM2-MM
KACE-1-0-G
GFDL-CM4
GFDL-ESM4
NESM3
SAM0-UNICON
CIESM
MCM-UA-1-0
historical
--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'

--> There will be 50 group(s)
TaiESM1
AWI-CM-

In [3]:
df

Unnamed: 0,piControl (ens.mem.),piControl (yrs),historical (yrs),historical (ens.mem.),ssp126 (yrs),ssp126 (ens.mem.),ssp245 (yrs),ssp245 (ens.mem.),ssp370 (yrs),ssp370 (ens.mem.),ssp585 (yrs),ssp585 (ens.mem.),abrupt-4xCO2 (yrs),abrupt-4xCO2 (ens.mem.)
IPSL-CM6A-LR,Problem when loading more than 1,need manual check,165.0,32.0,86.0,6.0,86.0,11.0,86.0,11.0,86.0,6.0,300.0,12.0
MRI-ESM2-0,Problem when loading more than 1,need manual check,165.0,6.0,86.0,1.0,86.0,5.0,86.0,5.0,86.0,2.0,151.0,14.0
MPI-ESM1-2-LR,Problem when loading more than 1,need manual check,165.0,10.0,86.0,10.0,86.0,10.0,86.0,10.0,86.0,10.0,165.0,1.0
TaiESM1,1,500,165.0,1.0,,,,,,,,,150.0,1.0
AWI-CM-1-1-MR,1,500,165.0,5.0,86.0,1.0,86.0,1.0,86.0,5.0,86.0,1.0,151.0,1.0
AWI-ESM-1-1-LR,1,100,165.0,1.0,,,,,,,,,,
BCC-CSM2-MR,1,600,165.0,3.0,86.0,1.0,86.0,1.0,86.0,1.0,86.0,1.0,151.0,1.0
BCC-ESM1,1,451,165.0,3.0,,,,,41.0,3.0,,,151.0,1.0
CAMS-CSM1-0,1,250,165.0,3.0,85.0,2.0,85.0,2.0,85.0,2.0,85.0,2.0,155.0,2.0
FGOALS-f3-L,1,561,,,86.0,1.0,86.0,1.0,86.0,1.0,86.0,1.0,,


In [4]:
column_names = list(df.columns);
yr_cols = [col for col in column_names if col[-5:] == '(yrs)']
ens_cols = [col for col in column_names if col[-10:] == '(ens.mem.)']
df2 = df[ens_cols + yr_cols]
df2
#df2.to_csv('available_tas_data_cloud_june12th_2020.txt', sep='\t')