In [1]:
# Imports
import numpy as np
import pandas as pd
import xarray as xr
from pathlib import Path
from os.path import dirname, join
from scipy.interpolate import RegularGridInterpolator

# Earthdata
from earthdata import DataCollections, DataGranules, Auth, Store

#### We are going to use [earthdata](https://github.com/nsidc/earthdata) to interact with CMR

First we initialize Auth() to get the cloud credentials, if we have a `.netrc` file this can be done automatically

In [2]:
auth = Auth().login(strategy='netrc')
if auth.authenticated is None:
    # we ask for credentials
    auth.login()

You're now authenticated with NASA Earthdata Login


# Define important parameters

In [3]:
# beaufort sea region
lonrange = [-160, -130]
latrange = [68, 80]
bounding_box = (lonrange[0], latrange[0], lonrange[1], latrange[1])
# Data collection short name
ShortName_sss = "SMAP_JPL_L3_SSS_CAP_8DAY-RUNNINGMEAN_V5"

# Define paths and directories

> NOTE: Change this to match your path

In [7]:
insitu_dir=Path('/home/jovyan/Data/SASSIE/collocation/insitu')  
output_dir=Path('/home/jovyan/Data/SASSIE/collocation/satellite/smap_jpl_l3')

#### Querying CMR using `earthdata` to get the `concept_id` for the cloud hosted collection

In [8]:
# See: https://github.com/nsidc/earthdata
CollectionQuery = DataCollections().short_name(ShortName_sss).cloud_hosted(True)
collections = CollectionQuery.get()

for collection in collections:
    concept_id = collection.concept_id()
    print(concept_id)

C2208422957-POCLOUD


# Load in situ SIZRS file for 1 year

In [9]:
filename=str(insitu_dir) + '/SIZRS_2016.nc'
insitu = xr.open_dataset(filename)

In [10]:
# round the time to the closest day
days_from_insitu = pd.to_datetime(insitu.t).round('D')
days = days_from_insitu.values
days

array(['2016-06-16T00:00:00.000000000', '2016-06-16T00:00:00.000000000',
       '2016-06-16T00:00:00.000000000', '2016-06-16T00:00:00.000000000',
       '2016-07-14T00:00:00.000000000', '2016-07-14T00:00:00.000000000',
       '2016-07-14T00:00:00.000000000', '2016-07-14T00:00:00.000000000',
       '2016-07-14T00:00:00.000000000', '2016-08-19T00:00:00.000000000',
       '2016-08-19T00:00:00.000000000', '2016-09-15T00:00:00.000000000',
       '2016-09-15T00:00:00.000000000', '2016-09-15T00:00:00.000000000',
       '2016-09-15T00:00:00.000000000', '2016-09-15T00:00:00.000000000',
       '2016-09-15T00:00:00.000000000', '2016-09-16T00:00:00.000000000',
       '2016-09-16T00:00:00.000000000', '2016-09-16T00:00:00.000000000',
       '2016-10-07T00:00:00.000000000', '2016-10-07T00:00:00.000000000',
       '2016-10-07T00:00:00.000000000', '2016-10-07T00:00:00.000000000',
       '2016-10-07T00:00:00.000000000', '2016-10-07T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [11]:
# we use a Set to avoid repeating queries to the same day
date_ranges = set()
for day in days:
    start_date = str(day)
    end_date = str(day + np.timedelta64(1, 'D'))
    # or end_date = str(day + np.timedelta64(1, 'D') - np.timedelta64(1, 's')) for 23:59:59 of the same day, the 
    # search on CMR is the same.
    date_range = (start_date, end_date)
    date_ranges.add(date_range)
date_ranges

{('2016-06-16T00:00:00.000000000', '2016-06-17T00:00:00.000000000'),
 ('2016-07-14T00:00:00.000000000', '2016-07-15T00:00:00.000000000'),
 ('2016-08-19T00:00:00.000000000', '2016-08-20T00:00:00.000000000'),
 ('2016-09-15T00:00:00.000000000', '2016-09-16T00:00:00.000000000'),
 ('2016-09-16T00:00:00.000000000', '2016-09-17T00:00:00.000000000'),
 ('2016-10-07T00:00:00.000000000', '2016-10-08T00:00:00.000000000')}

#### We query CMR for the granules we want

In [12]:
matching_granules = [] #list of granules to load

for dt in date_ranges:
    download_size = 0
    GranuleQuery = DataGranules().parameters(
        concept_id=concept_id,
        bounding_box=bounding_box,
        temporal=dt)
    granules = GranuleQuery.get()
    for granule in granules:
        # we use extend to aggregate the results in the same list.
        matching_granules.extend(granule.data_links(s3_only=True))
        download_size += granule.size()
    print(f"date: {dt} , total granules: {len(granules)}, size(MB): {round(download_size, 2)}")

date: ('2016-09-15T00:00:00.000000000', '2016-09-16T00:00:00.000000000') , total granules: 9, size(MB): 97.98
date: ('2016-06-16T00:00:00.000000000', '2016-06-17T00:00:00.000000000') , total granules: 9, size(MB): 89.04
date: ('2016-10-07T00:00:00.000000000', '2016-10-08T00:00:00.000000000') , total granules: 9, size(MB): 97.56
date: ('2016-09-16T00:00:00.000000000', '2016-09-17T00:00:00.000000000') , total granules: 9, size(MB): 97.95
date: ('2016-07-14T00:00:00.000000000', '2016-07-15T00:00:00.000000000') , total granules: 9, size(MB): 94.85
date: ('2016-08-19T00:00:00.000000000', '2016-08-20T00:00:00.000000000') , total granules: 9, size(MB): 97.5


In [13]:
print(f"Total granules found: {len(matching_granules)}")

In [15]:
matching_granules

['s3://podaac-ops-cumulus-protected/SMAP_JPL_L3_SSS_CAP_8DAY-RUNNINGMEAN_V5/2016/251/SMAP_L3_SSS_20160911_8DAYS_V5.0.nc',
 's3://podaac-ops-cumulus-protected/SMAP_JPL_L3_SSS_CAP_8DAY-RUNNINGMEAN_V5/2016/252/SMAP_L3_SSS_20160912_8DAYS_V5.0.nc',
 's3://podaac-ops-cumulus-protected/SMAP_JPL_L3_SSS_CAP_8DAY-RUNNINGMEAN_V5/2016/253/SMAP_L3_SSS_20160913_8DAYS_V5.0.nc',
 's3://podaac-ops-cumulus-protected/SMAP_JPL_L3_SSS_CAP_8DAY-RUNNINGMEAN_V5/2016/254/SMAP_L3_SSS_20160914_8DAYS_V5.0.nc',
 's3://podaac-ops-cumulus-protected/SMAP_JPL_L3_SSS_CAP_8DAY-RUNNINGMEAN_V5/2016/255/SMAP_L3_SSS_20160915_8DAYS_V5.0.nc',
 's3://podaac-ops-cumulus-protected/SMAP_JPL_L3_SSS_CAP_8DAY-RUNNINGMEAN_V5/2016/256/SMAP_L3_SSS_20160916_8DAYS_V5.0.nc',
 's3://podaac-ops-cumulus-protected/SMAP_JPL_L3_SSS_CAP_8DAY-RUNNINGMEAN_V5/2016/257/SMAP_L3_SSS_20160917_8DAYS_V5.0.nc',
 's3://podaac-ops-cumulus-protected/SMAP_JPL_L3_SSS_CAP_8DAY-RUNNINGMEAN_V5/2016/258/SMAP_L3_SSS_20160918_8DAYS_V5.0.nc',
 's3://podaac-ops-cumulu

### We use earthdata to get a S3FS session for the PODAAC cloud data
> NOTE: This session is valid for 1 hour

In [None]:
# IMPORTANT: This session will be valid for 1 hour
fs = Store(auth).get_s3fs_session('POCLOUD')
fs

# Opening the files directly in S3!
> NOTE: this could take up to ~1 minute the first time

We are opening the files via s3fs and xarray is doing a subsetting on-the-fly, this is possible because we're working with L3 data in a format xarray understands. 

In [None]:
%%time
# we open the files in the cloud and pass the S3FS fileset to xarray
fileset = [fs.open(s3_granule )for s3_granule in matching_granules]

ds_smap_L3 = xr.open_mfdataset(
    fileset,
    combine='nested',
    concat_dim='time',
    decode_cf=True,
    coords='minimal',
    chunks={'time': 1}
    ).sel(longitude=slice(lonrange[0],lonrange[1]), latitude=slice(latrange[1],latrange[0]))
ds_smap_L3

In [None]:
# We can create a deduplicated dataset.
# only one granule per time point so we can plot it with xarray using the time dimension
unique_ts_ds = ds_smap_L3.sel(time=~ds_smap_L3.indexes['time'].duplicated())
unique_ts_ds

### We plot a variable

> NOTE: plotting using xarray can take considerable memory, restarting the kernel after plotting is a good idea.

From here we can do all the other xarray operations.

In [None]:
unique_ts_ds.smap_sss.plot.pcolormesh(
    "longitude",
    "latitude",
    col="time",
    col_wrap=3,
    robust=True,
    add_colorbar=False
)