First step is to obtain full listing of the nrt data:

```bash
s3-find 's3://dea-public-data/L2/sentinel-2-nrt/S2MSIARD/**/*' | tee /dev/stderr | gzip -9 > s2nrt.txt.tgz
```

Code below will groups these paths into datasets using S2 specific path pattern:

```
{s3_prefix}/{date:day precision}/{dataset_dir}/...
```

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import xarray as xr
import gzip
from itertools import islice
import tqdm
from types import SimpleNamespace
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()


def get_paths(f, prefix=''):
    skip = len(prefix)
    for line in f:
        line = line.decode('utf8').rstrip('\n')
        line = line[skip:]
        yield line

def path_split(s):
    try:
        date, ds, *fp = s.split('/')
    except ValueError:
        print(f'Bad: "{s}"')
        return (None, None, None)
    
    return (date, ds, '/'.join(fp))

def load_from_gz(fname, prefix, verbose=True):
    oo = SimpleNamespace(ds_per_day={}, files_per_ds={})
    
    with gzip.open(fname, 'rb') as gz_src:
        dss = map(path_split, get_paths(gz_src, prefix))
        
        if verbose:
            dss = tqdm.tqdm_notebook(dss)
        
        for (day, ds, fname) in dss:
            if day is None:
                continue
                
            ds_key = f'{day}/{ds}'
            if fname == 'ARD-METADATA.yaml':
                oo.ds_per_day[day] = oo.ds_per_day.get(day, 0) + 1
        
            oo.files_per_ds.setdefault(ds_key, []).append(fname)
    
    return oo

def compute_stats(oo, expect_count=None):
    ds_per_day = oo.ds_per_day
    files_per_ds = oo.files_per_ds
    
    count_by_day = sorted((d,c) for d,c in ds_per_day.items())

    tt = np.asarray([d for d,_ in count_by_day], dtype='datetime64[s]')
    cc = np.asarray([c for _,c in count_by_day], dtype='uint32')
    ds_count = xr.DataArray(cc, 
                            dims=('time',), 
                            coords=dict(time=tt), 
                            name='ds_count')
    
    oo.ds_count = ds_count
    oo.total_ds = ds_count.sum().data.item()
    oo.total_ds_dirs = len(files_per_ds)
    oo.no_yaml_dirs = {k:v for k,v in files_per_ds.items() if 'ARD-METADATA.yaml' not in v}
    
    if expect_count is not None:
        skip_set = set(oo.no_yaml_dirs)
        oo.other_bad = {
            k:v 
            for k,v in files_per_ds.items() if len(v) != expect_count and k not in skip_set
        }   

In [None]:
s3_prefix = 's3://dea-public-data/L2/sentinel-2-nrt/S2MSIARD/'
http_prefix = s3_prefix.replace('s3://dea-public-data/', 'https://data.dea.ga.gov.au/?prefix=')
n_expect=64
file_list_gz='s2nrt.txt.gz'

In [None]:
%%time
s3 = load_from_gz(file_list_gz, s3_prefix, verbose=True)
compute_stats(s3, n_expect);

In [None]:
n_missing_yamls = s3.total_ds_dirs - s3.total_ds
n_incomplete = len(s3.other_bad)
print(f'''Directories without yamls:
   {n_missing_yamls:3} out of {s3.total_ds_dirs:,d}
Directories with fewer files than expected (< {n_expect}):
   {n_incomplete:3} out of {s3.total_ds_dirs:,d}''')

if n_missing_yamls:
    print("## Directories with missing yamls")
    for p in s3.no_yaml_dirs:
        print(f'{http_prefix}{p}')

if len(s3.other_bad) > 0:
    print(f"## Directories with fewer files than expected (< {n_expect})")
    for p in s3.other_bad:
        print(f'{http_prefix}{p}')

fig, ax = plt.subplots(1, figsize=(18, 6))
ax.plot(s3.ds_count.time, s3.ds_count.data, '.', markersize=4);