In [6]:
import dask.array as da
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm

In [3]:
import os
from typing import Union

import s3fs
import zarr

AWS_ZARR_ROOT = (
    "s3://gov-nasa-hdrl-data1/contrib/fdl-sdoml/fdl-sdoml-v2/sdomlv2_hmi.zarr/"
)


def s3_connection(path_to_zarr: os.path) -> s3fs.S3Map:
    """
    Instantiate connection to aws for a given path `path_to_zarr`
    """
    return s3fs.S3Map(
        root=path_to_zarr,
        s3=s3fs.S3FileSystem(anon=True),
        # anonymous access requires no credentials
        check=False,
    )


def load_single_aws_zarr(
    path_to_zarr: os.path,
    cache_max_single_size: int = None,
) -> Union[zarr.Array, zarr.Group]:
    """
    load zarr from s3 using LRU cache
    """
    return zarr.open(
        zarr.LRUStoreCache(
            store=s3_connection(path_to_zarr),
            max_size=cache_max_single_size,
        ),
        mode="r",
    )


root = load_single_aws_zarr(
    path_to_zarr=AWS_ZARR_ROOT,
)

In [4]:
def get_header(data, index):
    headr = {keys: values[index] for keys, values in data.attrs.items()}
    return headr

def get_all_headers(data):
    headr = {keys: values for keys, values in data.attrs.items()}
    return headr

In [8]:

timestamp_info_dict = {}

for year in tqdm(root):
    data = root[year]["Bx"]
    header = get_all_headers(data)

    timestamps = header["T_REC"]

    for i, timestamp in enumerate(timestamps):
        timestamp = timestamp[:-4].replace("_", " ").replace(".", "-")

        timestamp_info_dict[timestamp] = {
            "year": year,
#            "harps": timestamps_harps[timestamp],
            "index": i,
            "header": {keys: values[i] for keys, values in data.attrs.items() if len(values) > i},
        }

100%|██████████| 11/11 [02:03<00:00, 11.25s/it]
