In [1]:
import dcachefs
import gcsfs
import getpass
import os
import tqdm

# Copying tiles from GCS to dCache storage

In order to access a Google Cloud Storage (GCS) bucket via [GCSFS](https://gcsfs.readthedocs.io) ([Filesystem Spec](https://filesystem-spec.readthedocs.io) compatible file system module for GCS), one needs to set up authentication credentials:
* Make sure to have sufficient "Bucket" and "Object" permissions (being "Bucket Owner" is not sufficient to download the data); 
* Download and uncompress the [*Google Cloud SDK*](https://cloud.google.com/sdk/docs/quickstart) tarball;
* Run `./google-cloud-sdk/bin/gcloud init --no-launch-browser` and provide authentication credentials and default project information (a new one can be created)
* Install `gcsfs`  using `pip`: 
```shell
pip install gcsfs
```

In [2]:
google_account = getpass.getpass()

In [3]:
google_account = 'izeboud.maaike'

In [4]:
# read authentication credentials created by `gcloud`
gcs_fs = gcsfs.GCSFileSystem(
    # token=f"/Users/fnattino/.config/gcloud/legacy_credentials/{google_account}@gmail.com/adc.json"
    token=f"/Users/tud500158/.config/gcloud/legacy_credentials/{google_account}@gmail.com/adc.json"
)

In [5]:
# get bucket tile list
tiles = gcs_fs.glob(
    # "gs://ee-data_export/S2_composite_2019-11-1_2020-3-1_tile_*.tif"
    # "gs://ee-data_export/data_update/S2_composite_2019-11-1_2020-3-1_tile_*.tif"
    # "gs://ee-data_export/S1_mosaic_2020-01-01_2020-02-02/S1_*.tif"
    "gs://ee-data_export/data_S1_pineIsland/S1*.tif"
)
print(len(tiles))
tiles[:10]

273


['ee-data_export/data_S1_pineIsland/S1A_IW_GRDH_1SSH_20150526_orbit-65_sliceMosaic_10m.tif',
 'ee-data_export/data_S1_pineIsland/S1A_IW_GRDH_1SSH_20150607_orbit-65_sliceMosaic_10m.tif',
 'ee-data_export/data_S1_pineIsland/S1A_IW_GRDH_1SSH_20150619_orbit-65_sliceMosaic_10m.tif',
 'ee-data_export/data_S1_pineIsland/S1A_IW_GRDH_1SSH_20150701_orbit-65_sliceMosaic_10m.tif',
 'ee-data_export/data_S1_pineIsland/S1A_IW_GRDH_1SSH_20180110_orbit-65_sliceMosaic_10m.tif',
 'ee-data_export/data_S1_pineIsland/S1A_IW_GRDH_1SSH_20180122_orbit-65_sliceMosaic_10m.tif',
 'ee-data_export/data_S1_pineIsland/S1A_IW_GRDH_1SSH_20180203_orbit-65_sliceMosaic_10m.tif',
 'ee-data_export/data_S1_pineIsland/S1A_IW_GRDH_1SSH_20180215_orbit-65_sliceMosaic_10m.tif',
 'ee-data_export/data_S1_pineIsland/S1A_IW_GRDH_1SSH_20180227_orbit-65_sliceMosaic_10m.tif',
 'ee-data_export/data_S1_pineIsland/S1A_IW_GRDH_1SSH_20180311_orbit-65_sliceMosaic_10m.tif']

In [6]:
# configure access to dCache file system
dcache_fs = dcachefs.dCacheFileSystem(
    username=getpass.getpass("username"),
    password=getpass.getpass("password"),
    api_url="https://dcacheview.grid.surfsara.nl:22880/api/v1/", 
    webdav_url="https://webdav.grid.surfsara.nl:2880"
)


In [7]:
os.getcwd()

'/Users/tud500158/Library/Mobile Documents/com~apple~CloudDocs/Documents/Documents - TUD500158/github/AutomatedDamageDetection/notebooks'

In [8]:
def _get_token(rclone_config_file: str):
    """ Extract token from file (rclone config or plain file) """
    with open(rclone_config_file) as f:
        content = f.read()

    token = None

    for line in content.splitlines():
        # try rclone config file
        if line.startswith("bearer_token"):
            token = line.split()[-1]

    if token is None:
        # assume plain text file
        token = content.strip()
    return token

# configure access to dCache file system
homedir = '/Users/tud500158/Library/Mobile Documents/com~apple~CloudDocs/Documents/Documents - TUD500158/'
dcache_token_file = os.path.join(homedir,'github/iceshelf_2022-08-04_365D_maaike.conf')
dcache_api_url = "https://dcacheview.grid.surfsara.nl:22880/api/v1/"
dcache_webdav_url = "https://webdav.grid.surfsara.nl:2880"
                                 
dcache_fs = dcachefs.dCacheFileSystem(
    token=_get_token(dcache_token_file),
    api_url=dcache_api_url,
    webdav_url=dcache_webdav_url,
    block_size=0  # will open file in stream mode
)


In [9]:
for tile in tqdm.tqdm(tiles):
    
    _, filename = os.path.split(tile)
    
    source_uri = f"gs://{tile}"
    # destination_uri = f"/pnfs/grid.sara.nl/data/iceshelf/disk/S2_composite_2019-11-1_2020-3-1/{filename}"
    # destination_uri = f"/pnfs/grid.sara.nl/data/iceshelf/disk/S1_mosaic_2020-01-01_2020-02-01/{filename}"
    destination_uri = f"/pnfs/grid.sara.nl/data/iceshelf/disk/S1_pineIsland_temporal/{filename}"

    already_exists = dcache_fs.exists(destination_uri)
    same_size = False if not already_exists else gcs_fs.size(source_uri) == dcache_fs.size(destination_uri)

    # download missing/incomplete tiles
    if not already_exists or not same_size:
        with gcs_fs.open(source_uri) as f_read:
            with dcache_fs.open(destination_uri, "wb", block_size=0, timeout=900) as f:  # open file in stream mode
                f.write(f_read)

100%|██████████| 273/273 [3:59:00<00:00, 52.53s/it]   
