# Convert GLAD GLCLU2020 to COGs
The raw data hosted by Google are not COGs so we need to convert them and upload to S3 for consumption by MAAP users.

In [None]:
%pip install obstore 

In [1]:
import io
import os
import pathlib
import urllib.request
from typing import Union

import boto3
import obstore as obs
import rasterio
import tqdm.notebook
from obstore.store import HTTPStore, LocalStore, S3Store
from rasterio.io import MemoryFile
from rio_cogeo.cogeo import cog_translate, cog_validate
from rio_cogeo.profiles import cog_profiles

from maap.maap import MAAP

maap = MAAP()

In [2]:
file_list_url_format = "https://storage.googleapis.com/earthenginepartners-hansen/GLCLU2000-2020/v2/{year}.txt"

urls = []
for year in [
    "2000",
    "2005",
    "2010",
    "2015",
    "2020",
    "2000-2020change", # net change layer
]:
    file_list_url = file_list_url_format.format(year=year)
    orig_urls = urllib.request.urlopen(file_list_url)
    urls.extend([url.strip().decode("utf-8") for url in orig_urls.readlines()])

print(f"found {len(urls)} urls")
urls[:10]

found 1680 urls


['https://storage.googleapis.com/earthenginepartners-hansen/GLCLU2000-2020/v2/2000/00N_000E.tif',
 'https://storage.googleapis.com/earthenginepartners-hansen/GLCLU2000-2020/v2/2000/00N_010E.tif',
 'https://storage.googleapis.com/earthenginepartners-hansen/GLCLU2000-2020/v2/2000/00N_020E.tif',
 'https://storage.googleapis.com/earthenginepartners-hansen/GLCLU2000-2020/v2/2000/00N_030E.tif',
 'https://storage.googleapis.com/earthenginepartners-hansen/GLCLU2000-2020/v2/2000/00N_040E.tif',
 'https://storage.googleapis.com/earthenginepartners-hansen/GLCLU2000-2020/v2/2000/00N_040W.tif',
 'https://storage.googleapis.com/earthenginepartners-hansen/GLCLU2000-2020/v2/2000/00N_050W.tif',
 'https://storage.googleapis.com/earthenginepartners-hansen/GLCLU2000-2020/v2/2000/00N_060W.tif',
 'https://storage.googleapis.com/earthenginepartners-hansen/GLCLU2000-2020/v2/2000/00N_070W.tif',
 'https://storage.googleapis.com/earthenginepartners-hansen/GLCLU2000-2020/v2/2000/00N_080W.tif']

In [3]:
# original file store info
store_base_url = "https://storage.googleapis.com/earthenginepartners-hansen"
orig_store = HTTPStore.from_url(url=store_base_url)
orig_paths = [url.replace(store_base_url, "") for url in urls]

ObjectStore = Union[
    LocalStore,
    S3Store,
]

def convert_to_cog(
    src_path: str,
    src_store: ObjectStore,
    dst_path: str,
    dst_store: ObjectStore,
):
    """Fetch a file from an ObjectStore, load into memory, convert to an in-memory COG and write
    to another ObjectStore.
    """
    response = obs.get(src_store, src_path)
    dst_profile = cog_profiles.get("deflate")
    config = dict(
        GDAL_NUM_THREADS="ALL_CPUS",
        GDAL_TIFF_INTERNAL_MASK=True,
        GDAL_TIFF_OVR_BLOCKSIZE="128",
    )
    
    with rasterio.open(io.BytesIO(response.bytes())) as src, MemoryFile() as dst_memfile:
        cog_translate(
            src,
            dst_memfile.name,
            dst_profile,
            in_memory=True,
            quiet=True,
            config=config,
        )

        obs.put(dst_store, dst_path, dst_memfile)



In [4]:
s3_key_prefix = "shared/henrydevseed/hansen"

for i, src_path in tqdm.notebook.tqdm_notebook(enumerate(orig_paths), total=len(orig_paths)):
    # S3 credentials need to be refreshed periodically
    if not i % 20:
        aws_creds = maap.aws.workspace_bucket_credentials()
        aws_bucket_name = aws_creds.pop("aws_bucket_name")
        aws_bucket_prefix = aws_creds.pop("aws_bucket_prefix")
    
    session = boto3.Session(
        aws_access_key_id=aws_creds["aws_access_key_id"],
        aws_secret_access_key=aws_creds["aws_secret_access_key"],
        aws_session_token=aws_creds["aws_session_token"],
        region_name="us-west-2",
    )
    s3_store = S3Store.from_session(session, aws_bucket_name, config={"AWS_REGION": "us-west-2"})

    dst_path = f"{s3_key_prefix}{src_path}"
    try:
        exists = obs.head(s3_store, dst_path)
        if exists:
            aws_session = rasterio.session.AWSSession(session=session)
            with rasterio.Env(aws_session):
                s3_key = f"s3://{aws_bucket_name}/{dst_path}"
                valid, errors, warnings = cog_validate(
                    s3_key,
                )
                if not valid:
                    raise FileNotFoundError(f"{s3_key} is not a valid COG")
            continue
    except FileNotFoundError:
        convert_to_cog(src_path, orig_store, dst_path, s3_store)

  0%|          | 0/1680 [00:00<?, ?it/s]

## copy COGs to nasa-maap-data-store bucket

Need to be logged into SMCE MAAP AWS account first.

`aws s3 sync s3://maap-ops-workspace/shared/henrydevseed/hansen/GLCLU2000-2020/ s3://nasa-maap-data-store/file-staging/nasa-map/glad-glclu2020/`