# DOWNLOAD

In [None]:
!wget -O gpw_grassland_fscs.vi.vhr_grid.samples_20000101_20241231_go_epsg.4326_v2.gpkg "https://zenodo.org/records/15631655/files/gpw_grassland_fscs.vi.vhr_grid.samples_20000101_20241231_go_epsg.4326_v2.gpkg?download=1"

In [None]:
from pathlib import Path

OUTPUT_PATH = Path('../../data')
OUTPUT_PATH.mkdir(exist_ok=True)

In [None]:
import os
import requests

url = "https://zenodo.org/records/15631655/files/gpw_grassland_fscs.vi.vhr_grid.samples_20000101_20241231_go_epsg.4326_v2.gpkg?download=1"

filename = os.path.join(OUTPUT_PATH, "gpw_grassland_fscs.vi.vhr_grid.samples_20000101_20241231_go_epsg.4326_v2.gpkg")

response = requests.get(url, stream=True)
response.raise_for_status()

with open(filename, "wb") as f:
    for chunk in response.iter_content(chunk_size=8192):
        f.write(chunk)

print(f"Download concluído: {filename}")

# EXTRACTION

In [1]:
from pathlib import Path

BATCH_SIZE = 200_000

input_fn = "gpw_grassland_fscs.vi.vhr_grid.samples_20000101_20241231_go_epsg.4326_v2.gpkg"
input_dir = Path("../../data")
output_dir = Path("../../data/raw")
output_dir.mkdir(exist_ok=True)

In [2]:
CLASS_DICT = {
    "Other land cover": 0,
    "Cultivated grassland": 1,
    "Natural/semi-natural grassland": 2,
    "Open shrubland": 3
}

In [7]:
import pandas as pd
import geopandas as gpd

from tqdm import tqdm

processed = 0

while True:
    gdf = gpd.read_file(input_dir / input_fn, rows=slice(processed, processed + BATCH_SIZE))

    if gdf.empty:
        break

    last_tile_id = None

    if gdf.shape[0] == BATCH_SIZE:
        last_tile_id = gdf.iloc[-1]["tile_id"]

    for tile_id in tqdm(gdf["tile_id"].unique()):
        if last_tile_id == tile_id:
            continue 

        tile_gdf = gdf[gdf["tile_id"] == tile_id].copy()

        #bing_end_date = tile_gdf['bing_image_end_date'].str[:4]
        #bing_end_date = pd.to_numeric(bing_end_date, errors='coerce')
        bing_start_date = tile_gdf['bing_image_start_date'].str[:4]
        bing_start_date = pd.to_numeric(bing_start_date, errors='coerce')

        #tile_gdf["bing_year"] = (bing_end_date + bing_start_date) // 2
        tile_gdf["bing_year"] = bing_start_date
        tile_gdf["bing_value"] = tile_gdf["bing_class"].map(CLASS_DICT)

        #google_end_date = tile_gdf['google_image_end_date'].str[:4]
        #google_end_date = pd.to_numeric(google_end_date, errors='coerce')
        google_start_date = tile_gdf['google_image_start_date'].str[:4]
        google_start_date = pd.to_numeric(google_start_date, errors='coerce')

        #tile_gdf["google_year"] = (google_end_date + google_start_date) // 2
        tile_gdf["google_year"] = google_start_date
        tile_gdf["google_value"] = tile_gdf["google_class"].map(CLASS_DICT)

        if (
            (pd.isna(tile_gdf.iloc[0]["bing_year"]) or tile_gdf.iloc[0]["bing_year"] < 2017)
            and (pd.isna(tile_gdf.iloc[0]["google_year"]) or tile_gdf.iloc[0]["google_year"] < 2017)
        ):
            continue
        
        tile_gdf.to_file(output_dir / f'{tile_id}.gpkg', driver="GPKG")

    processed += len(gdf)

100%|██████████| 20/20 [00:01<00:00, 13.77it/s]
100%|██████████| 20/20 [00:01<00:00, 18.49it/s]
100%|██████████| 20/20 [00:01<00:00, 16.42it/s]
100%|██████████| 20/20 [00:01<00:00, 19.47it/s]
100%|██████████| 20/20 [00:01<00:00, 14.68it/s]
100%|██████████| 20/20 [00:01<00:00, 15.10it/s]
100%|██████████| 20/20 [00:01<00:00, 15.90it/s]
100%|██████████| 21/21 [00:01<00:00, 17.13it/s]
100%|██████████| 20/20 [00:01<00:00, 15.07it/s]
100%|██████████| 20/20 [00:01<00:00, 19.62it/s]
100%|██████████| 20/20 [00:01<00:00, 14.33it/s]
100%|██████████| 20/20 [00:01<00:00, 17.08it/s]
100%|██████████| 20/20 [00:01<00:00, 15.10it/s]
100%|██████████| 20/20 [00:01<00:00, 17.94it/s]
100%|██████████| 20/20 [00:01<00:00, 14.64it/s]
100%|██████████| 20/20 [00:01<00:00, 14.82it/s]
100%|██████████| 21/21 [00:01<00:00, 18.81it/s]
100%|██████████| 20/20 [00:01<00:00, 18.29it/s]
100%|██████████| 20/20 [00:01<00:00, 17.41it/s]
100%|██████████| 20/20 [00:01<00:00, 17.49it/s]
100%|██████████| 20/20 [00:01<00:00, 16.