# Extrair

In [None]:
from pathlib import Path

BATCH_SIZE = 10000000

GPKG_PATH = Path("../data")
GPKG_FILENAME = "gpw_grassland_fscs.vi.vhr_grid.samples_20000101_20241231_go_epsg.4326_v2.gpkg"

output_dir = Path("../data/raw")
output_dir.mkdir(exist_ok=True)

In [None]:
import geopandas as gpd

processed = 0

while True:
    gdf = gpd.read_file(GPKG_PATH / GPKG_FILENAME, rows=slice(processed, processed + BATCH_SIZE))

    if gdf.empty:
        print("No more data to process.")
        break

    if gdf.shape[0] + 1 == BATCH_SIZE:
        last_tile_id = gdf.iloc[-1]["tile_id"]
    else:
        last_tile_id = None

    # Save each tile as a separate GeoPackage
    for tile_id in gdf["tile_id"].unique():
        if not "mvp" in tile_id or last_tile_id == tile_id:
            continue 

        tile_gdf = gdf[gdf["tile_id"] == tile_id]

        tile_gdf.to_file(output_dir / f'{tile_id}.gpkg', driver="GPKG")

    processed += len(gdf)

    print(f"Processed up to rowid: {processed}")

Processed up to rowid: 10000000
Processed up to rowid: 20000000
Processed up to rowid: 30000000
Processed up to rowid: 40000000
Processed up to rowid: 50000000
Processed up to rowid: 60000000
Processed up to rowid: 70000000
Processed up to rowid: 80000000
Processed up to rowid: 90000000
Processed up to rowid: 100000000
Processed up to rowid: 102654330
No more data to process.


# Transformar

In [2]:
from pathlib import Path

input_dir = Path("../data/raw")
output_dir = Path("../data/mapped")
output_dir.mkdir(exist_ok=True)

In [3]:
import geopandas as gpd

map_dict = {
    "Other land cover": 0,
    "Cultivated grassland": 1,
    "Natural/semi-natural grassland": 2
}

for gpkg_file in input_dir.glob("*.gpkg"):
    gdf = gpd.read_file(gpkg_file)
    
    if "google_class" in gdf.columns:
        gdf["google_class"] = gdf["google_class"].map(map_dict)
    else:
        print(f"⚠️ Coluna 'google_class' não encontrada em {gpkg_file.name}")
        continue
    
    gdf.to_file(output_dir / f'{gpkg_file.stem}.gpkg', driver="GPKG")

# Rasterizar

In [None]:
# gdal_rasterize -a google_class -l 1-mvp -tr 10 10 -te 3298060 9564778 3299080 9565798 -ot Int16 -a_nodata -1 1-mvp.gpkg 1-mvp.tif

In [1]:
import geopandas as gpd
from pathlib import Path

input_dir = Path("../data/mapped")
output_dir = Path("../data/raster")
output_dir.mkdir(exist_ok=True)

gpkg_files = list(input_dir.glob("*.gpkg"))

In [2]:
from osgeo import gdal

for gpkg in gpkg_files:
    layer = gpkg.stem
    gdf = gpd.read_file(gpkg, layer=layer)
    xmin, ymin, xmax, ymax = gdf.total_bounds

    options = gdal.RasterizeOptions(
        attribute="google_class",
        layers=[layer],
        outputType=gdal.GDT_Int16,
        xRes=10,
        yRes=10,
        noData=-1,
        outputBounds=(xmin, ymin, xmax, ymax)
    )

    gdal.Rasterize(str(output_dir / f"{layer}.tif"), str(gpkg), options=options)



# Warp

In [None]:
# gdalwarp -t_srs EPSG:32635 -te 624600 7192530 625060 7193000 -tr 10 10 -r mode -dstnodata -1 1-mvp.tif 1-mvp-warped.tif

In [2]:
from pathlib import Path

input_dir = Path("../data/raster")
output_dir = Path("../data/warp")
features_dir = Path("../data/feature")
output_dir.mkdir(exist_ok=True)

gpkg_files = list(input_dir.glob("*.tif"))

In [3]:
from osgeo import gdal, osr

def get_output_bounds(filename: str):
    ds = gdal.Open(str(features_dir / f'{filename}.tif'))

    gt = ds.GetGeoTransform()
    xmin = gt[0]
    ymin = gt[3]
    xres = gt[1]
    yres = gt[5]
    xsize = ds.RasterXSize
    ysize = ds.RasterYSize
    xmax = xmin + xres * xsize
    ymax = ymin + yres * ysize

    return (xmin, ymin, xmax, ymax)

def get_epsg_from_gpkg(filename) -> int:
    ds = gdal.Open(features_dir / f'{filename}.tif')
    proj = ds.GetProjection()
    srs = osr.SpatialReference()
    srs.ImportFromWkt(proj)

    return srs.GetAttrValue("AUTHORITY", 1)

In [4]:
for gpkg in gpkg_files:
    filename = gpkg.stem

    espg = get_epsg_from_gpkg(filename)

    warp_options = gdal.WarpOptions(
        format="GTiff",
        dstSRS=f"EPSG:{espg}",
        outputBounds=get_output_bounds(filename),
        xRes=10, yRes=10,                  
        resampleAlg="mode",                
        dstNodata=-1                       
    )

    gdal.Warp(str(output_dir / f'{filename}.tif'), str(input_dir / f'{filename}.tif'), options=warp_options)



# Mask

In [1]:
from pathlib import Path

warped_dir = Path("../data/warp")
output_dir = Path("../data/mask")
features_dir = Path("../data/feature")
output_dir.mkdir(exist_ok=True)

files = list(warped_dir.glob("*.tif"))

In [2]:
import rasterio
import numpy as np

for file in files:
    with rasterio.open(file) as msrc:
        mask = msrc.read()

        mask = mask[:, ::-1, :]

    with rasterio.open(features_dir / f'{file.stem}.tif') as fsrc:
        data = fsrc.read()
        profile = fsrc.profile.copy()

    nodata = profile.get("nodata")

    masked_data = np.where(mask >= 0, data, nodata)

    data_concat = np.concatenate([masked_data, mask], axis=0)

    profile.update(count=data_concat.shape[0])

    with rasterio.open(output_dir / f'{file.stem}.tif', "w", **profile) as dst:
        dst.write(data_concat)

# Samples

In [3]:
from pathlib import Path

input_dir = Path("../data/mask")
output_dir = Path("../data/samples")
output_dir.mkdir(exist_ok=True)

files = list(input_dir.glob("*.tif"))

In [None]:
import rasterio
import geopandas as gpd
from shapely.geometry import Point

for file in files:
    with rasterio.open(file) as src:
        data = src.read()
        transform = src.transform
        crs = src.crs
        nodata = src.nodata

    bands, rows, cols = data.shape

    records = []
    geoms = []

    for i in range(rows):
        for j in range(cols):
            values = data[:, i, j]

            if any(values == nodata):
                continue

            x, y = transform * (j + 0.5, i + 0.5)
            geoms.append(Point(x, y))

            record = {f"B{b+1}": values[b] for b in range(bands)}
            record["class"] = record.pop("B65")
            record["tile_id"] = file.stem

            records.append(record)

    gdf = gpd.GeoDataFrame(records, geometry=geoms, crs=crs)

    gdf.to_file(output_dir / f'{file.stem}.gpkg', layer=file.stem, driver="GPKG")

# Concat Samples

In [5]:
import pandas as pd
import geopandas as gpd
from pathlib import Path

input_dir = Path("../data/samples")
output_fn = "samples.gpkg"

gpkg_files = list(input_dir.glob("*.gpkg"))

In [6]:
gdfs = []

for file in gpkg_files:
    gdf = gpd.read_file(file)
    gdf = gdf.to_crs("EPSG:4326")
    gdfs.append(gdf)

gdf_all = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), crs=gdfs[0].crs)

gdf_all.to_file(output_dir / output_fn, layer="all_points", driver="GPKG")

print(f"Todos os GPKGs concatenados em: {output_fn}")

Todos os GPKGs concatenados em: samples.gpkg


# View

In [2]:
import geopandas as gpd

gdf = gpd.read_file("../data/samples/samples.gpkg")

In [6]:
gdf['tile_id']

0          1-mvp
1          1-mvp
2          1-mvp
3          1-mvp
4          1-mvp
           ...  
8366501    1-mvp
8366502    1-mvp
8366503    1-mvp
8366504    1-mvp
8366505    1-mvp
Name: tile_id, Length: 8366506, dtype: object