Bi-montlhy Landsat -> https://browser.stac.dataspace.copernicus.eu/collections/opengeohub-landsat-bimonthly-mosaic-v1.0.1?.language=pt

# Samples

In [1]:
from pathlib import Path

input_dir = Path("../data/raster")
output_dir = Path("../data")
output_dir.mkdir(exist_ok=True)

raster_files = list(input_dir.glob("*.tif"))

In [2]:
import rasterio
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

In [3]:
def raster2gpkg(input_dir):
    with rasterio.open(input_dir) as src:
        data = src.read()
        transform = src.transform
        crs = src.crs
        nodata = src.nodata

    _, rows, cols = data.shape

    geoms = []
    records = []

    for i in range(rows):
        for j in range(cols):
            values = data[:, i, j]

            if any(values == nodata):
                continue

            x, y = transform * (j + 0.5, i + 0.5)

            geoms.append(Point(x, y))

            record = {f"B{b+1:02}": values[b] for b in range(64)}

            record["class"] = {0: 0, 1: 1, 64: 2}[values[64]]
            record["tile_id"] = input_dir.stem

            records.append(record)

    gdf = gpd.GeoDataFrame(records, geometry=geoms, crs=crs)

    return gdf if not gdf.empty else None

In [4]:
def reduce_samples(gdf: gpd.GeoDataFrame, frac: int):
    gdfs = []

    for value in [0, 1, 2]:
        new_gdf = gdf[gdf['class'] == value].copy()

        if not new_gdf.empty:
            new_gdf = new_gdf.sample(frac=frac, random_state=0).copy()

            gdfs.append(new_gdf)

    return gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), crs=gdfs[0].crs)

In [5]:
gdfs = []

for raster_file in raster_files:
    gdf = raster2gpkg(raster_file)

    if not gdf is None:
        gdf = gdf.to_crs("EPSG:4326")

        gdf = reduce_samples(gdf, 0.20)
        
        gdfs.append(gdf) 

gdf_all = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), crs=gdfs[0].crs)

gdf_all.to_file(output_dir / 'samples.gpkg', layer="all_points", driver="GPKG")

gdf_all.to_parquet(output_dir / "samples.pq", index=False)

In [6]:
import pandas as pd

df = pd.read_parquet('../data/samples.pq')

df['class'].value_counts()

class
0    4474164
2    1290158
1     519889
Name: count, dtype: int64

# Train/Valid

In [10]:
from pathlib import Path

input_dir = Path("../data/samples")
output_fn = "samples.gpkg"

In [None]:
import random
import geopandas as gpd

gdf = gpd.read_file("../data/samples/samples.gpkg")

lista = list(gdf['tile_id'].unique())

qtd_valid = max(1, int(len(lista) * 0.2))

ids_valid = random.sample(lista, qtd_valid)

print("ids_valid:", ids_valid)

ids_valid: ['536-mvp', '153-mvp', '1433-mvp', '796-mvp', '130-mvp', '397-mvp', '1515-mvp', '1447-mvp', '1550-mvp', '148-mvp', '1659-mvp', '1317-mvp', '1836-mvp', '501-mvp', '1519-mvp', '773-mvp', '191-mvp', '1034-mvp', '602-mvp', '1584-mvp', '261-mvp', '1202-mvp', '444-mvp', '282-mvp', '1543-mvp', '871-mvp', '1705-mvp', '757-mvp', '1504-mvp', '396-mvp', '1238-mvp', '392-mvp', '1537-mvp', '1290-mvp', '339-mvp', '213-mvp', '1263-mvp', '745-mvp', '346-mvp', '118-mvp', '1715-mvp', '200-mvp', '421-mvp', '1538-mvp', '589-mvp', '164-mvp', '1465-mvp', '1288-mvp', '1830-mvp', '1594-mvp', '1846-mvp', '333-mvp', '604-mvp', '1143-mvp', '1645-mvp', '883-mvp', '884-mvp', '304-mvp', '1721-mvp', '1380-mvp', '156-mvp', '1432-mvp', '1613-mvp', '293-mvp', '701-mvp', '1027-mvp', '19-mvp', '109-mvp', '1565-mvp', '476-mvp', '580-mvp', '1475-mvp', '1829-mvp', '13-mvp', '778-mvp', '847-mvp', '1451-mvp', '188-mvp', '1384-mvp', '1526-mvp', '1456-mvp', '1474-mvp', '1151-mvp', '927-mvp', '353-mvp', '1013-mvp', '1

In [6]:
gdf["is_valid"] = gdf["tile_id"].isin(ids_valid)

In [12]:
gdf.to_file(input_dir / output_fn, layer="all_points", driver="GPKG")