Bi-montlhy Landsat -> https://browser.stac.dataspace.copernicus.eu/collections/opengeohub-landsat-bimonthly-mosaic-v1.0.1?.language=pt

# Samples

In [1]:
from pathlib import Path

input_dir = Path("../data/raster")
output_dir = Path("../data/parquet")
output_dir.mkdir(exist_ok=True)

raster_files = list((input_dir / 'google').glob("*.tif")) + list((input_dir / 'bing').glob("*.tif"))

In [2]:
import rasterio
import pandas as pd
import geopandas as gpd

from tqdm import tqdm
from shapely.geometry import Point

In [3]:
def raster2gpkg(input_dir):
    with rasterio.open(input_dir) as src:
        data = src.read()
        transform = src.transform
        crs = src.crs
        nodata = src.nodata

    _, rows, cols = data.shape

    geoms = []
    records = []

    for i in range(rows):
        for j in range(cols):
            values = data[:, i, j]

            if any(values == nodata):
                continue

            x, y = transform * (j + 0.5, i + 0.5)

            geoms.append(Point(x, y))

            record = {f"B{b+1:02}": values[b] for b in range(64)}

            record["class"] = values[64]
            record["tile_id"] = input_dir.stem

            records.append(record)

    gdf = gpd.GeoDataFrame(records, geometry=geoms, crs=crs)

    return gdf if not gdf.empty else None

In [4]:
def reduce_samples(gdf: gpd.GeoDataFrame, frac: int):
    gdfs = []

    for value in range(4):
        new_gdf = gdf[gdf['class'] == value].copy()

        if not new_gdf.empty:
            new_gdf = new_gdf.sample(frac=frac, random_state=0).copy()

            gdfs.append(new_gdf)

    return gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), crs=gdfs[0].crs)

In [None]:
gdfs = []

counter = 0

for raster_file in tqdm(raster_files):
    gdf = raster2gpkg(raster_file)

    if not gdf is None:
        gdf = gdf.to_crs("EPSG:4326")

        gdf = reduce_samples(gdf, 0.2)
        
        gdf.to_parquet(output_dir / f"{counter}.pq", index=False)

        counter += 1


100%|██████████| 8098/8098 [43:45<00:00,  3.08it/s]


# Concat

In [1]:
from pathlib import Path

input_dir = Path("../data/parquet")
output_dir = Path("../data")
output_dir.mkdir(exist_ok=True)

parquet_files = list((input_dir).glob("*.pq"))

In [2]:
import pyarrow.parquet as pq
import pyarrow as pa

from tqdm import tqdm

writer = None

for files in tqdm(parquet_files):
    reader = pq.ParquetFile(files)

    for batch in reader.iter_batches():
        table = pa.Table.from_batches([batch])

        if writer is None:
            writer = pq.ParquetWriter(output_dir / "samples.pq", table.schema)

        writer.write_table(table)

if writer:
    writer.close()

100%|██████████| 8070/8070 [01:15<00:00, 106.27it/s]


# View

In [3]:
import pandas as pd

df = pd.read_parquet('../data/samples.pq')

df['class'].value_counts()

class
0.0    3463847
2.0    1130850
1.0     482438
3.0     249533
Name: count, dtype: int64