Bi-montlhy Landsat -> https://browser.stac.dataspace.copernicus.eu/collections/opengeohub-landsat-bimonthly-mosaic-v1.0.1?.language=pt

# Samples

In [2]:
from pathlib import Path

input_dir = Path("../data/raster")
output_dir = Path("../data/parquet")
output_dir.mkdir(exist_ok=True)

raster_files = list((input_dir / 'google').glob("*.tif")) + list((input_dir / 'bing').glob("*.tif"))

In [3]:
import rasterio
import pandas as pd

from tqdm import tqdm

In [None]:
def raster2gpkg(input_dir):
    with rasterio.open(input_dir) as src:
        data = src.read()
        nodata = src.nodata
        transform = src.transform

    _, rows, cols = data.shape

    records = []

    for i in range(rows):
        for j in range(cols):
            values = data[:, i, j]

            if any(values == nodata):
                continue

            x, y = transform * (j + 0.5, i + 0.5)

            record = {f"B{b+1:02}": values[b] for b in range(64)}

            record['x'] = x
            record['y'] = y
            record["class"] = values[64]
            record["tile_id"] = input_dir.stem

            records.append(record)

    df = pd.DataFrame(records)

    return df if not df.empty else None

In [5]:
counter = 0

for raster_file in tqdm(raster_files):
    df = raster2gpkg(raster_file)

    if not df is None:
        df.to_parquet(output_dir / f"{counter}.pq", index=False)

        counter += 1

100%|██████████| 8098/8098 [36:05<00:00,  3.74it/s]


# Concat

In [6]:
from pathlib import Path

input_dir = Path("../data/parquet")
output_dir = Path("../data")
output_dir.mkdir(exist_ok=True)

parquet_files = list((input_dir).glob("*.pq"))

In [7]:
import pyarrow.parquet as pq
import pyarrow as pa

from tqdm import tqdm

writer = None

for files in tqdm(parquet_files):
    reader = pq.ParquetFile(files)

    for batch in reader.iter_batches():
        table = pa.Table.from_batches([batch])

        if writer is None:
            writer = pq.ParquetWriter(output_dir / "samples.pq", table.schema)

        writer.write_table(table)

if writer:
    writer.close()

100%|██████████| 8070/8070 [02:22<00:00, 56.72it/s]


# GroupKFold

In [None]:
from pathlib import Path

input_dir = Path("../data")
output_dir = Path("../data")

In [None]:
import joblib
import pandas as pd

from sklearn.model_selection import GroupKFold

N_SPLITS = 5

samples = pd.read_parquet(input_dir / "samples.pq")

covariates = [f"B{b:02}" for b in range(1, 65)]

counter = 1

groupKFold = GroupKFold(N_SPLITS, shuffle=True, random_state=0)

for train_idx, test_idx in groupKFold.split(samples[covariates], samples['class'], samples['tile_id']):
        train_samples = samples.iloc[train_idx].sample(frac=0.2).to_parquet(output_dir / f"{counter:02}_samples_train.pq", index=False)
        test_samples = samples.iloc[test_idx].to_parquet(output_dir / f"{counter:02}_samples_test.pq", index=False)

        counter += 1

class
0.0    3463847
2.0    1130850
1.0     482438
3.0     249533
Name: count, dtype: int64

In [1]:
f"{1:02}"

'01'