In [4]:
import rasterio
import geopandas as gpd
from shapely.geometry import box
from rasterio.features import rasterize

raster_path = raster_file = r"/content/drive/MyDrive/Colab Notebooks/mastarbeit/in_data/2024350_Mosaik_RGB.tif"

try:
    with rasterio.open(raster_path) as src:
        aoi_bounds_orig = src.bounds
        crs_orig = src.crs
        raster_meta_final = src.meta.copy() # For the final label raster
        transform_orig = src.transform
        width_orig = src.width
        height_orig = src.height
        aoi_polygon_orig = box(*aoi_bounds_orig)
except Exception as e:
    print(f"Error loading raster: {e}. Exiting.")
    print(e)
aoi_gdf_orig = gpd.GeoDataFrame(geometry=[aoi_polygon_orig], crs=crs_orig)

# We will clip all data to aoi_polygon_orig AT THE END of vector processing.
# Convert AOI to WGS84 for OSMnx queries
aoi_gdf_orig = gpd.GeoDataFrame(geometry=[aoi_polygon_orig], crs=crs_orig)
aoi_31287 = aoi_gdf_orig.to_crs(epsg=31287)
aoi_bbox_31287 = aoi_31287.total_bounds # Use this for ox.features_from_bbox
aoi_31256 = aoi_gdf_orig.to_crs(epsg=31256)
aoi_bbox_31256 = aoi_31256.total_bounds # Use this for ox.features_from_bbox


In [5]:
final_gdf = gpd.read_file(r"/content/drive/MyDrive/Colab Notebooks/mastarbeit/in_data/merged_patches.gpkg")

In [6]:
import numpy as np
np.unique(final_gdf["class_id"], return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39]),
 array([   515,  67364,   1885,   1103, 293773,   7952,  29845,  37652,
         13670,   1356,  32634, 184033,  13133,    224,    147,    133,
             6,   8309,  30399,  10498, 193051,  17765,  22331,    194,
         12645,   3210,   2366,    659,     13,    225,    622,   1536,
          1998,  50190,  35783,  27974,   7341,    106]))

In [9]:
final_gdf["class_id"] = final_gdf["class_id"] -1
np.unique(final_gdf["class_id"], return_counts=True)

(array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34,
        35, 36, 37, 38], dtype=int64),
 array([   515,  67364,   1885,   1103, 293773,   7952,  29845,  37652,
         13670,   1356,  32634, 184033,  13133,    224,    147,    133,
             6,   8309,  30399,  10498, 193051,  17765,  22331,    194,
         12645,   3210,   2366,    659,     13,    225,    622,   1536,
          1998,  50190,  35783,  27974,   7341,    106]))

In [2]:
!pip install rasterio

Collecting rasterio
  Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting cligj>=0.5 (from rasterio)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Collecting click-plugins (from rasterio)
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl.metadata (6.4 kB)
Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.2/22.2 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Downloading affine-2.4.0-py3-none-any.whl (15 kB)
Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Installing collected packages: cligj, click-plugins, affine, rasterio
Successfully installed affine-2.4.0 click-plugins-1.1.1 cligj-0.7.2 rasterio-1.4.3


In [17]:
unique_classes = sorted(final_gdf['class_id'].unique())
class_to_id = {int(cls_name): i + 1 for i, cls_name in enumerate(unique_classes)} # Start IDs from 1

In [20]:
[class_to_id[val] for val in np.unique(final_gdf['class_id'])]

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38]

In [10]:
shapes = ((geom, class_to_id[val]) for geom, val in zip(final_gdf.geometry, final_gdf['class_id']))


In [29]:
import rasterio
from rasterio.windows import Window
import geopandas as gpd
import numpy as np
import pandas as pd
from rasterio.features import rasterize
from shapely.geometry import box
from typing import Union # Import Union for type hinting

def vector_to_raster_int8(
    vector_data: Union[gpd.GeoDataFrame, str], # Use Union for multiple allowed types
    reference_raster_path: str,
    output_raster_path: str
) -> None: # Explicitly state that the function returns None
    """
    Converts vector data (GeoDataFrame or GeoPackage) to a GeoTIFF raster
    with int8 datatype, matching the bounds and pixel size of a reference raster.

    The conversion is performed in a tiled manner for memory efficiency.
    The output raster will have pixel values corresponding to the 'class_id'
    column of the input vector data. Areas not covered by vector data
    will be assigned a value of 0 (nodata).

    Note: If vector_data is passed as a GeoDataFrame object, it is the caller's
    responsibility to manage its lifecycle and close it if necessary outside
    this function. If a GeoPackage path is provided, it is opened and read
    into memory, and explicit closing is not typically required by geopandas.

    Args:
        vector_data: Input vector data. Can be a GeoDataFrame object
                     or a file path (string) to a GeoPackage file.
        reference_raster_path: Path to a GeoTIFF file. Its spatial extent,
                               resolution, and CRS will be used for the output raster.
        output_raster_path: Path where the output GeoTIFF raster will be saved.
                            The file will be created with int8 datatype and LZW compression.

    Returns:
        None. The function writes the output raster directly to the specified path.

    Raises:
        TypeError: If vector_data is neither a GeoDataFrame nor a string path.
        ValueError: If the input GeoDataFrame does not contain a 'class_id' column.
        IOError: If there is an error opening or reading the reference raster.
    """
    if isinstance(vector_data, str):
        # Assume it's a GeoPackage path and load it
        # geopandas.read_file typically loads into memory and manages file handle
        gdf = gpd.read_file(vector_data)
    elif isinstance(vector_data, gpd.GeoDataFrame):
        # It's already a GeoDataFrame
        gdf = vector_data
    else:
        raise TypeError("Input vector_data must be a GeoDataFrame or a file path (str).")

    # Ensure the GeoDataFrame has the required 'class_id' column
    if 'class_id' not in gdf.columns:
        raise ValueError("Input GeoDataFrame must contain a 'class_id' column.")

    # Ensure 'class_id' is numeric and suitable for int8
    if not pd.api.types.is_numeric_dtype(gdf['class_id']):
            gdf['class_id'] = pd.to_numeric(gdf['class_id'], errors='coerce')
            gdf.dropna(subset=['class_id'], inplace=True) # Drop rows where conversion failed
            gdf["class_id"] = gdf["class_id"]-1

    if not (gdf['class_id'] >= np.iinfo(np.int8).min).all() or \
       not (gdf['class_id'] <= np.iinfo(np.int8).max).all():
        print("Warning: 'class_id' values are outside the range of int8. Values will be clamped by rasterio during rasterization.")

    # 1. Open the reference raster and get its properties
    # Using 'with' statement ensures the source dataset is closed automatically
    try:
        with rasterio.open(reference_raster_path) as src:
            width = src.width
            height = src.height
            crs = src.crs
            transform = src.transform
            bounds = src.bounds # Also get bounds for later use if needed
            profile = src.profile # Get the full profile for output
    except Exception as e:
        raise IOError(f"Error opening or reading reference raster: {e}")

    # 2. Ensure vector data is in the same CRS as the reference raster
    if gdf.crs != crs:
        print(f"Reprojecting vector data from {gdf.crs} to raster CRS {crs}")
        gdf = gdf.to_crs(crs)
    else:
        print("Vector data already in the correct CRS.")

    # 3. Define tiling strategy and iterate through tiles
    tile_width = 2048  # Define tile size
    tile_height = 2048

    # Calculate number of tiles
    num_tiles_x = (width + tile_width - 1) // tile_width
    num_tiles_y = (height + tile_height - 1) // tile_height

    print(f"Processing raster as {num_tiles_y}x{num_tiles_x} tiles of size {tile_width}x{tile_height}")

    # Prepare the output raster file for writing
    out_meta = profile.copy() # Use the profile from the source

    # Remove conflicting profile elements for a single-band int8 LZW output
    if 'photometric' in out_meta:
        del out_meta['photometric']
    if 'interleave' in out_meta:
        del out_meta['interleave']

    out_meta.update({
        "driver": "GTiff",
        "dtype": "int8",
        "tiled": True, # Explicitly set tiled to True
        "blockxsize": tile_width, # Add blockxsize
        "blockysize": tile_height, # Add blockysize
        "compress": "LZW", # Explicitly set LZW compression
        "predictor": 2, # Recommended for LZW compression on integer data
        "nodata": -1, # Set nodata value to the fill value
        "count": 1 # Explicitly set band count to 1 for classification output
    })

    # Using 'with' statement ensures the output dataset is closed automatically
    with rasterio.open(output_raster_path, "w", **out_meta) as out_src:
        for row in range(num_tiles_y):
            for col in range(num_tiles_x):
                # Calculate window for the current tile
                col_off = col * tile_width
                row_off = row * tile_height
                width_tile = min(tile_width, width - col_off)
                height_tile = min(tile_height, height - row_off)

                window = Window(col_off, row_off, width_tile, height_tile)
                print(f"    Processing tile ({row}, {col}) with window: {window}")

                # --- Rasterization for this tile ---

                # 3.1 Calculate the affine transform for the current window
                window_transform = src.window_transform(window)

                # 3.2 Clip the vector data to the bounding box of the current window
                window_bounds = src.window_bounds(window)
                window_bbox = box(*window_bounds)
                # Use spatial index for efficient clipping if gdf is large
                if hasattr(gdf, 'sindex'):
                    possible_matches_index = list(gdf.sindex.intersection(window_bbox.bounds))
                    clipped_gdf = gdf.iloc[possible_matches_index].clip(window_bbox, keep_geom_type=True) # Added keep_geom_type
                else:
                    clipped_gdf = gpd.clip(gdf, window_bbox, keep_geom_type=True) # Added keep_geom_type


                # 3.3 Prepare geometries and values for rasterization
                # Filter out empty or invalid geometries after clipping
                clipped_gdf = clipped_gdf[clipped_gdf.geometry.notna() & ~clipped_gdf.geometry.is_empty & clipped_gdf.geometry.is_valid]

                geometries_and_values = [
                    (geom, int(value)) # Ensure value is int
                    for geom, value in zip(clipped_gdf.geometry, clipped_gdf['class_id'])
                ]

                # Handle case where no features are in the tile
                if not geometries_and_values:
                    rasterized_tile = np.full((height_tile, width_tile), out_meta['nodata'], dtype='int8')
                else:
                    # 3.4 Use rasterio.features.rasterize to burn geometries
                    rasterized_tile = rasterize(
                        geometries_and_values,
                        out_shape=(height_tile, width_tile),
                        transform=window_transform,
                        # crs=crs, # Removed this line
                        dtype='int8',
                        fill=out_meta['nodata'] # Use nodata value as the fill value
                    )


                # 4. Write the rasterized tile to the output file
                out_src.write(rasterized_tile, 1, window=window)

    print("Finished processing all tiles and writing to output raster.")

In [30]:
vector_to_raster_int8(
    r"/content/drive/MyDrive/Colab Notebooks/mastarbeit/in_data/merged_patches.gpkg", # Use Union for multiple allowed types
    r"/content/drive/MyDrive/Colab Notebooks/mastarbeit/in_data/2024350_Mosaik_RGB.tif",
    r"/content/drive/MyDrive/Colab Notebooks/mastarbeit/out_data/final_lulc_train_remapped_2.tif"
)

[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
    Processing tile (128, 97) with window: Window(col_off=198656, row_off=262144, width=2048, height=2048)
    Processing tile (128, 98) with window: Window(col_off=200704, row_off=262144, width=2048, height=2048)
    Processing tile (128, 99) with window: Window(col_off=202752, row_off=262144, width=2048, height=2048)
    Processing tile (128, 100) with window: Window(col_off=204800, row_off=262144, width=2048, height=2048)
    Processing tile (128, 101) with window: Window(col_off=206848, row_off=262144, width=2048, height=2048)
    Processing tile (128, 102) with window: Window(col_off=208896, row_off=262144, width=2048, height=2048)
    Processing tile (128, 103) with window: Window(col_off=210944, row_off=262144, width=2048, height=2048)
    Processing tile (128, 104) with window: Window(col_off=212992, row_off=262144, width=2048, height=2048)
    Processing tile (128, 105) with window: Window(col_off

In [27]:
!ls -lrt "/content/drive/MyDrive/Colab Notebooks/mastarbeit/out_data/"

total 2063839
-rw------- 1 root root   58314752 Jun  9 20:21 final_lulc_data_subregion_01.gpkg
-rw------- 1 root root   87146496 Jun  9 22:19 final_lulc_data_subregion_02.gpkg
-rw------- 1 root root   64901120 Jun  9 23:23 final_lulc_data_subregion_03.gpkg
-rw------- 1 root root   41951232 Jun  9 23:41 final_lulc_data_subregion_04.gpkg
-rw------- 1 root root   60850176 Jun 10 00:24 final_lulc_data_subregion_05.gpkg
-rw------- 1 root root   86405120 Jun 10 00:55 final_lulc_data_subregion_06.gpkg
-rw------- 1 root root   39956480 Jun 10 01:10 final_lulc_data_subregion_07.gpkg
-rw------- 1 root root   54071296 Jun 10 01:38 final_lulc_data_subregion_08.gpkg
-rw------- 1 root root   60436480 Jun 10 01:55 final_lulc_data_subregion_09.gpkg
-rw------- 1 root root 1559337937 Jun 10 11:58 final_lulc_train.tif
