<a href="https://colab.research.google.com/github/GeoKauko/TheNavySeals/blob/main/1_Preprocessing_integrated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# Import necessary libraries

## Import libraries
import os
from osgeo import gdal
!pip install rasterio
import rasterio
from rasterio import windows
from rasterio.windows import Window

from rasterio.plot import reshape_as_image
import rasterio.mask
from rasterio.features import rasterize

import pandas as pd
import geopandas as gpd
from shapely.geometry import mapping, Point, box, Polygon
from shapely.ops import cascaded_union

import numpy as np

from rasterio.features import geometry_mask
from shapely.geometry import box
from rasterio.features import geometry_mask

import shutil
from sklearn.model_selection import train_test_split

from glob import glob



In [24]:
## GOOGLE COLAB USERS ONLY 
## Mount Google Drive for data retrieval

from google.colab import drive
drive.mount('/content/drive')

import os

project_path = '/content/drive/My Drive/TheNavySeals/'

ModuleNotFoundError: No module named 'google'

In [2]:
## LOCAL USERS ONLY
## Change the path to your project directory

os.chdir('D:\E_2024_P6\SEAL')

project_path = ''

In [56]:
## Define paths and create directories

input_path = os.path.join(project_path, 'data/1_preprocessing/input')
os.makedirs(input_path, exist_ok=True)
output_path =  os.path.join(project_path, 'data/1_preprocessing/output')
os.makedirs(output_path, exist_ok=True)

panchromatic_path = os.path.join(project_path, 'data/1_preprocessing/input/panchromatic')
os.makedirs(panchromatic_path, exist_ok=True)
panchromatic = os.path.join(panchromatic_path, '22MAR25134903-P3DS-014983717010_01_P001.tif')
panchromatic_reduced_path = os.path.join(project_path, 'data/1_preprocessing/input/panchromatic_reduced')
os.makedirs(panchromatic_reduced_path, exist_ok=True)
panchromatic_reduced = os.path.join(panchromatic_reduced_path, 'panchromatic_reduced.tif')

pansharpened_path = os.path.join(project_path, 'data/1_preprocessing/input/pansharpened')
os.makedirs(pansharpened_path, exist_ok=True)
pansharpened_reduced_path = os.path.join(project_path, 'data/1_preprocessing/input/pansharpened_reduced')
os.makedirs(pansharpened_reduced_path, exist_ok=True)
pansharpened_reduced = os.path.join(pansharpened_reduced_path, 'pansharpened_reduced.tif')

shapefile_path  = os.path.join(project_path, 'data/1_preprocessing/input/shapefiles')
os.makedirs(shapefile_path, exist_ok=True)

panchromatic_parts_path = os.path.join(project_path, 'data/1_preprocessing/input/panchromatic_parts')
os.makedirs(panchromatic_parts_path, exist_ok=True)
pansharpened_parts_path = os.path.join(project_path, 'data/1_preprocessing/input/pansharpened_parts')
os.makedirs(pansharpened_parts_path, exist_ok=True)
mask_parts_path = os.path.join(project_path, 'data/1_preprocessing/input/mask_parts')
os.makedirs(mask_parts_path, exist_ok=True)

csv_path = os.path.join(input_path, 'points_within_images.csv')
mask_path = os.path.join(input_path, 'raster_mask.tif')

output_panchromatic= os.path.join(output_path, 'panchromatic')
os.makedirs(output_panchromatic, exist_ok=True)
output_pansharpened = os.path.join(output_path, 'pansharpened')
os.makedirs(output_pansharpened, exist_ok=True)
output_mask = os.path.join(output_path, 'mask')
os.makedirs(output_mask, exist_ok=True)

# Define paths for deep learning input
input_path_deeplearning = 'data/2_deep_learning'
panchromatic_path_dl = os.path.join(input_path_deeplearning, 'panchromatic')
pansharpened_path_dl = os.path.join(input_path_deeplearning, 'pansharpened')
 
# Create directories
for path in [panchromatic_path_dl, pansharpened_path_dl]:
    os.makedirs(os.path.join(path, 'train', 'images'), exist_ok=True)
    os.makedirs(os.path.join(path, 'train', 'masks'), exist_ok=True)
    os.makedirs(os.path.join(path, 'val', 'images'), exist_ok=True)
    os.makedirs(os.path.join(path, 'val', 'masks'), exist_ok=True)
    os.makedirs(os.path.join(path, 'test', 'images'), exist_ok=True)
    os.makedirs(os.path.join(path, 'test', 'masks'), exist_ok=True)

In [4]:
## Reduce the radiometric resolution of a raster to 8 bits

def reduce_radiometric_resolution(input_path, output_path, input_res=11):
    '''
    Reduce the radiometric resolution of the input raster and save the output raster.
    
    Args:
    - input_path (string): Path to the input raster.
    - output_path (string): Path to the output raster.
    - input_res (int): Radiometric resolution of the input raster in bits.
    '''
    # Ensure the output directory exists
    output_dir = os.path.dirname(output_path)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with rasterio.open(input_path) as src:
        # Read the number of bands
        num_bands = src.count

        # Initialize an array to store the scaled bands
        scaled_arrays = []

        for band in range(1, num_bands + 1):
            # Read the image band as a numpy array
            image_array = src.read(band, masked=True)

            # Rescale the pixel values to fit within 8-bit range (0-255)
            scaled_array = (image_array / (2**input_res - 1) * 255).astype(np.uint8)

            # Append the scaled array to the list
            scaled_arrays.append(scaled_array)

        # Stack the scaled arrays along the first axis to create a 3D array
        scaled_arrays = np.stack(scaled_arrays, axis=0)

        # Create a new raster profile with 8-bit pixel depth
        profile = src.profile
        profile.update(dtype=rasterio.uint8, count=num_bands)

        # Write the scaled arrays to a new raster file
        with rasterio.open(output_path, 'w', **profile) as dst:
            dst.write(scaled_arrays)

def resize_rasters_in_folder(input_folder, output_folder):
    '''
    Reduces the radiometric resolution of all rasters in a folder and saves the output rasters in the output_folder.
    
    Args:
    - input_folder (string): Path to the input folder.
    - output_folder (string): Path to the output folder.
    '''
    # Iterate over each file in the input folder
    for filename in os.listdir(input_folder):
        input_path = os.path.join(input_folder, filename)

        # Ensure we're only processing files (not subdirectories)
        if os.path.isfile(input_path):
            output_path = os.path.join(output_folder, filename)
            reduce_radiometric_resolution(input_path, output_path)

# Example usage
resize_rasters_in_folder(pansharpened_path, pansharpened_reduced_path)
reduce_radiometric_resolution(panchromatic, panchromatic_reduced)

In [5]:
## Mosaic pansharpened images

def mosaic_rasters(input_folder, output_path):
    # List to hold the file paths of the rasters to be merged
    input_files = []

    # Loop through the folder and add all .tif files to the list
    for file_name in os.listdir(input_folder):
        print(file_name)
        if file_name.endswith('.TIF'):
            input_files.append(os.path.join(input_folder, file_name))

    # Check if we have any input files
    if not input_files:
        raise FileNotFoundError("No .tif files found in the specified folder.")

    # Open the input files
    src_files_to_mosaic = []
    for file in input_files:
        src = gdal.Open(file)
        if src:
            src_files_to_mosaic.append(src)
        else:
            print(f"Failed to open {file}")

    # Create a virtual raster from the input files
    vrt = gdal.BuildVRT('temporary.vrt', src_files_to_mosaic)

    # Write the virtual raster to a new file
    gdal.Translate(output_path, vrt)

    # Cleanup
    vrt = None
    for src in src_files_to_mosaic:
        src = None

    print(f"Mosaic raster saved as {output_path}")

mosaic_rasters(pansharpened_reduced_path, pansharpened_reduced)

22MAR25134903-S3DS_R1C1-014983717040_01_P001.TIF
22MAR25134903-S3DS_R1C2-014983717040_01_P001.TIF
22MAR25134903-S3DS_R1C3-014983717040_01_P001.TIF
22MAR25134903-S3DS_R2C1-014983717040_01_P001.TIF
22MAR25134903-S3DS_R2C2-014983717040_01_P001.TIF
22MAR25134903-S3DS_R2C3-014983717040_01_P001.TIF
22MAR25134903-S3DS_R3C1-014983717040_01_P001.TIF
22MAR25134903-S3DS_R3C2-014983717040_01_P001.TIF
22MAR25134903-S3DS_R3C3-014983717040_01_P001.TIF
Mosaic raster saved as data/input/pansharpened_reduced\pansharpened_reduced.tif


In [19]:
## Mask raster

def create_polygon_from_pixels(row, col, transform):
    """
    Create a polygon from a center pixel (row, col) and its 24 surrounding pixels (5x5 block).
    """
    # Calculate the coordinates of the top-left corner of the top-left pixel
    top_left_x = transform[0] + (col - 2) * transform[1] + (row - 2) * transform[2]
    top_left_y = transform[3] + (col - 2) * transform[4] + (row - 2) * transform[5]

    # Pixel dimensions
    pixel_width = abs(transform[1])
    pixel_height = abs(transform[5])

    # Calculate the coordinates for the 5x5 block of pixels
    polygon_coords = [
        (top_left_x, top_left_y),
        (top_left_x + 5 * pixel_width, top_left_y),
        (top_left_x + 5 * pixel_width, top_left_y - 5 * pixel_height),
        (top_left_x, top_left_y - 5 * pixel_height),
        (top_left_x, top_left_y)
    ]

    return Polygon(polygon_coords)

def raster_points_to_polygons(raster_path, shapefile_path):
    # Read raster data
    raster_dataset = gdal.Open(raster_path)
    raster_geotransform = raster_dataset.GetGeoTransform()

    # Read shapefile
    shapefile_gdf = gpd.read_file(shapefile_path)
    shape_crs = shapefile_gdf.crs

    polygons = []
    for point in shapefile_gdf.geometry:
        # Convert point coordinates to raster coordinates
        x, y = point.x, point.y
        col = int((x - raster_geotransform[0]) / raster_geotransform[1])
        row = int((y - raster_geotransform[3]) / raster_geotransform[5])

        # Create polygon around the pixel and its 24 surrounding pixels
        polygon = create_polygon_from_pixels(row, col, raster_geotransform)
        polygons.append(polygon)

    result_gdf = gpd.GeoDataFrame(geometry=polygons, crs=shape_crs)

    return result_gdf

def mask_raster_with_polygon(input_raster_path, polygons, output_raster_path, value=1):
    """
    Create a copy of a raster, set all its values to 0, overlay it with a polygon shapefile,
    and set all pixels underneath polygons to a specified value.

    Args:
    - input_raster_path (str): Path to the input raster.
    - polygons (gdf): GeoDataFrame with the polygons of the mask.
    - output_raster_path (str): Path to save the masked raster.
    - value (int, optional): Value to set for pixels underneath polygons. Defaults to 1.
    """
    # Open the input raster for reading
    with rasterio.open(input_raster_path) as src:
        # Read raster data
        raster_data = src.read(1)
        # Get metadata
        meta = src.meta

    # Set all values to 0
    raster_data.fill(0)

    # Create mask from polygons
    mask = geometry_mask(polygons.geometry, out_shape=raster_data.shape, transform=src.transform, invert=True)

    # Set pixels underneath polygons to the specified value
    raster_data[mask] = value

    # Save the masked raster
    with rasterio.open(output_raster_path, 'w', **meta) as dst:
        dst.write(raster_data, 1)

mask_raster_with_polygon(panchromatic_reduced, raster_points_to_polygons(panchromatic_reduced, shapefile_path), mask_path)

In [23]:
## Tile raster and mask to 224x224 px

def split_and_save_raster(input_raster_path, part_width, part_height, output_folder):
    '''
    Split a raster into multiple tiles of length part_width and height part_height, and save them in output_folder.
    
    Args:
    - input_raster_path: path to the input raster.
    - part_width (int): Width of each tile.
    - part_height (int): Height of each tile.
    - output_folder (str): Directory to save the rasters.
    '''
    # Open the raster
    dataset = gdal.Open(input_raster_path)
    
    # Get raster dimensions
    width = dataset.RasterXSize
    height = dataset.RasterYSize

    # Calculate the number of parts
    num_parts_x = width // part_width
    num_parts_y = height // part_height

    # Get the number of bands
    bands = dataset.RasterCount

    # Split the raster and save
    for i in range(num_parts_x):
        for j in range(num_parts_y):
            x_offset = i * part_width
            y_offset = j * part_height

            # Read the split region
            part = dataset.ReadAsArray(x_offset, y_offset, part_width, part_height)

            # Expand dimensions if there's only one band
            if bands == 1:
               part = np.expand_dims(part, axis=0)

            # Create a new GDAL dataset to save the split part
            driver = gdal.GetDriverByName('GTiff')
            output_path = os.path.join(output_folder, f'part_{i}_{j}.tif')
            out_dataset = driver.Create(output_path, part_width, part_height, bands, gdal.GDT_UInt16)

            # Write data to the new dataset
            for band in range(bands):
                out_band = out_dataset.GetRasterBand(band + 1)
                out_band.WriteArray(part[band])

            # Set georeference and projection
            geo_transform = list(dataset.GetGeoTransform())
            geo_transform[0] += x_offset * geo_transform[1]
            geo_transform[3] += y_offset * geo_transform[5]
            out_dataset.SetGeoTransform(tuple(geo_transform))
            out_dataset.SetProjection(dataset.GetProjection())

            # Save and close
            out_dataset.FlushCache()
            del out_dataset

    # Close the original dataset
    del dataset

width = 224
height = 224

split_and_save_raster(panchromatic_reduced, width, height, panchromatic_parts_path)
split_and_save_raster(pansharpened_reduced, width, height, pansharpened_parts_path)
split_and_save_raster(mask_path, width, height, mask_parts_path)

In [24]:
## Remove images and masks with no data values

def contains_zero(input_raster_path):
    """
    Check if a single band raster has a zero value pixel. 
    Args:
    - input_raster_path (str): Path to the input raster.
    Returns:
    - bool: True if the raster contains at least one zero value pixel, False otherwise.
    """
    with rasterio.open(input_raster_path) as src:
        # Read the image as a numpy array
        raster_array = src.read(1)
        # Check if the array contains any zero values
        return (raster_array == 0).any()
 
def remove_images_with_zero_panchromatic(directory, panchromatic_directory, mask_directory):
    """
    Removes all images from a directory that contain a zero value pixel in their corresponding panchromatic image.
    Also removes images with the same name from the mask directory.
    Args:
    - directory (str): Path to the directory containing the images to validate (pansharpened).
    - panchromatic_directory (str): Path to the directory containing the panchromatic images with the same name.
    - mask_directory (str): Path to the directory containing the mask images with the same name.
    """
 
    for filename in os.listdir(directory):
        if filename.endswith(('.tif', '.tiff')):
            file_path = os.path.join(directory, filename)
            panchromatic_path = os.path.join(panchromatic_directory, filename)
            if contains_zero(panchromatic_path):
                os.remove(file_path)
                print(f"Removed image: {file_path}")
 
                # Remove the mask image with the same name
                mask_path = os.path.join(mask_directory, filename)
                if os.path.exists(mask_path):
                    os.remove(mask_path)
                    print(f"Removed mask: {mask_path}")
 
 
                # Remove the pansharpened image with the same name
                panchromatic_path = os.path.join(panchromatic_directory, filename)
                if os.path.exists(panchromatic_path):
                    os.remove(panchromatic_path)
                    print(f"Removed mask: {panchromatic_path}")
 
# Remove pansharpened images and corresponding masks based on zero values in corresponding panchromatic images
remove_images_with_zero_panchromatic(pansharpened_parts_path, panchromatic_parts_path, mask_parts_path)

Removed image: data/1_preprocessing/input/pansharpened_parts\part_0_0.tif
Removed mask: data/1_preprocessing/input/mask_parts\part_0_0.tif
Removed mask: data/1_preprocessing/input/panchromatic_parts\part_0_0.tif
Removed image: data/1_preprocessing/input/pansharpened_parts\part_0_1.tif
Removed mask: data/1_preprocessing/input/mask_parts\part_0_1.tif
Removed mask: data/1_preprocessing/input/panchromatic_parts\part_0_1.tif
Removed image: data/1_preprocessing/input/pansharpened_parts\part_0_10.tif
Removed mask: data/1_preprocessing/input/mask_parts\part_0_10.tif
Removed mask: data/1_preprocessing/input/panchromatic_parts\part_0_10.tif
Removed image: data/1_preprocessing/input/pansharpened_parts\part_0_100.tif
Removed mask: data/1_preprocessing/input/mask_parts\part_0_100.tif
Removed mask: data/1_preprocessing/input/panchromatic_parts\part_0_100.tif
Removed image: data/1_preprocessing/input/pansharpened_parts\part_0_101.tif
Removed mask: data/1_preprocessing/input/mask_parts\part_0_101.tif


In [9]:
# ## Remove images and masks with no data values
# 
# def contains_zero(input_raster_path):
#     """
#     Check if a single band raster has a zero value pixel. 
#     
#     Args:
#     - input_raster_path (str): Path to the input raster.
#     
#     Returns:
#     - bool: True if the raster contains at least one zero value pixel, False otherwise.
#     """
#     with rasterio.open(input_raster_path) as src:
#         # Read the image as a numpy array
#         raster_array = src.read(1)
#         # Check if the array contains any zero values
#         return (raster_array == 0).any()
# 
# # Potentially this one can be used for multiple bands, but I have yet to test it
# def contains_zero_multiband(image_path):
#     """
#     Check if an image has a zero value pixel in any of its bands.
#     """
#     with rasterio.open(image_path) as src:
#         # Iterate through each band
#         for band in range(1, src.count + 1):
#             # Read the current band as a numpy array
#             image_array = src.read(band)
#             # Check if the array contains any zero values
#             if (image_array == 0).any():
#                 return True
#     return False
# 
# def remove_mulraster_with_zero_values(directory):
#     """
#     Removes all rasters from a directory that contain a zero value pixel.
# 
#     Args:
#     - directory (str): Path to the directory containing the images to validate.
#     """
#     for filename in os.listdir(directory):
#         if filename.endswith(('.tif', '.tiff')):  
#             file_path = os.path.join(directory, filename)
#             if contains_zero_multiband(file_path):
#                 os.remove(file_path)
#                 #print(f"Removed raster: {file_path}")
#                 
# def remove_zero_raster_mask(directory, mask_directory):
#     """
#     Removes all rasters from a directory that contain a zero value pixel.
#     Also removes rasters with the same name from the mask directory.
#     
#     Args:
#     - directory (str): Path to the directory containing the rasters to validate.
#     - mask_directory (str): Name of the folder containing the mask rasters with the same name.
#     """
# 
#     for filename in os.listdir(directory):
#         if filename.endswith(('.tif', '.tiff')):
#             file_path = os.path.join(directory, filename)
#             if contains_zero(file_path):
#                 os.remove(file_path)
#                 # print(f"Removed image: {file_path}")
# 
#                 # Remove the image with the same name from the similar folder
#                 mask_path = os.path.join(mask_directory, filename)
#                 if os.path.exists(mask_path):
#                     os.remove(mask_path)
#                     # print(f"Removed similar image: {mask_path}")
#                     
# remove_zero_raster_mask(panchromatic_parts_path, mask_parts_path)
# remove_mulraster_with_zero_values(pansharpened_parts_path)

In [25]:
## Clip the raster masks to remove pixels above a certain threshold to more accurately represent the seal shape (Pan only)

def update_masks(panchromatic_parts_path, mask_parts_path, threshold_value):
    """
    Updates mask images based on corresponding panchromatic images. Specifically, for each mask,
    all pixels with a value of 1 that correspond to pixels in the panchromatic image with a value above the 
    specified threshold are set to 0 in the new mask.

    Args:
    - panchromatic_parts_path (str): The directory path containing the panchromatic images.
    - mask_parts_path (str): The directory path containing the mask images.
    - threshold_value (int): The threshold value for the panchromatic image pixels. Pixels in the mask with a value of 1 and corresponding panchromatic image pixels above this threshold will be set to 0 in the new mask.

    Returns:
    None
    """
    # List all mask files in the specified directory
    mask_files = [f for f in os.listdir(mask_parts_path) if f.endswith('.tif')]
    
    for mask_file in mask_files:
        # Construct the full file paths for the mask and corresponding panchromatic image
        mask_path = os.path.join(mask_parts_path, mask_file)
        image_path = os.path.join(panchromatic_parts_path, mask_file)
        
        # Read the panchromatic image
        with rasterio.open(image_path) as img:
            image_data = img.read(1)
        
        # Read the mask image
        with rasterio.open(mask_path) as mask:
            mask_data = mask.read(1)
            mask_meta = mask.meta
        
        # Update the mask data based on the condition
        new_mask_data = np.where((mask_data == 1) & (image_data > threshold_value), 0, mask_data)
        
        # Save the new mask data overwriting the old mask
        with rasterio.open(mask_path, 'w', **mask_meta) as mask:
            mask.write(new_mask_data, 1)

# Usage example
threshold_value = 70
update_masks(panchromatic_parts_path, mask_parts_path, threshold_value)

In [32]:
## Don't Run
def obtain_statistic(image_path, shapefile_path):
    '''
    Count the points that are within the bounds of an image and calculate the average pixel value.
    Args:
    - image_path (str): Path to the input raster.
    - shapefile_path (str): Path to the shapefile containing points.
    Returns:
    - num_points (int): Number of points within the image bounds.
    - avg_pixel_value (float): Average pixel value of the image.
    '''
    with rasterio.open(image_path) as src:
        image_bounds = src.bounds
        image_box = box(image_bounds.left, image_bounds.bottom, image_bounds.right, image_bounds.top)
        # Read the CRS from the image
        image_crs = src.crs
        # Calculate the average pixel value
        image_data = src.read(1)  # Read the first band
        avg_pixel_value = np.mean(image_data)
    shapefile = gpd.read_file(shapefile_path)
 
    shapefile['within_image'] = shapefile.apply(lambda row: image_box.contains(Point(row.geometry.x, row.geometry.y)), axis=1)
    points_within_image = shapefile[shapefile['within_image']]
 
    return len(points_within_image), avg_pixel_value
 
def obtain_statistics(image_dir, shapefile_path, csv_path):
    # List to store results
    results = []
 
    # Iterate through all images in the directory
    for image_name in os.listdir(image_dir):
        if image_name.endswith('.tif'):
            image_path = os.path.join(image_dir, image_name)
            num_points, avg_pixel_value = obtain_statistic(image_path, shapefile_path)
            results.append({'image_name': image_name, 'num_points': num_points, 'avg_pixel_value': avg_pixel_value})
            print(f'appended image {image_name}')
    # Convert results to DataFrame and save as CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv(csv_path, index=False)
 
    print(f"Results saved to {csv_path}")
 
# Run the main function
obtain_statistics(panchromatic_parts_path, shapefile_path, csv_path)

appended image part_100_10.tif
appended image part_100_100.tif
appended image part_100_101.tif
appended image part_100_103.tif
appended image part_100_104.tif
appended image part_100_105.tif
appended image part_100_106.tif
appended image part_100_107.tif
appended image part_100_108.tif
appended image part_100_109.tif
appended image part_100_11.tif
appended image part_100_110.tif
appended image part_100_111.tif
appended image part_100_112.tif
appended image part_100_113.tif
appended image part_100_114.tif
appended image part_100_115.tif
appended image part_100_116.tif
appended image part_100_117.tif
appended image part_100_118.tif
appended image part_100_119.tif
appended image part_100_12.tif
appended image part_100_120.tif
appended image part_100_121.tif
appended image part_100_122.tif
appended image part_100_123.tif
appended image part_100_124.tif
appended image part_100_126.tif
appended image part_100_127.tif
appended image part_100_128.tif
appended image part_100_129.tif
appended im

In [57]:
## Split into validation, test and training data sets
def organize_images(csv_file, image_folder, output_folder):
    """
    Organize images into subfolders based on the number of seals and average pixel values.
    
    Args:
    - csv_file (str): Path to the CSV file containing image data.
    - image_folder (str): Directory containing the images.
    - output_folder (str): Directory to save the organized subfolders.
    """
    # Read the CSV file
    df = pd.read_csv(csv_file)
    
    # Create the main output directories
    no_seals_folder = os.path.join(output_folder, 'no_seals')
    seals_folder = os.path.join(output_folder, 'seals')
    os.makedirs(no_seals_folder, exist_ok=True)
    os.makedirs(seals_folder, exist_ok=True)
    
    # Create subfolders for 'no_seals'
    ice_folder = os.path.join(no_seals_folder, 'ice')
    water_folder = os.path.join(no_seals_folder, 'water')
    os.makedirs(ice_folder, exist_ok=True)
    os.makedirs(water_folder, exist_ok=True)
    
    # Process each row in the CSV
    for index, row in df.iterrows():
        image_name = row[0]
        seal_count = row[1]
        avg_pixel_value = row[2]
        
        # Define the source and destination paths
        src_path = os.path.join(image_folder, image_name)
        
        # Determine the destination folder based on seal count and pixel value
        if seal_count == 0:
            if avg_pixel_value > 20:
                dst_folder = ice_folder
            else:
                dst_folder = water_folder
        else:
            dst_folder = seals_folder
        
        # Copy the image to the appropriate folder
        dst_path = os.path.join(dst_folder, image_name)
        shutil.copy(src_path, dst_path)
    
    print("Images have been organized into subfolders.")

organize_images(csv_path, panchromatic_parts_path, output_panchromatic)
organize_images(csv_path, mask_parts_path, output_mask)
organize_images(csv_path, pansharpened_parts_path, output_pansharpened)

def split_data(input_panchromatic, input_mask, output_path, train_ratio=0.8, val_ratio=0.15, test_ratio=0.05):
    assert train_ratio + val_ratio + test_ratio == 1, "The ratios must sum to 1."
    def move_files(files, output_subfolder):
        for file in files:
            filename = os.path.basename(file)
            shutil.copy(file, os.path.join(output_subfolder, 'images', filename))
            mask_file = file.replace(input_panchromatic, input_mask)
            shutil.copy(mask_file, os.path.join(output_subfolder, 'masks', filename))
    # Collect and sort images
    seals_images = sorted(glob(os.path.join(input_panchromatic, 'seals', '*.tif')))
    water_images = sorted(glob(os.path.join(input_panchromatic, 'no_seals', 'water', '*.tif')))
    ice_images = sorted(glob(os.path.join(input_panchromatic, 'no_seals', 'ice', '*.tif')))
    total_seals = len(seals_images)
    half_seals = total_seals // 2
 
    # Ensure water and ice have enough images
    water_images = water_images[:half_seals]
    ice_images = ice_images[:half_seals]
    train_count = int(train_ratio * total_seals)
    val_count = int(val_ratio * total_seals)
    test_count = total_seals - train_count - val_count
 
    # Split images
    train_images = seals_images[:train_count] + water_images[:train_count//2] + ice_images[:train_count//2]
    val_images = seals_images[train_count:train_count + val_count] + water_images[train_count//2:train_count//2 + val_count//2] + ice_images[train_count//2:train_count//2 + val_count//2]
    test_images = seals_images[train_count + val_count:] + water_images[train_count//2 + val_count//2:] + ice_images[train_count//2 + val_count//2:]

    move_files(train_images, os.path.join(output_path, 'train'))
    move_files(val_images, os.path.join(output_path, 'val'))
    move_files(test_images, os.path.join(output_path, 'test'))

split_data(output_panchromatic, output_mask, panchromatic_path_dl)
split_data(output_pansharpened, output_mask, pansharpened_path_dl)