In [None]:
# Standard Library Imports
import os
import random
import shutil
from pathlib import Path
from typing import List, Optional
from datetime import datetime

# Data Handling
import numpy as np
import pandas as pd
from pandas_path import path
from PIL import Image

# PyTorch Core and Utilities
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

# Model Training and Evaluation
from transformers import (
    AutoConfig,
    AutoModelForSemanticSegmentation,
    AutoImageProcessor,
    SegformerImageProcessor,
    TrainingArguments,
    Trainer
)
import evaluate
from sklearn.metrics import accuracy_score

# Image Transformations and Augmentations
from torchvision import transforms
import albumentations as A
from albumentations.pytorch import ToTensorV2

# Visualization
import matplotlib.pyplot as plt


In [None]:
DATA_DIR = Path.cwd().parent.resolve() / "data/cloud_drivendata/final/public"
TRAIN_FEATURES = DATA_DIR / "train_features"
TRAIN_LABELS = DATA_DIR / "train_labels"

assert TRAIN_FEATURES.exists()

In [None]:
BANDS = ["B02", "B03", "B04", "B08"]
train_meta = pd.read_csv(DATA_DIR / "train_metadata.csv")

def add_paths(df, feature_dir, label_dir=None, bands=BANDS):
    """
    Given dataframe with a column for chip_id, returns a dataframe with a column
    added indicating the path to each band's TIF image as "{band}_path", eg "B02_path".
    A column is also added to the dataframe with paths to the label TIF, if the
    path to the labels directory is provided.
    """
    for band in bands:
        df[f"{band}_path"] = feature_dir / df["chip_id"] / f"{band}.tif"
        #assert df[f"{band}_path"].path.exists().all()
    if label_dir is not None:
        df["label_path"] = label_dir / (df["chip_id"] + ".tif")
        #assert df["label_path"].path.exists().all()

    return df

print(f"Feature directory : {TRAIN_FEATURES}")
print(f"Label directory : {TRAIN_LABELS}")
train_meta = add_paths(train_meta, TRAIN_FEATURES, TRAIN_LABELS)

In [None]:
random.seed(42)  # set a seed for reproducibility

# put 1/3 of chips into the validation set
chip_ids = train_meta.chip_id.unique().tolist()
val_chip_ids = random.sample(chip_ids, round(len(chip_ids) * 0.4))

val_mask = train_meta.chip_id.isin(val_chip_ids)
val = train_meta[val_mask].copy().reset_index(drop=True)
train = train_meta[~val_mask].copy().reset_index(drop=True)

# separate features from labels
feature_cols = ["chip_id"] + [f"{band}_path" for band in BANDS]

val_x = val[feature_cols].copy()
val_y = val[["chip_id", "label_path"]].copy()

train_x = train[feature_cols].copy()
train_y = train[["chip_id", "label_path"]].copy()

In [None]:
class SemanticSegmentationDataset(Dataset):
    def __init__(
        self,
        x_paths: pd.DataFrame,
        bands: List[str],
        y_paths: Optional[pd.DataFrame] = None,
        transform = None,
        mask_transform = None,
        processor=None
    ):
        super().__init__()
        self.data = x_paths
        self.label = y_paths
        self.bands = bands
        self.transform = transform
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def load_channel_pil(self, filepath: str):
        return np.array(Image.open(filepath))

    def open_mask(self, filepath: str):
        mask = self.load_channel_pil(filepath) # 0-1 mask (no need to convert into a binary format)
        return mask

    def open_as_array(self, idx: int, invert=False):
        band_arrs = [self.load_channel_pil(self.data.loc[idx][f"{band}_path"]) for band in self.bands]
        x_arr = np.stack(band_arrs, axis=-1)
        # Normalize
        return (x_arr / np.iinfo(x_arr.dtype).max)
    
    def preprocess_batch(self, example_batch, transforms: A.Compose):
        pixel_values = []
        labels = []
        for image, target in zip(example_batch["image"], example_batch["label"]):
            transformed = transforms(image=image, mask=target)
            pixel_values.append(transformed["image"])
            labels.append(transformed["mask"])

        pixel_values = torch.stack(pixel_values).to(torch.float)
        labels = torch.stack(labels).to(torch.long)

        return pixel_values, labels

    def __getitem__(self, idx: int):
        image = self.open_as_array(idx)
        mask = None 
        
        if self.label is not None:
            mask = self.open_mask(self.label.iloc[idx]['label_path'])

        # Apply transforms from almumentations for semantic segmentation
        if self.transform is not None:
            data = {'image': image, 'label': mask}
            image, mask = self.preprocess_batch(example_batch = data, transforms=self.transform)

        encoded_inputs = self.processor(images=image, segmentation_maps=mask, do_reduce_labels=False, return_tensors='pt')

        for k,v in encoded_inputs.items():
            encoded_inputs[k].squeeze_() # remove batch dimension
        
        return encoded_inputs
    
    def __repr__(self):
        # Return a string representation of the dataset
        s = 'Dataset class with {} files'.format(self.__len__())
        return s

In [None]:
# utils functions
def load_channel_pil(filepath):
        return np.array(Image.open(filepath))

def true_color_img(chip_id, data_dir=TRAIN_FEATURES, load_channel_f=load_channel_pil):
        # Open image files as arrays, optionally including NIR channel

        chip_dir = data_dir / chip_id

        raw_rgb = np.stack([load_channel_f(chip_dir / "B04.tif"),
                            load_channel_f(chip_dir / "B03.tif"),
                            load_channel_f(chip_dir / "B02.tif"),
                           ], axis=-1)
    
        # Normalize pixel values 0-1 values
        return raw_rgb / raw_rgb.max()

def display_random_prediction(model, processor, metadata, chip_id = None, data_dir='TRAIN_FEATURES', alpha=0.5):
    """
    Displays the ground truth and predicted segmentation masks overlaid on the original image.

    Args:
        model (torch.nn.Module): The segmentation model.
        processor (Callable): The processor to preprocess the images for the model.
        metadata (DataFrame): Metadata containing chip IDs and label paths.
        data_dir (str): Directory path containing image data.
        alpha (float): Transparency level for overlaying the mask on the image.
    """
    interpolate = not hasattr(object, 'interpolate') or not model.interpolate # if interpolate == True, an interpolation is required

    list_images = os.listdir(path=data_dir)
    if chip_id is None:
        random_chip_id = np.random.choice(list_images)
    else:
        random_chip_id = list_images[chip_id]
    
    # Load image and ground truth from metadata and data directory
    random_chip = metadata[metadata["chip_id"]==random_chip_id].iloc[0]
    image_np = true_color_img(random_chip.chip_id)  # Load the RGB image 
    gt = Image.open(random_chip.label_path)      # Load the ground truth mask
    # Convert the NumPy image array to a PIL image
    image = Image.fromarray(np.uint8(image_np*255)) # as image_np is 0-1 values we need to rescale to 0-255 scale then convert into np.unit8

    # Preprocess image and make predictions
    inputs = processor(images=image, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs) # Cloud_Segformer outputs upsample logits => (1, nb_channel, heigth, width)

    if interpolate:
        outputs = nn.functional.interpolate(outputs.logits, image.size[-2:], mode="bilinear", align_corners=True)
    
    # Apply argmax to get the predicted segmentation mask
    pred_seg = outputs.argmax(dim=1)[0].cpu().numpy() # (height, width)
    
    # Convert ground truth and prediction to color overlays
    gt_mask = np.array(gt)
    pred_mask = pred_seg
    

    # Define a colormap for displaying the masks
    colormap = plt.get_cmap("jet")  # Use 'jet' colormap or choose as needed

    # Generate overlays
    gt_overlay = colormap(gt_mask / gt_mask.max())[:, :, :3] * 255  # normalize mask values for display
    pred_overlay = colormap(pred_mask / pred_mask.max())[:, :, :3] * 255

    # Convert overlays to PIL images
    gt_overlay_img = Image.fromarray(gt_overlay.astype(np.uint8)).convert("RGBA")
    pred_overlay_img = Image.fromarray(pred_overlay.astype(np.uint8)).convert("RGBA")

    # Original image as RGBA for transparency blending
    image_rgba = image.convert("RGBA")

    # Blend original image with ground truth and prediction overlays
    gt_display = Image.blend(image_rgba, gt_overlay_img, alpha=alpha)
    pred_display = Image.blend(image_rgba, pred_overlay_img, alpha=alpha)

    # Display the original, ground truth, and prediction images side by side
    fig, axs = plt.subplots(1, 3, figsize=(18, 6))
    axs[0].imshow(image)
    axs[0].set_title("Original Image")
    axs[0].axis("off")

    axs[1].imshow(gt_display)
    axs[1].set_title("Ground Truth Overlay")
    axs[1].axis("off")

    axs[2].imshow(pred_display)
    axs[2].set_title("Predicted Overlay")
    axs[2].axis("off")

    plt.tight_layout()
    plt.show()
    

## Fine Tune Segformer on DrivenData Cloud Segmentation Challenge

In [None]:
model_name = "nvidia/segformer-b0-finetuned-ade-512-512"
label2id = {"cloud": 1, "no_cloud": 0 }
id2label = {v: k for k,v in label2id.items()}
num_labels = 2

config = config = AutoConfig.from_pretrained(
        model_name,
        trust_remote_code=True,
        num_labels=num_labels
    )
model = AutoModelForSemanticSegmentation.from_pretrained(
    model_name,
    config=config,
    trust_remote_code=True,
    ignore_mismatched_sizes=True
)
image_processor = AutoImageProcessor.from_pretrained(
    model_name,
    do_reduce_labels=False,
    trust_remote_code=True,
    do_rescale=False
)

# output : (batch_size, 150, 128, 128) where 150 : num_labels, h/4, w/4 
# last layer (classifier): Conv2d(256, num_labels, kernel_size=(1, 1), stride=(1, 1))
# output : SemanticSegmenteroutput : loss (if label given as model input ), logits , hidden states and attentions
# In Transformer Segformer implementation, when the labels are provided the CrossEntropyloss (or BCE) is computed on upsampled logits (however non upsampled logits are returned)

In [None]:
display_random_prediction(model, image_processor, metadata=train_meta, data_dir=TRAIN_FEATURES, alpha=0.5, chip_id=1)

In [None]:
# Dice calculation helper function
def calculate_dice(pred_labels, labels, num_classes):
    dice_scores = []
    for i in range(num_classes):
        pred_flat = (pred_labels == i).astype(int).flatten()
        label_flat = (labels == i).astype(int).flatten()
        intersection = np.sum(pred_flat * label_flat)
        dice = (2.0 * intersection) / (np.sum(pred_flat) + np.sum(label_flat) + 1e-6)
        dice_scores.append(dice)
    return dice_scores

In [None]:
from accelerate import Accelerator
import torch.nn.functional as F
# Training function
def train(model, dataloader, optimizer, scheduler, metric, epoch, writer, id2label, accelerator=None, device='cuda'):
    model.train()
    running_loss = 0.0
    num_classes = len(id2label)

    for idx, batch in enumerate(dataloader):
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(pixel_values=pixel_values, labels=labels)
        loss, logits = outputs.loss, outputs.logits
        running_loss += loss.item()

        # Backward pass and optimization
        if accelerator is None:
            loss.backward()
        else:
            accelerator.backward(loss)

        optimizer.step()
        scheduler.step()

        # Metric calculation for evaluation
        with torch.no_grad():
            upsampled_logits = F.interpolate(logits, size=labels.shape[-2:], mode="bilinear", align_corners=False)
            predicted = upsampled_logits.argmax(dim=1)
            metric.add_batch(predictions=predicted.cpu().numpy(), references=labels.cpu().numpy())

        # Log loss to TensorBoard every 50 batches
        if idx % 50 == 0:
            writer.add_scalar("Training Loss", loss.item(), epoch * len(dataloader) + idx)

    # End of epoch, compute metrics including Dice score
    metrics = metric.compute(num_labels=num_classes, reduce_labels=False)
    mean_iou = metrics["mean_iou"]
    mean_accuracy = metrics["mean_accuracy"]
    
    # Calculate per-class Dice score and log it
    dice_scores = calculate_dice(predicted.cpu().numpy(), labels.cpu().numpy(), num_classes)
    for i, dice in enumerate(dice_scores):
        writer.add_scalar(f"Training Dice_{id2label[i]}", dice, epoch)

    writer.add_scalar("Training Mean IoU", mean_iou, epoch)
    writer.add_scalar("Training Mean Accuracy", mean_accuracy, epoch)

    print(f"Epoch [{epoch}], Loss: {running_loss / len(dataloader):.4f}, Mean IoU: {mean_iou:.4f}, Mean Accuracy: {mean_accuracy:.4f}")
    print(f"Epoch [{epoch}] Dice scores: {[f'{id2label[i]}: {dice:.4f}' for i, dice in enumerate(dice_scores)]}")

# Validation function
def validate(model, dataloader, metric, epoch, writer, id2label, device='cuda'):
    model.eval()
    running_loss = 0.0
    num_classes = len(id2label)

    with torch.no_grad():
        for idx, batch in enumerate(dataloader):
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss, logits = outputs.loss, outputs.logits
            running_loss += loss.item()

            # Metric calculation for evaluation
            upsampled_logits = F.interpolate(logits, size=labels.shape[-2:], mode="bilinear", align_corners=False)
            predicted = upsampled_logits.argmax(dim=1)
            metric.add_batch(predictions=predicted.cpu().numpy(), references=labels.cpu().numpy())

    # Compute mean IoU and accuracy
    metrics = metric.compute(num_labels=num_classes, reduce_labels=False)
    mean_iou = metrics["mean_iou"]
    mean_accuracy = metrics["mean_accuracy"]

    # Calculate Dice score and log it
    dice_scores = calculate_dice(predicted.cpu().numpy(), labels.cpu().numpy(), num_classes)
    for i, dice in enumerate(dice_scores):
        writer.add_scalar(f"Validation Dice_{id2label[i]}", dice, epoch)

    writer.add_scalar("Validation Loss", running_loss / len(dataloader), epoch)
    writer.add_scalar("Validation Mean IoU", mean_iou, epoch)
    writer.add_scalar("Validation Mean Accuracy", mean_accuracy, epoch)

    print(f"Validation - Epoch [{epoch}], Loss: {running_loss / len(dataloader):.4f}, Mean IoU: {mean_iou:.4f}, Mean Accuracy: {mean_accuracy:.4f}")
    print(f"Validation - Epoch [{epoch}] Dice scores: {[f'{id2label[i]}: {dice:.4f}' for i, dice in enumerate(dice_scores)]}")


In [None]:
train_ds = SemanticSegmentationDataset(
            x_paths=train_x,
            bands=["B04", "B03", "B02"],
            y_paths=train_y,
            processor=image_processor,
            transform=None
            
        )
valid_ds = SemanticSegmentationDataset(
            x_paths=val_x,
            bands=["B04", "B03", "B02"],
            y_paths=val_y,
            processor=image_processor,
            transform=None
        )

train_dl = DataLoader(train_ds, batch_size=64, shuffle=True, num_workers=8, pin_memory=True)
valid_dl = DataLoader(valid_ds, batch_size=64, shuffle=False, num_workers=8, pin_memory=True)

In [None]:
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR, LambdaLR

# Assuming model is already defined and loaded
# optimizer setup
learning_rate = 6e-5
weight_decay = 0.01
num_epochs = 50 
total_steps = num_epochs * len(train_dl)

# Optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Warm-up and Cosine Annealing Scheduler
warmup_steps = int(0.01 * total_steps)  # e.g., 1% of total steps as warm-up

# Lambda function for warm-up
def lr_lambda(current_step):
    if current_step < warmup_steps:
        return float(current_step) / float(max(1, warmup_steps))
    return max(
        0.0,
        0.5 * (1.0 + torch.cos(torch.pi * (current_step - warmup_steps) / (total_steps - warmup_steps))),
    )

# LambdaLR with the custom lr_lambda
scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda)

# define metrics 
metric = evaluate.load("mean_iou")

# move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set up TensorBoard writer
log_dir = "../runs"
model_dir="../models"
experiment_name="Segformer_DrivenData"
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = os.path.join(log_dir, f"{experiment_name}_{timestamp}")
os.makedirs(log_dir, exist_ok=True)
writer = SummaryWriter(log_dir)

In [None]:
# Main training loop
accelerator = Accelerator()
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    
    # Train and validate
    train(model=model, 
        dataloader=train_dl,
        optimizer=optimizer,
        scheduler=scheduler, 
        metric=metric,
        epoch=epoch,
        id2label=id2label,
        writer=writer,
        accelerator=accelerator,
        device=device
    )
    validate(model=model,
        dataloader=valid_dl, 
        metric=metric,
        id2label=id2label,
        epoch=epoch,
        writer=writer
    )

# Close the TensorBoard writer
writer.close()

In [None]:
display_random_prediction(model, image_processor, metadata=train_meta, data_dir=TRAIN_FEATURES, alpha=0.5, chip_id=1)

### Use HuggingFace Trainer (pytorch compatible) (Not working)

In [None]:
# Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
from PIL import Image
import numpy as np

# Convert numpy array to PIL Image, maintaining the mode as grayscale for segmentation masks
def convert_to_pil(image_array):
    if image_array.ndim == 3 and image_array.shape[0] == 1:  # single-channel grayscale
        image_array = image_array[0]  # drop the channel dimension
    return Image.fromarray(image_array.astype(np.uint8))

@torch.no_grad()
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits_tensor = torch.from_numpy(logits)

    # Scale logits to the size of the labels
    logits_tensor = nn.functional.interpolate(
        logits_tensor,
        size=labels.shape[-2:],  # Match label dimensions
        mode="bilinear",
        align_corners=False,
    ).argmax(dim=1)

    # Convert tensors to numpy arrays
    pred_labels = logits_tensor.detach().cpu().numpy()
    true_labels = labels if isinstance(labels, np.ndarray) else np.array(labels)

    # Convert predictions and references to the expected PIL image format
    pil_predictions = [convert_to_pil(pred) for pred in pred_labels]
    pil_references = [convert_to_pil(true_label) for true_label in true_labels]

    # Compute metrics
    metrics = metric.compute(
        predictions=pil_predictions,
        references=pil_references,
        num_labels=len(id2label),
        reduce_labels=image_processor.do_reduce_labels,
    )

    # Extract and add per-category metrics
    per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
    per_category_iou = metrics.pop("per_category_iou").tolist()

    metrics.update({f"accuracy_{id2label[i]}": v for i, v in enumerate(per_category_accuracy)})
    metrics.update({f"iou_{id2label[i]}": v for i, v in enumerate(per_category_iou)})

    return metrics

In [None]:
train_ds = SemanticSegmentationDataset(
            x_paths=train_x,
            bands=["B04", "B03", "B02"],
            y_paths=train_y,
            processor=image_processor,
            transform=None
            
        )
valid_ds = SemanticSegmentationDataset(
            x_paths=val_x,
            bands=["B04", "B03", "B02"],
            y_paths=val_y,
            processor=image_processor,
            transform=None
        )

In [None]:
metric = evaluate.load("mean_iou")
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
# Define training arguments

# Define optimized training arguments
training_args = TrainingArguments(
    output_dir="../outputs/segformer-b0-finetuned-cloud-detection",
    eval_strategy="steps",
    save_strategy="steps",
    eval_accumulation_steps=5,
    learning_rate=6e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    save_steps=500,
    eval_steps=500,
    num_train_epochs=50,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    logging_dir=f"../runs/Segformer_DrivenData_{timestamp}",
    logging_strategy="steps",
    logging_steps=1,
    save_total_limit=3,
    seed=42,
    # torch compile 
    #torch_compile=True,
    # Enable Mixed Precision
    #fp16=True, 
    # Use efficient optimizer if available
    optim="adamw_torch",  # NVIDIA-optimized version for faster training
    # Efficient Data Loading
    dataloader_pin_memory=True,
    dataloader_num_workers=8,
    dataloader_persistent_workers=True,
    
    # Other Training Settings
    load_best_model_at_end=True,
    disable_tqdm=False,
    push_to_hub=False
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    compute_metrics=compute_metrics,
)

In [None]:
display_random_prediction(model, image_processor, metadata=train_meta, data_dir=TRAIN_FEATURES, alpha=0.5, chip_id=1)