# Cloud Cover Segmentation Driven Data 

# Load Data from Source Coop API

Inspired by : How to Use Deep Learning, PyTorch Lightning, and the Planetary Computer to Predict Cloud Cover in Satellite Imagery (https://drivendata.co/blog/cloud-cover-benchmark/) \
Data Source : https://source.coop/repositories/radiantearth/cloud-cover-detection-challenge/ \
A complete access to DrivenData Cloud Cover Detection Challenge through Kaggle API : https://www.kaggle.com/datasets/hmendonca/cloud-cover-detection/data


In [None]:
import boto3
bucket_name = 'radiantearth'
online_folder='test_labels'  # 'test_features' 'test_labels' 'train_features'  #'train_labels' # 'train_features' 
state = 'private' #'public' # 'private'
prefix = f'cloud-cover-detection-challenge/final/{state}/{online_folder}'
local_dir = f'../data/cloud_data/final/{state}/{online_folder}'

# required to add to the environment variable : AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
s3 = boto3.client('s3', endpoint_url='https://data.source.coop')
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
for obj in response.get('Contents', []):
    print(obj['Key'])

In [None]:
import os

# Ensure the local directory exists
os.makedirs(local_dir, exist_ok=True)

# List and download files
objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
for obj in objects.get('Contents', []):
    file_key = obj['Key']
    local_file_path = os.path.join(local_dir, os.path.relpath(file_key, prefix))
    
    # Create any necessary subdirectories
    os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
    
    # Download file
    s3.download_file(bucket_name, file_key, local_file_path)
    print(f"Downloaded {file_key} to {local_file_path}")

# Download the data using Kaggle API 

- run : pip install kaggle 
- Connect to kaggle API (username and API Key via Environmental variable) : export KAGGLE_USERNAME=... and export KAGGLE_KEY=...
- run : kaggle datasets download -d hmendonca/cloud-cover-detection (27GB)

# Load the data from S3 bucket 

In [None]:
#! mc cp s3/mbesnier/diffusion/damage_detection/cloud-segmentation-data/final/public/ ../data/Cloud_Driven/final/public

# Segmentation Model Training

In [None]:
# In a Jupyter notebook or IPython environment, run this in the first cell
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import os
sys.path.append(os.path.abspath('../src'))

In [None]:
from datasets import prepare_cloud_segmentation_data, Cloud_DrivenData_Dataset

In [None]:
folder_path = "../data/Cloud_DrivenData/final/public"
train_share = 0.7
train_x, train_y, val_x, val_y = prepare_cloud_segmentation_data(folder_path, train_share)

In [None]:
from torch.utils.data import DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2

training_transform = A.Compose(
    [
        A.Resize(512, 512),
        A.HorizontalFlip(p=0.5),  # Random horizontal flip with 50% probability
        A.VerticalFlip(p=0.5),    # Random vertical flip with 50% probability
        A.RandomRotate90(p=0.5),  # Random 90 degree rotation with 50% probability
        ToTensorV2(), 
    ], is_check_shapes=True
)

val_transform = A.Compose(
    [
       A.Resize(512, 512),
       ToTensorV2(),
    ], is_check_shapes=True
)

In [None]:
train_dataset = Cloud_DrivenData_Dataset(
    x_paths=train_x,
    y_paths=train_y,
    transform=training_transform, 
    bands = ["B04", "B03", "B02"]
)
valid_dataset = Cloud_DrivenData_Dataset(
    x_paths=val_x,
    y_paths=val_y,
    transform=val_transform,
    bands = ["B04", "B03", "B02"]
)

In [None]:
from torch.utils.data import DataLoader
train_dl = DataLoader(train_dataset, batch_size=32, pin_memory=True, shuffle=True)
val_dl = DataLoader(valid_dataset, batch_size=32, pin_memory=True, shuffle=False)

In [None]:
# Test 
inputs = next(iter(train_dl))
print("images shape : " , inputs["image"].shape)
print("mask shape : " , inputs["mask"].shape)

In [None]:
# Load and Test Model
from models import ResNet_UNET
import torch 
model = ResNet_UNET(in_channels=3,out_channels=2)

In [None]:
with torch.no_grad():
    inputs = torch.randn((32, 3, 512, 512))
    outputs = model.predict(inputs)
    print("Predicted output shape ", outputs.shape)
    outputs = model.forward(inputs)
    print("Predicted output shape ", outputs.shape)

In [None]:
import torch.optim as optim
from losses import DiceLoss
from metrics import accuracy, f1_score, iou_score, recall, precision
# Define functions, losses and metrics 

# Define Optimize 
lr = 1e-4
weight_decay = 1e-5
optimizer = optim.AdamW
params_opt = {"lr":lr, "weight_decay":weight_decay}
# Define a Scheduler 
scheduler = optim.lr_scheduler.StepLR # Decreases LR by a factor of 0.1 every 10 epochs
params_sc = {'step_size':10, 'gamma': 0.1}
# Define Loss
criterion = DiceLoss(mode="multiclass")
# Define Metrics 
metrics = [accuracy, f1_score, iou_score, recall, precision]

# Early Stopping 
early_stopping_params = {"patience":5, "trigger_times":0}

In [None]:
from training import train 

train(
    model=model,
    train_dl=train_dl,
    valid_dl=val_dl,
    optimizer=optimizer,
    scheduler=scheduler,
    params_opt=params_opt,
    params_sc=params_sc,
    loss_fn=criterion,
    metrics=metrics,
    nb_epochs=50,
    experiment_name="ResNet_Unet",
    log_dir="../runs",
    model_dir="../models",
    early_stopping_params=early_stopping_params,
)

# Train Segformer on Cloud Dataset 

In [None]:
# Load and Test Model
from models import Segformer
from training import train 

model_name = "nvidia/segformer-b0-finetuned-ade-512-512"
label2id = {"cloud": 1, "no_cloud": 0 }
id2label = {v: k for k,v in label2id.items()}
num_labels = 2

segformer = Segformer(
    model_name=model_name,
    label2id=label2id,
    num_labels=2,
    freeze_encoder=True
    )

In [None]:
batch = next(iter(train_dl))

In [None]:
print("Model parameter dtype:", next(segformer.parameters()).dtype)
print("Input tensor dtype:", batch["image"].dtype)

In [None]:
from torch.optim.lr_scheduler import CosineAnnealingLR, LambdaLR
import torch.optim as optim
# Assuming model is already defined and loaded
# optimizer setup
lr = 6e-5
weight_decay = 0.01
num_epochs = 50 
total_steps = num_epochs * len(train_dl)

# Optimizer
optimizer = optim.AdamW
params_opt = {"lr":lr, "weight_decay":weight_decay}

# Warm-up and Cosine Annealing Scheduler
warmup_steps = int(0.01 * total_steps)  # e.g., 1% of total steps as warm-up

# Lambda function for warm-up
def lr_lambda(current_step):
    if current_step < warmup_steps:
        return float(current_step) / float(max(1, warmup_steps))
    return max(
        0.0,
        0.5 * (1.0 + torch.cos(torch.pi * (current_step - warmup_steps) / (total_steps - warmup_steps))),
    )

# LambdaLR with the custom lr_lambda
scheduler = LambdaLR
params_sc = {"lr_lambda":lr_lambda}

# Define Loss
criterion = DiceLoss(mode="multiclass")
# Define Metrics 
metrics = [accuracy, f1_score, iou_score, recall, precision]

# Early Stopping 
early_stopping_params = {"patience":5, "trigger_times":0}

In [None]:
train(
    model=segformer,
    train_dl=train_dl,
    valid_dl=val_dl,
    optimizer=optimizer,
    scheduler=scheduler,
    params_opt=params_opt,
    params_sc=params_sc,
    loss_fn=criterion,
    metrics=metrics,
    nb_epochs=50,
    experiment_name="Segformer_DrivenData",
    log_dir="../runs",
    model_dir="../models",
    early_stopping_params=early_stopping_params,
)

# Use Cloud Segmentation Model on Puerto Rico Data 

In [None]:
from datasets import Puerto_Rico_Building_Dataset

data_puerto = Puerto_Rico_Building_Dataset(
    base_dir="../data/Puerto_Rico_dataset/tiff_tiles",
    pre_disaster_dir="Pre_Event_Grids_In_TIFF",
    post_disaster_dir="Post_Event_Grids_In_TIFF",
    mask_dir="Post_Event_Grids_In_TIFF_mask",
    transform=None,
    extension="tif",
    cloud_filter_params=None,
    preprocessing_mode="None",
    filtered_list_path=None
    )

In [None]:
puerto_loader = DataLoader(data_puerto, batch_size=32, shuffle=False, pin_memory=True)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import torch

def display_predictions_batch(images, mask_predictions, mask_labels):
    """
    Displays a batch of images alongside their predicted masks and ground truth masks.

    Args:
        images (torch.Tensor or numpy.ndarray): Batch of input images, shape (N, C, H, W) or (N, H, W, C).
        mask_predictions (torch.Tensor or numpy.ndarray): Batch of predicted masks, shape (N, H, W) or (N, H, W, C).
        mask_labels (torch.Tensor or numpy.ndarray): Batch of ground truth masks, shape (N, H, W) or (N, H, W, C).
    """
    # Convert tensors to numpy arrays if needed
    if isinstance(images, torch.Tensor):
        images = images.detach().cpu().numpy()
    if isinstance(mask_predictions, torch.Tensor):
        mask_predictions = mask_predictions.detach().cpu().numpy()
    if isinstance(mask_labels, torch.Tensor):
        mask_labels = mask_labels.detach().cpu().numpy()
    
    batch_size = images.shape[0]  # Number of images in the batch

    for i in range(batch_size):
        image = images[i]
        mask_prediction = mask_predictions[i]
        mask_label = mask_labels[i]
        
        # Handle grayscale or channel-first images
        if image.ndim == 3 and image.shape[0] in [1, 3]:  # (C, H, W) format
            image = np.transpose(image, (1, 2, 0))  # Convert to (H, W, C)
        
        # Normalize image for better visualization (if needed)
        if image.max() > 1:
            image = image / 255.0  # Assuming image is in [0, 255]
        
        # Create the plot
        plt.figure(figsize=(12, 4))
        
        # Show the input image
        plt.subplot(1, 3, 1)
        plt.imshow(image)
        plt.axis('off')
        plt.title("Input Image")
        
        # Show the predicted mask
        plt.subplot(1, 3, 2)
        plt.imshow(mask_prediction, cmap='jet', interpolation='none')
        plt.axis('off')
        plt.title("Predicted Mask")
        
        # Show the ground truth mask
        plt.subplot(1, 3, 3)
        plt.imshow(mask_label, cmap='jet', interpolation='none')
        plt.axis('off')
        plt.title("Ground Truth Mask")
        
        plt.tight_layout()
        plt.show()


In [None]:
inputs = next(iter(puerto_loader))
images = inputs["pre_image"].to("cuda")
outputs = segformer.predict(images)

In [None]:
display_predictions_batch(images=images, mask_predictions=outputs, mask_labels=inputs["mask"])

In [None]:
segformer.save(path="../models/Segformer_cloud_seg")