## Creating / importing and training the model

In [3]:
%%capture

! git clone https://github.com/GuiguiV/hackathon-mines-invent-2024.git
#fetching a pre-coded standard UNet architecture
! git clone https://github.com/milesial/Pytorch-UNet.git
    

    
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import torch.optim, torch.nn as nn
import keras.losses

sys.path.append("/kaggle/working/hackathon-mines-invent-2024")


from baseline.dataset import BaselineDataset, TestDataset
from baseline.collate import pad_collate


PATH_TO_DATA_TRAIN = "/kaggle/input/data-challenge-invent-mines-2024/DATA/DATA/TRAIN/"
PATH_TO_DATA_TEST = "/kaggle/input/data-challenge-invent-mines-2024/DATA/DATA/TEST/"
dt = BaselineDataset(Path(PATH_TO_DATA_TRAIN))



In [5]:
# Set seeds for PyTorch and numpy
torch.manual_seed(1234)
np.random.seed(1234)

In [6]:
# model_mobileunet.summary()
%cd "Pytorch-UNet"
from unet.unet_model import UNet
%cd ..

N_CH =10
N_CAT = 20
model_unet = UNet(N_CH,N_CAT)


/kaggle/working/Pytorch-UNet
/kaggle/working


In [7]:
from baseline.train import print_iou_per_class, print_mean_iou, train_model

BATCH_SIZE = 2
dl = torch.utils.data.DataLoader(
    dt, batch_size=BATCH_SIZE, collate_fn=pad_collate, shuffle=False, num_workers=3
)
device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu' )


load_existing_weights = True
if load_existing_weights:
    # use pretrained weights that we uploaded on kaggle under this name : 
    PATH_WEIGHTS = "/kaggle/input/unet_segment/pytorch/default/1/model_well_trained.pt"
    model_unet.load_state_dict(torch.load(PATH_WEIGHTS, weights_only=True))
    m = model_unet.to(device)
else:
    m = train_model(model_unet,dl,num_epochs=10)


In [63]:
#clean up cuda memory if needed

# m = m.cpu()
# import gc
# gc.collect()
# torch.cuda.empty_cache()

# t = torch.cuda.get_device_properties(0).total_memory
# r = torch.cuda.memory_reserved(0)
# a = torch.cuda.memory_allocated(0)
# print(t,r,a)

17059545088 3525312512 915934720


In [None]:
train_now = True
if train_now:
    m = train_model(model_unet,dl,num_epochs=1)
    torch.save(m.state_dict(),"./model_trained.pt")

## Testing the iou on the labeled data

In [None]:
#iou testing on the labeled data;
from sklearn.metrics import jaccard_score

count = 0
total_loss = 0.0
iou_per_class = 20*[0]
device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu' )
print(device)

from torchvision.transforms import Resize, InterpolationMode

resize_up = Resize((256,256),InterpolationMode.NEAREST_EXACT)
resize_down = Resize((128,128),InterpolationMode.NEAREST_EXACT)

model = m

model.eval()
evaluate_on_train = True
if evaluate_on_train:
    with torch.no_grad():
        for inputs_with_t, targets_ in dl:
            count +=1

            inputs_with_t = inputs_with_t["S2"].to(device)
            inputs = torch.median(inputs_with_t,dim=1)[0]

            # Forward pass
            outputs = resize_down(model(resize_up(inputs)))
            targets = targets_.flatten()
            outputs = outputs.permute(0,2,3,1).flatten(end_dim=-2)
            criterion = nn.CrossEntropyLoss()

            loss = criterion(outputs,targets.to(device) )
            preds = torch.argmax(outputs, dim=1).cpu()

        #     print_iou_per_class(targets, preds, 20)
        #     print_mean_iou(targets, preds)

            for class_id in range(20):
                iou = jaccard_score(
                    targets == class_id,
                    preds == class_id,
                    average="binary",
                    zero_division=0,
                )

                iou_per_class[class_id] += iou

            total_loss += loss.item()
            
    print(total_loss/len(dl))
    print(np.mean(iou_per_class)/len(dl))

## Generating a submission

In [None]:
dt_test = TestDataset(Path(PATH_TO_DATA_TEST))

dl_test = torch.utils.data.DataLoader(
    dt_test, batch_size=BATCH_SIZE, collate_fn=pad_collate, shuffle=False, num_workers=3
)


In [66]:
from pathlib import Path
import geopandas as gpd


METADATA = Path(PATH_TO_DATA_TEST) / "metadata.geojson"

mtd = gpd.read_file(METADATA)


In [None]:
#generating a submission
from baseline.submission_tools import masks_to_str


masks = []

with torch.no_grad():
    for i_batch, inputs_with_t in enumerate(dl_test):
        count +=1
        
        inputs_with_t = inputs_with_t["S2"].to(device)
        inputs = torch.median(inputs_with_t,dim=1)[0]

        # Forward pass
        outputs = resize_down(m(resize_up(inputs)))
        outputs = outputs.permute(0,2,3,1)
                
        preds = torch.argmax(outputs, dim=-1).cpu().numpy()
        
        masks.extend(masks_to_str(preds))
            
    #     print_iou_per_class(targets, preds, 20)
    #     print_mean_iou(targets, preds)
    


In [69]:
df = pd.DataFrame({"ID": mtd["ID"]})

df["MASKS"] = masks
print(df)
df.to_csv("submission_vanilla4.csv", index=False)

        ID                                              MASKS
0    20000  0 0 0 0 0 0 0 19 19 19 19 19 0 0 0 0 0 0 0 0 0...
1    20001  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 19...
2    20002  19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 1...
3    20003  19 19 19 19 0 0 19 19 19 19 19 19 19 19 19 19 ...
4    20004  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
..     ...                                                ...
469  20469  0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 19 19 19 19 19 0...
470  20470  19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 0...
471  20471  19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 1...
472  20472  19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 1...
473  20473  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...

[474 rows x 2 columns]


## Submission

This section shows you how to submit predictions on Kaggle.

Your submission must be in the CSV format. It should have two columns:

- **ID**: the ID of the image
- **MASKS**: contains the 1D-flattened string conversion of the 2D segmentation masks

To generate the `MASKS` column, we provide you a `masks_to_str` function. We also provide the decoding script so you have a plain understanding of how we will process your submission.


In [None]:
from baseline.submission_tools import decode_masks

- Generate a random submission (and solution)

> Note: 474 is the size of the test set


In [None]:
# Set the random seed
np.random.seed(1234)

X = np.random.randint(0, NUM_CLASSES, size=(474, 128, 128), dtype=np.uint8)
masks = masks_to_str(X)
print(X.shape)

submission = pd.DataFrame.from_dict({"ID": range(len(X)), "MASKS": masks})
submission["ID"] = submission["ID"] + 20000

# Note that the index=False argument is important.
submission.to_csv("submission_random.csv", index=False)

Finally, note that the test set is splitted into two subsets: a public set (50%) and a private set (50%).

During the competition, your scores will be computed **only on the public score**.

At the end of the competition, your scores will be updated to take with your score on the private set into account. The goal is to avoid indirect overfitting on the test set via repeated submissions. This will also add some suspense at the end of the challenge 😇.


## Evaluation pipeline

(Useless for predictions but may be useful if you have issues when submitting)

Here is a short review of how our automated pipeline evaluates your predictions against the test ground truths:


In [None]:
df = pd.read_csv("submission_random.csv")

# Verify the shape of the restored array
X_restored = decode_masks(df["MASKS"].to_list())
print(X_restored.shape)

Let's check that the restored submission batch is the same as the one you submitted.


In [None]:
# Reconstruction test - Should be True
(X == X_restored).all()

Let's compute the mIOU between the original prediction batch tensor and its restored version.

If everything went well, the cell should return `1.0`.


In [None]:
miou = jaccard_score(X.flatten(), X_restored.flatten(), average="macro")
print("miou:", miou)

Finally, here is the evaluation pipeline in Kaggle. It incorporates all the aforementioned elements. Note that Kaggle opens your CSV with Pandas before the `score` function. It expects the `ID` column to be the first one of your CSV.


In [None]:
"""
TODO: Enter any documentation that only people updating the metric should read here.

All columns of the solution and submission dataframes are passed to your metric, except for the Usage column.

Your metric must satisfy the following constraints:
- You must have a function named score. Kaggle's evaluation system will call that function.
- You can add your own arguments to score, but you cannot change the first three (solution, submission, and row_id_column_name).
- All arguments for score must have type annotations.
- score must return a single, finite, non-null float.
"""

import numpy as np
import pandas as pd
import pandas.api.types
from sklearn.metrics import jaccard_score


class ParticipantVisibleError(Exception):
    # If you want an error message to be shown to participants, you must raise the error as a ParticipantVisibleError
    # All other errors will only be shown to the competition host. This helps prevent unintentional leakage of solution data.
    pass


def decode_masks(
    masks: list[str],
    target_shape: tuple[int, int] = (128, 128),
) -> np.ndarray:
    """
    Convert each string in masks back to a 1D list of integers.

    Args:
        masks (list[str]): list of stringified masks

    Returns:
        np.ndarray: reconstructed batch of masks
    """
    return np.array(
        [
            np.fromstring(mask, sep=" ", dtype=np.uint8).reshape(target_shape)
            for mask in masks
        ]
    )


def score(
    solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str
) -> float:
    """
    Scoring function. Takes the solution and submission and returns the mIOU.
    """
    # Check submission files
    COL_MASK = "MASKS"
    expected_columns = ["ID", COL_MASK]
    for col in expected_columns:
        if col not in submission.columns:
            raise ParticipantVisibleError(
                f"Required column: {col} not found in the submission dataframe. Check your column names."
            )

    if not pandas.api.types.is_string_dtype(submission[COL_MASK]):
        raise ParticipantVisibleError(
            f"Submission column {col} must be an object (str) column."
        )

    # Parse and decode the masks into tensors
    masks_submission = decode_masks(submission[COL_MASK].to_list())
    masks_solution = decode_masks(solution[COL_MASK].to_list())

    if not masks_submission.shape == masks_solution.shape:
        raise ParticipantVisibleError(
            f"Submission should be of shape {masks_solution.shape} after decoding. Got submission.shape: {masks_submission.shape}."
        )

    return jaccard_score(
        y_true=masks_solution.flatten(),
        y_pred=masks_submission.flatten(),
        average="macro",
    )

In [None]:
#Final score : 0.152 