#### Imports

In [1]:
import torch
import random
import numpy as np

from functions.download_data import (
    get_patchs_labels,
    normalization_params,
    get_golden_paths,
    pooled_std_dev,
)

from functions.filter import filter_indices_from_labels

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2

from functions.instanciators import get_dataset, get_lightning_module, get_trainer
from torch.utils.data import random_split, DataLoader
from torch import Generator

import mlflow

import gc

In [2]:
seed = 12345 
torch.manual_seed(seed)
random.seed(0)
np.random.seed(0)

#### Variables

In [3]:
# 
# experiment_name = "test-dev"
# run_name =  "kikito_stagios"
# 
# 
# 
# augment_size = 512
# 
# 
# logits = 1
# freeze_encoder = 0
# epochs = 10
# 
# test_batch_size = 8
# num_sanity_val_steps = 1
# accumulate_batch = 8
# module_name = "segformer-b5"
# loss_name =  "cross_entropy_weighted"
# building_class_weight = 1
# label_smoothing = 0.0
# lr = 0.00005
# momentum = float
# scheduler_name = "one_cycle"
# scheduler_patience = 3
# patience = 200
# 
# 
# cuda = 0
# cuda = cuda and torch.cuda.is_available()


In [4]:
# Import train
from_s3 = 0
task = "segmentation"
source = "PLEIADES"
dep, year  = "MARTINIQUE", "2022"
tiles_size = 250
type_labeler = "BDTOPO"

patches, labels = get_patchs_labels(
        from_s3, task, source, dep, year, tiles_size, type_labeler, train=True
    )

train_patches = []
train_labels = []
test_patches = []
test_labels = []
normalization_means = []
normalization_stds = []
weights = []

patches.sort()
labels.sort()
indices = filter_indices_from_labels(labels, -1.0, 2.0)
train_patches += [patches[idx] for idx in indices]
train_labels += [labels[idx] for idx in indices]

In [5]:
# Import test
patches, labels = get_patchs_labels(
    from_s3, task, source, dep, year, tiles_size, type_labeler, train=False
)

patches.sort()
labels.sort()
test_patches += list(patches)
test_labels += list(labels)

# Normalisation
normalization_mean, normalization_std = normalization_params(
    task, source, dep, year, tiles_size, type_labeler
)
normalization_means.append(normalization_mean)
normalization_stds.append(normalization_std)
weights.append(len(indices))


In [6]:
# Golden test
golden_patches, golden_labels = get_golden_paths(
    from_s3, task, source, "MAYOTTE_CLEAN", "2022", tiles_size
)

golden_patches.sort()
golden_labels.sort()

`s3/projet-slums-detection/golden-test/patchs/segmentation/PLEIADES/MAYOTTE_CLEAN/2022/250/ORT_976_2022_0513_8568_U38S_8Bits_0026.jp2` -> `data/data-preprocessed/golden-test/patchs/segmentation/PLEIADES/MAYOTTE_CLEAN/2022/250/ORT_976_2022_0513_8568_U38S_8Bits_0026.jp2`
`s3/projet-slums-detection/golden-test/patchs/segmentation/PLEIADES/MAYOTTE_CLEAN/2022/250/ORT_976_2022_0512_8592_U38S_8Bits_0005.jp2` -> `data/data-preprocessed/golden-test/patchs/segmentation/PLEIADES/MAYOTTE_CLEAN/2022/250/ORT_976_2022_0512_8592_U38S_8Bits_0005.jp2`
`s3/projet-slums-detection/golden-test/patchs/segmentation/PLEIADES/MAYOTTE_CLEAN/2022/250/ORT_976_2022_0513_8568_U38S_8Bits_0025.jp2` -> `data/data-preprocessed/golden-test/patchs/segmentation/PLEIADES/MAYOTTE_CLEAN/2022/250/ORT_976_2022_0513_8568_U38S_8Bits_0025.jp2`
`s3/projet-slums-detection/golden-test/patchs/segmentation/PLEIADES/MAYOTTE_CLEAN/2022/250/ORT_976_2022_0513_8593_U38S_8Bits_0031.jp2` -> `data/data-preprocessed/golden-test/patchs/segmentat

`s3/projet-slums-detection/golden-test/labels/segmentation/PLEIADES/MAYOTTE_CLEAN/2022/250/ORT_976_2022_0513_8568_U38S_8Bits_0026.npy` -> `data/data-preprocessed/golden-test/labels/segmentation/PLEIADES/MAYOTTE_CLEAN/2022/250/ORT_976_2022_0513_8568_U38S_8Bits_0026.npy`
`s3/projet-slums-detection/golden-test/labels/segmentation/PLEIADES/MAYOTTE_CLEAN/2022/250/ORT_976_2022_0514_8595_U38S_8Bits_0003.npy` -> `data/data-preprocessed/golden-test/labels/segmentation/PLEIADES/MAYOTTE_CLEAN/2022/250/ORT_976_2022_0514_8595_U38S_8Bits_0003.npy`
`s3/projet-slums-detection/golden-test/labels/segmentation/PLEIADES/MAYOTTE_CLEAN/2022/250/ORT_976_2022_0513_8568_U38S_8Bits_0025.npy` -> `data/data-preprocessed/golden-test/labels/segmentation/PLEIADES/MAYOTTE_CLEAN/2022/250/ORT_976_2022_0513_8568_U38S_8Bits_0025.npy`
`s3/projet-slums-detection/golden-test/labels/segmentation/PLEIADES/MAYOTTE_CLEAN/2022/250/ORT_976_2022_0524_8587_U38S_8Bits_0011.npy` -> `data/data-preprocessed/golden-test/labels/segmentat

In [7]:
# Transformations

n_bands = 3
normalization_mean = np.average(
    [mean[:n_bands] for mean in normalization_means], weights=weights, axis=0
)
normalization_std = [
    pooled_std_dev(
        weights,
        [mean[i] for mean in normalization_means],
        [std[i] for std in normalization_stds],
    )
    for i in range(n_bands)
]

In [8]:
transform_list = [
    A.HorizontalFlip(),
    A.VerticalFlip(),
    A.Normalize(
        max_pixel_value=1.0,
        mean=normalization_mean,
        std=normalization_std,
    ),
    ToTensorV2(),
]

augment_size = 250
if augment_size != tiles_size:
    transform_list.insert(0, A.Resize(augment_size, augment_size))
transform = A.Compose(transform_list)

test_transform_list = [
    A.Normalize(
        max_pixel_value=1.0,
        mean=normalization_mean,
        std=normalization_std,
    ),
    ToTensorV2(),
]
if augment_size != tiles_size:
    test_transform_list.insert(0, A.Resize(augment_size, augment_size))
test_transform = A.Compose(test_transform_list)

### Dataset

In [9]:
dataset = get_dataset(task, train_patches, train_labels, n_bands, from_s3, transform)
dataset = get_dataset(task, train_patches[:40], train_labels[:40], n_bands, from_s3, transform)
test_dataset = get_dataset(task, test_patches, test_labels, n_bands, from_s3, test_transform)
golden_dataset = get_dataset(
    task, golden_patches, golden_labels, n_bands, from_s3, test_transform
)

In [10]:
train_dataset, val_dataset = random_split(dataset, [0.8, 0.2], generator=Generator())

In [11]:
batch_size = 8
test_batch_size = 8
cuda = 0
kwargs = {"num_workers": os.cpu_count(), "pin_memory": True} if cuda else {}

train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, drop_last=True, **kwargs
)
val_loader = DataLoader(
    val_dataset, batch_size=test_batch_size, shuffle=False, drop_last=True, **kwargs
)
test_loader = DataLoader(
    test_dataset, batch_size=test_batch_size, shuffle=False, drop_last=True, **kwargs
)
golden_loader = DataLoader(
    golden_dataset, batch_size=test_batch_size, shuffle=False, drop_last=True, **kwargs
)

### Trainer

In [15]:
%load_ext autoreload
%autoreload 2

from modeles_gaetan import UNet
from config.module import module_dict
from functions.instanciators import get_model

print(module_dict)
module_name = "UNetGaetan"

model = get_model(module_name,3,True,False)

# test data
batch = next(iter(train_loader))
labels = batch["labels"]
images = batch["pixel_values"]

output = model(images)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
{'deeplabv3': <class 'models.components.segmentation_models.DeepLabv3Module'>, 'single_class_deeplabv3': <class 'models.components.segmentation_models.SingleClassDeepLabv3Module'>, 'segformer-b0': <class 'models.components.segmentation_models.SegformerB0'>, 'segformer-b1': <class 'models.components.segmentation_models.SegformerB1'>, 'segformer-b2': <class 'models.components.segmentation_models.SegformerB2'>, 'segformer-b3': <class 'models.components.segmentation_models.SegformerB3'>, 'segformer-b4': <class 'models.components.segmentation_models.SegformerB4'>, 'segformer-b5': <class 'models.components.segmentation_models.SegformerB5'>, 'UNetGaetan': <class 'modeles_gaetan.UNet'>}




In [16]:
from functions.instanciators import get_loss
loss_name =  "cross_entropy_weighted"
building_class_weight = 1
label_smoothing = 0.0

loss = get_loss(
        loss_name, building_class_weight=building_class_weight, label_smoothing=label_smoothing
    )


In [24]:
patience = 200
earlystop = {"monitor": "validation_loss", "patience": patience, "mode": "min"}
checkpoints = [
    {
        "monitor": "validation_loss",
        "save_top_k": 1,
        "save_last": False,
        "mode": "min",
    }
]


label_smoothing = 0.0
epochs = 1
num_sanity_val_steps = 1
accumulate_batch = 8
logits = 1
freeze_encoder = 0
lr = 0.00005
momentum = float
scheduler_name = "one_cycle"
scheduler_patience = 3

trainer = get_trainer(earlystop, checkpoints, epochs, num_sanity_val_steps, accumulate_batch)

light_module = get_lightning_module(
    module_name=module_name,
    loss_name=loss_name,
    building_class_weight=building_class_weight,
    label_smoothing=label_smoothing,
    n_bands=n_bands,
    logits=bool(logits),
    freeze_encoder=bool(freeze_encoder),
    task=task,
    lr=lr,
    momentum=momentum,
    earlystop=earlystop,
    scheduler_name=scheduler_name,
    scheduler_patience=scheduler_patience,
    cuda=cuda,
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


/opt/mamba/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


### Entraînement

In [25]:
remote_server_uri = "https://projet-slums-detection-128833.user.lab.sspcloud.fr"
experiment_name = "test-dev"
run_name =  "kikito_stagios2"

mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='mlflow-artifacts:/9', creation_time=1697531551081, experiment_id='9', last_update_time=1697531551081, lifecycle_stage='active', name='test-dev', tags={}>

In [26]:
with mlflow.start_run(run_name=run_name):
    mlflow.pytorch.autolog()
    # 7- Training the model on the training set
    torch.cuda.empty_cache()
    torch.set_float32_matmul_precision("medium")
    gc.collect()

    trainer.fit(light_module, train_loader, val_loader)

    best_model = type(light_module).load_from_checkpoint(
        checkpoint_path=trainer.checkpoint_callback.best_model_path,
        model=light_module.model,
        loss=light_module.loss,
        optimizer=light_module.optimizer,
        optimizer_params=light_module.optimizer_params,
        scheduler=light_module.scheduler,
        scheduler_params=light_module.scheduler_params,
        scheduler_interval=light_module.scheduler_interval,
    )

    # Logging the model with the associated code
    mlflow.pytorch.log_model(
        artifact_path="model",
        code_paths=[
            "src/models/",
            "src/optim/",
            "src/config/",
        ],
        pytorch_model=best_model.to("cpu"),
    )

    # Log normalization parameters
    mlflow.log_params(
        {
            "normalization_mean": normalization_mean.tolist(),
            "normalization_std": normalization_std,
        }
    )
    # TODO: Add signature for inference

    # 8- Test
    trainer.test(dataloaders=[test_loader, golden_loader], ckpt_path="best")



Loading `train_dataloader` to estimate number of stepping batches.
/opt/mamba/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=71` in the `DataLoader` to improve performance.

  | Name  | Type             | Params
-------------------------------------------
0 | model | UNet             | 31.0 M
1 | loss  | CrossEntropyLoss | 0     
-------------------------------------------
31.0 M    Trainable params
0         Non-trainable params
31.0 M    Total params
124.127   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/mamba/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=71` in the `DataLoader` to improve performance.
/opt/mamba/lib/python3.11/site-packages/pytorch_lightning/utilities/data.py:77: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 8. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


MlflowException: ("Failed to copy the specified code path 'src/models/' into the model artifacts. It appears that your code path includes file(s) that cannot be copied. Please specify a code path that does not include such files and try again.",)

In [None]:
#for batch in  train_loader :
 #   print batch.keys()

iterateur  = iter(train_loader)

batch = next(iterateur)
batch.keys()

dict_keys(['pixel_values', 'labels', 'metadata'])

In [None]:
batch_image = batch["pixel_values"]
batch_image.shape

torch.Size([8, 3, 250, 250])

In [None]:
output = best_model(batch_image)
output.shape


torch.Size([8, 2, 63, 63])

In [None]:
batch["metadata"]