# **Segformer**

In [None]:
from datasets import load_dataset
from PIL import Image
from huggingface_hub import hf_hub_download
import json
import torchvision.transforms as t
from transformers import AutoImageProcessor
from torchvision.transforms import ColorJitter
from transformers import SegformerImageProcessor
from torchvision.transforms import v2
import torch
from torch import nn
import evaluate
from transformers import Trainer
from transformers import SegformerForSemanticSegmentation
from transformers import TrainingArguments




Access token to get access to the dataset. 

You just need a HuggingFace account to have access to it. 

If you don't have a HuggingFace account use the temporary token shared in the report.

In [None]:
access_token = "place_token_in_this string"

Load the dataset:

In [None]:
hf_dataset_identifier = "segments/sidewalk-semantic"
ds = load_dataset(hf_dataset_identifier,
                   token=access_token)

Prepare labels :

In [None]:
filename = "id2label.json"
id2label = json.load(
    open(hf_hub_download(hf_dataset_identifier, filename, repo_type="dataset",token=access_token), "r")
)
id2label = {int(k): v for k, v in id2label.items()}
label2id = {v: k for k, v in id2label.items()}

num_labels = len(id2label)

In [None]:
num_labels, list(label2id.keys())

Prepare the preprocessing for the data

In [None]:
pretrained_model_name = "nvidia/mit-b2" 
image_processor = AutoImageProcessor.from_pretrained(pretrained_model_name)
image_processor

In [None]:

def transforms(image):
    image = t.ToTensor(image)
    image = image.permute(
        (2, 0, 1)
    )  # because model is channels-first
    return image


def preprocess(example_batch):
    images = [transforms(x.convert("RGB")) for x in example_batch["pixel_values"]]
    labels = [x for x in example_batch["label"]]
    inputs = image_processor(images, labels)
    return inputs

In [None]:

processor = SegformerImageProcessor()

image_only_transforms = v2.Compose([
     ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1),
])
label_image_transforms = v2.Compose([
    v2.RandomHorizontalFlip(p=0.5),
    v2.RandomVerticalFlip(p=0.5),
])

def train_transforms(example_batch):
    images = [image_only_transforms(x) for x in example_batch['pixel_values']]
    labels = [x for x in example_batch['label']]

    images, labels= label_image_transforms(images, labels)
    inputs = processor(images, labels)
    return inputs


def val_transforms(example_batch):
    images = [x for x in example_batch['pixel_values']]
    labels = [x for x in example_batch['label']]
    inputs = processor(images, labels)
    return inputs




Prepare dataset:

In [None]:
# split up training into training + validation
splits = ds["train"].train_test_split(test_size=0.2)
train_ds = splits["train"]
val_ds = splits["test"]

In [None]:
# Set transforms
train_ds.set_transform(train_transforms)
val_ds.set_transform(val_transforms)

Load our segformer model

In [None]:

#use the mit-b2 encoder for our experiments
pretrained_model_name = "nvidia/mit-b2" 
model = SegformerForSemanticSegmentation.from_pretrained(
    pretrained_model_name,
    id2label=id2label,
    label2id=label2id,
    long_decoder_depth=0,
    width_list=[1024, 1280, 1536, 1792]
    
)

In [None]:
print(model.decode_head)

In [None]:
print(model.decode_head.num_parameters())

set the parameters for the trainning pipeline :

In [None]:

#set the parameters for the trainning 
epochs = 80
lr = 0.00006
batch_size = 4
checkpoints_path = "./training_logs"
training_args = TrainingArguments(
    checkpoints_path,
    learning_rate=lr,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_total_limit=3,
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=20,
    eval_steps=20,
    logging_steps=1,
    eval_accumulation_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_mean_iou",
    greater_is_better = True,
    lr_scheduler_type="polynomial"
)


The metric for the trainning and the evaluation (loss and mIOU):

In [None]:


metric = evaluate.load("mean_iou")

def compute_metrics(eval_pred):
  with torch.no_grad():
    logits, labels = eval_pred
    logits_tensor = torch.from_numpy(logits)
    # scale the logits to the size of the label
    logits_tensor = nn.functional.interpolate(
        logits_tensor,
        size=labels.shape[-2:],
        mode="bilinear",
        align_corners=False,
    ).argmax(dim=1)

    pred_labels = logits_tensor.detach().cpu().numpy()
    metrics = metric.compute(
        predictions=pred_labels,
        references=labels,
        num_labels=len(id2label),
        ignore_index=0,
        reduce_labels=processor.do_reduce_labels,
    )
    
    # remove not necessary metrics
    metrics.pop("per_category_accuracy")
    metrics.pop("mean_accuracy")
    metrics.pop("overall_accuracy")
    metrics.pop("per_category_iou")    
    return metrics


Build the trainer for trainning:

In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)


Launch the trainning : 

In [None]:
trainer.train()

Save the model:

In [None]:
trainer.save_model("./segformer_wide_")

Evaluate the model

In [None]:
trainer.evaluate()

# Evaluation of all models

Here we evaluate each of our model.

## The vanilla model (just the segformer)

In [None]:
checkpoints_path = "./training_logs"
epochs = 80
lr = 0.00006
batch_size = 4


In [None]:
pretrained_model_name = "./segformer_vanilla" 
model_eval = SegformerForSemanticSegmentation.from_pretrained(
    pretrained_model_name,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
training_args_eval = TrainingArguments(
    checkpoints_path,
    learning_rate=lr,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_total_limit=3,
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=20,
    eval_steps=20,
    logging_steps=1,
    eval_accumulation_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_mean_iou",
    greater_is_better = True,
    lr_scheduler_type="polynomial"
)

In [None]:
trainer_eval = Trainer(
    model=model_eval,
    args=training_args_eval,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)

In [None]:
print("Model " + pretrained_model_name[2:] + ":")
print(f"    total number of parameters :{model_eval.num_parameters()}")
print(f"    number of parameters on the decoder :{model_eval.decode_head.num_parameters()}")
print("\n Decoder architecture:")
print(model_eval.decode_head)
print("\n ### Evaluation of the model " + pretrained_model_name[2:] + ":" )
trainer_eval.evaluate()

# Here we test all the long decoder

First the segformer_long_3

In [None]:
pretrained_model_name = "./segformer_long_3" 
model_eval = SegformerForSemanticSegmentation.from_pretrained(
    pretrained_model_name,
    id2label=id2label,
    label2id=label2id,
)
trainer_eval = Trainer(
    model=model_eval,
    args=training_args_eval,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)
print("Model " + pretrained_model_name[2:] + ":")
print(f"    total number of parameters :{model_eval.num_parameters()}")
print(f"    number of parameters on the decoder :{model_eval.decode_head.num_parameters()}")
print("\n Decoder architecture:")
print(model_eval.decode_head)
print("\n ### Evaluation of the model " + pretrained_model_name[2:] + ":" )
trainer_eval.evaluate()

Then the segformer_long_5

In [None]:
pretrained_model_name = "./segformer_long_5" 
model_eval = SegformerForSemanticSegmentation.from_pretrained(
    pretrained_model_name,
    id2label=id2label,
    label2id=label2id,
)
trainer_eval = Trainer(
    model=model_eval,
    args=training_args_eval,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)
print("Model " + pretrained_model_name[2:] + ":")
print(f"    total number of parameters :{model_eval.num_parameters()}")
print(f"    number of parameters on the decoder :{model_eval.decode_head.num_parameters()}")
print("\n Decoder architecture:")
print(model_eval.decode_head)
print("\n ### Evaluation of the model " + pretrained_model_name[2:] + ":" )
trainer_eval.evaluate()

Finally the segformer_long_8

In [None]:
pretrained_model_name = "./segformer_long_8" 
model_eval = SegformerForSemanticSegmentation.from_pretrained(
    pretrained_model_name,
    id2label=id2label,
    label2id=label2id,
)
trainer_eval = Trainer(
    model=model_eval,
    args=training_args_eval,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)
print("Model " + pretrained_model_name[2:] + ":")
print(f"    total number of parameters :{model_eval.num_parameters()}")
print(f"    number of parameters on the decoder :{model_eval.decode_head.num_parameters()}")
print("\n Decoder architecture:")
print(model_eval.decode_head)
print("\n ### Evaluation of the model " + pretrained_model_name[2:] + ":" )
trainer_eval.evaluate()

## Here we test all the wide decoder


Then segformer_wide_3

In [None]:
pretrained_model_name = "./segformer_wide_3" 
model_eval = SegformerForSemanticSegmentation.from_pretrained(
    pretrained_model_name,
    id2label=id2label,
    label2id=label2id,
)
trainer_eval = Trainer(
    model=model_eval,
    args=training_args_eval,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)
print("Model " + pretrained_model_name[2:] + ":")
print(f"    total number of parameters :{model_eval.num_parameters()}")
print(f"    number of parameters on the decoder :{model_eval.decode_head.num_parameters()}")
print("\n Decoder architecture:")
print(model_eval.decode_head)
print("\n ### Evaluation of the model " + pretrained_model_name[2:] + ":" )
trainer_eval.evaluate()

Finally segformer_wide_4

In [None]:
pretrained_model_name = "./segformer_wide_4" 
model_eval = SegformerForSemanticSegmentation.from_pretrained(
    pretrained_model_name,
    id2label=id2label,
    label2id=label2id,
)
trainer_eval = Trainer(
    model=model_eval,
    args=training_args_eval,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)
print("Model " + pretrained_model_name[2:] + ":")
print(f"    total number of parameters :{model_eval.num_parameters()}")
print(f"    number of parameters on the decoder :{model_eval.decode_head.num_parameters()}")
print("\n Decoder architecture:")
print(model_eval.decode_head)
print("\n ### Evaluation of the model " + pretrained_model_name[2:] + ":" )
trainer_eval.evaluate()