# Fine-tune SegFormer for car piece segmentation

Following HuggingFace tutorial: https://huggingface.co/blog/fine-tune-segformer

## Load Dataset

Try data augmentation with Jitter

In [16]:
from torchvision import transforms
from src.data.DeloitteDataset import split_dataset

# from torchvision.transforms import ColorJitter
# jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1) 

transformations = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(0.0, 1.0)
    ])

train_dataset, valid_dataset, testing_dataset = split_dataset(
        '../data/raw/carseg_data/clean_data', 
        '../references/test_set_ids.txt',
        transform=transformations
    )

## Feature extraction

In [33]:
import numpy as np

In [19]:
from transformers import SegformerFeatureExtractor

feature_extractor = SegformerFeatureExtractor()

In [25]:
input, mask = train_dataset[0]
input.size()

torch.Size([3, 256, 256])

In [36]:
features = feature_extractor(input, mask[0])
print('Keys of features:', features.keys())
print('Unique elements in labels:', np.unique(features['labels']))

Keys of features: dict_keys(['pixel_values', 'labels'])
Unique elements in labels: [0 1 2 3 4 7 8]


In [48]:
features.labels[0].shape

(512, 512)

## Load SegFormer pretrained model

In [14]:
from transformers import SegformerForSemanticSegmentation
from src.visualization.visualization_fct import get_mask_names

id2label = get_mask_names()
label2id = {v: k for k, v in id2label.items()}

pretrained_model_name = "nvidia/mit-b0" 
model = SegformerForSemanticSegmentation.from_pretrained(
    pretrained_model_name,
    id2label=id2label,
    label2id=label2id
)

Downloading:   0%|          | 0.00/70.0k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.4M [00:00<?, ?B/s]

Some weights of the model checkpoint at nvidia/mit-b0 were not used when initializing SegformerForSemanticSegmentation: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing SegformerForSemanticSegmentation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SegformerForSemanticSegmentation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/mit-b0 and are newly initialized: ['decode_head.linear_fuse.weight', 'decode_head.batch_norm.num_batches_tracked', 'decode_head.linear_c.0.proj.weight', 'decode_head.batch_norm.running_var', 'decode_head.linear_c.3.proj.bias', 'decode

In [15]:
model

SegformerForSemanticSegmentation(
  (segformer): SegformerModel(
    (encoder): SegformerEncoder(
      (patch_embeddings): ModuleList(
        (0): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(3, 32, kernel_size=(7, 7), stride=(4, 4), padding=(3, 3))
          (layer_norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        )
        (1): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        )
        (2): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(64, 160, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (layer_norm): LayerNorm((160,), eps=1e-05, elementwise_affine=True)
        )
        (3): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(160, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  