Install pytorch! 

In [1]:
from torch.utils.data import Dataset
import os
from PIL import Image
import numpy as np
class SemanticSegmentationDataset(Dataset):
    """Image (semantic) segmentation dataset."""

    def __init__(self, root_dir, image_processor, train=True):
        """
        Args:
            root_dir (string): Root directory of the dataset containing the images + annotations.
            image_processor (SegFormerImageProcessor): image processor to prepare images + segmentation maps.
            train (bool): Whether to load "training" or "validation" images + annotations.
        """
        self.root_dir = root_dir
        self.image_processor = image_processor
        self.train = train

        sub_path = "training" if self.train else "validation"
        self.img_dir = os.path.join(self.root_dir, "images", sub_path)
        self.ann_dir = os.path.join(self.root_dir, "annotations", sub_path)

        # read images
        image_file_names = []
        for root, dirs, files in os.walk(self.img_dir):
          image_file_names.extend(files)
        self.images = sorted(image_file_names)

        # read annotations
        annotation_file_names = []
        for root, dirs, files in os.walk(self.ann_dir):
          annotation_file_names.extend(files)
        self.annotations = sorted(annotation_file_names)

        assert len(self.images) == len(self.annotations), "There must be as many images as there are segmentation maps"

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):

        image = np.array(Image.open(os.path.join(self.img_dir, self.images[idx])))
        segmentation_map = np.array(Image.open(os.path.join(self.ann_dir, self.annotations[idx])).convert("L"))
        
        # randomly crop + pad both image and segmentation map to same size
        encoded_inputs = self.image_processor(image, segmentation_map, return_tensors="pt")

        for k,v in encoded_inputs.items():
          encoded_inputs[k].squeeze_() # remove batch dimension

        return encoded_inputs


In [2]:
from transformers import SegformerImageProcessor

# my_root_dir = ".content/ADE20k_toy_dataset"
my_root_dir = ".content/customDatasetWithLabel"
my_image_processor = SegformerImageProcessor(do_reduce_labels=True) #changed reduce_labels to do_reduce_labels

train_dataset = SemanticSegmentationDataset(root_dir=my_root_dir, image_processor=my_image_processor)
valid_dataset = SemanticSegmentationDataset(root_dir=my_root_dir, image_processor=my_image_processor, train=False)

In [3]:
print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(valid_dataset))


Number of training examples: 427
Number of validation examples: 0


In [4]:
my_encoded_inputs= train_dataset[0]

In [5]:
my_encoded_inputs["pixel_values"].shape
    

torch.Size([3, 512, 512])

In [6]:
my_encoded_inputs["labels"].shape


torch.Size([512, 512])

In [7]:
my_encoded_inputs["labels"]


tensor([[255, 255, 255,  ...,   7,   7,   7],
        [  1,   1,   1,  ...,   7,   7,   7],
        [  1,   1,   1,  ...,   7,   7,   7],
        ...,
        [  1,   1,   1,  ..., 255, 255, 255],
        [  1,   1,   1,  ..., 255, 255, 255],
        [  1,   1,   1,  ..., 255, 255, 255]])

In [8]:
my_encoded_inputs["labels"].squeeze().unique() #to see unique ids of labels in an image

tensor([  1,   2,   3,   7, 255])

In [9]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=2)


In [10]:
batch = next(iter(train_dataloader))


In [11]:
for k2,v2 in batch.items():
  print(k2, v2.shape)

pixel_values torch.Size([2, 3, 512, 512])
labels torch.Size([2, 512, 512])


In [12]:
batch["labels"].shape


torch.Size([2, 512, 512])

In [13]:
mask = (batch["labels"] != 255)
mask


tensor([[[False, False, False,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False, False],
         ...,
         [ True,  True,  True,  ...,  True,  True,  True],
         [ True,  True,  True,  ...,  True,  True,  True],
         [ True,  True,  True,  ..., False, False, False]],

        [[False, False, False,  ..., False, False, False],
         [ True,  True,  True,  ...,  True,  True,  True],
         [ True,  True,  True,  ...,  True,  True,  True],
         ...,
         [ True,  True,  True,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False, False],
         [False, False, False,  ..., False, False, False]]])

In [14]:
batch["labels"][mask]


tensor([1, 1, 1,  ..., 3, 3, 3])

In [15]:
# #defining the model and labels. 
# 
from transformers import SegformerForSemanticSegmentation
import json

#from huggingface_hub import hf_hub_download
# 
# # load id2label mapping from a JSON on the hub
# repo_id = "huggingface/label-files"
# filename = "ade20k-id2label.json"
# id2label = json.load(open(hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset"), "r"))
id2label = {0: 'other', 1: 'Manhole Cover', 2: 'Concrete',3: 'Brick',4: 'Cane', 5: 'Subway grate', 6:'Dirt', 7:'Cellar door', 8:'Tactile pavement'}

with open('cats-and-dogs-id2label.json', 'w') as fp:
    json.dump(id2label, fp)


id2label = {int(k): v for k, v in id2label.items()}
label2id = {v: k for k, v in id2label.items()}
 
#define model
model = SegformerForSemanticSegmentation.from_pretrained("nvidia/mit-b0",
                                                         num_labels=9,
                                                         id2label=id2label,
                                                         label2id=label2id,
)

#load id2label mapping



Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/mit-b0 and are newly initialized: ['decode_head.batch_norm.bias', 'decode_head.batch_norm.num_batches_tracked', 'decode_head.batch_norm.running_mean', 'decode_head.batch_norm.running_var', 'decode_head.batch_norm.weight', 'decode_head.classifier.bias', 'decode_head.classifier.weight', 'decode_head.linear_c.0.proj.bias', 'decode_head.linear_c.0.proj.weight', 'decode_head.linear_c.1.proj.bias', 'decode_head.linear_c.1.proj.weight', 'decode_head.linear_c.2.proj.bias', 'decode_head.linear_c.2.proj.weight', 'decode_head.linear_c.3.proj.bias', 'decode_head.linear_c.3.proj.weight', 'decode_head.linear_fuse.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
import evaluate
metric = evaluate.load("mean_iou")


In [None]:
#I didn't touch the model. I mean it's a toy data set, obviously. I just wanted to see if it was working, and it did.

import torch
from torch import nn
from tqdm.notebook import tqdm

# define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00006)
# move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.train()
for epoch in range(1):  # loop over the dataset multiple times
   print("Epoch:", epoch)
   for idx2, batch in enumerate(tqdm(train_dataloader)):
        # get the inputs;
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(pixel_values=pixel_values, labels=labels)
        loss, logits = outputs.loss, outputs.logits

        loss.backward()
        optimizer.step()

        # evaluate
        with torch.no_grad():
          upsampled_logits = nn.functional.interpolate(logits, size=labels.shape[-2:], mode="bilinear", align_corners=False)
          predicted = upsampled_logits.argmax(dim=1)

          # note that the metric expects predictions + labels as numpy arrays
          metric.add_batch(predictions=predicted.detach().cpu().numpy(), references=labels.detach().cpu().numpy())

        # let's print loss and metrics every 100 batches
        if idx2 % 1 == 0:
          # currently using _compute instead of compute
          # see this issue for more info: https://github.com/huggingface/evaluate/pull/328#issuecomment-1286866576
          
          #changed it to compute.
            metrics = metric.compute(
                  predictions=predicted.cpu(),
                  references=labels.cpu(),
                  num_labels=len(id2label),
                  ignore_index=255,
                  reduce_labels=False, # we've already reduced the labels ourselves
            )
            print("Loss:", loss.item())
            print("Mean_iou:", metrics["mean_iou"])
            print("Mean accuracy:", metrics["mean_accuracy"])

Epoch: 0


  0%|          | 0/214 [00:00<?, ?it/s]

  acc = total_area_intersect / total_area_label


Loss: 2.3217408657073975
Mean_iou: 0.005480565531787611
Mean accuracy: 0.1620940808928717
Loss: 2.2800979614257812
Mean_iou: 0.007475974514783758
Mean accuracy: 0.10990251853564666
Loss: 2.2291507720947266
Mean_iou: 0.011648753420787068
Mean accuracy: 0.3446529106156488
Loss: 2.2076005935668945
Mean_iou: 0.027507815086194123
Mean accuracy: 0.20186882313080126
Loss: 2.1431725025177
Mean_iou: 0.02659441648321248
Mean accuracy: 0.2718027504882842
Loss: 2.1715071201324463
Mean_iou: 0.022738004971065085
Mean accuracy: 0.22719936666736681
Loss: 2.1897592544555664
Mean_iou: 0.05119878526695362
Mean accuracy: 0.27024921390516066
Loss: 2.1101083755493164
Mean_iou: 0.04488615652556485
Mean accuracy: 0.1038423114961065
Loss: 2.2817602157592773
Mean_iou: 0.00813490654309268
Mean accuracy: 0.3258307175854754
Loss: 2.2183821201324463
Mean_iou: 0.01474015369948071
Mean accuracy: 0.4647211767583036
Loss: 2.1190192699432373
Mean_iou: 0.05020101810121687
Mean accuracy: 0.41132048725636466


In [None]:
#inference
firstImage = Image.open('.content/customDatasetWithLabel/images/training/0@0_2023-03-23_15-38-19-4410_Brick.jpg')
firstImage

In [None]:
# prepare the image for the model
pixel_values = my_image_processor(firstImage, return_tensors="pt").pixel_values.to(device)
print(pixel_values.shape)


In [None]:
import torch

# forward pass
with torch.no_grad():
  outputs = model(pixel_values=pixel_values)


In [None]:
# logits are of shape (batch_size, num_labels, height/4, width/4)
logits = outputs.logits.cpu()
print(logits.shape)

In [None]:
def ade_palette():
    """ADE20K palette that maps each class to RGB values."""
    #Changed to my own palette cuz why not
    return [[1,1,1], [31, 31, 31], [62, 62, 62], [93, 93, 93],
            [123, 123, 123], [154, 154, 154], [185,185, 185], [216, 216, 216],
            [247, 247, 247]]


In [None]:
predicted_segmentation_map = my_image_processor.post_process_semantic_segmentation(outputs, target_sizes=[firstImage.size[::-1]])[0]
predicted_segmentation_map = predicted_segmentation_map.cpu().numpy()
print(predicted_segmentation_map)


In [None]:
import matplotlib.pyplot as plt

color_seg = np.zeros((predicted_segmentation_map.shape[0],
                      predicted_segmentation_map.shape[1], 3), dtype=np.uint8) # height, width, 3

palette = np.array(ade_palette())
for label, color in enumerate(palette):
    color_seg[predicted_segmentation_map == label, :] = color
# Convert to BGR
color_seg = color_seg[..., ::-1]

# Show image + mask
img = np.array(firstImage) * 0.5 + color_seg * 0.5
img = img.astype(np.uint8)

plt.figure(figsize=(15, 10))
plt.imshow(img)
plt.show()


In [None]:
firstAnnotation = Image.open('.content/customDatasetWithLabel/annotations/training/0@0_2023-03-23_15-38-19-4410_Brick.png')
firstAnnotation


In [None]:
# convert map to NumPy array
firstAnnotation = np.array(firstAnnotation)
firstAnnotation[firstAnnotation == 0] = 255 # background class is replaced by ignore_index
firstAnnotation = firstAnnotation - 1 # other classes are reduced by one
firstAnnotation[firstAnnotation == 254] = 255

classes_map = np.unique(firstAnnotation).tolist()
unique_classes = [model.config.id2label[idx2] if idx2!=255 else None for idx2 in classes_map]
print("Classes in this image:", unique_classes)

# create coloured map
color_seg = np.zeros((firstAnnotation.shape[0], firstAnnotation.shape[1], 3), dtype=np.uint8) # height, width, 3
palette = np.array(ade_palette())
for label, color in enumerate(palette):
    color_seg[map == label, :] = color
# Convert to BGR
color_seg = color_seg[..., ::-1]

# Show image + mask
img = np.array(firstImage) * 0.5 + color_seg * 0.5
img = img.astype(np.uint8)

plt.figure(figsize=(15, 10))
plt.imshow(img)
plt.show()


In [None]:
# metric expects a list of numpy arrays for both predictions and references
metrics = metric.compute(
                  predictions=[predicted_segmentation_map],
                  references=[firstAnnotation],
                  num_labels=len(id2label),
                  ignore_index=255,
                  reduce_labels=False, # we've already reduced the labels ourselves
              )


In [None]:
metrics.keys()

In [None]:
import pandas as pd

# print overall metrics
for key in list(metrics.keys())[:3]:
  print(key, metrics[key])

# pretty-print per category metrics as Pandas DataFrame
metric_table = dict()
for my_id, label in id2label.items():
    metric_table[label] = [
                           metrics["per_category_iou"][my_id],
                           metrics["per_category_accuracy"][my_id]
    ]

print("---------------------")
print("per-category metrics:")
pd.DataFrame.from_dict(metric_table, orient="index", columns=["IoU", "accuracy"])
