# Подготовка датасета

## Установка transformers

In [None]:
!pip install -q transformers datasets
!pip install wget

## Загрузка оригинального датасета с  сайта сбера


In [None]:
import wget
import sys
from pathlib import Path

def bar_progress(current, total, width=80):
    progress_message = "Downloading: %d%% [%d / %d] bytes" % (current / total * 100, current, total)
    sys.stdout.write("\r" + progress_message)
    sys.stdout.flush()

print('Beginning download...')

url = 'https://dsworks.s3pd01.sbercloud.ru/aij2021/AITrain_train/AITrain_train.zip'
wget.download(url, bar=bar_progress)


## Загрузка оригинального датасета с google диска

Последующая его обработка


In [None]:
from google.colab import drive

drive.mount("/content/drive")

In [None]:
!unzip -q /content/drive/MyDrive/AITrain_train.zip -d /content/dataset/

In [None]:
import numpy as np
from PIL import Image
import os

In [None]:
images_dir = "/content/dataset/train_data/masks/"

IMAGES_NAME = os.listdir(images_dir)

i = 0
l = len(IMAGES_NAME)


for img in IMAGES_NAME:
    if i % 100 == 0:
        print(f"{i} / {l}: {i / l * 100}%")
    im = Image.open(images_dir + img)
    im, _, _ = im.split()
    im.save(images_dir + img)

    i += 1

In [None]:
images_dir = "/content/dataset/train_data/masks/"

IMAGES_NAME = os.listdir(images_dir)


i = 0
for img in IMAGES_NAME:
    if i % 100 == 0:
        print(f"{i} / {l}: {i / l * 100}%")
    im = Image.open(images_dir + img)

    data = np.array(im)

    train = 10 # Original value of train
    rail_add = 6 # Original value of additional railroad
    rail = 7 # Original value of railroad
    black = 0 # Value that we want to replace it with
    gray = data[:,:]
    mask = (gray == train)
    data[:,:][mask] = [black]

    gray = data[:,:]
    mask = (gray == rail_add)
    data[:,:][mask] = [rail]
    im = Image.fromarray(data)
    
    im.save(images_dir + img)
    i += 1


In [None]:
!zip -q -T -m -r /content/segmentation.zip /content/dataset/
!cp /content/segmentation.zip /content/drive/MyDrive
drive.flush_and_unmount()

## Загрузка обработанного датасета

In [None]:
from google.colab import drive

drive.mount("/content/drive")

In [None]:
!unzip -q /content/drive/MyDrive/segmentation.zip -d /

## Создание PyTorch dataset and dataloaders


In [None]:
from torch.utils.data import Dataset
import os
from PIL import Image

class SemanticSegmentationDataset(Dataset):
    """Image (semantic) segmentation dataset."""

    def __init__(self, root_dir, feature_extractor, train=True, test=False):
        """
        Args:
            root_dir (string): Root directory of the dataset containing the images + annotations.
            feature_extractor (SegFormerFeatureExtractor): feature extractor to prepare images + segmentation maps.
            train (bool): Whether to load "training" or "validation" images + annotations.
        """
        self.root_dir = root_dir
        self.feature_extractor = feature_extractor
        self.train = train

        self.img_dir = os.path.join(self.root_dir, "images")
        self.ann_dir = os.path.join(self.root_dir, "masks")
        
        # read images
        image_file_names = []
        for root, dirs, files in os.walk(self.img_dir):
          image_file_names.extend(files)
        if train:
          self.images = sorted(image_file_names)[:int(len(image_file_names) * 0.7)]
        elif not test:
          self.images = sorted(image_file_names)[int(len(image_file_names) * 0.7):int(len(image_file_names) * 0.9)]
        else:
          self.images = sorted(image_file_names)[int(len(image_file_names) * 0.9):]
        
        # read annotations
        annotation_file_names = []
        for root, dirs, files in os.walk(self.ann_dir):
          annotation_file_names.extend(files)
        if train:
          self.annotations = sorted(annotation_file_names)[:int(len(image_file_names) * 0.7)]
        elif not test:
          self.annotations = sorted(annotation_file_names)[int(len(annotation_file_names) * 0.7):int(len(annotation_file_names) * 0.9)]
        else:
          self.annotations = sorted(annotation_file_names)[int(len(annotation_file_names) * 0.9):]

        assert len(self.images) == len(self.annotations), "There must be as many images as there are segmentation maps"

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        
        image = Image.open(os.path.join(self.img_dir, self.images[idx]))
        segmentation_map = Image.open(os.path.join(self.ann_dir, self.annotations[idx]))

        # randomly crop + pad both image and segmentation map to same size
        encoded_inputs = self.feature_extractor(image, segmentation_map, return_tensors="pt")

        for k,v in encoded_inputs.items():
          encoded_inputs[k].squeeze_() # remove batch dimension

        return encoded_inputs

In [None]:
from transformers import SegformerFeatureExtractor
model_type = "b2"
root_dir = '/content/dataset/train_data'

feature_extractor = SegformerFeatureExtractor.from_pretrained(f"nvidia/mit-{model_type}")
feature_extractor.reduce_labels = True
feature_extractor.size = 512

train_dataset = SemanticSegmentationDataset(root_dir=root_dir, feature_extractor=feature_extractor)
valid_dataset = SemanticSegmentationDataset(root_dir=root_dir, feature_extractor=feature_extractor, train=False)
test_dataset = SemanticSegmentationDataset(root_dir=root_dir, feature_extractor=feature_extractor, train=False, test=True)

Downloading:   0%|          | 0.00/272 [00:00<?, ?B/s]



In [None]:
print("Number of training examples:", len(train_dataset))
print("Number of validation examples:", len(valid_dataset))
print("Number of test examples:", len(test_dataset))

Next, we define corresponding dataloaders.

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=2)
test_dataloader = DataLoader(test_dataset, batch_size=2)

In [None]:
batch = next(iter(train_dataloader))

## Define the model

Here we load the model, and equip the encoder with weights pre-trained on ImageNet-1k (we take the smallest variant, `nvidia/mit-b0` here, but you can take a bigger one like `nvidia/mit-b5` from the [hub](https://huggingface.co/models?other=segformer)). We also set the `id2label` and `label2id` mappings, which will be useful when performing inference.

In [None]:
from transformers import SegformerForSemanticSegmentation
import json
from huggingface_hub import cached_download, hf_hub_url

filename = "sber-id2label.json"
id2label = json.load(open(filename, "r"))
id2label = {int(k): v for k, v in id2label.items()}
label2id = {v: k for k, v in id2label.items()}

# define model
model = SegformerForSemanticSegmentation.from_pretrained(f"nvidia/mit-{model_type}",
                                                         num_labels=16, 
                                                         id2label=id2label, 
                                                         label2id=label2id,
)

## Fine-tune the model

Here we fine-tune the model in native PyTorch, using the AdamW optimizer. We use the same learning rate as the one reported in the [paper](https://arxiv.org/abs/2105.15203).

It's also very useful to track metrics during training. For semantic segmentation, typical metrics include the mean intersection-over-union (mIoU) and pixel-wise accuracy. These are available in the Datasets library. We can load it as follows:

In [None]:
from datasets import load_metric

metric = load_metric("mean_iou")

In [None]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [None]:
from torch import nn
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
# define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00006)
load = True
if load:
    checkpoint = torch.load(f"/content/drive/MyDrive/rails/mit-{model_type}/last_checkpoint.pt")
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_idx = checkpoint['idx']
    if start_idx == 0:
      start_epoch = checkpoint['epoch'] + 1
    else:
      start_epoch = checkpoint['epoch']
    loss = checkpoint['loss']
else:
    start_epoch = 0
    start_idx = 0

# move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.train()
for epoch in range(start_epoch, 10):  # loop over the dataset multiple times
    print("Epoch:", epoch)
    for idx, batch in enumerate(tqdm(train_dataloader)):
        if start_idx != 0 and idx <= start_idx:
          continue
        else:
          start_idx = 0
        # get the inputs;
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(pixel_values=pixel_values, labels=labels)
        loss, logits = outputs.loss, outputs.logits
        
        loss.backward()
        optimizer.step()

        # evaluate
        with torch.no_grad():
          upsampled_logits = nn.functional.interpolate(logits, size=labels.shape[-2:], mode="bilinear", align_corners=False)
          predicted = upsampled_logits.argmax(dim=1)
          
          metric.add_batch(predictions=predicted.detach().cpu().numpy(), references=labels.detach().cpu().numpy())
        if idx % 400 == 0:
          metrics = metric.compute(num_labels=len(id2label), 
                                   ignore_index=255,
                                   reduce_labels=False)
          
          print(idx, "")
          print("Loss:", loss.item())
          print("Mean_iou:", metrics["mean_iou"])
          print("Mean accuracy:", metrics["mean_accuracy"])
          
          torch.save({
                'epoch': epoch,
                'idx': idx,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
                }, f"/content/drive/MyDrive/rails/mit-{model_type}/last_checkpoint.pt")

    torch.save({
            'epoch': epoch,
            'idx': 0,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, f"/content/drive/MyDrive/rails/mit-{model_type}/{epoch}.pt")
    start_idx = 0
    

## Inference

Finally, let's check whether the model has really learned something. Let's test the trained model on an image:

In [None]:
image = Image.open('/content/dataset/train_data/masks/'+test_dataset.images[-235])
image

In [1]:
encoding = feature_extractor(image, return_tensors="pt")
pixel_values = encoding.pixel_values.to(device)
print(pixel_values.shape)

NameError: ignored

In [None]:
# forward pass
outputs = model(pixel_values=pixel_values)

In [None]:
# logits are of shape (batch_size, num_labels, height/4, width/4)
logits = outputs.logits.cpu()
print(logits.shape)

In [None]:
def ade_palette():
    
    return [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
            [4, 200, 3], [255, 255, 0], [140, 140, 140], [204, 5, 255],
            [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
            [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82]]

In [None]:
from torch import nn
import numpy as np
import matplotlib.pyplot as plt

# First, rescale logits to original image size
upsampled_logits = nn.functional.interpolate(logits,
                size=image.size[::-1], # (height, width)
                mode='bilinear',
                align_corners=False)

# Second, apply argmax on the class dimension
seg = upsampled_logits.argmax(dim=1)[0]
color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8) # height, width, 3
palette = np.array(ade_palette())
for label, color in enumerate(palette):
    color_seg[seg == label, :] = color
# Convert to BGR
color_seg = color_seg[..., ::-1]

# Show image + mask
img = np.array(image) * 0.5 + color_seg * 0.5
img = img.astype(np.uint8)

plt.figure(figsize=(15, 10))
plt.imshow(img)
plt.show()

Compare this to the ground truth segmentation map:

In [None]:
map = Image.open('/content/dataset/train_data/images/'+test_dataset.images[-235])
map 

In [None]:
# convert map to NumPy array
map = np.array(map)
map[map == 0] = 255 # background class is replaced by ignore_index
map = map - 1 # other classes are reduced by one
map[map == 254] = 255

classes_map = np.unique(map).tolist()
unique_classes = [model.config.id2label[idx] if idx!=255 else None for idx in classes_map]
print("Classes in this image:", unique_classes)

# create coloured map
color_seg = np.zeros((map.shape[0], map.shape[1], 3), dtype=np.uint8) # height, width, 3
palette = np.array(ade_palette())
for label, color in enumerate(palette):
    color_seg[map == label, :] = color
# Convert to BGR
color_seg = color_seg[..., ::-1]

# Show image + mask
img = np.array(image) * 0.5 + color_seg * 0.5
img = img.astype(np.uint8)

plt.figure(figsize=(15, 10))
plt.imshow(img)
plt.show()

In [None]:
seg.unique()

In [None]:
model.config.id2label[7]

In [None]:
np.unique(map)

In [None]:
seg

In [None]:
map

Let's compute the metrics:

In [None]:
# metric expects a list of numpy arrays for both predictions and references
metrics = metric.compute(predictions=[seg.numpy()], references=[map], num_labels=16, ignore_index=255)

In [None]:
metrics.keys()

In [None]:
import pandas as pd

# print overall metrics
for key in list(metrics.keys())[:3]:
  print(key, metrics[key])

# pretty-print per category metrics as Pandas DataFrame
metric_table = dict()
for id, label in id2label.items():
    metric_table[label] = [
                           metrics["per_category_iou"][id],
                           metrics["per_category_accuracy"][id]
    ]

print("---------------------")
print("per-category metrics:")
pd.DataFrame.from_dict(metric_table, orient="index", columns=["IoU", "accuracy"])

In [None]:
import os
import sys
import cv2
import datetime
from torch import nn
import numpy as np
import gc
import torch


def video(video_path):

    gc.collect()
    torch.cuda.empty_cache()
    output_video_path = "/content/drive/MyDrive/result2.mp4"

    capture = cv2.VideoCapture(video_path)
    if not capture.isOpened():
        raise Exception("failed to open {}".format(video_path))

    width = int(capture.get(3))
    height = int(capture.get(4))

    fourcc = cv2.VideoWriter_fourcc(*"MP4V")
    fps = 30.0
    out_video = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

    _total_ms = 0
    count_frame = 0
    while capture.isOpened():
        ret, frame = capture.read()
        count_frame += 1

        if not ret:
            break

        start = datetime.datetime.now()
        encoding = feature_extractor(frame, return_tensors="pt")
        pixel_values = encoding.pixel_values.to(device)
        outputs = model(pixel_values=pixel_values)
        logits = outputs.logits.cpu()
        upsampled_logits = nn.functional.interpolate(logits,
                      size=(height, width), # (height, width)
                      mode='bilinear',
                      align_corners=False)


        seg = upsampled_logits.argmax(dim=1)[0]
        color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8) # height, width, 3
        palette = np.array(ade_palette())
        for label, color in enumerate(palette):
            color_seg[seg == label, :] = color
            color_seg = color_seg[..., ::-1]

        overlay = np.array(frame) * 0.5 + color_seg * 0.5
        overlay = overlay.astype(np.uint8)

        _total_ms += (datetime.datetime.now() - start).total_seconds() * 1000

        out_video.write(overlay)

        if cv2.waitKey(1) & 0xFF == ord("q"):
            break

        print("processing time one frame {}[ms]".format(_total_ms / count_frame))
        print("processing total {} frames".format(count_frame))
        print("processing total {} seconds".format(count_frame / 30))
        

    capture.release()
    out_video.release()
    cv2.destroyAllWindows()

video("/content/drive/MyDrive/sakura.mp4")
