# **Computer Vision** - Podstawy działania i przykłady zastosowania sieci neuronowych do klasyfikacji obrazów i wykrywania obiektów

Organizator: Koło naukowe BioMedicalAI  
![biomedical.svg](biomedical.svg)

## Czym jest konwolucja (okno przesuwne)?

Konwolucja jest operacją matematyczną przekształcenia macierzowego danych wejściowych.
Macierz tego przekształcenia nazywamy filtrem/kernelem.

!["Przykład konwolucji"](./Convolution_arithmetic_-_Padding_strides.gif)  
*Przykłąd konwolucji*
*Źródło: Animation of a variation of the convolution operation. Blue maps are inputs, and cyan maps are outputs. From Vincent Dumoulin, Francesco Visin - A guide to convolution arithmetic for deep learning[1]*

[A guide to convolution arithmetic for deep learning https://arxiv.org/pdf/1603.07285](https://arxiv.org/pdf/1603.07285)

In [None]:
import numpy as np
import cv2 as cv
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (15,15)

image = cv.cvtColor(cv.imread("cv.png"), cv.COLOR_BGR2GRAY)
plt.imshow(image, cmap="grey")

In [None]:
# Prosta konwolucja
identity = np.array([
    [0, 0, 0],
    [0, 1, 0],
    [0, 0, 0],
], dtype=float)
# Sobel
vertical_edge_detection = np.array([
    [-1, 0, 1],
    [-2, 0, 2],
    [-1, 0, 1],
], dtype=float)

horizontal_edge_detection = np.array([
    [-1, -2, -1],
    [0, 0, 0],
    [1, 2, 1],
], dtype=float)
kernel = vertical_edge_detection

new_image = np.zeros_like(image, dtype=float)
for i in range(image.shape[0] - 2):
    for j in range(image.shape[1] - 2):
        new_image[i, j] = np.sum(kernel * image[i:i+3, j:j+3])
plt.imshow(new_image, cmap="grey")

## Czym jest stride, padding i dylatacja?
Stride  - krok o ile przesuwa się okno
Padding - wypełnienie pozwalające zachować wielkość obrazu
Dylatacja - przestrzeń pomiędzy elementami macierzy

!["Przykład konwolucji z dylatacją i paddingiem"](./image.png)  
*Przykład konwolucji z dylatacją i paddingiem*  
*Żródło: Animation of a variation of the convolution operation. Blue maps are inputs, and cyan maps are outputs. From Vincent Dumoulin, Francesco Visin - A guide to convolution arithmetic for deep learning*

## Pooling
Pooling pozwala na processing danych i wyodrębnienie z nich cech.
Wyodrebniamy Min/Avg/Max pooling.

![max_pool.png](./max_pool.png)  
*Max pooling. Źródło: https://computersciencewiki.org/index.php?title=Max-pooling_/_Pooling*

## Klasyfikacja MNIST uzywajać LeNet5
**https://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf**

In [None]:
from torchvision import datasets, transforms
import torch
from torch import optim, nn
import lightning as L
import torchmetrics as TM
import os

torch.set_float32_matmul_precision('medium')

train = datasets.MNIST('../data', train=True, download=True, transform=(transforms.ToTensor()))
test = datasets.MNIST('../data', train=False, download=True, transform=(transforms.ToTensor()))

class LightningClassification(L.LightningModule):
    def __init__(self):
        super().__init__()
        self.layer_1 = nn.Sequential(
            nn.Conv2d(1, 6, kernel_size=5),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Sigmoid()
        )
        self.layer_2 = nn.Sequential(
            nn.Conv2d(6, 16, kernel_size=5),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Sigmoid()
        )
        self.fc5 = nn.Linear(256, 120)
        self.fc6 = nn.Linear(120, 84)
        self.fc7 = nn.Linear(84, 10)
            
    def model(self, x):
        x = self.layer_1(x)
        x = self.layer_2(x)
        x = torch.flatten(x, 1)
        x = self.fc5(x)
        x = torch.nn.functional.sigmoid(x)
        x = self.fc6(x)
        x = torch.nn.functional.sigmoid(x)
        x = self.fc7(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self.model(x)
        loss = nn.functional.cross_entropy(y_pred, y)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch):
        x, y = batch
        y_pred = self.model(x)
        self.log("val_accuracy", TM.functional.accuracy(y_pred, y, task="multiclass", num_classes=10))
        self.log("val_precision", TM.functional.precision(y_pred, y, task="multiclass", num_classes=10), prog_bar=True)
        self.log("val_matthews_corrcoef", TM.functional.matthews_corrcoef(y_pred, y, task="multiclass", num_classes=10), prog_bar=True)
    
    def test_step(self, batch):
        x, y = batch
        y_pred = self.model(x)
        self.log("test_accuracy", TM.functional.accuracy(y_pred, y, task="multiclass", num_classes=10))
        self.log("test_precision", TM.functional.precision(y_pred, y, task="multiclass", num_classes=10), prog_bar=True)
        self.log("test_matthews_corrcoef", TM.functional.matthews_corrcoef(y_pred, y, task="multiclass", num_classes=10), prog_bar=True)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-3)
        return optimizer


classifier = LightningClassification()

train_dataloader = torch.utils.data.DataLoader(train, batch_size=32, shuffle=True, num_workers=os.cpu_count() -1)
test_dataloader = torch.utils.data.DataLoader(test, batch_size=32, num_workers=os.cpu_count() -1)

trainer = L.Trainer(max_epochs=10)
trainer.fit(model=classifier, train_dataloaders=train_dataloader, val_dataloaders=test_dataloader)
trainer.test(classifier, test_dataloader)

# Wizualizacja wyników klasyfikatora MNIST
figure = plt.figure(figsize=(8, 8))
cols, rows = 3, 3
for i in range(1, cols * rows + 1):
    sample_idx = torch.randint(len(test), size=(1,)).item()
    img, label = test[sample_idx]
    label_pred_vector = classifier.model(img[None, :])
    label_pred = torch.argmax(label_pred_vector, axis=1)
    figure.add_subplot(rows, cols, i)
    plt.title(f"{label} / {label_pred.item()}")
    plt.axis("off")
    plt.imshow(img.squeeze(), cmap="gray")
plt.show()

## Architektury

### ResNet
Block ResNet na znaczącą poprawę uczenia głebokich konwolucyjnych sieci neuronowych.  
![image-2.png](./resblock.png)  
[Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385)

### UNet
Poprawa segmentacji poprzez wykorzystanie wyuczonej augmentacji danych wraz z kontekstem.  
![image.png](./unet.png)  
[U-Net: Convolutional Networks for Biomedical Image Segmentation](https://arxiv.org/abs/1505.04597)


### R-CNN (Regions with CNN feature)
Rodzina architektur do wykrywania obiektów oraz segmentacji obrazów.  
![image-3.png](./rcnn.png)  
Składają się na nią:  
[R-CNN](https://arxiv.org/abs/1311.2524v5)  
[Fast R-CNN](https://arxiv.org/abs/1504.08083)  
[Faster R-CNN](https://arxiv.org/abs/1506.01497)  
[Mask R-CNN](https://arxiv.org/abs/1703.06870)

### YOLO
Rodzina architektur pozwalająca na detekcję obiektow real-time na niskiej klasy hardware.  
![image-4.png](./yolo.png)  

[You Only Look Once: Unified, Real-Time Object Detection](https://arxiv.org/abs/1506.02640)


Zasoby:  
[https://towardsdatascience.com/r-cnn-fast-r-cnn-faster-r-cnn-yolo-object-detection-algorithms-36d53571365e](https://towardsdatascience.com/r-cnn-fast-r-cnn-faster-r-cnn-yolo-object-detection-algorithms-36d53571365e)  
[https://d2l.ai/chapter_computer-vision/rcnn.html](https://d2l.ai/chapter_computer-vision/rcnn.html)

## Metryki

### Segmentacja
Interception Over Union (IOU) - róznica między iloczynem zbiorów (ground truth i detection) oraz ich sumą
![image.png](./iou.png)  
*Intersection over Union (IoU) in Object Detection & Segmentation  https://learnopencv.com/intersection-over-union-iou-in-object-detection-and-segmentation/*

Dice - $\frac{2*Intersection}{Union} $

### Detekcja obiektów
Mean Average Precision (mAP) - metryka dająca informację nt. jakości detekcji obiektów dla wybranego IOU(np. 0.5). Na podstawie wyników detekcji dla obrazów przygotowuje się krzywą Precision-Recall dla klasy, pod którą pole to Average Precision. Następnie uśredniamy AP kla wszystkich klas aby uzyskać mAP.     
![image-2.png](./precision-recall.png)  
*Precision Recall curve graph. Image: Ren Jie Tan https://builtin.com/articles/mean-average-precision*


### Generacja obrazów
SSIM - Structural similarity index measure  
Fréchet inception distance (FID) - różnica rozkładów pomiędzy aktywacjami Inception-v3 dla serii obrazów prawdziwycj i wygenerowanych


## Wizualizacja filtrów

### Aktywacja
![image.png](./activations.png)  
*How to visualize convolutional features in 40 lines of code https://towardsdatascience.com/how-to-visualize-convolutional-features-in-40-lines-of-code-70b7d87b0030*  

### GradCam
![image-2.png](./gradcam.png)
*M, M.M., T. R, M., V, V.K. et al. Enhancing brain tumor detection in MRI images through explainable AI using Grad-CAM with Resnet 50. BMC Med Imaging 24, 107 (2024). https://doi.org/10.1186/s12880-024-01292-7*

## Wykorzystanie UNet w praktyce
W ramach praktyki wysegmentujemy gruczoły znajdujące się w tkance w barwieniu H&E (hematoksyliną i eozyną).  
Do tego zadania wykorzystamy dataset [GlaS MICCAI'2015](https://www.kaggle.com/datasets/sani84/glasmiccai2015-gland-segmentation)  
[Link do ZIPa z datasetem](https://drive.google.com/file/d/1KwCeBziUnnYM0rXWLWXLxAGiN1M3TpGo/view?usp=drive_link)

[Strona https://paperswithcode.com agregująca zarówno datasety oraz metody](https://paperswithcode.com/)

In [4]:
import os
import os.path as pp
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.preprocessing import OneHotEncoder
from torchvision.transforms.functional import normalize

# DataModule
class GlaSDataModule():
    class GlaSDataset(Dataset):
        def __init__(self, dt, y, data_path):
            self.dt = dt
            self.y = y
            self.data_path = data_path

        def __len__(self):
            return len(self.dt)

        def __getitem__(self, idx):
            item = self.dt.iloc[idx]
            image = torch.load(pp.join(self.data_path, "tiles", f"{item["name"]}.pth"), weights_only=True)
            mask = torch.load(pp.join(self.data_path, "tiles", f"{item["name"]}_anno.pth"), weights_only=True)
            return normalize(image, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), mask, self.y[idx]

    def __init__(self, data_path, tile_size):
        dataset = pd.read_csv(pp.join(data_path, "Grade.csv"))
        self.dt_train = dataset[dataset["name"].str.startswith("train")]
        self.dt_test = dataset[dataset["name"].str.startswith("test")]
        if not pp.exists(pp.join(data_path, "tiles")):
            os.mkdir(pp.join(data_path, "tiles"))

        def convert_to_tiles(items):
            results = []
            for _, item in items.iterrows():
                image = cv.imread(pp.join(data_path, f"{item["name"]}.bmp"))
                image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
                mask = cv.imread(pp.join(data_path, f"{item["name"]}_anno.bmp"))
                mask = mask[:, :, 0]
                mask = np.where(mask > 0, 1, 0)
                for i in range(0, int(np.ceil(image.shape[0]/tile_size))):
                    for j in range(0, int(np.ceil(image.shape[1]/tile_size))):
                        cut_image = image[i*tile_size:(i+1)*tile_size, j*tile_size:(j+1)*tile_size]
                        if cut_image.shape[0] * cut_image.shape[1] < 0.3 * tile_size * tile_size:
                            continue

                        tile_image = np.zeros((tile_size, tile_size, 3))
                        tile_image[:cut_image.shape[0], :cut_image.shape[1], :] = cut_image
                        cut_mask = mask[i*tile_size:(i+1)*tile_size, j*tile_size:(j+1)*tile_size]
                        tile_mask = np.zeros((tile_size, tile_size, 1))
                        tile_mask[:cut_mask.shape[0], :cut_mask.shape[1], 0] = cut_mask
                        torch.save(torch.FloatTensor(tile_image).permute(2, 0, 1) / 255, pp.join(data_path, "tiles", f"{item["name"]}_{i}_{j}.pth"))
                        torch.save(torch.IntTensor(tile_mask).permute(2, 0, 1), pp.join(data_path, "tiles", f"{item["name"]}_{i}_{j}_anno.pth"))
                        tile_item = item.to_dict()
                        tile_item["name"] = f"{item["name"]}_{i}_{j}"
                        results.append(tile_item)
            return results

        self.dt_train = pd.DataFrame(convert_to_tiles(self.dt_train))
        self.dt_test = pd.DataFrame(convert_to_tiles(self.dt_test))

        self.grade_encoder = OneHotEncoder()
        self.train_y = self.grade_encoder.fit_transform(self.dt_train[" grade (GlaS)"].values.reshape(-1, 1)).toarray()
        self.test_y = self.grade_encoder.transform(self.dt_test[" grade (GlaS)"].values.reshape(-1, 1)).toarray()
        self.data_path = data_path

    @property
    def train(self):
        return GlaSDataModule.GlaSDataset(self.dt_train, self.train_y, self.data_path)
    
    @property
    def test(self):
        return GlaSDataModule.GlaSDataset(self.dt_test, self.test_y, self.data_path)


datamodule = GlaSDataModule("./warwick_qu_dataset_released_2016_07_08/Warwick QU Dataset (Released 2016_07_08)/", 128)
train_dataloader = DataLoader(datamodule.train, batch_size=32, shuffle=True, num_workers=os.cpu_count() -1)
test_dataloader = DataLoader(datamodule.test, batch_size=32, num_workers=os.cpu_count() -1)

In [None]:
# Wizualizacja datasetu
figure = plt.figure(figsize=(8, 8))
cols, rows = 3, 3
images, masks, labels  = next(iter(test_dataloader))

for i in range(1, cols * rows + 1):
    sample_idx = torch.randint(len(images), size=(1,)).item()
    img = images[sample_idx].permute(1, 2, 0).numpy()
    mask = masks[sample_idx].permute(1, 2, 0).numpy()
    figure.add_subplot(rows, cols * 2, i*2-1)
    plt.title(datamodule.grade_encoder.inverse_transform(labels[sample_idx].numpy().reshape(1, -1))[0][0])
    plt.axis("off")
    plt.imshow(img.squeeze())
    figure.add_subplot(rows, cols * 2, i*2)
    plt.axis("off")
    plt.imshow(mask.squeeze())
plt.show()

In [6]:
# From https://github.com/milesial/Pytorch-UNet
class DoubleConv(nn.Module):
    def __init__(self, in_channels, out_channels, mid_channels=None):
        super().__init__()
        if not mid_channels:
            mid_channels = out_channels
        self.double_conv = nn.Sequential(
            nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        return self.double_conv(x)


class Down(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.maxpool_conv = nn.Sequential(
            nn.MaxPool2d(2),
            DoubleConv(in_channels, out_channels)
        )

    def forward(self, x):
        return self.maxpool_conv(x)


class Up(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.up = nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2)
        self.conv = DoubleConv(in_channels, out_channels)

    def forward(self, x1, x2):
        x1 = self.up(x1)
        x = torch.cat([x2, x1], dim=1)
        return self.conv(x)


class OutConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(OutConv, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1)

    def forward(self, x):
        return self.conv(x)

In [7]:
def dice_loss(input, target):
    dims = (1, 2, 3)
    intersection = torch.sum(input * target, dims)
    cardinality = torch.sum(input + target, dims)

    dice_score = 2.0 * intersection / (cardinality +  1e-6)
    return torch.mean(1.0 - dice_score)

In [None]:
from lightning.pytorch import loggers

class LightningUNet(L.LightningModule):
    def __init__(self):
        super().__init__()
        self.n_classes = 1

        self.inc = DoubleConv(3, 64)
        self.down1 = Down(64, 128)
        self.down2 = Down(128, 256)
        self.down3 = Down(256, 512)
        self.down4 = Down(512, 1024)
        self.up4 = Up(1024, 512)
        self.up3 = Up(512, 256)
        self.up2 = Up(256, 128)
        self.up1 = Up(128, 64)
        self.outc = OutConv(64, self.n_classes)
            
    def model(self, x):
        x1 = self.inc(x)
        x2 = self.down1(x1)
        x3 = self.down2(x2)
        x4 = self.down3(x3)
        x5 = self.down4(x4)
        x = x5
        x = self.up4(x, x4)
        x = self.up3(x, x3)
        x = self.up2(x, x2)
        x = self.up1(x, x1)
        logits = self.outc(x)
        return logits

    def training_step(self, batch, batch_idx):
        images, masks, _ = batch
        y_pred = self.model(images)
        loss = nn.functional.binary_cross_entropy_with_logits(y_pred.squeeze(1), masks.squeeze(1).float())
        loss += dice_loss(nn.functional.sigmoid(y_pred), masks)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        images, masks, _ = batch
        y_pred = self.model(images)
        self.log("test_dice", TM.functional.dice(y_pred, masks), prog_bar=True)
        self.log("test_iou", TM.functional.jaccard_index(y_pred, masks, task="binary"), prog_bar=True)
        if batch_idx == 0:
            self.logger.experiment.add_image("input", images[0], self.current_epoch)
            self.logger.experiment.add_image("mask", torch.vstack([masks[0], masks[0], masks[0]]), self.current_epoch)
            self.logger.experiment.add_image("pred", torch.vstack([y_pred[0], y_pred[0], y_pred[0]]), self.current_epoch)

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-3)
        return optimizer


segmenter = LightningUNet()

tb_logger = loggers.TensorBoardLogger(save_dir="unet_logs")
trainer = L.Trainer(max_epochs=25, logger=tb_logger)
trainer.fit(model=segmenter, train_dataloaders=train_dataloader, val_dataloaders=test_dataloader)

## Ankieta
!["Ankieta"](./ankieta.png)  