**Проблема затухающих градиентов (vanishing grandients)**


Исследования показали, что если взять нейронную сеть с большим количеством слоев и применить к ней алгоритм обратного распространения ошибки, то обучение будет происходить неравномерно, последние слои будут обучать быстрее начальных. В результате, градиент на поздних слоях будет устремляться к нулю, а значит начальные слои перестают обучаться.

skip connection:

$H(x) = F(x) + x$ 

$\frac{dH(x)}{dx} = 1 + \frac{dF(x)}{dx}$

Из-за появления 1 градиенты не будут затухать от распространения сигналы от последних к начальным слоям


$F(x) = H(x) - x$

- В ResNet18, ResNet43 используется BasicBlock;
- В ResNet50, ResNet101 используется BottleneckBlock;

## ResNet18

Общая архитектура ResNet: 

1. Conv2d: 3, 64, 7x7, s=2, p=3, b=False
2. BatchNorm: 64
3. MaxPool2d: 3x3, s=2, p=1

Здесь в каждом слое присутствует 2 BasicBlock:
- Layer1: 64, H, W  - После этого слоя получаем tensor с такими же размерами  64, H, W
- Layer2: 128, H/2, W/2
- Layer3: 256, H/4, W/4
- Layer4: 512, H/8, W/8

4. AdaptiveAvgPool2d
5. Linear: 512*factor, out=100, b=True

In [8]:
from torchvision import models
from PIL import Image

model = models.resnet50()
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [9]:
resnet_weights = models.ResNet50_Weights.DEFAULT
cats = resnet_weights.meta['categories']
transforms = resnet_weights.transforms()
model = models.resnet50(weights=resnet_weights)

In [11]:
img = Image.open('style/danila_isaev.jpg').convert("RGB")
img = transforms(img).unsqueeze(0) # (1,3,224,224)

In [14]:
model.eval()
p = model(img).squeeze()
res = p.softmax(dim=0).sort(descending=True)

In [15]:
for s, i in zip(res[0][:5], res[1][:5]):
    print(f"{cats[i]}: {s:.4f}")

jean: 0.5660
Loafer: 0.0525
suit: 0.0243
sunglasses: 0.0199
sandal: 0.0164


## Transfer Learning

In [16]:
import os
import json
from PIL import Image

import torch
import torch.utils.data as data
import torchvision.transforms.v2 as tfs
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

class DogDataset(data.Dataset):
    def __init__(self, path, train=True, transform=None):
        self.path = os.path.join(path, "train" if train else "test")
        self.transform = transform

        with open(os.path.join(self.path, "format.json"), "r") as fp:
            self.format = json.load(fp)

        self.length = 0
        self.files = []
        self.targets = torch.eye(10)

        for _dir, _target in self.format.items():
            path = os.path.join(self.path, _dir)
            list_files = os.listdir(path)
            self.length += len(list_files)
            self.files.extend(map(lambda _x: (os.path.join(path,_x),_target),list_files))

    def __getitem__(self, item):
        path_file, target = self.files[item]
        t = self.targets[target]
        img = Image.open(path_file)

        if self.transform:
            img = self.transform(img)

        return img, t
    
    def __len__(self):
        return self.length

In [17]:
resnet_weights = models.ResNet50_Weights.DEFAULT
transforms = resnet_weights.transforms()

In [18]:
model = models.resnet50(weights=resnet_weights)
model.requires_grad_(False)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [20]:
# меняем последний слой и будем его обучать
model.fc = nn.Linear(512*4, 10)
model.fc.requires_grad_(True)

Linear(in_features=2048, out_features=10, bias=True)

In [22]:
d_train = DogDataset(r"dogs", transform=transforms)
train_data = data.DataLoader(d_train, batch_size=32, shuffle=True)

In [23]:
optimizer = optim.Adam(params=model.fc.parameters(), lr=0.001, weight_decay=0.001)
loss_function = nn.CrossEntropyLoss()
epochs = 3
model.train()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [24]:
epochs = 5
model.train()

for _e in range(epochs):
    loss_mean = 0
    lm_count = 0

    train_tqdm = tqdm(train_data, leave=True)
    for x_train, y_train in train_tqdm:
        predict = model(x_train)
        loss = loss_function(predict, y_train)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        lm_count += 1
        loss_mean = 1/lm_count * loss.item() + (1 - 1/lm_count) * loss_mean

        train_tqdm.set_description(f"Epoch {_e+1}/{epochs}, loss_mean={loss_mean:.3f}")

Epoch 1/5, loss_mean=1.382: 100%|██████████| 51/51 [02:56<00:00,  3.47s/it]
Epoch 2/5, loss_mean=0.532: 100%|██████████| 51/51 [02:58<00:00,  3.50s/it]
Epoch 3/5, loss_mean=0.351: 100%|██████████| 51/51 [02:48<00:00,  3.31s/it]
Epoch 4/5, loss_mean=0.259: 100%|██████████| 51/51 [02:49<00:00,  3.32s/it]
Epoch 5/5, loss_mean=0.218: 100%|██████████| 51/51 [02:43<00:00,  3.21s/it]


In [25]:
st = model.state_dict()
torch.save(st, 'model_transfer_resnet.tar')

In [26]:
d_test = DogDataset("dogs", train=False, transform=transforms)
test_data = data.DataLoader(d_test, batch_size=50, shuffle=False)

In [30]:
# тестирование обученной НС
Q = 0
P = 0
count = 0
model.eval()

test_tqdm = tqdm(test_data, leave=True)
for x_test, y_test in test_tqdm:
    with torch.no_grad():
        p = model(x_test)
        p2 = torch.argmax(p, dim=1)
        y = torch.argmax(y_test, dim=1)
        P += torch.sum(p2 == y).item()
        Q += loss_function(p, y_test).item()
        count += 1

Q /= count
P /= len(d_test)
print(Q)
print(P)

100%|██████████| 7/7 [00:33<00:00,  4.83s/it]

0.24200923102242605
0.943217665615142





## U-Net

Семантическая сегментация изображений