In [None]:
# pixel-wise classification

# Img Restoration : + > Concat     흐릿한 사진에서 노이즈 부분을 빼주면 선명해짐
# Img Classification : + < Concat

In [5]:
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import os
import os.path as osp
import PIL

# Dataloader

- ./resources/Kitti/data_road/training/image_2/um_000081.png <----- source
- ./resources/Kitti/data_road/training/gt_image_2/um_road_000081.png <----- target

![source](./resources/Kitti/data_road/training/image_2/um_000081.png "Variable")
![target](./resources/Kitti/data_road/training/gt_image_2/um_road_000081.png "Variable")


In [2]:
imgsets_file = osp.join('resources/Kitti', '%s.txt'% 'train')

for i, src_tar_name in enumerate(open(imgsets_file)):
    if i < 3:
        print(src_tar_name)
        src_tar_name = src_tar_name.strip() # 양옆의 공백을 제거
        print(src_tar_name)
        src_tar_name = src_tar_name.split() # 띄워쓰기를 나눠서 List로 만듬
        print(src_tar_name)

training/image_2/um_000000.png training/gt_image_2/um_road_000000.png

training/image_2/um_000000.png training/gt_image_2/um_road_000000.png
['training/image_2/um_000000.png', 'training/gt_image_2/um_road_000000.png']
training/image_2/um_000001.png training/gt_image_2/um_road_000001.png

training/image_2/um_000001.png training/gt_image_2/um_road_000001.png
['training/image_2/um_000001.png', 'training/gt_image_2/um_road_000001.png']
training/image_2/um_000002.png training/gt_image_2/um_road_000002.png

training/image_2/um_000002.png training/gt_image_2/um_road_000002.png
['training/image_2/um_000002.png', 'training/gt_image_2/um_road_000002.png']


In [10]:
class KITTIdataset(torch.utils.data.Dataset):
    class_names = np.array(['background', 'road'])
    mean_bgr = np.array([104.00698793, 116.66876762, 122.67891434]) # KITTIdataset의 Normalization

    def __init__(self, root, split='train', transform=False): # root폴더 => resources
        self._transform = transform
        dataset_dir = os.path.join(root, 'Kitti') #string을 /로 이어서 경로 만들어주는 것
        self.files = []  # dict 형태로 pair로 append({"img" : src_tar_name[0], "lbl" : src_tar_name[1]})
        # TODO
        # file names in 'self.files'
        imgsets_file = os.path.join(dataset_dir, '%s.txt'% split)
        
        for i, src_tar_name in enumerate(open(imgsets_file)):
            src_tar_name = src_tar_name.strip()
            src_tar_name = src_tar_name.split()
            self.files.append({"img" : os.path.join(dataset_dir, "data_road",  src_tar_name[0]), "lbl" : os.path.join(dataset_dir, "data_road", src_tar_name[1]) })
        
    def __len__(self):
        return len(self.files)

    def __getitem__(self, index): # Loader로 부터 들어온 데이터의 해당하는 이름을 찾아서 읽어서 내보내주는 역할
        # load image & label
        # TODO
        # image -> img, label -> lbl
        data_file = self.files[index]
        img_file = data_file["img"]
        lbl_file = data_file["lbl"]
        
        img = PIL.Image.open(img_file)       # PIL : Python Image Library
        img = np.array(img, dtype=np.uint8)
        lbl = PIL.Image.open(lbl_file)
        lbl = np.array(lbl, dtype=np.int32)
        lbl[lbl == 255] = 1 # background는 0이고 도로는 1로 주기 위해서
        
        img, lbl = self.transform(img, lbl)
        return img, lbl

        # numpy와 tensor의 배열 차원 순서가 다르다. 
        # numpy : (행, 열, 채널)
        # tensor : (채널, 행, 열)
        # 따라서 위 순서에 맞춰 transpose
    
    def transform(self, img, lbl):
        img = img[:, :, ::-1]  # RGB -> BGR
        img = img.astype(np.float64)
        img -= self.mean_bgr
        img = img.transpose(2, 0, 1) # H W C -> C H W
        img = torch.from_numpy(img).float()
        lbl = torch.from_numpy(lbl).long()
        return img, lbl

In [11]:
train_loader = torch.utils.data.DataLoader(KITTIdataset(root = 'resources', split = 'train', transform = True), 
                                           batch_size = 1, shuffle = True) # 이미지 크기가 제각각이라서 batch_size는 1

val_loader = torch.utils.data.DataLoader(KITTIdataset(root = 'resources', split = 'val', transform = True), 
                                         batch_size = 1, shuffle = False)

vgg16 = torchvision.models.vgg16(pretrained = True)

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /home/piai/.cache/torch/hub/checkpoints/vgg16-397923af.pth


  0%|          | 0.00/528M [00:00<?, ?B/s]

# Define the Network
-VGG16

- FCN model
![convnet](./resources/fcn_upsampling.png "Variable")

In [5]:
# FCN에서는 Input 이미지의 사이즈를 고려하지 않는다. Convolution의 특징. 이미지 사이즈가 달라도 상관없다.
# Unet도 FCN이라서 동일함, in_channel 수, out_channel 수, kernel_size만 고려
# FCN : Sum, Unet : Concat, Convolution을 하면서 Kernel이 2개를 어떻게 합쳤는지 학습하게된다.

class FCN(nn.Module):
    def __init__(self, num_class = 21):
        super(FCN, self).__init__()
        
        ## Why padding 100?? https://github.com/shelhamer/fcn.berkeleyvision.org
        # The 100 pixel input padding guarantees that the network output can be aligned to the input for any input size in the given datasets
        self.features1 = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding = 100),
            nn.ReLU(inplace = True),
            nn.Conv2d(64, 64, 3, padding = 1),
            nn.ReLU(inplace = True))
    
        self.features2 = nn.Sequential(
            nn.Conv2d(64, 128, 3, padding = 1),
            nn.ReLU(inplace = True),
            nn.Conv2d(128, 128, 3, padding = 1),
            nn.ReLU(inplace = True))
        
        self.features3 = nn.Sequential(
            nn.Conv2d(128, 256, 3, padding = 1),
            nn.ReLU(inplace = True),
            nn.Conv2d(256, 256, 3, padding = 1),
            nn.ReLU(inplace = True),
            nn.Conv2d(256, 256, 3, padding = 1))
        
        self.features4 = nn.Sequential(
            nn.Conv2d(256, 512, 3, padding = 1),
            nn.ReLU(inplace = True),
            nn.Conv2d(512, 512, 3, padding = 1),
            nn.ReLU(inplace = True),
            nn.Conv2d(512, 512, 3, padding = 1))
                
        self.features5 = nn.Sequential(
            nn.Conv2d(512, 512, 3, padding = 1),
            nn.ReLU(inplace = True),
            nn.Conv2d(512, 512, 3, padding = 1),
            nn.ReLU(inplace = True),
            nn.Conv2d(512, 512, 3, padding = 1))
        
        self.maxpool = nn.MaxPool2d(2, stride = 2, ceil_mode = True)

        self.classifier = nn.Sequential(
            nn.Conv2d(512, 4096, 7),
            nn.ReLU(inplace = True),
            nn.Dropout2d(),
            nn.Conv2d(4096, 4096, 1),
            nn.ReLU(inplace = True),
            nn.Dropout2d(),
            nn.Conv2d(4096, num_class, 1))
        
        self.upscore2 = nn.ConvTranspose2d(num_class, num_class, kernel_size = 4, stride = 2, bias = False)
        self.upscore4 = nn.ConvTranspose2d(num_class, num_class, kernel_size = 4, stride = 2, bias = False)
        self.upscore8 = nn.ConvTranspose2d(num_class, num_class, kernel_size = 16, stride = 8, bias = False)
        
        self.score_pool4 = nn.Conv2d(512, num_class, 1)
        self.score_pool3 = nn.Conv2d(256, num_class, 1)
        
        self.params = [self.features1, self.features2, self.features3, 
                       self.features4, self.features5]
        
    def upsample(self, x, size):
        return nn.functional.upsample(x, size = size, mode = 'bilinear')
                             
    def forward(self, inputs):
        x = self.features1(inputs)
        pool1 = self.maxpool(x)
        x = self.features2(pool1)
        pool2 = self.maxpool(x)
        x = self.features3(pool2)
        pool3 = self.maxpool(x)
        x = self.features4(pool3)
        pool4 = self.maxpool(x)
        x = self.features5(pool4)
        pool5 = self.maxpool(x)
        x = self.classifier(pool5)
        
        # also use getattr with for loop ...
        x = self.upscore2(x)
        
        pool4 = self.score_pool4(pool4)
        pool4 = pool4[:, :, 5:5 + x.size()[2], 5:5 + x.size()[3]]
        x = torch.add(x, pool4)
        
        x = self.upscore4(x)
        
        pool3 = self.score_pool3(pool3)
        pool3 = pool3[:, :, 9:9 + x.size()[2], 9:9 + x.size()[3]]
        x = torch.add(x, pool3)
        
        x = self.upscore8(x)
        x = x[:, :, 33:33 + inputs.size()[2], 33:33 + inputs.size()[3]]
        return x
    
    def copy_params(self, vgg):  # 학습된 vgg weight를 가져와서 FCN과 같은 Structure를 짜놓고 Parameter weight를 덮어씌우는 방법
        for l1, l2 in zip(vgg.features, self.params):
            if (isinstance(l1, nn.Conv2d) and isinstance(l2, nn.Conv2d)):
                assert l1.weight.size() == l2.weight.size()
                assert l1.bias.size() == l2.bias.size()
                l2.weight.data = l1.weight.data
                l2.bias.data = l1.bias.data

# U-Net


-  U-Net model
![unet](./resources/unet.png "Variable")

EncoderBlock layers
    
    - conv2d in -> out (kernel = 3
    - batchnorm2d
    - relu
    - conv2d out -> out (kernel = 3
    - batchnorm2d
    - relu
    - dropout
    - maxpool (kernel = 2, sride = 2)

DecoderBlock layers
    
    - conv2d in -> middle (kernel = 3)
    - batchnorm2d
    - relu
    - conv2d middle -> middle (kernel = 3)
    - batchnorm2d
    - relu
    - convtranspose2d middle -> out (kernel = 3, stride = 2) #이미지 사이즈 2배가 된다

In [12]:
class _EncoderBlock(nn.Module):
    def __init__(self, in_channels, out_channels, dropout=False):
        super(_EncoderBlock, self).__init__()
        # TODO
        layers = [
            nn.Conv2d(in_channels, out_channels, kernel_size=3),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True), # inplace=True 기존거에서 작업하니깐 2중으로 메모리가 들어가지않는다.
            nn.Conv2d(out_channels, out_channels, kernel_size=3),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        ]
        if dropout:
            layers.append(nn.Dropout())
        layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
        self.encode = nn.Sequential(*layers)

    def forward(self, x):
        return self.encode(x)

# Convolution은 pixel을 모은다면 DeConvolution은 1pixel을 다시 펼치는 작업
class _DecoderBlock(nn.Module):
    def __init__(self, in_channels, middle_channels, out_channels):
        super(_DecoderBlock, self).__init__()
        # TODO
        self.decode = nn.Sequential(
            nn.Conv2d(in_channels, middle_channels, kernel_size = 3),
            nn.BatchNorm2d(middle_channels),
            nn.ReLU(inplace = True),
            nn.Conv2d(middle_channels, middle_channels, kernel_size = 3),
            nn.BatchNorm2d(middle_channels),
            nn.ReLU(inplace = True),
            nn.ConvTranspose2d(middle_channels, out_channels, kernel_size = 2, stride = 2) # Deconvolution을 거치게되면 Img사이즈가 커짐.
        )

    def forward(self, x):
        return self.decode(x)


class UNet(nn.Module): # 네트워크가 U자로 생겼다
    def __init__(self, num_classes):
        super(UNet, self).__init__()
        self.enc1 = _EncoderBlock(3, 64)
        self.enc2 = _EncoderBlock(64, 128)
        self.enc3 = _EncoderBlock(128, 256)
        self.enc4 = _EncoderBlock(256, 512, dropout=True)
        self.center = _DecoderBlock(512, 1024, 512) 
        self.dec4 = _DecoderBlock(1024, 512, 256) # input채널 1024 => skip으로 넘어오는 채널 512 +  up-conv로 진행된 채널 512
        self.dec3 = _DecoderBlock(512, 256, 128)
        self.dec2 = _DecoderBlock(256, 128, 64)
        self.dec1 = nn.Sequential( # Convolution
            nn.Conv2d(128, 64, kernel_size=3),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
        )
        self.final = nn.Conv2d(64, num_classes, kernel_size=1)

    def forward(self, x):
        enc1 = self.enc1(x)
        enc2 = self.enc2(enc1)
        enc3 = self.enc3(enc2)
        enc4 = self.enc4(enc3)
        center = self.center(enc4)
        dec4 = self.dec4(torch.cat([center, F.upsample(enc4, center.size()[2:], mode='bilinear')], 1)) # encoding feature를 Concat,     center.size => [1, 512, 30, 138], 3번째부터니깐 [30, 138]
        print(F.upsample(enc4, center.size()[2:]).size())
        dec3 = self.dec3(torch.cat([dec4, F.upsample(enc3, dec4.size()[2:], mode='bilinear')], 1))     # dec4.size() => [1, 256, 52, 268] 3번째부터니깐 [52, 268]
        dec2 = self.dec2(torch.cat([dec3, F.upsample(enc2, dec3.size()[2:], mode='bilinear')], 1))     # input에서 high-level feature로 가기 전에 low-level featrure를 함께써서 resolution이 올라가면서 특성을 살릴 수 있다.
        dec1 = self.dec1(torch.cat([dec2, F.upsample(enc1, dec2.size()[2:], mode='bilinear')], 1))
        final = self.final(dec1)
        return F.upsample(final, x.size()[2:], mode='bilinear')

## Measure accuracy and visualization

In [13]:
from PIL import Image # import 
def visualization(net, image, epoch, device):
    net.to('cpu')
    mean_bgr = np.array([104.00698793, 116.66876762, 122.67891434])
    img = image
    img = np.array(img, dtype = np.uint8)
    img = img[:, :, ::-1] # channel RGB -> BGR
    img = img.astype(np.float64)
    img -= mean_bgr
    img = img.transpose(2, 0, 1) # H W C -> C H W
    img = torch.from_numpy(img).float()
    img = img.unsqueeze(0)

    score = net(img) # 네트워크의 output(logits), B x C x H x W
    lbl_pred = score.data.max(1)[1].cpu().numpy() # [max value, idx]     (1) -> B x 1(max인 index 0 또는 1) x H x W
    lbl_pred = np.squeeze(lbl_pred)

    print(score.data.max(1))

    Image.fromarray((lbl_pred * 255).astype(np.uint8)).save('./resources/pred/mask_'+str(epoch+1)+'.png')
    
    input_img = image
    input_img = np.array(input_img, dtype = np.uint8)
    color = [0, 255, 0, 127] 
    color = np.array(color).reshape(1, 4)
    shape = input_img.shape
    segmentation = lbl_pred.reshape(shape[0], shape[1], 1)
    output = np.dot(segmentation, color)

    output = Image.fromarray(output.astype(np.uint8))
    background = Image.fromarray(input_img.astype(np.uint8))
    background.paste(output, box = None, mask = output)
    background.save('./resources/overlay/overlay_'+str(epoch+1)+'.png')

    net.to(device)

# Train

In [None]:
import torch.optim as optim

model_type = 'unet'

if model_type == 'unet':
    net = UNet(2)
elif model_type == 'FCN':
    net = FCN(num_class = 2)
    vgg16 = torchvision.models.vgg16(pretrained = True)
    net.copy_params(vgg16)
    del vgg16

    
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
net = net.to(device)

training_epochs = 10 

# TODO
# criterion : Cross entropy loss
# optimizer : Adam, learning weight=0.004
criterion = nn.CrossEntropyLoss(size_average=False) # size_average : 배치에 각 이미지의 loss를 average 하느냐 마느냐.
optimizer = optim.Adam(net.parameters(), lr=1e-4)

num_class = 2

for epoch in range(training_epochs):
    print ('current epoch : %d'%(epoch))
    # training
    net.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        # TODO
        # data to gpu
        
        data, target = data.to(device), target.to(device)
        
        # forward
        score = net(data)
        
        # loss = ...
        loss = criterion(score, target)
        
        # update
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
#         if batch_idx % 20 ==0:
#             print ('batch : %d, loss : %f'%(batch_idx, loss.item()))
        
    #validation
#     net.eval()
    
#     val_loss = 0

#     for batch_idx, (data, target) in enumerate(val_loader):
#         # TODO
#         # load data
#         data, target = data.to(device), target.to(device)
#         # forward
#         score = net(data)
        
#         loss = criterion(score, target)
#         val_loss += loss.item() / len(data)
            
#     val_loss /= len(val_loader)
#     print ('val loss : %f'%val_loss)
        
#     #visualization
#     img = PIL.Image.open('./resources/Kitti/data_road/testing/image_2/um_000081.png')
#     visualization(net, img, epoch, device)
    
        
print('Finished Training')

current epoch : 0
torch.Size([1, 512, 30, 138])




torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 136])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 136])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 136])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 136])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 136])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size

torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
torch.Size([1, 512, 30, 138])
