In [60]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
import torchvision
import torchvision.transforms as transforms
from PIL import Image
import numpy as np
import os
import cv2
from torchsummary import summary as summary_

In [61]:
class FCN_8s(nn.Module):
    def __init__(self):
        super(FCN_8s, self).__init__()
        #conv1
        self.conv1_1 = nn.Conv2d(3, 64, kernel_size = (3,3), stride = (1,1), padding = 1)
        self.relu1_1 = nn.ReLU(inplace=True)
        self.conv1_2 = nn.Conv2d(64, 64, kernel_size = (3,3), stride = (1,1), padding = 1)
        self.relu1_2 = nn.ReLU(inplace=True)
        self.pool1 = nn.MaxPool2d(kernel_size = (2,2), stride = (2,2), padding = 0)
        #conv2
        self.conv2_1 = nn.Conv2d(64, 128, kernel_size=(3,3), stride = (1,1), padding = 1)
        self.relu2_1 = nn.ReLU(inplace=True)
        self.conv2_2 = nn.Conv2d(128, 128, kernel_size=(3,3), stride = (1,1), padding = 1)
        self.relu2_2 = nn.ReLU(inplace=True)
        self.pool2 = nn.MaxPool2d(kernel_size = (2,2), stride = (2,2), padding = 0)
        #conv3
        self.conv3_1 = nn.Conv2d(128, 256, kernel_size=(3,3), stride = (1,1), padding = 1)
        self.relu3_1 = nn.ReLU(inplace=True)
        self.conv3_2 = nn.Conv2d(256, 256, kernel_size=(3,3), stride = (1,1), padding = 1)
        self.relu3_2 = nn.ReLU(inplace=True)
        self.conv3_3 = nn.Conv2d(256, 256, kernel_size=(3,3), stride = (1,1), padding = 1)
        self.relu3_3 = nn.ReLU(inplace=True)
        self.pool3 = nn.MaxPool2d(kernel_size = (2,2), stride = (2,2), padding = 0)
        #conv4
        self.conv4_1 = nn.Conv2d(256, 512, kernel_size=(3,3), stride = (1,1), padding = 1)
        self.relu4_1 = nn.ReLU(inplace=True)
        self.conv4_2 = nn.Conv2d(512, 512, kernel_size=(3,3), stride = (1,1), padding = 1)
        self.relu4_2 = nn.ReLU(inplace=True)
        self.conv4_3 = nn.Conv2d(512, 512, kernel_size=(3,3), stride = (1,1), padding = 1)
        self.relu4_3 = nn.ReLU(inplace=True)
        self.pool4 = nn.MaxPool2d(kernel_size=(2,2), stride = (2,2), padding = 0)
        #conv5
        self.conv5_1 = nn.Conv2d(512, 512, kernel_size=(3,3), stride = (1,1), padding = 1)
        self.relu5_1 = nn.ReLU(inplace=True)
        self.conv5_2 = nn.Conv2d(512, 512, kernel_size=(3,3), stride = (1,1), padding = 1)
        self.relu5_2 = nn.ReLU(inplace=True)
        self.conv5_3 = nn.Conv2d(512, 512, kernel_size=(3,3), stride = (1,1), padding = 1)
        self.relu5_3 = nn.ReLU(inplace=True)
        self.pool5 = nn.MaxPool2d(kernel_size=(2,2), stride = (2,2), padding = 0)
        #fully conv
        self.conv6 = nn.Conv2d(512, 4096, kernel_size=(1,1), stride = (1,1), padding = 0)
        self.relu6 = nn.ReLU(inplace=True)
        self.drop6 = nn.Dropout2d()
        self.conv7 = nn.Conv2d(4096, 4096, kernel_size=(1,1), stride = (1,1), padding = 0)
        self.relu7 = nn.ReLU(inplace=True)
        self.drop7 = nn.Dropout2d()
        self.score1 = nn.Conv2d(4096, 21, kernel_size = (1,1), stride = (1,1), padding = 0)
        #upsampling
        self.x2upsamp = nn.ConvTranspose2d(21, 21, kernel_size = (4,4), stride = (2,2), padding = 1)
        self.x8upsamp = nn.ConvTranspose2d(21, 21, kernel_size = (16,16), stride = (8,8), padding = 4)
        #pool3,4 conv
        self.pool3conv = nn.Conv2d(256, 21, kernel_size = (1,1))
        self.pool4conv = nn.Conv2d(512, 21, kernel_size = (1,1), padding = 0)
        
    def forward(self, x):
        x = self.relu1_1(self.conv1_1(x))
        x = self.relu1_2(self.conv1_2(x))
        x = self.pool1(x)
        
        x = self.relu2_1(self.conv2_1(x))
        x = self.relu2_2(self.conv2_2(x))
        x = self.pool2(x)
        
        x = self.relu3_1(self.conv3_1(x))
        x = self.relu3_2(self.conv3_2(x))
        x = self.relu3_3(self.conv3_3(x))
        x = self.pool3(x)
        pool3 = x
        
        x = self.relu4_1(self.conv4_1(x))
        x = self.relu4_2(self.conv4_2(x))
        x = self.relu4_3(self.conv4_3(x))
        x = self.pool4(x)
        pool4 = x
        
        x = self.relu5_1(self.conv5_1(x))
        x = self.relu5_2(self.conv5_2(x))
        x = self.relu5_3(self.conv5_3(x))
        x = self.pool5(x)
        
        x = self.drop6(self.relu6(self.conv6(x)))
        x = self.drop7(self.relu7(self.conv7(x)))
        x = self.score1(x)
        
        x = self.x2upsamp(x)
        
        pool4 = self.pool4conv(pool4)

        x = x + pool4
        x = self.x2upsamp(x)
        
        pool3 = self.pool3conv(pool3)
        
        x = x + pool3
        x = self.x8upsamp(x)
        
        return x

In [62]:
model = FCN_8s()
summary_(model.to('cuda'), (3, 256, 256), batch_size = 1)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [1, 64, 256, 256]           1,792
              ReLU-2          [1, 64, 256, 256]               0
            Conv2d-3          [1, 64, 256, 256]          36,928
              ReLU-4          [1, 64, 256, 256]               0
         MaxPool2d-5          [1, 64, 128, 128]               0
            Conv2d-6         [1, 128, 128, 128]          73,856
              ReLU-7         [1, 128, 128, 128]               0
            Conv2d-8         [1, 128, 128, 128]         147,584
              ReLU-9         [1, 128, 128, 128]               0
        MaxPool2d-10           [1, 128, 64, 64]               0
           Conv2d-11           [1, 256, 64, 64]         295,168
             ReLU-12           [1, 256, 64, 64]               0
           Conv2d-13           [1, 256, 64, 64]         590,080
             ReLU-14           [1, 256,

In [63]:
def read_voc_images(train = True):
    file = 'VOCdevkit/VOC2012/ImageSets/Segmentation/' + ('train.txt' if train else 'val.txt')
    mode = torchvision.io.image.ImageReadMode.RGB
    with open(file, 'r') as f:
        imgs = f.read().split()
    datas, targets = [], []
    for i, imgname in enumerate(imgs):
        datas.append(torchvision.io.read_image(os.path.join('VOCdevkit/VOC2012/JPEGImages', f'{imgname}.jpg')))
        targets.append(torchvision.io.read_image(os.path.join('VOCdevkit/VOC2012/SegmentationClass', f'{imgname}.png'), mode))
    return datas, targets

Color_Map = [
               [0, 0, 0],  # background
               [128, 0, 0], # aeroplane
               [0, 128, 0], # bicycle
               [128, 128, 0], # bird
               [0, 0, 128], # boat
               [128, 0, 128], # bottle
               [0, 128, 128], # bus 
               [128, 128, 128], # car
               [64, 0, 0], # cat
               [192, 0, 0], # chair
               [64, 128, 0], # cow
               [192, 128, 0], # dining table
               [64, 0, 128], # dog
               [192, 0, 128], # horse
               [64, 128, 128], # motorbike
               [192, 128, 128], # person
               [0, 64, 0], # potted plant
               [128, 64, 0], # sheep
               [0, 192, 0], # sofa
               [128, 192, 0], # train
               [0, 64, 128] # tv/monitor
]

def voccolormap2label():
    colormap2label = torch.zeros(256**3, dtype=torch.long)
    for i, cm in enumerate(Color_Map):
        colormap2label[(cm[0] * 256 + cm[1]) * 256 +cm[2]] = i
    return colormap2label

def voclabel_indices(colormap, colormap2label):
    colormap = colormap.permute(1, 2, 0).numpy().astype('int32')
    idx = ((colormap[:,:,0] * 256 + colormap[:,:,1]) * 256 + colormap[:,:,2])
    return colormap2label[idx]

def vocrand_crop(data, target, h, w):
    rect = transforms.RandomCrop.get_params(data, (h,w))
    data = transforms.functional.crop(data, *rect)
    target = transforms.functional.crop(target, *rect)
    return data, target

class VOCSegDataset(data.Dataset):
    def __init__(self, train, img_size):
        self.transform = transforms.Normalize(mean = [0.485,0.456,0.406],
                                             std = [0.229,0.224,0.225])
        self.img_size = img_size
        datas, targets = read_voc_images(train = train)
        self.datas = [self.normalize_image(data) for data in self.filter(datas)]
        self.targets = self.filter(targets)
        self.colormap2label = voccolormap2label()
        
    def normalize_image(self, img):
        return self.transform(img.float())
    
    def filter(self, imgs):
        return [img for img in imgs if(img.shape[1] >= self.img_size[0] and img.shape[2] >= self.img_size[1])]
    
    def __getitem__(self, idx):
        data, target = vocrand_crop(self.datas[idx], self.targets[idx], *self.img_size)
        return (data, voclabel_indices(target, self.colormap2label))
    
    def __len__(self):
        return len(self.datas)
###

img_size = (256, 256)
train_data = VOCSegDataset(train = True, img_size = img_size)
valid_data = VOCSegDataset(train = False, img_size = img_size)

train_loader = data.DataLoader(train_data, batch_size=1, shuffle=True)
valid_loader = data.DataLoader(valid_data, batch_size=1)

In [64]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
criterion = nn.NLLLoss()
test_criterion = nn.NLLLoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr = 0.0001, weight_decay = 0.0016, momentum = 0.9)

def train(device, mode, train_loader, optimizer,epoch):
    model.train()
    for batch_idx, (data,target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        inputs = Variable(data)
        targets = Variable(target)
        optimizer.zero_grad()
        output = model(inputs)
        output = F.log_softmax(output)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        if batch_idx % 200 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)\tloss:{:.4f}]'
                 .format(epoch, batch_idx * len(data), len(train_loader.dataset),
                        100 * batch_idx / len(train_loader),
                        loss.item()))
            
def evaluate(device, model, valid_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in valid_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            
            test_loss += test_criterion(output, target).item()
            
            predicted = torch.max(output.data, 1)
            correct += (predicted == target).sum().item()
    
    test_loss /= len(valid_loader.dataset)
    test_accuracy = 100. * correct / len(valid_loader.dataset)
    return test_loss, test_accuracy

epochs = 50
for epoch in range(1, epochs + 1):
    train(device, model, train_loader, optimizer, epoch)
    test_loss, test_accuracy = evaluate(device, model, valid_loader)
    
    print('[{}] Test Loss: {:.4f}, Accuracy: {:.2f}%'.format(epoch, test_loss, test_accuracy))

  output = F.log_softmax(output)


Train Epoch: 1 [0/1444 (0%)	loss:3.0547]
Train Epoch: 1 [200/1444 (14%)	loss:1.3495]
Train Epoch: 1 [400/1444 (28%)	loss:1.7294]
Train Epoch: 1 [600/1444 (42%)	loss:1.7199]
Train Epoch: 1 [800/1444 (55%)	loss:2.3142]
Train Epoch: 1 [1000/1444 (69%)	loss:2.5331]
Train Epoch: 1 [1200/1444 (83%)	loss:2.0111]
Train Epoch: 1 [1400/1444 (97%)	loss:1.5274]


AttributeError: 'bool' object has no attribute 'sum'