<a href="https://colab.research.google.com/github/KokiNiimura/study/blob/master/Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%cd /content/drive/My Drive/study/PyTorch_Advanced/03

/content/drive/My Drive/study/PyTorch_Advanced/03


In [2]:
import random
import math
import time
import pandas as pd
import numpy as np

import torch
import torch.utils.data as data
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
import torch.optim as optim

In [3]:
torch.manual_seed(1234)
np.random.seed(1234)
random.seed(1234)

In [4]:
from utils.dataloader import make_datapath_list, DataTransform, VOCDataset

rootpath = "./data/VOCdevkit/VOC2012/"
train_img_list, train_anno_list, val_img_list, val_anno_list = make_datapath_list(rootpath=rootpath)

color_mean = (0.485, 0.456, 0.406)
color_std = (0.229, 0.224, 0.225)

train_dataset = VOCDataset(train_img_list, train_anno_list, phase="train", 
                           transform=DataTransform(input_size=475, color_mean=color_mean, color_std=color_std))

val_dataset = VOCDataset(val_img_list, val_anno_list, phase="val", 
                           transform=DataTransform(input_size=475, color_mean=color_mean, color_std=color_std))

batch_size = 4

train_dataloader = data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = data.DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False)

dataloaders_dict = {"train": train_dataloader, "val": val_dataloader}

In [5]:
from utils.pspnet import PSPNet

net = PSPNet(n_classes=21)

In [6]:
net

PSPNet(
  (feature_conv): FeatureMap_convolution(
    (cbnr_1): conv2DBatchNormRelu(
      (conv): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (batchnorm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (cbnr_2): conv2DBatchNormRelu(
      (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (batchnorm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (cbnr_3): conv2DBatchNormRelu(
      (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (batchnorm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  )
  (feature_res_1): ResidualBlockPSP(
    (block1): bottleNec

In [7]:
class PSPLoss(nn.Module):

    def __init__(self, aux_weight=0.4):
        super(PSPLoss, self).__init__()
        self.aux_weight = aux_weight


    def forward(self, outputs, targets):
        loss = F.cross_entropy(outputs[0], targets, reduction='mean')
        loss_aux = F.cross_entropy(outputs[1], targets, reduction='mean')

        return loss + self.aux_weight*loss_aux

criterion = PSPLoss(aux_weight=0.4)

In [8]:
optimizer = optim.SGD([
    {'params': net.feature_conv.parameters(), 'lr': 1e-3}, 
    {'params': net.feature_res_1.parameters(), 'lr': 1e-3}, 
    {'params': net.feature_res_2.parameters(), 'lr': 1e-3}, 
    {'params': net.feature_dilated_res_1.parameters(), 'lr': 1e-3}, 
    {'params': net.feature_dilated_res_2.parameters(), 'lr': 1e-3}, 
    {'params': net.pyramid_pooling.parameters(), 'lr': 1e-3}, 
    {'params': net.decode_feature.parameters(), 'lr': 1e-2}, 
    {'params': net.aux.parameters(), 'lr': 1e-2}, 
], momentum=0.9, weight_decay=0.0001)

def lambda_epoch(epoch):
    max_epoch = 30
    return math.pow((1-epoch/max_epoch), 0.9)

scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_epoch)

In [9]:
def train_model(net, dataloaders_dict, criterion, scheduler, optimizer, num_epochs):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("device: {}".format(device))

    net.to(device)

    torch.backends.cudnn.benchmark = True

    num_train_imgs = len(dataloaders_dict["train"].dataset)
    num_val_imgs = len(dataloaders_dict["val"].dataset)
    batch_size = dataloaders_dict["train"].batch_size

    iteration = 1
    logs = []

    batch_multiplier = 6

    for epoch in range(num_epochs):
        
        t_epoch_start = time.time()
        t_iter_start = time.time()
        epoch_train_loss = 0.0
        epoch_val_loss = 0.0

        print('--------------')
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('--------------')
        
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()
                scheduler.step()
                optimizer.zero_grad()
                print('(train)')

            else:
                if ((epoch+1) % 5 == 0):
                    net.eval()
                    print('--------------')
                    print('(val)')
                else:
                    continue
        
            count = 0
            for images, anno_class_images in dataloaders_dict[phase]:
                if images.size()[0] == 1:
                    continue

                images = images.to(device)
                anno_class_images = anno_class_images.to(device)

                if (phase == 'train') and (count == 0):
                    optimizer.step()
                    optimizer.zero_grad()
                    count = batch_multiplier

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(images)
                    loss = criterion(outputs, anno_class_images.long()) / batch_multiplier

                    if phase == 'train':
                        loss.backward()
                        count -= 1

                        if (iteration % 20 == 0):
                            t_iter_finish = time.time()
                            duration = t_iter_finish - t_iter_start
                            print('iteration {} || Loss: {:.4f} || 20iter: {:.4f} sec.'.format(
                                iteration, loss.item()/batch_size*batch_multiplier, duration))
                            t_iter_start = time.time()

                        epoch_train_loss += loss.item() * batch_multiplier
                        iteration += 1

                    else:
                        epoch_val_loss += loss.item() * batch_multiplier

        t_epoch_finish = time.time()
        print('--------------')
        print('epoch {} || Epoch_TRAIN_Loss: {:.4f} || Epoch_VAL_Loss: {:.4f}'.format(
            epoch+1, epoch_train_loss/num_train_imgs, epoch_val_loss/num_val_imgs))
        print('timer: {:.4f} sec.'.format(t_epoch_finish - t_epoch_start))
        t_epoch_start = time.time()

        log_epoch = {'epoch': epoch+1, 'train_loss': epoch_train_loss/num_train_imgs, 
                        'val_loss': epoch_val_loss/num_val_imgs}
        logs.append(log_epoch)
        df = pd.DataFrame(logs)
        df.to_csv("log_output.csv")

        torch.save(net.state_dict(), 'weights/pspnet50_' + str(epoch+1) + '.pth')

In [10]:
num_epochs = 30
train_model(net, dataloaders_dict, criterion, scheduler, optimizer, num_epochs=num_epochs)

device: cuda:0
--------------
Epoch 1/30
--------------
(train)




iteration 20 || Loss: 0.8095 || 20iter: 34.8563 sec.
iteration 40 || Loss: 0.4639 || 20iter: 26.5321 sec.
iteration 60 || Loss: 0.2629 || 20iter: 25.9936 sec.
iteration 80 || Loss: 0.5196 || 20iter: 26.9041 sec.
iteration 100 || Loss: 0.1393 || 20iter: 26.6143 sec.
iteration 120 || Loss: 0.4731 || 20iter: 26.3384 sec.
iteration 140 || Loss: 0.3700 || 20iter: 26.5787 sec.
iteration 160 || Loss: 0.2124 || 20iter: 26.5514 sec.
iteration 180 || Loss: 0.3651 || 20iter: 26.4365 sec.
iteration 200 || Loss: 0.4846 || 20iter: 26.4562 sec.
iteration 220 || Loss: 0.6219 || 20iter: 26.4665 sec.
iteration 240 || Loss: 0.3101 || 20iter: 26.5571 sec.
iteration 260 || Loss: 0.2635 || 20iter: 26.4749 sec.
iteration 280 || Loss: 0.5002 || 20iter: 26.4806 sec.
iteration 300 || Loss: 0.3085 || 20iter: 26.4540 sec.
iteration 320 || Loss: 0.8929 || 20iter: 26.5060 sec.
iteration 340 || Loss: 0.6758 || 20iter: 26.5055 sec.
iteration 360 || Loss: 0.4334 || 20iter: 26.5071 sec.
--------------
epoch 1 || Epoch_