# 04_SSD_training.ipynb

Training SSD model. Takes several hours even with GPU.

### This uses utills.ssd_model.py

In [1]:
# import package
import os.path as osp
import random
import time

import cv2
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
import torch.utils.data as data

In [2]:
# init rand seeds
torch.manual_seed(1234)
np.random.seed(1234)
random.seed(1234)

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device ", device)

使用デバイス： cuda:0


### create Dataset and DataLoader

In [4]:
from utils.ssd_model import make_datapath_list, VOCDataset, DataTransform, Anno_xml2list, od_collate_fn


# get filepath
rootpath = "./data/VOCdevkit/VOC2012/"
train_img_list, train_anno_list, val_img_list, val_anno_list = make_datapath_list(
    rootpath)

# create Dataset
voc_classes = ['aeroplane', 'bicycle', 'bird', 'boat',
               'bottle', 'bus', 'car', 'cat', 'chair',
               'cow', 'diningtable', 'dog', 'horse',
               'motorbike', 'person', 'pottedplant',
               'sheep', 'sofa', 'train', 'tvmonitor']
color_mean = (104, 117, 123)  # mean of (BGR)
input_size = 300  # input image size 300×300

train_dataset = VOCDataset(train_img_list, train_anno_list, phase="train", transform=DataTransform(
    input_size, color_mean), transform_anno=Anno_xml2list(voc_classes))

val_dataset = VOCDataset(val_img_list, val_anno_list, phase="val", transform=DataTransform(
    input_size, color_mean), transform_anno=Anno_xml2list(voc_classes))


# create DataLoader
batch_size = 32

train_dataloader = data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=od_collate_fn)

val_dataloader = data.DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False, collate_fn=od_collate_fn)

# concatenate them into dictionary-type variables
dataloaders_dict = {"train": train_dataloader, "val": val_dataloader}


### Create SSD300 model

In [5]:
from utils.ssd_model import SSD

# config of SSD300 model (same as before)
ssd_cfg = {
    'num_classes': 21,  
    'input_size': 300,  
    'bbox_aspect_num': [4, 6, 6, 6, 4, 4],  
    'feature_maps': [38, 19, 10, 5, 3, 1],  
    'steps': [8, 16, 32, 64, 100, 300],  
    'min_sizes': [30, 60, 111, 162, 213, 264],  
    'max_sizes': [60, 111, 162, 213, 264, 315],  
    'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
}

# SSD model
net = SSD(phase="train", cfg=ssd_cfg)

# initial weights of SSD
# load weights in VGG part of SSD
vgg_weights = torch.load('./weights/vgg16_reducedfc.pth')
net.vgg.load_state_dict(vgg_weights)

# initialize weights of SSD with He's method


def weights_init(m):
    if isinstance(m, nn.Conv2d):
        init.kaiming_normal_(m.weight.data)
        if m.bias is not None:  # with bias
            nn.init.constant_(m.bias, 0.0)


# use He's initial value
net.extras.apply(weights_init)
net.loc.apply(weights_init)
net.conf.apply(weights_init)

# confirm the availability of GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device in use：", device)

print('finish network configuration: loaded weight files')


使用デバイス： cuda:0
ネットワーク設定完了：学習済みの重みをロードしました


### Definition of loss function and optimizer

In [6]:
from utils.ssd_model import MultiBoxLoss

# loss function
criterion = MultiBoxLoss(jaccard_thresh=0.5, neg_pos=3, device=device)

# optimizer
optimizer = optim.SGD(net.parameters(), lr=1e-3,
                      momentum=0.9, weight_decay=5e-4)


### exec the training of SSD 

In [7]:
# training function

def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):

    # confirm the availability of GPU
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("Device in use:", device)

    # copt network to GPU
    net.to(device)

    # fasten network if architecture is stable
    torch.backends.cudnn.benchmark = True

    # set an iteration count
    iteration = 1
    epoch_train_loss = 0.0  # train loss in epoch
    epoch_val_loss = 0.0  # val loss in ecpoch
    logs = []

    # epoch loop
    for epoch in range(num_epochs+1):

        # store start time
        t_epoch_start = time.time()
        t_iter_start = time.time()

        print('-------------')
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-------------')

        # train and val loop in the epoch
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  # set the training mode
                print('（train）')
            else:
                if((epoch+1) % 10 == 0):
                    net.eval()   # set the evaluation mode
                    print('-------------')
                    print('（val）')
                else:
                    # evaluation is performed once per 10
                    continue

            # loop for extracting mini-batch from Dataloader
            for images, targets in dataloaders_dict[phase]:

                # send images and targets to GPU if available
                images = images.to(device)
                targets = [ann.to(device)
                           for ann in targets]  

                # init optimizer
                optimizer.zero_grad()

                # forward process
                with torch.set_grad_enabled(phase == 'train'):
                    # forward
                    outputs = net(images)

                    # calc loss
                    loss_l, loss_c = criterion(outputs, targets)
                    loss = loss_l + loss_c

                    # back-prop in the training mode
                    if phase == 'train':
                        loss.backward()  # calc gradient

                        # prevent over-fitting with clipping parameters upto 2.0
                        nn.utils.clip_grad_value_(
                            net.parameters(), clip_value=2.0)

                        optimizer.step()  # parameter update

                        if (iteration % 10 == 0):  # display loss once per 10 iter
                            t_iter_finish = time.time()
                            duration = t_iter_finish - t_iter_start
                            print('iteration {} || Loss: {:.4f} || 10iter: {:.4f} sec.'.format(
                                iteration, loss.item(), duration))
                            t_iter_start = time.time()

                        epoch_train_loss += loss.item()
                        iteration += 1

                    # in validation
                    else:
                        epoch_val_loss += loss.item()

        # loss and accuracy in epoch
        t_epoch_finish = time.time()
        print('-------------')
        print('epoch {} || Epoch_TRAIN_Loss:{:.4f} ||Epoch_VAL_Loss:{:.4f}'.format(
            epoch+1, epoch_train_loss, epoch_val_loss))
        print('timer:  {:.4f} sec.'.format(t_epoch_finish - t_epoch_start))
        t_epoch_start = time.time()

        # save log
        log_epoch = {'epoch': epoch+1,
                     'train_loss': epoch_train_loss, 'val_loss': epoch_val_loss}
        logs.append(log_epoch)
        df = pd.DataFrame(logs)
        df.to_csv("log_output.csv")

        epoch_train_loss = 0.0  # sum of error in epoch
        epoch_val_loss = 0.0 

        # save network
        if ((epoch+1) % 10 == 0):
            torch.save(net.state_dict(), 'weights/ssd300_' +
                       str(epoch+1) + '.pth')


In [8]:
# execute training / validation
num_epochs= 50  
train_model(net, dataloaders_dict, criterion, optimizer, num_epochs=num_epochs)

使用デバイス： cuda:0
-------------
Epoch 1/50
-------------
（train）
イテレーション 10 || Loss: 16.7849 || 10iter: 52.2679 sec.
イテレーション 20 || Loss: 12.0788 || 10iter: 25.0179 sec.
イテレーション 30 || Loss: 10.9953 || 10iter: 25.4926 sec.
イテレーション 40 || Loss: 9.8858 || 10iter: 25.0565 sec.
イテレーション 50 || Loss: 8.6146 || 10iter: 24.8988 sec.
イテレーション 60 || Loss: 8.1224 || 10iter: 24.7498 sec.
イテレーション 70 || Loss: 8.5834 || 10iter: 25.5584 sec.
イテレーション 80 || Loss: 8.2935 || 10iter: 24.9817 sec.
イテレーション 90 || Loss: 8.2462 || 10iter: 25.1121 sec.
イテレーション 100 || Loss: 7.5155 || 10iter: 24.8603 sec.
イテレーション 110 || Loss: 7.7157 || 10iter: 25.1244 sec.
イテレーション 120 || Loss: 7.5915 || 10iter: 25.6062 sec.
イテレーション 130 || Loss: 7.7106 || 10iter: 24.9809 sec.
イテレーション 140 || Loss: 7.7460 || 10iter: 24.4395 sec.
イテレーション 150 || Loss: 7.8148 || 10iter: 24.9344 sec.
イテレーション 160 || Loss: 7.3453 || 10iter: 25.3215 sec.
イテレーション 170 || Loss: 7.1660 || 10iter: 24.7397 sec.
-------------
epoch 1 || Epoch_TRAIN_Loss:1642.0417 ||Epoch_