In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter

import numpy as np
from tqdm import tqdm
import argparse

import os
import shutil
import math
from time import time

from utils.post_process import cal_absolute_from_relative, plot_from_pose
from utils.misc import to_var, adjust_learning_rate, pre_create_file_train, pre_create_file_test, \
    display_loss_tb, display_loss_tb_val

##### load path

In [2]:
dir_data = '/media/czy/DATA/Share/Kitti/color'
dir_label = './dataset'
model_dir = './model'
log_dir = './log'


In [3]:

parser = argparse.ArgumentParser()
parser.add_argument('--server', default=None, type=int, help='[6099 / 6199 / 6499]')
parser.add_argument('--net_architecture', default='cnn', help='[cnn / cnn-tb / cnn-iks / cnn-lstm]')
parser.add_argument("--samples", default='i0', help='samples for train')
parser.add_argument('--phase', default='Train', help='[Train / Test]')
parser.add_argument('--resume', default='No', help='[Yes / No] for cnn, [cnn / lstm / No] for cnn-lstm')

# 模型载入的参数
parser.add_argument('--net_restore', default='cnn-vo', help='Restore net name')
parser.add_argument('--dir_restore', default='20220323', help='Restore file name')
parser.add_argument('--model_restore', default='model-4', help='Restore model-id')

parser.add_argument('--net_name', default='cnn-vo', help='[cnn-vo / cnn-vo-cons / cnn-lstm-vo / cnn-lstm-vo-cons]')
parser.add_argument('--dir0', default='20220323', help='Name it with date, such as 20220323')
parser.add_argument('--batch_size', default=2, type=int, help='Batch size')  # 32
parser.add_argument('--epoch_max', default=5, type=int, help='Max epoch')  # 200
parser.add_argument('--epoch_test', default=2, type=int, help='Test epoch during train process')  # 10
parser.add_argument('--epoch_save', default=2, type=int, help='Max epoch number')  # 10
parser.add_argument('--lr_base', default=1e-4, type=float, help='Base learning rate')
parser.add_argument('--lr_decay_rate', default=0.316, type=float, help='Decay rate of lr')
parser.add_argument('--epoch_lr_decay', default=2, type=int, help='Every # epoch, lr decay lr_decay_rate')  # 30
parser.add_argument('--beta', default=10, type=int, help='loss = loss_t + beta * loss_r')

# lstm 参数
parser.add_argument('--img_pairs', default=10, type=int, help='Image pairs')
parser.add_argument('--si', default=3, type=int, help='Start interval')
parser.add_argument('--num_layer', default=2, type=int, help='Lstm layer number')
parser.add_argument('--hidden_size', default=1024, type=int, help='Lstm hidden units')

parser.add_argument("--gpu", default='0', help='GPU id list')
parser.add_argument("--workers", default=2, type=int, help='Workers number')  # win上为0 linux上根据配置设置
args = parser.parse_known_args()[0]

In [4]:
from net.cnn import Net
from dataset.kitti import KITTIDataSet

In [5]:
torch.set_default_tensor_type('torch.FloatTensor')
model = Net()
if torch.cuda.is_available():
    print("Using GPU Device!")
    model = nn.DataParallel(model.cuda())
else:
    print("Using CPU Device!")

  xavier_normal(m.weight.data)


Using GPU Device!


In [6]:
def run_batch(sample, model, loss_func=None, optimizer=None, phase=None):
    """
    训练、验证：
        run_batch(sample, model, loss_func, optimizer, phase='Train')
        run_batch(sample, model, loss_func, phase='Valid')
        返回估计位姿以及loss
    测试：
        run_batch(sample, model, phase='Test')
        返回估计位姿
    """
    if phase == 'Train':
        model.train()
    else:
        model.eval()  # 启用测试模式，关闭dropout

    # as for cnn: [bs, 6, H, W], as for cnn-lstm: [N, T, 6, H, W]
    img1 = to_var(sample['img1'])
    img2 = to_var(sample['img2'])
    label_pre = model(img1, img2)  # [32, 6]
    # conv_out = x_conv.data.cpu().numpy()
    # lstm_out = x_lstm.data.cpu().numpy()
    # print('Conv >>> min: {:.5f}, max: {:.5f}'.format(np.min(conv_out), np.max(conv_out)))
    # print('LSTM >>> min: {:.5f}, max: {:.5f}'.format(np.min(lstm_out), np.max(lstm_out)))

    if phase == 'Train' or phase == 'Valid':
        label = to_var(sample['label'])  # [bs, 6]
        label = label.view(-1, 6)
        loss1 = loss_func(label_pre[:, :3], label[:, :3])
        loss2 = loss_func(label_pre[:, 3:], label[:, 3:])
        loss = loss1 + args.beta * loss2

        # loss_x = loss_func(label_pre[:, 0], label[:, 0])
        # loss_y = loss_func(label_pre[:, 1], label[:, 1])
        # loss_z = loss_func(label_pre[:, 2], label[:, 2])
        # loss_tx = loss_func(label_pre[:, 3], label[:, 3])
        # loss_ty = loss_func(label_pre[:, 4], label[:, 4])
        # loss_tz = loss_func(label_pre[:, 5], label[:, 5])

        print("epoch: {:03d} \t step: {:d} \t 'step_per_epoch:' {:d} \t t_loss: {:.4f} \t r_loss: {:.4f} \t loss:{:.4f}"
              .format(epoch+1, step+1, step_per_epoch, loss1, loss2, loss))

        if phase == 'Train':
            optimizer.zero_grad()  # clear gradients for this training step
            loss.backward()  # bp, compute gradients
            optimizer.step()  # apply gradients

        return loss.item(), loss1.item(), loss2.item(), label_pre.data
        # return loss.data[0], loss1.data[0], loss2.data[0], label_pre.data, \
        #     loss_x.data[0], loss_y.data[0], loss_z.data[0], loss_tx.data[0], loss_ty.data[0], loss_tz.data[0]
    else:
        return label_pre.data


In [7]:
def run_val(model, loss_func, loader):
    """
    验证多个batch，并返回平均误差
    """
    loss_ret = []
    loss1_ret = []
    loss2_ret = []

    for _, sample_v in enumerate(loader):
        loss_v, loss1_v, loss2_v, _ = run_batch(sample=sample_v, model=model, loss_func=loss_func, phase='Valid')
        loss_ret.append(loss_v)
        loss1_ret.append(loss1_v)
        loss2_ret.append(loss2_v)

    loss_mean = np.mean(loss_ret)
    loss1_mean = np.mean(loss1_ret)
    loss2_mean = np.mean(loss2_ret)

    return loss_mean, loss1_mean, loss2_mean


In [8]:
def run_test(model, seq, dir_model=None, epoch=None, dir_time=None):
    """
    训练阶段对一段完整的轨迹进行测试，或者测试阶段直接用于测试

    训练过程中测试：
    1. 计算一段完整场景中所有相对姿态的预测值
    cnn-lstm:
        手动写读图的代码，从而可以处理场景末尾图片序列长度不足一个batch的情况
    cnn:
        采用DataLoader读取，较为方便

    2. 计算绝对姿态，并画出轨迹
    训练阶段保存轨迹图
    测试阶保存轨迹图、相对位姿、绝对位姿
    """
    print('\nTest sequence {:02d} >>>'.format(seq))
    data_set = KITTIDataSet(dir_data=dir_data, dir_label=dir_label, phase='Test', seq=seq)
    loader = DataLoader(data_set, batch_size=args.batch_size, shuffle=False, num_workers=args.workers)
    pose_ret = []
    for _, sample_batch in enumerate(tqdm(loader)):
        pose_pre = run_batch(sample=sample_batch, model=model, phase='Test')
        pose_ret.extend(pose_pre.cpu().numpy())
    pose_abs = cal_absolute_from_relative(pose_ret)

    if args.phase == 'Test':
        np.savetxt(dir_time + '/pose_{:d}.txt'.format(seq), pose_ret)
        np.savetxt((dir_time + '/{:02d}.txt'.format(seq)), pose_abs)
        plot_from_pose(seq=seq, dir_save=dir_time, pose_abs=pose_abs, args=args)
        print('Save pose and trajectory in {:s}'.format(dir_time))
    else:
        plot_from_pose(seq=seq, dir_save=dir_model, pose_abs=pose_abs, epoch=epoch, args=args)
        print('Save trajectory in {:s}'.format(dir_model))

In [9]:
if args.resume == 'Yes' or args.phase == 'Test':
        dir_restore = model_dir + '/' + args.net_restore + '/' + args.dir_restore + '/' + args.model_restore + '.pkl'
        print('Restore from CNN: {:s}'.format(dir_restore))
        model.load_state_dict(torch.load(dir_restore))
else:
        print('Initialize from scratch')

Initialize from scratch


In [10]:
if args.phase == 'Train':
    dir_model, dir_log = pre_create_file_train(model_dir, log_dir, args)
    writer = SummaryWriter(dir_log)
    loss_func = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr_base)

    data_set_t = KITTIDataSet(dir_data=dir_data, dir_label=dir_label, samples=args.samples, phase='Train')
    data_set_v = KITTIDataSet(dir_data=dir_data, dir_label=dir_label, phase='Valid')
    loader_t = DataLoader(data_set_t, batch_size=args.batch_size, shuffle=True, num_workers=args.workers)
    loader_v = DataLoader(data_set_v, batch_size=args.batch_size, shuffle=False, num_workers=args.workers)

    step_per_epoch = int(math.floor(len(data_set_t) / loader_t.batch_size))
    print(step_per_epoch)
    step_val = int(math.floor(step_per_epoch / 3))  # 每个epoch验证3次

2270


In [11]:
for epoch in np.arange(args.epoch_max):
    adjust_learning_rate(optimizer, epoch, args.lr_base,
                         args.lr_decay_rate, args.epoch_lr_decay)

    # test a complete sequence and plot trajectory
    if epoch != 0 and epoch % args.epoch_test == 0:
        run_test(model, seq=9, dir_model=dir_model, epoch=epoch)
        run_test(model, seq=5, dir_model=dir_model, epoch=epoch)

    loss_list = []  # 记录每个epoch的loss
    loss1_list = []
    loss2_list = []
    for step, sample_t in enumerate(loader_t):
        step_global = epoch * step_per_epoch + step
        tic = time()
        loss, loss1, loss2, _ = \
            run_batch(sample=sample_t, model=model,
                      loss_func=loss_func, optimizer=optimizer, phase='Train')
        hour_per_epoch = step_per_epoch * ((time() - tic) / 3600)
        loss_list.append(loss)
        loss1_list.append(loss1)
        loss2_list.append(loss2)

        # display and add to tensor board
        if (step + 1) % 100== 0:
            display_loss_tb(hour_per_epoch, epoch, args, step, step_per_epoch, optimizer, loss, loss1,
                            loss2, loss_list, loss1_list, loss2_list, writer, step_global)

        if (step + 1) % step_val == 0:
            batch_v = int(math.ceil(len(data_set_v) / loader_v.batch_size))
            loss_v, loss1_v, loss2_v = run_val(model, loss_func, loader_v)
            display_loss_tb_val(batch_v, loss_v, loss1_v,
                                loss2_v, args, writer, step_global)

    # save
    if (epoch + 1) % args.epoch_save == 0:
        print(
            '\nSaving model: {:s}/model-{:d}.pkl'.format(dir_model, epoch + 1))
        torch.save(model.state_dict(), (dir_model +
                   '/model-{:d}.pkl'.format(epoch + 1)))
    if epoch == args.epoch_max:
        print('\nSaving model: {:s}/model-{:d}.pkl'.format(dir_model, epoch))
        torch.save(model.state_dict(), (dir_model +
                   '/model-{:d}.pkl'.format(epoch)))


epoch: 001 	 step: 1 	 'step_per_epoch:' 2270 	 t_loss: 0.1072 	 r_loss: 0.2184 	 loss:2.2913
epoch: 001 	 step: 2 	 'step_per_epoch:' 2270 	 t_loss: 0.1546 	 r_loss: 0.1088 	 loss:1.2430

0.111 [001/005] [002/2270] lr 0.0001000: 1.2430(1.7672)=0.1546(0.1309)+10*0.1088(0.1636)
epoch: 001 	 step: 3 	 'step_per_epoch:' 2270 	 t_loss: 0.2827 	 r_loss: 0.0768 	 loss:1.0508
epoch: 001 	 step: 4 	 'step_per_epoch:' 2270 	 t_loss: 0.2005 	 r_loss: 0.9428 	 loss:9.6283

0.112 [001/005] [004/2270] lr 0.0001000: 9.6283(3.5534)=0.2005(0.1862)+10*0.9428(0.3367)
epoch: 001 	 step: 5 	 'step_per_epoch:' 2270 	 t_loss: 0.1411 	 r_loss: 0.5921 	 loss:6.0617
epoch: 001 	 step: 6 	 'step_per_epoch:' 2270 	 t_loss: 0.0442 	 r_loss: 0.0651 	 loss:0.6951

0.111 [001/005] [006/2270] lr 0.0001000: 0.6951(3.4950)=0.0442(0.1550)+10*0.0651(0.3340)
epoch: 001 	 step: 7 	 'step_per_epoch:' 2270 	 t_loss: 0.0636 	 r_loss: 0.0318 	 loss:0.3819
epoch: 001 	 step: 8 	 'step_per_epoch:' 2270 	 t_loss: 0.0734 	 r_loss:

100%|██████████| 795/795 [05:50<00:00,  2.27it/s]
100%|██████████| 1590/1590 [00:00<00:00, 2014.70it/s]


OSError: dataset/ground-truth/09.txt not found.