In [1]:
import argparse
import subprocess
import pprint

In [2]:
import numpy as np
import torch
# import torch.multiprocessing
# torch.multiprocessing.set_sharing_strategy('file_system')
from torch.utils.data import DataLoader

In [3]:
from COTR.models import build_model
from COTR.utils import debug_utils, utils
# from COTR.datasets import cotr_dataset
from COTR.trainers.cotr_trainer_Ver1 import COTRTrainer
from COTR.global_configs import general_config
from COTR.options.options import *
from COTR.options.options_utils import *
from COTR_DatasetLidarCamera_Ver5 import DatasetLidarCameraKittiOdometry
from utils import (mat2xyzrpy, merge_inputs, overlay_imgs, quat2mat,
                   quaternion_from_matrix, rotate_back, rotate_forward,
                   tvector2mat)

In [4]:
utils.fix_randomness(0)
# np.set_printoptions(threshold=sys.maxsize)

In [5]:
def train(opt):
    pprint.pprint(dict(os.environ), width=1)
    result = subprocess.Popen(["nvidia-smi"], stdout=subprocess.PIPE)
    print(result.stdout.read().decode())
    device = torch.cuda.current_device()
    print(f'can see {torch.cuda.device_count()} gpus')
    print(f'current using gpu at {device} -- {torch.cuda.get_device_name(device)}')
    # dummy = torch.rand(3758725612).to(device)
    # del dummy
    torch.cuda.empty_cache()
    model = build_model(opt)
    model = model.to(device)
    dataset_class = DatasetLidarCameraKittiOdometry
#     if opt.enable_zoom:
#         train_dset = cotr_dataset.COTRZoomDataset(opt, 'train')
#         val_dset = cotr_dataset.COTRZoomDataset(opt, 'val')
#     else:
#         train_dset = cotr_dataset.COTRDataset(opt, 'train')
#         val_dset = cotr_dataset.COTRDataset(opt, 'val')

    dataset_train = dataset_class("/mnt/data/kitti_odometry", max_r=10.0, max_t=0.2,
                                  split='train', use_reflectance=False,
                                  val_sequence= '00')
    
    dataset_val = dataset_class("/mnt/data/kitti_odometry", max_r=10.0, max_t=0.2,
                                split='val', use_reflectance=False,
                                val_sequence='00')
    
    train_dataset_size = len(dataset_train)
    val_dataset_size = len(dataset_val)
    print('Number of the train dataset: {}'.format(train_dataset_size))
    print('Number of the val dataset: {}'.format(val_dataset_size))

    train_loader = DataLoader(dataset_train, batch_size=opt.batch_size,
                              shuffle=opt.shuffle_data, num_workers=opt.workers,
                              worker_init_fn=utils.worker_init_fn, collate_fn=merge_inputs,drop_last=False,pin_memory=True)
    val_loader   = DataLoader(dataset_val, batch_size=opt.batch_size,
                              shuffle=opt.shuffle_data, num_workers=opt.workers,
                              worker_init_fn=utils.worker_init_fn,collate_fn=merge_inputs,drop_last=False, pin_memory=True)
    optim_list = [{"params": model.transformer.parameters(), "lr": opt.learning_rate},
                  {"params": model.corr_embed.parameters(), "lr": opt.learning_rate},
                  {"params": model.query_proj.parameters(), "lr": opt.learning_rate},
                  {"params": model.input_proj.parameters(), "lr": opt.learning_rate},
                  ]
    if opt.lr_backbone > 0:
        optim_list.append({"params": model.backbone.parameters(), "lr": opt.lr_backbone})
    optim = torch.optim.Adam(optim_list)
    trainer = COTRTrainer(opt, model, optim, None, train_loader, val_loader)
    trainer.train()
    print("train epoch end")

In [6]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    set_general_arguments(parser)
    set_dataset_arguments(parser)
    set_nn_arguments(parser)
    set_COTR_arguments(parser)
    parser.add_argument('--num_kp', type=int,
                        default=10000)
    parser.add_argument('--kp_pool', type=int,
                        default=100)
    parser.add_argument('--enable_zoom', type=str2bool,
                        default=False)
    parser.add_argument('--zoom_start', type=float,
                        default=1.0)
    parser.add_argument('--zoom_end', type=float,
                        default=0.1)
    parser.add_argument('--zoom_levels', type=int,
                        default=10)
    parser.add_argument('--zoom_jitter', type=float,
                        default=0.5)
    parser.add_argument('--out_dir', type=str, default=general_config['out'], help='out directory')
    parser.add_argument('--tb_dir', type=str, default=general_config['tb_out'], help='tensorboard runs directory')
    parser.add_argument('--learning_rate', type=float,
                        default=1e-5, help='learning rate')
    parser.add_argument('--lr_backbone', type=float,
                        default=1e-5, help='backbone learning rate')
    parser.add_argument('--batch_size', type=int,
                        default=1, help='batch size for training')
    parser.add_argument('--cycle_consis', type=str2bool, default=True,
                        help='cycle consistency')
    parser.add_argument('--bidirectional', type=str2bool, default=False,
                        help='left2right and right2left')
    parser.add_argument('--max_iter', type=int,
                        default=500000, help='total training iterations')
    parser.add_argument('--valid_iter', type=int,
                        default=1000, help='iterval of validation')
    #parser.add_argument('--resume', type=str2bool, default=True,
    #                    help='resume training with same model name')
    parser.add_argument('--resume', type=str2bool, default=False,
                        help='resume training with same model name')
    parser.add_argument('--cc_resume', type=str2bool, default=False,
                        help='resume from last run if possible')
    parser.add_argument('--need_rotation', type=str2bool, default=False,
                        help='rotation augmentation')
    parser.add_argument('--max_rotation', type=float, default=0,
                        help='max rotation for data augmentation')
    parser.add_argument('--rotation_chance', type=float, default=0,
                        help='the probability of being rotated')
    parser.add_argument('--load_weights', type=str, default=None, help='load a pretrained set of weights, you need to provide the model id')
    parser.add_argument('--suffix', type=str, default='', help='model suffix')
#     opt = parser.parse_args()
    opt = parser.parse_args(args=[])
    opt.command = ' '.join(sys.argv)
    layer_2_channels = {'layer1': 256,
                        'layer2': 512,
                        'layer3': 1024,
                        'layer4': 2048, }
    opt.dim_feedforward = layer_2_channels[opt.layer]
    opt.num_queries = opt.num_kp
#     opt.name = get_compact_naming_cotr(opt)
    opt.out_dir = '/root/work/COTR/out'
    opt.name = '/20220608'
    opt.tb_dir = '/root/work/COTR/out/tb'
#     opt.out = os.path.join(opt.out_dir, opt.name)
#     opt.tb_out = os.path.join(opt.tb_dir, opt.name)
    opt.out = '/root/work/COTR/out/model/20220608'
    opt.tb_out = '/root/work/COTR/out/tb/20220608'
    if opt.cc_resume:
        if os.path.isfile(os.path.join(opt.out, 'checkpoint.pth.tar')):
            print('resuming from last run')
            opt.load_weights = None
            opt.resume = True
        else:
            opt.resume = False
    assert (bool(opt.load_weights) and opt.resume) == False
    if opt.load_weights:
        opt.load_weights_path = os.path.join(opt.out_dir, opt.load_weights, 'checkpoint.pth.tar')
    if opt.resume:
        opt.load_weights_path = os.path.join(opt.out, 'checkpoint.pth.tar')
#     opt.scenes_name_list = build_scenes_name_list_from_opt(opt)
    if opt.confirm:
        confirm_opt(opt)
    else:
        print_opt(opt)
#     save_opt(opt)
    train(opt)

---------------------- OPTIONS ----------------------

                 backbone  resnet50
               batch_size  1
            bidirectional  False
                cc_resume  False
                  command  /root/venv/lib/python3.8/site-packages/ipykernel_launcher.py -f /root/.local/share/jupyter/runtime/kernel-915b0257-4f60-45f3-9819-405e95914faa.json
                  confirm  True
                 crop_cam  crop_center_and_resize
             cycle_consis  True
             dataset_name  megadepth
               dec_layers  6
                 dilation  False
          dim_feedforward  1024
                  dropout  0.1
              enable_zoom  False
               enc_layers  6
               hidden_dim  256
               info_level  rgbd
                   k_size  1
                  kp_pool  100
                    layer  layer3
            learning_rate  1e-05
             load_weights  None
        load_weights_path  /root/work/COTR/out/model/20220324_1/checkpoint.pth.

OK to continue? [y/n]  y


{'CLICOLOR': '1',
 'CRC32C_SW_MODE': 'auto',
 'DISPLAY': '204a4a851509:10.0',
 'GIT_PAGER': 'cat',
 'HOME': '/root',
 'JPY_PARENT_PID': '82521',
 'KMP_DUPLICATE_LIB_OK': 'True',
 'KMP_INIT_AT_FORK': 'FALSE',
 'LANG': 'en.UTF-8',
 'LANGUAGE': 'en.UTF-8',
 'LC_ADDRESS': 'C.UTF-8',
 'LC_CTYPE': 'C.UTF-8',
 'LC_IDENTIFICATION': 'C.UTF-8',
 'LC_MEASUREMENT': 'C.UTF-8',
 'LC_MONETARY': 'C.UTF-8',
 'LC_NAME': 'C.UTF-8',
 'LC_NUMERIC': 'C.UTF-8',
 'LC_PAPER': 'C.UTF-8',
 'LC_TELEPHONE': 'C.UTF-8',
 'LC_TIME': 'C.UTF-8',
 'LESSCLOSE': '/usr/bin/lesspipe '
              '%s '
              '%s',
 'LESSOPEN': '| '
             '/usr/bin/lesspipe '
             '%s',
 'LOGNAME': 'root',
 'LS_COLORS': 'rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=0

Train:   0%|                                              | 0/3 [00:00<?, ?it/s]

weights safely loaded
---------------------- NOTIFICATION ----------------------

Loaded pretrained weights from /root/work/COTR/out/model/20220324_1/checkpoint.pth.tar

----------------------------------------------------



Train epoch=5:  13%|███▉                           | 4945/39011 [00:00<?, ?it/s][A
Train epoch=5:  13%|██▌                 | 4946/39011 [00:07<69:46:55,  7.37s/it][A
Train epoch=5:  13%|██▌                 | 4947/39011 [00:08<37:30:16,  3.96s/it][A
Train epoch=5:  13%|██▌                 | 4948/39011 [00:09<23:32:45,  2.49s/it][A
Train epoch=5:  13%|██▌                 | 4949/39011 [00:10<16:33:35,  1.75s/it][A
Train epoch=5:  13%|██▌                 | 4950/39011 [00:11<13:25:18,  1.42s/it][A
Train epoch=5:  13%|██▌                 | 4951/39011 [00:11<11:14:59,  1.19s/it][A
Train epoch=5:  13%|██▋                  | 4952/39011 [00:12<9:44:50,  1.03s/it][A
Train epoch=5:  13%|██▋                  | 4953/39011 [00:13<9:03:50,  1.04it/s][A
Train epoch=5:  13%|██▌                 | 4954/39011 [00:14<14:56:59,  1.58s/it][A
Train:   0%|                                              | 0/3 [00:18<?, ?it/s]


KeyboardInterrupt: 