<a href="https://colab.research.google.com/github/Ilvecoding0912/Robotic-3D-Detection-In-Surgery/blob/main/DETR_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DETR training process


Clone DETR Repo after changing to our dataset

In [1]:
!git clone https://github.com/Ilvecoding0912/DETR_Robotic_Surgery.git
%cd DETR_Robotic_Surgery

Cloning into 'DETR_Robotic_Surgery'...
remote: Enumerating objects: 118, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 118 (delta 10), reused 0 (delta 0), pack-reused 80[K
Receiving objects: 100% (118/118), 2.46 MiB | 20.63 MiB/s, done.
Resolving deltas: 100% (28/28), done.
/content/DETR_Robotic_Surgery


Download DETR weights

In [2]:
import gdown

url = 'https://drive.google.com/uc?id=1HV2Tit0CsVeYKHugjx8QxROPegQa3AV-'
gdown.download(url,'detr_weights.pth',quiet=True)

'detr_weights.pth'

Part of content related to training in 'main.py'.
(We ignore the evaluation part.)

In [3]:
from main import get_args_parser
import argparse
import torch
import time
import random
import datetime
import json
from torch.utils.data import DataLoader, DistributedSampler
import util.misc as utils
from datasets import build_dataset, get_coco_api_from_dataset
from pathlib import Path
import numpy as np
from engine import evaluate, train_one_epoch
from models import build_model
from datasets.coco import *
import os

def main():

    parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()])
    args = parser.parse_args([])
    args.output_dir = './outputs' # Results dir
    args.endovis_path = 'endovis17' # Path to our dataset
    args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
    # training parameters
    args.start_epoch = 0
    args.epochs = 5 # total number of epoch
    args.batch_size = 2

    # create output directoty if not exist
    if os.path.exists(args.output_dir) is False:
      os.mkdir(args.output_dir)

    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    # create model
    model, criterion, postprocessors = build_model(args)
    model.to(device)

    model_without_ddp = model
    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]},
        {
            "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad],
            "lr": args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
                                  weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    # Our dataset class (initialize in datasets->coco.py)
    dataset_train = EnvidosDataset(args.endovis_path, transforms=make_coco_transforms('train'), mode='train', length=4)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)

    batch_sampler_train = torch.utils.data.BatchSampler(
        sampler_train, args.batch_size, drop_last=True)

    data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train,
                                   collate_fn=utils.collate_fn, num_workers=args.num_workers)

    #---------------------- Training Process ----------------------
    print("Start training")
    start_time = time.time()

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)

        #----------------- main training function (can be seen in 'engine.py') -----------------
        train_stats = train_one_epoch(
            model_without_ddp, criterion, data_loader_train, optimizer, device, epoch,
            args.clip_max_norm)

        lr_scheduler.step()

        # Save trained models
        output_dir = Path(args.output_dir)
        if args.output_dir:
            checkpoint_paths = [output_dir / f'checkpoint{epoch:04}.pth']
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0:
                checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}_lr{args.lr}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master({
                    'model': model_without_ddp.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'lr_scheduler': lr_scheduler.state_dict(),
                    'epoch': epoch,
                    'args': args,
                }, checkpoint_path)

        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
                     'epoch': epoch,
                     'n_parameters': n_parameters}

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")


    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))


main()

Not using distributed mode
git:
  sha: 620f6db10fc39148252ca1b758d02b916d9cdce1, status: has uncommited changes, branch: main

Namespace(lr=0.0001, lr_backbone=1e-05, batch_size=2, weight_decay=0.0001, epochs=5, lr_drop=200, clip_max_norm=0.1, frozen_weights=None, model=True, backbone='resnet50', dilation=False, position_embedding='sine', enc_layers=6, dec_layers=6, dim_feedforward=2048, hidden_dim=256, dropout=0.1, nheads=8, num_queries=100, pre_norm=False, masks=False, aux_loss=True, set_cost_class=1, set_cost_bbox=5, set_cost_giou=2, mask_loss_coef=1, dice_loss_coef=1, bbox_loss_coef=5, giou_loss_coef=2, eos_coef=0.01, dataset_file='endovis17', coco_path=None, endovis_path='endovis17', coco_panoptic_path=None, remove_difficult=False, output_dir='./outputs', device='cuda', seed=42, resume='', start_epoch=0, eval=False, num_workers=2, world_size=1, dist_url='env://', distributed=False)


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 284MB/s]


number of params: 41281037
Start training
Epoch: [0]  [0/2]  eta: 0:00:17  lr: 0.000100  class_error: 66.67  loss: 22.4941 (22.4941)  loss_ce: 2.3403 (2.3403)  loss_bbox: 0.5941 (0.5941)  loss_giou: 0.4833 (0.4833)  loss_ce_0: 2.3685 (2.3685)  loss_bbox_0: 1.0949 (1.0949)  loss_giou_0: 0.5298 (0.5298)  loss_ce_1: 2.3933 (2.3933)  loss_bbox_1: 0.8353 (0.8353)  loss_giou_1: 0.5557 (0.5557)  loss_ce_2: 2.3758 (2.3758)  loss_bbox_2: 0.8234 (0.8234)  loss_giou_2: 0.6742 (0.6742)  loss_ce_3: 2.4173 (2.4173)  loss_bbox_3: 0.8771 (0.8771)  loss_giou_3: 0.5837 (0.5837)  loss_ce_4: 2.3404 (2.3404)  loss_bbox_4: 0.7217 (0.7217)  loss_giou_4: 0.4855 (0.4855)  loss_ce_unscaled: 2.3403 (2.3403)  class_error_unscaled: 66.6667 (66.6667)  loss_bbox_unscaled: 0.1188 (0.1188)  loss_giou_unscaled: 0.2416 (0.2416)  cardinality_error_unscaled: 97.0000 (97.0000)  loss_ce_0_unscaled: 2.3685 (2.3685)  loss_bbox_0_unscaled: 0.2190 (0.2190)  loss_giou_0_unscaled: 0.2649 (0.2649)  cardinality_error_0_unscaled: 97