<a href="https://colab.research.google.com/github/Ilvecoding0912/DETR_Robotic_Surgery/blob/main/SWIN_DETR_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DETR training process


Clone DETR Repo after changing to our dataset

In [1]:
!git clone https://github.com/Ilvecoding0912/DETR_Robotic_Surgery.git
%cd DETR_Robotic_Surgery

Cloning into 'DETR_Robotic_Surgery'...
remote: Enumerating objects: 130, done.[K
remote: Counting objects: 100% (50/50), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 130 (delta 15), reused 0 (delta 0), pack-reused 80[K
Receiving objects: 100% (130/130), 2.52 MiB | 2.20 MiB/s, done.
Resolving deltas: 100% (33/33), done.
/content/DETR_Robotic_Surgery


Download DETR weights

In [2]:
import gdown

url = 'https://drive.google.com/uc?id=1HV2Tit0CsVeYKHugjx8QxROPegQa3AV-'
gdown.download(url,'detr_weights.pth',quiet=True)

'detr_weights.pth'

In [3]:
!pip -q install timm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
from timm import create_model
swintran = create_model("swin_base_patch4_window7_224", pretrained=True)
for param in swintran.parameters():
    param.requires_grad = False

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Downloading model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

In [5]:
def replace_text_in_file(search_path, replace_path, search_text=None, replace_text=None):
    with open(search_path, 'r') as file: content = file.read()
    modified_content = content.replace(search_text, replace_text)
    with open(replace_path, 'w') as file: file.write(modified_content)

file_path = '/content/DETR_Robotic_Surgery/datasets/coco.py'
replace_text_in_file(file_path, file_path, search_text='1024', replace_text='224')

In [8]:
%cd /content/DETR_Robotic_Surgery
from collections import OrderedDict
import torch
from torch import nn
from typing import Dict, List
from util.misc import NestedTensor
import torch.nn.functional as F
from models.position_encoding import build_position_encoding
from timm import create_model


class backboneSWIN(nn.Module):

    def __init__(self, ):
        super().__init__()
        self.swintran = create_model("swin_base_patch4_window7_224", pretrained=True)
        self.swintran.eval()
        for param in self.swintran.parameters():
            param.requires_grad = False



    def forward(self, tensor_list: NestedTensor):
        xs = OrderedDict()
        with torch.no_grad():
            x = self.swintran.patch_embed(tensor_list.tensors)
            x = self.swintran.layers(x)
            xs['0'] = self.swintran.norm(x).permute(0,3,1,2)

        # print('out swin features', xs['0'].shape)
        out: Dict[str, NestedTensor] = {}
        for name, x in xs.items():
            m = tensor_list.mask
            # print('m', m.shape)
            assert m is not None
            import torch.nn.functional as F
            mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
            # mask = F.upsample_nearest(m[None].float(),size=x.shape[-2:]).to(torch.bool)[0]
            # print('mobarak',name, x.shape, mask.shape)
            out[name] = NestedTensor(x, mask)
        return out

class Joiner(nn.Sequential):
    def __init__(self, backbone, position_embedding):
        super().__init__(backbone, position_embedding)

    def forward(self, tensor_list: NestedTensor):
        xs = self[0](tensor_list)
        out: List[NestedTensor] = []
        pos = []
        for name, x in xs.items():
            out.append(x)
            # position encoding
            pos.append(self[1](x).to(x.tensors.dtype))

        return out, pos

def build_backbone(args):
    position_embedding = build_position_encoding(args)
    train_backbone = args.lr_backbone > 0
    return_interm_layers = args.masks
    backbone = backboneSWIN()#Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation)
    model = Joiner(backbone, position_embedding)
    # model.num_channels = backbone.num_channels
    return model

/content/DETR_Robotic_Surgery


Part of content related to training in 'main.py'.
(We ignore the evaluation part.)

In [11]:
%cd /content/DETR_Robotic_Surgery
from main import get_args_parser
import argparse
import torch
import time
import random
import datetime
import json
from torch.utils.data import DataLoader, DistributedSampler
import util.misc as utils
from datasets import build_dataset, get_coco_api_from_dataset
from pathlib import Path
import numpy as np
from engine import evaluate, train_one_epoch
from models import build_model
from datasets.coco import *
import os

def main():

    parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()])
    args = parser.parse_args([])
    args.output_dir = './outputs' # Results dir
    args.endovis_path = 'endovis17' # Path to our dataset
    args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
    # training parameters
    args.start_epoch = 0
    args.epochs = 1 # total number of epoch
    args.batch_size = 1

    # create output directoty if not exist
    if os.path.exists(args.output_dir) is False:
        os.mkdir(args.output_dir)

    utils.init_distributed_mode(args)
    print("git:\n  {}\n".format(utils.get_sha()))
    print(args)

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    # create model
    model, criterion, postprocessors = build_model(args)
    model.to(device)
    # print(model)
    model_without_ddp = model
    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('number of params:', n_parameters)

    param_dicts = [
        {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]},
        {
            "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad],
            "lr": args.lr_backbone,
        },
    ]
    optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
                                    weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)

    # Our dataset class (initialize in datasets->coco.py)
    dataset_train = EnvidosDataset(args.endovis_path, transforms=make_coco_transforms('train'), mode='train', length=4)

    if args.distributed:
        sampler_train = DistributedSampler(dataset_train)
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)

    batch_sampler_train = torch.utils.data.BatchSampler(
        sampler_train, args.batch_size, drop_last=True)

    data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train,
                                    collate_fn=utils.collate_fn, num_workers=args.num_workers)

    #---------------------- Training Process ----------------------
    print("Start training")
    start_time = time.time()

    sam_backbone = build_backbone(args)#BackboneBase()
    sam_backbone.to(device)
    sam_backbone.eval()
    model_without_ddp.backbone = sam_backbone
    model_without_ddp.input_proj = nn.Conv2d(1024, model_without_ddp.transformer.d_model, kernel_size=1)
    model_without_ddp.to(device)

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)

        #----------------- main training function (can be seen in 'engine.py') -----------------
        train_stats = train_one_epoch(
            model_without_ddp, criterion, data_loader_train, optimizer, device, epoch,
            args.clip_max_norm)

        lr_scheduler.step()

        # Save trained models
        output_dir = Path(args.output_dir)
        if args.output_dir:
            checkpoint_paths = [output_dir / f'checkpoint{epoch:04}.pth']
            if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0:
                checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}_lr{args.lr}.pth')
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master({
                    'model': model_without_ddp.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'lr_scheduler': lr_scheduler.state_dict(),
                    'epoch': epoch,
                    'args': args,
                }, checkpoint_path)

        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
                        'epoch': epoch,
                        'n_parameters': n_parameters}

        if args.output_dir and utils.is_main_process():
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")


    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))


main()

/content/DETR_Robotic_Surgery
Not using distributed mode
git:
  sha: 51daa00dc19ce10ff76760f95f88499fa80b48c3, status: has uncommited changes, branch: main

Namespace(lr=0.0001, lr_backbone=1e-05, batch_size=1, weight_decay=0.0001, epochs=1, lr_drop=200, clip_max_norm=0.1, frozen_weights=None, model=True, backbone='resnet50', dilation=False, position_embedding='sine', enc_layers=6, dec_layers=6, dim_feedforward=2048, hidden_dim=256, dropout=0.1, nheads=8, num_queries=100, pre_norm=False, masks=False, aux_loss=True, set_cost_class=1, set_cost_bbox=5, set_cost_giou=2, mask_loss_coef=1, dice_loss_coef=1, bbox_loss_coef=5, giou_loss_coef=2, eos_coef=0.01, dataset_file='endovis17', coco_path=None, endovis_path='endovis17', coco_panoptic_path=None, remove_difficult=False, output_dir='./outputs', device='cuda', seed=42, resume='', start_epoch=0, eval=False, num_workers=2, world_size=1, dist_url='env://', distributed=False)




number of params: 41281037
Start training
out swin features torch.Size([1, 1024, 7, 7])
Epoch: [0]  [0/4]  eta: 0:00:01  lr: 0.000100  class_error: 33.33  loss: 40.7313 (40.7313)  loss_ce: 2.2733 (2.2733)  loss_bbox: 2.9468 (2.9468)  loss_giou: 1.7204 (1.7204)  loss_ce_0: 2.5862 (2.5862)  loss_bbox_0: 2.7820 (2.7820)  loss_giou_0: 1.3801 (1.3801)  loss_ce_1: 2.4352 (2.4352)  loss_bbox_1: 2.4938 (2.4938)  loss_giou_1: 1.2280 (1.2280)  loss_ce_2: 2.4121 (2.4121)  loss_bbox_2: 3.0030 (3.0030)  loss_giou_2: 1.4809 (1.4809)  loss_ce_3: 2.3199 (2.3199)  loss_bbox_3: 3.1853 (3.1853)  loss_giou_3: 1.4462 (1.4462)  loss_ce_4: 2.2419 (2.2419)  loss_bbox_4: 3.0485 (3.0485)  loss_giou_4: 1.7476 (1.7476)  loss_ce_unscaled: 2.2733 (2.2733)  class_error_unscaled: 33.3333 (33.3333)  loss_bbox_unscaled: 0.5894 (0.5894)  loss_giou_unscaled: 0.8602 (0.8602)  cardinality_error_unscaled: 97.0000 (97.0000)  loss_ce_0_unscaled: 2.5862 (2.5862)  loss_bbox_0_unscaled: 0.5564 (0.5564)  loss_giou_0_unscaled: 0.6

In [None]:
import torchvision.transforms as T
import numpy as np

invTrans = T.Compose([ T.Normalize(mean = [ 0., 0., 0. ],
                                                     std = [ 1/0.229, 1/0.224, 1/0.225 ]),
                                T.Normalize(mean = [ -0.485, -0.456, -0.406 ],
                                                     std = [ 1., 1., 1. ]),
                               ])

img1 = invTrans(samples.tensors)
print(img1.shape)
print(img1[0].permute(1,2,0).shape)
img = img1[0].permute(1,2,0).cpu()
# ([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])

print(img.max(), img.min())
img_cv = np.array(img*255).astype(np.uint8)
im_rgb = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)
cv2.imwrite('sample.png', im_rgb)