In [1]:
!python --version

Python 3.10.12


In [2]:
import sys

sys.path.append("/kaggle/input/monodtr-library-ver2/MonoDTR")

In [3]:
!pip install -qq -r /kaggle/input/monodtr-library-ver2/MonoDTR/requirement.txt
!pip install -qq coloredlogs
!pip install -qq ptflops

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.9/115.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
sys.path.append("/kaggle/input/monodtr-library-ver2/MonoDTR/scripts")
sys.path.append("/kaggle/input/pretrain-monodtr-base")
sys.path.append("/kaggle/input/checkpoint")

In [5]:
import os
import sys
import torch

# Add the directory containing the .so file to the system path
sys.path.append('/kaggle/input/monodtr-library-ver2/MonoDTR/visualDet3D/networks/lib/ops/dcn')

# Check if the .so file exists
print(os.listdir('/kaggle/input/monodtr-library-ver2/MonoDTR/visualDet3D/networks/lib/ops/dcn'))

# Print Python version
print("Python version:", sys.version)

# Set LD_LIBRARY_PATH
os.environ['LD_LIBRARY_PATH'] = '/kaggle/input/monodtr-library-ver2/MonoDTR/visualDet3D/networks/lib/ops/dcn:' + os.environ.get('LD_LIBRARY_PATH', '')

# Attempt to import the module
try:
    import deform_conv_ext
    print("Import successful!")
except ImportError as e:
    print("Import failed:", e)

# If needed, run the setup or make script
# !python setup.py build
# !python setup.py install
# or
# !bash make.sh


['deform_conv.py', 'deform_conv_ext.cpython-310-x86_64-linux-gnu.so', 'make.sh', 'src', '__init__.py', 'setup.py']
Python version: 3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]
Import successful!


In [6]:
import torch

if torch.cuda.is_available():
    print("CUDA is available!")
    print("Number of GPUs available:", torch.cuda.device_count())
    print("Current GPU:", torch.cuda.current_device())
    print("GPU Name:", torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print("CUDA is not available. Check your settings.")


CUDA is available!
Number of GPUs available: 1
Current GPU: 0
GPU Name: Tesla P100-PCIE-16GB


In [7]:
# Create a large tensor and move it to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = torch.randn(10000, 10000, device=device)  # This should allocate GPU memory
print("Tensor created on GPU.")
print("Allocated GPU Memory:", torch.cuda.memory_allocated(device))
print("Cached GPU Memory:", torch.cuda.memory_reserved(device))


Tensor created on GPU.
Allocated GPU Memory: 400556032
Cached GPU Memory: 400556032


In [8]:
import torch.nn as nn
import torch
import torch
from torch.nn import Module, Dropout
from einops import rearrange
import torch.nn.functional as F
from ptflops import get_model_complexity_info

In [9]:
# -------------------------- RMSNorm --------------------------
class RMSNorm(nn.Module):
    def __init__(self, dim, eps=1e-8):
        super().__init__()
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(dim))

    def forward(self, x):
        # x: (B, L, D)
        norm_x = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
        return self.scale * norm_x

# -------------------------- FLIP Module --------------------------
def flip(x, dim):
    return torch.flip(x, dims=[dim])

In [10]:
# -------------------------- BiMamba2 Block (Conv Fusion) --------------------------
class BiMamba2(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.linear_x = nn.Linear(dim, dim)
        self.linear_bf = nn.Linear(dim, dim)
        self.linear_bb = nn.Linear(dim, dim)
        self.linear_z = nn.Linear(dim, dim)

        self.concat_proj_f = nn.Linear(2 * dim, dim)
        self.concat_proj_b = nn.Linear(2 * dim, dim)

        self.conv1d_f = nn.Conv1d(dim, dim, kernel_size=1)
        self.conv1d_b = nn.Conv1d(dim, dim, kernel_size=1)
        self.conv1d_zf = nn.Conv1d(dim, dim, kernel_size=1)
        self.conv1d_ab_z = nn.Conv1d(dim, dim, kernel_size=1)

        self.ssms_f = nn.Sequential(
            nn.Conv1d(dim, dim, 1),
            nn.GELU(),
            nn.Conv1d(dim, dim, 1)
        )
        self.ssms_b = nn.Sequential(
            nn.Conv1d(dim, dim, 1),
            nn.GELU(),
            nn.Conv1d(dim, dim, 1)
        )

        self.norm_f = RMSNorm(dim)
        self.norm_b = RMSNorm(dim)
        self.output_linear = nn.Linear(dim, dim)

    def forward(self, u):
        # u: (B, L, D)
        x = self.linear_x(u)          # (B, L, D)
        bf = self.linear_bf(u)        # (B, L, D)
        bb = self.linear_bb(u)        # (B, L, D)
        z = self.linear_z(u)          # (B, L, D)

        bf_x = torch.cat([bf, x], dim=-1)         # (B, L, 2D)
        bb_x = torch.cat([bb, x], dim=-1)         # (B, L, 2D)

        bf_x = self.concat_proj_f(bf_x)           # (B, L, D)
        bb_x = self.concat_proj_b(bb_x)           # (B, L, D)

        af = self.conv1d_f(bf_x.transpose(1, 2))  # (B, D, L)
        af = F.gelu(af)     
        af = self.ssms_f(af).transpose(1, 2)      # (B, L, D)# (B, D, L)
        # af = af + self.conv1d_zf(z.transpose(1, 2))  # af conv with z
        af = af * z
        af = self.norm_f(af)                      # (B, L, D)

        bb_x_flip = flip(bb_x, dim=1)             # (B, L, D)
        ab = self.conv1d_b(bb_x_flip.transpose(1, 2))  # (B, D, L)
        ab = F.gelu(ab)                                # (B, D, L)

        z_flip = flip(z, dim=1)                        # (B, L, D)
        ab = self.ssms_b(ab).transpose(1, 2)           # (B, L, D)
        # ab = ab + self.conv1d_ab_z(z_flip.transpose(1, 2))  # (B, D, L)
        ab = ab * z_flip   # (B, D, L)
        ab = self.norm_b(ab)                           # (B, L, D)
        ab = flip(ab, dim=1)                           # (B, L, D)

        out = self.output_linear(af + ab)         # (B, L, D)                 # (B, L, D)
        return out

In [11]:
# -------------------------- CrossMamba2 Module --------------------------
class CrossMamba2(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.linear_x = nn.Linear(dim, dim)
        self.linear_z = nn.Linear(dim, dim)
        self.linear_bf = nn.Linear(dim, dim)
        self.linear_bb = nn.Linear(dim, dim)

        self.concat_proj_f = nn.Linear(2 * dim, dim)
        self.concat_proj_b = nn.Linear(2 * dim, dim)

        self.conv1d_f = nn.Conv1d(dim, dim, kernel_size=1)
        self.conv1d_b = nn.Conv1d(dim, dim, kernel_size=1)
        self.conv1d_zf = nn.Conv1d(dim, dim, kernel_size=1)
        self.conv1d_ab_z = nn.Conv1d(dim, dim, kernel_size=1)

        self.ssms_f = nn.Sequential(
            nn.Conv1d(dim, dim, 1),
            nn.GELU(),
            nn.Conv1d(dim, dim, 1)
        )
        self.ssms_b = nn.Sequential(
            nn.Conv1d(dim, dim, 1),
            nn.GELU(),
            nn.Conv1d(dim, dim, 1)
        )

        self.norm_f = RMSNorm(dim)
        self.norm_b = RMSNorm(dim)
        self.output_linear = nn.Linear(dim, dim)

    def forward(self, u1, u2):
        # u1: context (B, L, D), u2: depth (B, L, D)
        x = self.linear_x(u1)               # (B, L, D)
        z = self.linear_z(u1)               # (B, L, D)
        bf = self.linear_bf(u2)             # (B, L, D)
        bb = self.linear_bb(u2)             # (B, L, D)

        bf_x = torch.cat([bf, x], dim=-1)   # (B, L, 2D)
        bb_x = torch.cat([bb, x], dim=-1)   # (B, L, 2D)

        bf_x = self.concat_proj_f(bf_x)     # (B, L, D)
        bb_x = self.concat_proj_b(bb_x)     # (B, L, D)

        af = self.conv1d_f(bf_x.transpose(1, 2))  # (B, D, L)
        af = F.gelu(af)
        af = self.ssms_f(af).transpose(1, 2)      # (B, L, D)
        # af = af + self.conv1d_zf(z.transpose(1, 2))  # af conv with z
        af = af * z  # af conv with z
        af = self.norm_f(af)

        bb_x_flip = flip(bb_x, dim=1)             # (B, L, D)
        ab = self.conv1d_b(bb_x_flip.transpose(1, 2))
        ab = F.gelu(ab).transpose(1, 2)           # (B, L, D)

        z_flip = flip(z, dim=1)
        ab = self.ssms_b(ab.transpose(1, 2)).transpose(1, 2)
        # ab = ab + self.conv1d_ab_z(z_flip.transpose(1, 2))  # (B, D, L)
        ab = ab * z_flip  # (B, D, L)
        ab = self.norm_b(ab)
        ab = flip(ab, dim=1)

        out = self.output_linear(af + ab)         # (B, L, D)
        return out

In [12]:
# -------------------------- FFN + Norm --------------------------
class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, dim)
        )

    def forward(self, x):
        return self.net(x)

In [13]:
# -------------------------- Depth Aware Mamba (DTR-Compatible Input) --------------------------
class DepthAwareMamba(nn.Module):
    def __init__(self, output_channel_num):
        super().__init__()
        self.output_channel_num = output_channel_num

        self.encoder_bimamba = BiMamba2(self.output_channel_num)
        self.encoder_ffn = FeedForward(self.output_channel_num, self.output_channel_num * 2)
        self.encoder_norm1 = nn.LayerNorm(self.output_channel_num)
        self.encoder_norm2 = nn.LayerNorm(self.output_channel_num)

        self.decoder_bimamba = BiMamba2(self.output_channel_num)
        self.cross_bimamba = CrossMamba2(self.output_channel_num)
        self.decoder_ffn = FeedForward(self.output_channel_num, self.output_channel_num * 2)
        self.decoder_norm1 = nn.LayerNorm(self.output_channel_num)
        self.decoder_norm2 = nn.LayerNorm(self.output_channel_num)
        self.decoder_norm3 = nn.LayerNorm(self.output_channel_num)

    def forward(self, depth_feat, context_feat, depth_pos=None):
        depth_feat = depth_feat.contiguous()
        context_feat = context_feat.contiguous()
        if depth_pos is not None:
            depth_pos = depth_pos.contiguous()
            context_feat = context_feat + depth_pos
    
        # Encoder on context_feat
        x = self.encoder_bimamba(context_feat.contiguous())
        x = self.encoder_norm1((x + context_feat).contiguous())
        x_ffn = self.encoder_ffn(x.contiguous())
        x = self.encoder_norm2((x + x_ffn).contiguous())
    
        # Decoder on depth_feat and fused context
        d = self.decoder_bimamba(depth_feat.contiguous())
        d = self.decoder_norm1((d + depth_feat).contiguous())
    
        x = self.cross_bimamba(x.contiguous(), d.contiguous())
        x = self.decoder_norm2(x.contiguous())
        x_ffn = self.decoder_ffn(x.contiguous())
        x = self.decoder_norm3((x + x_ffn).contiguous())
        return x


In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CSA(nn.Module):
    """
    Cross-Scale Attention (CSA) module.
    Projects both inputs to a common channel dimension 'c' before computing attention.
    """
    def __init__(self, c1, c2, c):
        super(CSA, self).__init__()
        self.proj_query = nn.Linear(c1, c)
        self.proj_coarser = nn.Linear(c2, c)  # Shared for key and value

    def forward(self, x1, x2):
        """
        :param x1: Finer-resolution input (B, C1, H1, W1)
        :param x2: Coarser-resolution input (B, C2, H2, W2)
        :return: Augmented finer-resolution feature (B, c, H1, W1)
        """
        B, C1, H1, W1 = x1.shape
        _, C2, H2, W2 = x2.shape

        flat1 = x1.permute(0, 2, 3, 1).reshape(B, H1 * W1, C1)
        flat2 = x2.permute(0, 2, 3, 1).reshape(B, H2 * W2, C2)

        query = self.proj_query(flat1)
        key = self.proj_coarser(flat2)
        value = self.proj_coarser(flat2)

        energy = torch.bmm(query, key.permute(0, 2, 1))
        attention = F.softmax(energy, dim=-1)

        out = torch.bmm(attention, value)

        out = out + query
        out = out.reshape(B, H1, W1, -1).permute(0, 3, 1, 2)

        return out

In [15]:
class MSR(nn.Module):
    """
    Multi-Scale Refinement (MSR) module without depth head for feature refinement only.
    """
    def __init__(self, d=256):
        super(MSR, self).__init__()
        self.mlp1 = nn.Sequential(nn.Conv2d(d, d, kernel_size=1), nn.ReLU(inplace=True))
        self.mlp2 = nn.Sequential(nn.Conv2d(d, d, kernel_size=1), nn.ReLU(inplace=True))

    def forward(self, coarser=None, finer=None):
        if coarser is not None:
            target_size = finer.shape[2:] if finer is not None else (coarser.shape[2] * 2, coarser.shape[3] * 2)
            up = F.interpolate(coarser, size=target_size, mode='bilinear', align_corners=False)
            if finer is not None:
                feature = finer + up
            else:
                feature = up
        else:
            feature = finer
        feature = self.mlp1(feature)
        feature = self.mlp2(feature)
        # CHANGE: Only return the refined feature (no depth)
        return feature  # Was: return feature, depth

In [16]:
from visualDet3D.networks.backbones.dla import dla102
from visualDet3D.networks.backbones.dlaup import DLAUp
from visualDet3D.networks.detectors.dfe import DepthAwareFE
from visualDet3D.networks.detectors.dpe import DepthAwarePosEnc
from visualDet3D.networks.detectors.dtr import DepthAwareTransformer
import math
import time
import csv

In [17]:
import timm

class MonoDTRCore(nn.Module):
    def __init__(self, backbone_arguments=dict()):
        super(MonoDTRCore, self).__init__()

        # Swin-T backbone with features_only=True to ensure feature pyramid output
        self.backbone = timm.create_model('swin_tiny_patch4_window7_224', pretrained=True,features_only=True, img_size=(288, 1280))
        # self.backbone = timm.create_model('swin_tiny_patch4_window7_224', pretrained=True, features_only=True)
        self.first_level = 1  # Kept for reference, but no longer used

        # Feature pyramid channels from Swin-T
        self.channels = [96, 192, 384, 768]
        self.output_channel_num = 256

        self.csa_1_8 = CSA(192, self.output_channel_num, self.output_channel_num)   # 1/8 with 1/16
        self.csa_1_16 = CSA(384, 768, self.output_channel_num)  # 1/16 with 1/32

        # MSR modules for upsampling and refinement
        self.msr_levels = nn.ModuleList([MSR(self.output_channel_num) for _ in range(2)])
        
        self.dpe = DepthAwarePosEnc(self.output_channel_num)
        self.depth_embed = nn.Embedding(100, self.output_channel_num)
        self.dtr = DepthAwareTransformer(self.output_channel_num)
        self.dfe = DepthAwareFE(self.output_channel_num)
        self.img_conv = nn.Conv2d(self.output_channel_num, self.output_channel_num, kernel_size=3, padding=1)

    def forward(self, x):
        """
        Forward pass of MonoDTRCore.
        
        Input:
            x: dict with key 'image' containing (N, 3, H, W)
        
        Output:
            feat: (N, C, H/8, W/8) - Processed feature map for compatibility with dtr
            depth: (N, D, H, W) - Depth prediction at 1/1 resolution
        """
        if 'image' not in x:
            raise ValueError("Input dictionary must contain 'image' key")
        img = x['image']  # (N, 3, H, W)
        assert img.shape[2] == 288 and img.shape[3] == 1280, f"Expected shape [N, 3, 288, 1280], got {img.shape}"

        if img.dim() != 4:
            raise ValueError(f"Expected 4D input tensor (N, C, H, W), got shape {img.shape}")
        
        if img.shape[1] != 3:
            raise ValueError(
                f"Expected input image to have 3 channels (RGB), got {img.shape[1]} channels. "
                "Please check your data pipeline to ensure the input is correctly formatted as (N, 3, H, W). "
                "If the input has 7 channels (e.g., multi-spectral), preprocess it to 3 channels or modify the backbone."
            )

        # Extract features using standard forward method
        features = self.backbone(img)  # [(N, H/4, W/4, 96), (N, H/8, W/8, 192), (N, H/16, W/16, 384), (N, H/32, W/32, 768)]

        # Permute features from (N, H, W, C) to (N, C, H, W)
        features = [f.permute(0, 3, 1, 2) for f in features]  # [(N, 96, H/4, W/4), (N, 192, H/8, W/8), (N, 384, H/16, W/16), (N, 768, H/32, W/32)]


        # Refine features progressively with CSA and MSR
        fused = self.csa_1_16(features[2], features[3])  # (N, 256, H/16, W/16)
        prev_feat = self.msr_levels[0](fused, None)

        fused = self.csa_1_8(features[1], prev_feat)     # (N, 256, H/8, W/8)
        prev_feat = self.msr_levels[1](fused, prev_feat)

        x = prev_feat

        # Proceed with downstream processing
        N, C, H, W = x.shape  # H=H/8, W=W/8 (reduced resolution)
        # print(f"X shape: {x.shape}")
        depth, depth_guide, depth_feat = self.dfe(x) # depth: (N, D, H/8, W/8), depth_guide: (N, num_classes, H/8, W/8), depth_feat: (N, 256, H/8, W/8)
        depth_feat = depth_feat.permute(0, 2, 3, 1).view(N, H * W, C)
        depth_guide = depth_guide.argmax(1)
        depth_emb = self.depth_embed(depth_guide).view(N, H * W, C)
        depth_emb = self.dpe(depth_emb, (H, W))
        img_feat = x + self.img_conv(x)
        img_feat = img_feat.permute(0, 2, 3, 1)
        # print(f"img_feat shape: {img_feat.shape}")
        img_feat = img_feat.view(N, H * W, C)
        feat = self.dtr(depth_feat, img_feat, depth_emb)
        feat = feat.permute(0, 2, 1).view(N, C, H, W)

        return feat, depth

In [18]:
from visualDet3D.networks.heads.detection_3d_head import AnchorBasedDetection3DHead
from visualDet3D.networks.heads.depth_losses import bin_depths, DepthFocalLoss
from visualDet3D.networks.utils.registry import DETECTOR_DICT

In [19]:
class MonoDTR(nn.Module):
    def __init__(self, network_cfg):
        super(MonoDTR, self).__init__()

        self.obj_types = network_cfg.obj_types

        self.build_head(network_cfg)

        self.build_core(network_cfg)

        self.network_cfg = network_cfg

    def build_core(self, network_cfg):
        self.mono_core = MonoDTRCore(network_cfg.mono_backbone)

    def build_head(self, network_cfg):
        self.bbox_head = AnchorBasedDetection3DHead(
            **(network_cfg.head)
        )
        self.depth_loss = DepthFocalLoss(96)

    def train_forward(self, left_images, annotations, P2, depth_gt=None):
        
        features, depth = self.mono_core(dict(image=left_images, P2=P2))
        
        depth_output   = depth
        
        features = features.contiguous()
        P2 = P2.contiguous()
        left_images = left_images.contiguous()
        
        try:
            cls_preds, reg_preds = self.bbox_head(
                dict(
                    features=features,
                    P2=P2,
                    image=left_images
                )
            )
        except RuntimeError as e:
            print(f"RuntimeError: {e}")
            raise
            
        anchors = self.bbox_head.get_anchor(left_images, P2)

        cls_loss, reg_loss, loss_dict = self.bbox_head.loss(cls_preds, reg_preds, anchors, annotations, P2)
        
        depth_gt = bin_depths(depth_gt, mode = "LID", depth_min=1, depth_max=80, num_bins=96, target=True)
        
        # if depth_gt is not None: #use in feature size is (H/4, W/4)
        #         depth_gt = F.interpolate(depth_gt.unsqueeze(1), size=depth_output.shape[2:], mode='nearest').squeeze(1)
            
        if reg_loss.mean() > 0 and not depth_gt is None and not depth_output is None:
            
            depth_gt = depth_gt.unsqueeze(1)
            depth_loss = 1.0 * self.depth_loss(depth_output, depth_gt)
            loss_dict['depth_loss'] = depth_loss
            reg_loss += depth_loss

            self.depth_output = depth_output.detach()
        else:
            loss_dict['depth_loss'] = torch.zeros_like(reg_loss)
        return cls_loss, reg_loss, loss_dict

    def test_forward(self, left_images, P2):
        assert left_images.shape[0] == 1 # we recommmend image batch size = 1 for testing

        features, _ = self.mono_core(dict(image=left_images, P2=P2))
        
        cls_preds, reg_preds = self.bbox_head(
                dict(
                    features=features,
                    P2=P2,
                    image=left_images
                )
            )

        anchors = self.bbox_head.get_anchor(left_images, P2)

        scores, bboxes, cls_indexes = self.bbox_head.get_bboxes(cls_preds, reg_preds, anchors, P2, left_images)
        
        return scores, bboxes, cls_indexes

    def forward(self, inputs):

        if isinstance(inputs, list) and len(inputs) >= 3:
            return self.train_forward(*inputs)
        else:
            return self.test_forward(*inputs)

In [20]:
import os
import sys
import numpy as np
from easydict import EasyDict
from tqdm import tqdm
from fire import Fire
import coloredlogs
import logging
import torch
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from _path_init import *
from visualDet3D.networks.utils.registry import DETECTOR_DICT, DATASET_DICT, PIPELINE_DICT
from visualDet3D.networks.utils.utils import BackProjection, BBox3dProjector, get_num_parameters
from visualDet3D.evaluator.kitti.evaluate import evaluate
import visualDet3D.data.kitti.dataset
from visualDet3D.utils.timer import Timer
from visualDet3D.utils.utils import LossLogger, cfg_from_file
from visualDet3D.networks.optimizers import optimizers, schedulers

In [21]:
import torch
from torch.cuda.amp import autocast  # For AMP
import torch.optim as optim
from visualDet3D.utils.utils import LossLogger
from visualDet3D.utils.utils import compound_annotation

def My_train_mono_detection(data, module: nn.Module,
                         optimizer: optim.Optimizer,
                         writer: SummaryWriter = None,
                         loss_logger: LossLogger = None,
                         global_step: int = None,
                         epoch_num: int = None,
                         cfg: EasyDict = EasyDict(),
                         iter_num: int = None):
    if iter_num is None:
        iter_num = 0  # Fallback, but manage externally
    steps = cfg.trainer.accumulation_steps
    is_accum_start = (iter_num % steps == 0)
    if is_accum_start and optimizer is not None:
        optimizer.zero_grad(set_to_none=True)  # Faster than zero_grad()

    images, P2, labels, bbox2d, bbox_3d, depth = data
    # Assume batch=1, so lists of len=1

    # Handle empty annotations gracefully
    max_length = np.max([len(label) for label in labels]) if labels else 0
    if max_length == 0:
        return  # Still increments iter_num externally

    annotation = compound_annotation(labels, max_length, bbox2d, bbox_3d, cfg.obj_types)
    annotation_tensor = images.new(annotation).cuda()  # Inherit dtype/device

    # Forward pass with AMP for efficiency
    # with autocast(device_type='cuda', enabled=cfg.get('use_amp', True)):
    cls_loss, reg_loss, loss_dict = module([
        images.cuda().float().contiguous(),
        annotation_tensor,
        P2.cuda(),
        depth.cuda().contiguous()
    ])
    cls_loss = cls_loss.mean()
    reg_loss = reg_loss.mean()
    total_loss = cls_loss + reg_loss
    # loss_dict['total_loss'] = total_loss.item()  # Add for logging

    # Log unscaled micro-batch losses
    if loss_logger is not None:
        loss_logger.update(loss_dict)
    del loss_dict

    if optimizer is not None:
        # Check for invalid losses
        if torch.isnan(total_loss) or torch.isinf(total_loss):
            print(f"Warning: Invalid loss at iter {iter_num}: {total_loss.item()}")
            # Optionally: optimizer.zero_grad(); return
            total_loss = torch.tensor(0.0, device=total_loss.device, requires_grad=True)

        scaled_loss = total_loss / steps
        scaled_loss.backward()

    # Update every accumulation_steps
    if (iter_num + 1) % steps == 0:
        # Gradient clipping
        if hasattr(cfg.optimizer, 'clipped_gradient_norm'):
            torch.nn.utils.clip_grad_norm_(module.parameters(), cfg.optimizer.clipped_gradient_norm)
        else:
            print("Warning: No clipped_gradient_norm in cfg; skipping clipping.")

        optimizer.step()
        if is_accum_start:  # Reset only if full cycle (handle uneven final)
            optimizer.zero_grad(set_to_none=True)

In [22]:
from collections import OrderedDict  # Add this import
def main(config="/kaggle/input/monodtr-library-ver2/MonoDTR/config/config.py", experiment_name="default", world_size=1, local_rank=-1):
    """Main function for the training script.

    KeywordArgs:
        config (str): Path to config file.
        experiment_name (str): Custom name for the experitment, only used in tensorboard.
        world_size (int): Number of total subprocesses in distributed training. 
        local_rank: Rank of the process. Should not be manually assigned. 0-N for ranks in distributed training (only process 0 will print info and perform testing). -1 for single training. 
    """

    ## Get config
    cfg = cfg_from_file(config)

    ## Collect distributed(or not) information
    cfg.dist = EasyDict()
    cfg.dist.world_size = world_size
    cfg.dist.local_rank = local_rank
    is_distributed = local_rank >= 0 # local_rank < 0 -> single training
    is_logging     = local_rank <= 0 # only log and test with main process
    is_evaluating  = local_rank <= 0

    ## Setup writer if local_rank > 0
    recorder_dir = os.path.join(cfg.path.log_path, experiment_name + f"config={config}")
    if is_logging: # writer exists only if not distributed and local rank is smaller
        ## Clean up the dir if it exists before
        if os.path.isdir(recorder_dir):
            os.system("rm -r {}".format(recorder_dir))
            print("clean up the recorder directory of {}".format(recorder_dir))
        writer = SummaryWriter(recorder_dir)

        ## Record config object using pprint
        import pprint

        formatted_cfg = pprint.pformat(cfg)
        writer.add_text("config.py", formatted_cfg.replace(' ', '&nbsp;').replace('\n', '  \n')) # add space for markdown style in tensorboard text
    else:
        writer = None

    ## Set up GPU and distribution process
    if is_distributed:
        cfg.trainer.gpu = local_rank # local_rank will overwrite the GPU in configure file
    gpu = min(cfg.trainer.gpu, torch.cuda.device_count() - 1)
    torch.backends.cudnn.benchmark = getattr(cfg.trainer, 'cudnn', False)
    torch.cuda.set_device(gpu)
    if is_distributed:
        torch.distributed.init_process_group(backend='nccl', init_method='env://')
    print(local_rank)
 
    ## define datasets and dataloader.
    dataset_train = DATASET_DICT[cfg.data.train_dataset](cfg)
    dataset_val = DATASET_DICT[cfg.data.val_dataset](cfg, "validation")

    dataloader_train = DataLoader(dataset_train, num_workers=cfg.data.num_workers,
                                  batch_size=cfg.data.batch_size, collate_fn=dataset_train.collate_fn, shuffle=local_rank<0, drop_last=True,
                                  sampler=torch.utils.data.DistributedSampler(dataset_train, num_replicas=world_size, rank=local_rank, shuffle=True) if local_rank >= 0 else None)
    dataloader_val = DataLoader(dataset_val, num_workers=cfg.data.num_workers,
                                batch_size=cfg.data.batch_size, collate_fn=dataset_val.collate_fn, shuffle=False, drop_last=True)

    detector = MonoDTR(cfg.detector)         
            
    ## Convert to cuda
    if is_distributed:
        detector = torch.nn.SyncBatchNorm.convert_sync_batchnorm(detector)
        detector = torch.nn.parallel.DistributedDataParallel(detector.cuda(), device_ids=[gpu], output_device=gpu)
    else:
        detector = detector.cuda()
    
    ## define optimizer and weight decay
    optimizer = optimizers.build_optimizer(cfg.optimizer, detector)

    ## define scheduler
    scheduler_config = getattr(cfg, 'scheduler', None)
    scheduler = schedulers.build_scheduler(scheduler_config, optimizer)
    is_iter_based = getattr(scheduler_config, "is_iter_based", False)

    ## define loss logger
    training_loss_logger =  LossLogger(writer, 'train') if is_logging else None

    ## training pipeline
    if 'training_func' in cfg.trainer:
        # training_dection = PIPELINE_DICT[cfg.trainer.training_func] #hoanbi1
        training_dection = My_train_mono_detection
    else:
        raise KeyError

    ## Get evaluation pipeline
    if 'evaluate_func' in cfg.trainer:
        evaluate_detection = PIPELINE_DICT[cfg.trainer.evaluate_func]
        print("Found evaluate function {}".format(cfg.trainer.evaluate_func))
    else:
        evaluate_detection = None
        print("Evaluate function not found")


    ## timer is used to estimate eta
    timer = Timer()

    print('Num training images: {}'.format(len(dataset_train)))

    resume = True
    if resume == True:
        ckpt_path = os.path.join("/kaggle/input/monodtr-resume-checkpoint", "checkpoint_resume.pt")
        dict_checkpoint = torch.load(ckpt_path, map_location="cpu")
        detector.load_state_dict(dict_checkpoint["state_dict_backbone"])
        optimizer.load_state_dict(dict_checkpoint["state_optimizer"])
        scheduler.load_state_dict(dict_checkpoint["state_lr_scheduler"])
        epoch = dict_checkpoint["epoch"]
        print(f"Resuming training at epoch {epoch}")
    else:
        epoch = 0
        print("Start training at epoch 0")

    # this is for load pretrain checkpoint
    state_dict_pretrain = torch.load("/kaggle/input/pretrain-monodtr-base/MonoDTR.pth", map_location='cpu')
    
    # Filter out incompatible keys
    new_state_dict = OrderedDict()
    loaded_keys = []  # Track which parameters are loaded
    for k, v in state_dict_pretrain.items():
        if k in detector.state_dict() and v.shape == detector.state_dict()[k].shape:
            new_state_dict[k] = v
            loaded_keys.append(k)  # Record loaded keys
            print(f"Parameter {k} is loaded from pretrain")

    detector.load_state_dict(new_state_dict, strict=False)
    
    # Freeze specific layers in the Swin-T backbone (first two stages)
    for name, param in detector.named_parameters():
        # print(name)
        if name in loaded_keys:
            param.requires_grad = False  # Freeze loaded weights
        else:
            param.requires_grad = True

    for name, param in detector.named_parameters():
        if param.requires_grad == True:
            print(f"Parameter {name} is trainable")
            
    ## Record basic information of the model
    if is_logging:
        string1 = detector.__str__().replace(' ', '&nbsp;').replace('\n', '  \n')
        writer.add_text("model structure", string1) # add space for markdown style in tensorboard text
        num_parameters = get_num_parameters(detector)
        print(f'number of trained parameters of the model: {num_parameters}')
        
    detector.train()
    global_step = 0
    
    # for epoch_num in range(cfg.trainer.max_epochs):
    for epoch_num in range(epoch, (40+epoch)):
        ## Start training for one epoch
        torch.cuda.empty_cache()
        detector.train()
        if training_loss_logger:
            training_loss_logger.reset()
        for iter_num, data in enumerate(dataloader_train):
            training_dection(data, detector, optimizer, writer, training_loss_logger, global_step, epoch_num, cfg, iter_num)

            global_step += 1
            if (iter_num + 1) % cfg.trainer.accumulation_steps == 0:
                if is_iter_based:
                    scheduler.step()
    
                if is_logging and global_step % cfg.trainer.disp_iter == 0:
                    ## Log loss, print out and write to tensorboard in main process
                    if 'total_loss' not in training_loss_logger.loss_stats:
                        print(f"\nIn epoch {epoch_num}, iteration:{iter_num}, global_step:{global_step}, total_loss not found in logger.")
                    else:
                        log_str = 'Epoch: {} | Iteration: {}  | Running loss: {:1.5f} | eta:{}'.format(
                            epoch_num, iter_num, training_loss_logger.loss_stats['total_loss'].avg,
                            timer.compute_eta(global_step, len(dataloader_train) * cfg.trainer.max_epochs))
                        print(log_str, end='\r')
                        writer.add_text("training_log/train", log_str, global_step)
                        training_loss_logger.log(global_step)

        if not is_iter_based:
            scheduler.step()
        ## save model in main process if needed
        if is_logging:
            torch.save(detector.module.state_dict() if is_distributed else detector.state_dict(), os.path.join(
                cfg.path.checkpoint_path, '{}_latest.pth'.format(
                    cfg.detector.name)
                )
            )
        if is_logging and (epoch_num + 1) % cfg.trainer.save_iter == 0:
            torch.save(detector.module.state_dict() if is_distributed else detector.state_dict(), os.path.join(
                cfg.path.checkpoint_path, '{}_{}.pth'.format(
                    cfg.detector.name,epoch_num)
                )
            )
        checkpoint = {
                "epoch": epoch_num + 1,
                "global_step": global_step,
                "state_dict_backbone": detector.state_dict(),
                "state_optimizer": optimizer.state_dict(),
                "state_lr_scheduler": scheduler.state_dict()
            }
        torch.save(checkpoint, os.path.join(cfg.path.checkpoint_path, f"checkpoint_resume.pt"))
        print(f"Save checkpoint at epoch {epoch_num+1} successfully!")
        ## test model in main process if needed
        if is_evaluating and evaluate_detection is not None and cfg.trainer.test_iter > 0 and (epoch_num + 1) % cfg.trainer.test_iter == 0:
            print("\n/**** start testing after training epoch {} ******/".format(epoch_num))
            evaluate_detection(cfg, detector.module if is_distributed else detector, dataset_val, writer, epoch_num)
            print("/**** finish testing after training epoch {} ******/".format(epoch_num))

        if is_distributed:
            torch.distributed.barrier() # wait untill all finish a epoch

        if is_logging:
            writer.flush()

In [23]:
%%writefile my_config.py

from easydict import EasyDict as edict
import os 
import numpy as np

cfg = edict()
cfg.obj_types = ['Car']
#cfg.obj_types = ['Car', 'Pedestrian', 'Cyclist']

## trainer
trainer = edict(
    gpu = 0,
    max_epochs = 240, 
    disp_iter = 100,
    save_iter = 5,
    test_iter = 10,
    training_func = "train_mono_detection",
    test_func = "test_mono_detection",
    evaluate_func = "evaluate_kitti_obj",
    accumulation_steps = 1,  # hoanbi1
)

cfg.trainer = trainer

## path
path = edict()
path.data_path = "/kaggle/input/kitti-3d-object-detection-dataset/training" # used in visualDet3D/data/.../dataset
path.test_path = "/kaggle/input/kitti-3d-object-detection-dataset/testing" # used in visualDet3D/data/.../dataset
path.visualDet3D_path = "/kaggle/input/monodtr-library-ver2/MonoDTR/visualDet3D" # The path should point to the inner subfolder
path.project_path = "/kaggle/working/" # or other path for pickle files, checkpoints, tensorboard logging and output files.
if not os.path.isdir(path.project_path):
    os.mkdir(path.project_path)
path.project_path = os.path.join(path.project_path, 'MonoDTR')
if not os.path.isdir(path.project_path):
    os.mkdir(path.project_path)

path.log_path = os.path.join(path.project_path, "log")
if not os.path.isdir(path.log_path):
    os.mkdir(path.log_path)

path.checkpoint_path = os.path.join(path.project_path, "checkpoint")
if not os.path.isdir(path.checkpoint_path):
    os.mkdir(path.checkpoint_path)

path.preprocessed_path = os.path.join(path.project_path, "output")
if not os.path.isdir(path.preprocessed_path):
    os.mkdir(path.preprocessed_path)

path.train_imdb_path = os.path.join(path.preprocessed_path, "training")
if not os.path.isdir(path.train_imdb_path):
    os.mkdir(path.train_imdb_path)

path.val_imdb_path = os.path.join(path.preprocessed_path, "validation")
if not os.path.isdir(path.val_imdb_path):
    os.mkdir(path.val_imdb_path)

path.test_imdb_path = os.path.join(path.preprocessed_path, "test")
if not os.path.isdir(path.test_imdb_path):
    os.mkdir(path.test_imdb_path)

path.test_result_path = os.path.join(path.test_imdb_path, "data")
if not os.path.isdir(path.test_result_path):
    os.mkdir(path.test_result_path)

cfg.path = path

## optimizer
optimizer = edict(
    type_name = 'adam',
    keywords = edict(
        lr        = 1e-4,
        weight_decay = 0,
    ),
    clipped_gradient_norm = 0.1
)
cfg.optimizer = optimizer
## scheduler
scheduler = edict(
    type_name = 'CosineAnnealingLR',
    keywords = edict(
        T_max     = cfg.trainer.max_epochs,
        eta_min   = 5e-6,
    )
)
cfg.scheduler = scheduler

## data
data = edict(
    batch_size = 8,
    num_workers = 8,
    rgb_shape = (288, 1280, 3),
    train_dataset = "KittiMonoDataset",
    val_dataset   = "KittiMonoDataset",
    test_dataset  = "KittiMonoTestDataset",
    train_split_file = os.path.join(cfg.path.visualDet3D_path, 'data', 'kitti', 'chen_split', 'train.txt'),
    val_split_file   = os.path.join(cfg.path.visualDet3D_path, 'data', 'kitti', 'chen_split', 'val.txt'),
)

data.augmentation = edict(
    rgb_mean = np.array([0.485, 0.456, 0.406]),
    rgb_std  = np.array([0.229, 0.224, 0.225]),
    cropSize = (data.rgb_shape[0], data.rgb_shape[1]),
    crop_top = 100,
)
data.train_augmentation = [
    edict(type_name='ConvertToFloat'),
    edict(type_name='PhotometricDistort', keywords=edict(distort_prob=1.0, contrast_lower=0.5, contrast_upper=1.5, saturation_lower=0.5, saturation_upper=1.5, hue_delta=18.0, brightness_delta=32)),
    edict(type_name='CropTop', keywords=edict(crop_top_index=data.augmentation.crop_top)),
    edict(type_name='Resize', keywords=edict(size=data.augmentation.cropSize)),
    edict(type_name='RandomMirror', keywords=edict(mirror_prob=0.5)),
    edict(type_name='Normalize', keywords=edict(mean=data.augmentation.rgb_mean, stds=data.augmentation.rgb_std))
]
data.test_augmentation = [
    edict(type_name='ConvertToFloat'),
    edict(type_name='CropTop', keywords=edict(crop_top_index=data.augmentation.crop_top)),
    edict(type_name='Resize', keywords=edict(size=data.augmentation.cropSize)),
    edict(type_name='Normalize', keywords=edict(mean=data.augmentation.rgb_mean, stds=data.augmentation.rgb_std))
]
cfg.data = data

## networks
detector = edict()
detector.obj_types = cfg.obj_types
detector.name = 'MonoDTR'
detector.mono_backbone=edict(
)
head_loss = edict(
    fg_iou_threshold = 0.5,
    bg_iou_threshold = 0.4,
    L1_regression_alpha = 5 ** 2,
    focal_loss_gamma = 2.0,
    balance_weight   = [20.0],
    #balance_weight   = [20.0, 40, 40],
    regression_weight = [1, 1, 1, 1, 1, 1, 12, 1, 1, 0.5, 0.5, 0.5, 1], #[x, y, w, h, cx, cy, z, sin2a, cos2a, w, h, l]
)
head_test = edict(
    score_thr=0.75,
    cls_agnostic = False,
    nms_iou_thr=0.4,
    post_optimization=False
)

anchors = edict(
        {
            'obj_types': cfg.obj_types,
            'pyramid_levels':[3],
            'strides': [2 ** 3],
            'sizes' : [24],
            'ratios': np.array([0.5, 1, 2.0]),
            'scales': np.array([2 ** (i / 4.0) for i in range(16)]),
        }
    )

head_layer = edict(
    num_features_in=256,
    num_cls_output=len(cfg.obj_types)+1,
    num_reg_output=12,
    cls_feature_size=256,
    reg_feature_size=256,
)
detector.head = edict(
    num_regression_loss_terms=13,
    preprocessed_path=path.preprocessed_path,
    num_classes     = len(cfg.obj_types),
    anchors_cfg     = anchors,
    layer_cfg       = head_layer,
    loss_cfg        = head_loss,
    test_cfg        = head_test
)
detector.anchors = anchors
detector.loss = head_loss
cfg.detector = detector


Writing my_config.py


In [24]:
def run_training():
    # config_path = "/kaggle/input/monodtr-library-ver2/MonoDTR/config/config.py"  # Path to your config file
    config_path = "/kaggle/working/my_config.py"  # Path to your config file
    experiment_name = "EXP_NAME"  # Use the defined experiment name
    world_size = 1  # For single GPU training
    local_rank = 0  # Local rank set to 0 as per your command
    %env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
    main(config=config_path, experiment_name=experiment_name, world_size=world_size)

In [25]:
run_training()

env: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
-1




model.safetensors:   0%|          | 0.00/114M [00:00<?, ?B/s]

Found evaluate function evaluate_kitti_obj
Num training images: 3712


  dict_checkpoint = torch.load(ckpt_path, map_location="cpu")


Resuming training at epoch 40


  state_dict_pretrain = torch.load("/kaggle/input/pretrain-monodtr-base/MonoDTR.pth", map_location='cpu')


Parameter bbox_head.balance_weights is loaded from pretrain
Parameter bbox_head.regression_weight is loaded from pretrain
Parameter bbox_head.loss_cls.balance_weights is loaded from pretrain
Parameter bbox_head.cls_feature_extraction.0.weight is loaded from pretrain
Parameter bbox_head.cls_feature_extraction.0.bias is loaded from pretrain
Parameter bbox_head.cls_feature_extraction.3.weight is loaded from pretrain
Parameter bbox_head.cls_feature_extraction.3.bias is loaded from pretrain
Parameter bbox_head.cls_feature_extraction.6.weight is loaded from pretrain
Parameter bbox_head.cls_feature_extraction.6.bias is loaded from pretrain
Parameter bbox_head.reg_feature_extraction.0.weight is loaded from pretrain
Parameter bbox_head.reg_feature_extraction.0.bias is loaded from pretrain
Parameter bbox_head.reg_feature_extraction.0.conv_offset.weight is loaded from pretrain
Parameter bbox_head.reg_feature_extraction.0.conv_offset.bias is loaded from pretrain
Parameter bbox_head.reg_feature_ext

  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds

Save checkpoint at epoch 41 successfully!


  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds

Save checkpoint at epoch 42 successfully!


  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds

Save checkpoint at epoch 43 successfully!


  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds

Save checkpoint at epoch 44 successfully!


  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds

Save checkpoint at epoch 45 successfully!


  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds

Save checkpoint at epoch 46 successfully!


  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds

Save checkpoint at epoch 47 successfully!


  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds

Save checkpoint at epoch 48 successfully!


  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds

Save checkpoint at epoch 49 successfully!


  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds

Save checkpoint at epoch 50 successfully!

/**** start testing after training epoch 49 ******/
rebuild /kaggle/working/MonoDTR/output/validation/data


  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds
  scores, bbox, obj_index = module([images.cuda().float().contiguous(), torch.tensor(P2).cuda().float()])
100%|██████████| 3769/3769 [06:21<00:00,  9.89it/s]
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.
[1m
File "../input/monodtr-library-ver2/MonoDTR/visualDet3D/evaluator/kitti/eval.py", line 129:[0m
[1m@numba.jit(nopython=True, parallel=True)
[1mdef d3_box_overlap_kernel(boxes,
[0m[1m^[0m[0m
[0m


Car AP(Average Precision)@0.70, 0.70, 0.70:
bbox AP:96.42, 87.62, 75.98
bev  AP:22.36, 16.28, 13.62
3d   AP:15.19, 11.32, 9.26
aos  AP:88.21, 78.73, 67.51
Car AP(Average Precision)@0.70, 0.50, 0.50:
bbox AP:96.42, 87.62, 75.98
bev  AP:60.20, 44.16, 37.82
3d   AP:53.99, 38.64, 32.74
aos  AP:88.21, 78.73, 67.51

/**** finish testing after training epoch 49 ******/




Save checkpoint at epoch 51 successfully!
Save checkpoint at epoch 52 successfully!
Save checkpoint at epoch 53 successfully!
Save checkpoint at epoch 54 successfully!
Save checkpoint at epoch 55 successfully!
Save checkpoint at epoch 56 successfully!
Save checkpoint at epoch 57 successfully!
Save checkpoint at epoch 58 successfully!
Save checkpoint at epoch 59 successfully!
Save checkpoint at epoch 60 successfully!

/**** start testing after training epoch 59 ******/
clean up the recorder directory of /kaggle/working/MonoDTR/output/validation/data
rebuild /kaggle/working/MonoDTR/output/validation/data


  scores, bbox, obj_index = module([images.cuda().float().contiguous(), torch.tensor(P2).cuda().float()])
100%|██████████| 3769/3769 [04:49<00:00, 13.00it/s]


Car AP(Average Precision)@0.70, 0.70, 0.70:
bbox AP:94.72, 87.86, 76.31
bev  AP:22.02, 16.43, 13.89
3d   AP:14.71, 11.33, 9.46
aos  AP:90.35, 81.83, 70.27
Car AP(Average Precision)@0.70, 0.50, 0.50:
bbox AP:94.72, 87.86, 76.31
bev  AP:58.78, 43.48, 37.54
3d   AP:52.31, 38.53, 32.84
aos  AP:90.35, 81.83, 70.27

/**** finish testing after training epoch 59 ******/
Save checkpoint at epoch 61 successfully!
Save checkpoint at epoch 62 successfully!
Save checkpoint at epoch 63 successfully!
Save checkpoint at epoch 64 successfully!
Save checkpoint at epoch 65 successfully!
Save checkpoint at epoch 66 successfully!
Save checkpoint at epoch 67 successfully!
Save checkpoint at epoch 68 successfully!
Save checkpoint at epoch 69 successfully!
Save checkpoint at epoch 70 successfully!

/**** start testing after training epoch 69 ******/
clean up the recorder directory of /kaggle/working/MonoDTR/output/validation/data
rebuild /kaggle/working/MonoDTR/output/validation/data


100%|██████████| 3769/3769 [04:45<00:00, 13.21it/s]


Car AP(Average Precision)@0.70, 0.70, 0.70:
bbox AP:97.63, 88.19, 76.54
bev  AP:23.10, 17.83, 14.75
3d   AP:15.81, 11.81, 9.64
aos  AP:92.26, 81.69, 70.14
Car AP(Average Precision)@0.70, 0.50, 0.50:
bbox AP:97.63, 88.19, 76.54
bev  AP:60.20, 44.77, 38.18
3d   AP:53.57, 40.52, 34.43
aos  AP:92.26, 81.69, 70.14

/**** finish testing after training epoch 69 ******/
Save checkpoint at epoch 71 successfully!
Save checkpoint at epoch 72 successfully!
Save checkpoint at epoch 73 successfully!
Save checkpoint at epoch 74 successfully!
Save checkpoint at epoch 75 successfully!
Save checkpoint at epoch 76 successfully!
Save checkpoint at epoch 77 successfully!
Save checkpoint at epoch 78 successfully!
Save checkpoint at epoch 79 successfully!
Save checkpoint at epoch 80 successfully!

/**** start testing after training epoch 79 ******/
clean up the recorder directory of /kaggle/working/MonoDTR/output/validation/data
rebuild /kaggle/working/MonoDTR/output/validation/data


100%|██████████| 3769/3769 [04:58<00:00, 12.64it/s]


Car AP(Average Precision)@0.70, 0.70, 0.70:
bbox AP:95.36, 88.63, 76.90
bev  AP:23.89, 18.27, 15.44
3d   AP:16.87, 12.69, 10.35
aos  AP:90.01, 82.24, 70.54
Car AP(Average Precision)@0.70, 0.50, 0.50:
bbox AP:95.36, 88.63, 76.90
bev  AP:60.46, 44.70, 38.42
3d   AP:53.98, 39.66, 33.71
aos  AP:90.01, 82.24, 70.54

/**** finish testing after training epoch 79 ******/
