In [1]:
!python --version

Python 3.10.12


In [2]:
import sys

sys.path.append("/kaggle/input/monodtr-library-ver2/MonoDTR")

In [3]:
!pip install -qq -r /kaggle/input/monodtr-library-ver2/MonoDTR/requirement.txt
!pip install -qq coloredlogs
!pip install -qq ptflops

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.9/115.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
sys.path.append("/kaggle/input/monodtr-library-ver2/MonoDTR/scripts")
sys.path.append("/kaggle/input/pretrain-monodtr-base")
sys.path.append("/kaggle/input/checkpoint")

In [5]:
import os
import sys
import torch

# Add the directory containing the .so file to the system path
sys.path.append('/kaggle/input/monodtr-library-ver2/MonoDTR/visualDet3D/networks/lib/ops/dcn')

# Check if the .so file exists
print(os.listdir('/kaggle/input/monodtr-library-ver2/MonoDTR/visualDet3D/networks/lib/ops/dcn'))

# Print Python version
print("Python version:", sys.version)

# Set LD_LIBRARY_PATH
os.environ['LD_LIBRARY_PATH'] = '/kaggle/input/monodtr-library-ver2/MonoDTR/visualDet3D/networks/lib/ops/dcn:' + os.environ.get('LD_LIBRARY_PATH', '')

# Attempt to import the module
try:
    import deform_conv_ext
    print("Import successful!")
except ImportError as e:
    print("Import failed:", e)

# If needed, run the setup or make script
# !python setup.py build
# !python setup.py install
# or
# !bash make.sh


['deform_conv.py', 'deform_conv_ext.cpython-310-x86_64-linux-gnu.so', 'make.sh', 'src', '__init__.py', 'setup.py']
Python version: 3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]
Import successful!


In [6]:
import torch

if torch.cuda.is_available():
    print("CUDA is available!")
    print("Number of GPUs available:", torch.cuda.device_count())
    print("Current GPU:", torch.cuda.current_device())
    print("GPU Name:", torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print("CUDA is not available. Check your settings.")


CUDA is available!
Number of GPUs available: 1
Current GPU: 0
GPU Name: Tesla P100-PCIE-16GB


In [7]:
# Create a large tensor and move it to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = torch.randn(10000, 10000, device=device)  # This should allocate GPU memory
print("Tensor created on GPU.")
print("Allocated GPU Memory:", torch.cuda.memory_allocated(device))
print("Cached GPU Memory:", torch.cuda.memory_reserved(device))


Tensor created on GPU.
Allocated GPU Memory: 400556032
Cached GPU Memory: 400556032


In [8]:
import torch.nn as nn
import torch
import torch
from torch.nn import Module, Dropout
from einops import rearrange
import torch.nn.functional as F
from ptflops import get_model_complexity_info

In [9]:
# -------------------------- RMSNorm --------------------------
class RMSNorm(nn.Module):
    def __init__(self, dim, eps=1e-8):
        super().__init__()
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(dim))

    def forward(self, x):
        # x: (B, L, D)
        norm_x = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
        return self.scale * norm_x

# -------------------------- FLIP Module --------------------------
def flip(x, dim):
    return torch.flip(x, dims=[dim])

In [10]:
# -------------------------- BiMamba2 Block (Conv Fusion) --------------------------
class BiMamba2(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.linear_x = nn.Linear(dim, dim)
        self.linear_bf = nn.Linear(dim, dim)
        self.linear_bb = nn.Linear(dim, dim)
        self.linear_z = nn.Linear(dim, dim)

        self.concat_proj_f = nn.Linear(2 * dim, dim)
        self.concat_proj_b = nn.Linear(2 * dim, dim)

        self.conv1d_f = nn.Conv1d(dim, dim, kernel_size=1)
        self.conv1d_b = nn.Conv1d(dim, dim, kernel_size=1)
        self.conv1d_zf = nn.Conv1d(dim, dim, kernel_size=1)
        self.conv1d_ab_z = nn.Conv1d(dim, dim, kernel_size=1)

        self.ssms_f = nn.Sequential(
            nn.Conv1d(dim, dim, 1),
            nn.GELU(),
            nn.Conv1d(dim, dim, 1)
        )
        self.ssms_b = nn.Sequential(
            nn.Conv1d(dim, dim, 1),
            nn.GELU(),
            nn.Conv1d(dim, dim, 1)
        )

        self.norm_f = RMSNorm(dim)
        self.norm_b = RMSNorm(dim)
        self.output_linear = nn.Linear(dim, dim)

    def forward(self, u):
        # u: (B, L, D)
        x = self.linear_x(u)          # (B, L, D)
        bf = self.linear_bf(u)        # (B, L, D)
        bb = self.linear_bb(u)        # (B, L, D)
        z = self.linear_z(u)          # (B, L, D)

        bf_x = torch.cat([bf, x], dim=-1)         # (B, L, 2D)
        bb_x = torch.cat([bb, x], dim=-1)         # (B, L, 2D)

        bf_x = self.concat_proj_f(bf_x)           # (B, L, D)
        bb_x = self.concat_proj_b(bb_x)           # (B, L, D)

        af = self.conv1d_f(bf_x.transpose(1, 2))  # (B, D, L)
        af = F.gelu(af)     
        af = self.ssms_f(af).transpose(1, 2)      # (B, L, D)# (B, D, L)
        # af = af + self.conv1d_zf(z.transpose(1, 2))  # af conv with z
        af = af * z
        af = self.norm_f(af)                      # (B, L, D)

        bb_x_flip = flip(bb_x, dim=1)             # (B, L, D)
        ab = self.conv1d_b(bb_x_flip.transpose(1, 2))  # (B, D, L)
        ab = F.gelu(ab)                                # (B, D, L)

        z_flip = flip(z, dim=1)                        # (B, L, D)
        ab = self.ssms_b(ab).transpose(1, 2)           # (B, L, D)
        # ab = ab + self.conv1d_ab_z(z_flip.transpose(1, 2))  # (B, D, L)
        ab = ab * z_flip   # (B, D, L)
        ab = self.norm_b(ab)                           # (B, L, D)
        ab = flip(ab, dim=1)                           # (B, L, D)

        out = self.output_linear(af + ab)         # (B, L, D)                 # (B, L, D)
        return out

In [11]:
# -------------------------- CrossMamba2 Module --------------------------
class CrossMamba2(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.linear_x = nn.Linear(dim, dim)
        self.linear_z = nn.Linear(dim, dim)
        self.linear_bf = nn.Linear(dim, dim)
        self.linear_bb = nn.Linear(dim, dim)

        self.concat_proj_f = nn.Linear(2 * dim, dim)
        self.concat_proj_b = nn.Linear(2 * dim, dim)

        self.conv1d_f = nn.Conv1d(dim, dim, kernel_size=1)
        self.conv1d_b = nn.Conv1d(dim, dim, kernel_size=1)
        self.conv1d_zf = nn.Conv1d(dim, dim, kernel_size=1)
        self.conv1d_ab_z = nn.Conv1d(dim, dim, kernel_size=1)

        self.ssms_f = nn.Sequential(
            nn.Conv1d(dim, dim, 1),
            nn.GELU(),
            nn.Conv1d(dim, dim, 1)
        )
        self.ssms_b = nn.Sequential(
            nn.Conv1d(dim, dim, 1),
            nn.GELU(),
            nn.Conv1d(dim, dim, 1)
        )

        self.norm_f = RMSNorm(dim)
        self.norm_b = RMSNorm(dim)
        self.output_linear = nn.Linear(dim, dim)

    def forward(self, u1, u2):
        # u1: context (B, L, D), u2: depth (B, L, D)
        x = self.linear_x(u1)               # (B, L, D)
        z = self.linear_z(u1)               # (B, L, D)
        bf = self.linear_bf(u2)             # (B, L, D)
        bb = self.linear_bb(u2)             # (B, L, D)

        bf_x = torch.cat([bf, x], dim=-1)   # (B, L, 2D)
        bb_x = torch.cat([bb, x], dim=-1)   # (B, L, 2D)

        bf_x = self.concat_proj_f(bf_x)     # (B, L, D)
        bb_x = self.concat_proj_b(bb_x)     # (B, L, D)

        af = self.conv1d_f(bf_x.transpose(1, 2))  # (B, D, L)
        af = F.gelu(af)
        af = self.ssms_f(af).transpose(1, 2)      # (B, L, D)
        # af = af + self.conv1d_zf(z.transpose(1, 2))  # af conv with z
        af = af * z  # af conv with z
        af = self.norm_f(af)

        bb_x_flip = flip(bb_x, dim=1)             # (B, L, D)
        ab = self.conv1d_b(bb_x_flip.transpose(1, 2))
        ab = F.gelu(ab).transpose(1, 2)           # (B, L, D)

        z_flip = flip(z, dim=1)
        ab = self.ssms_b(ab.transpose(1, 2)).transpose(1, 2)
        # ab = ab + self.conv1d_ab_z(z_flip.transpose(1, 2))  # (B, D, L)
        ab = ab * z_flip  # (B, D, L)
        ab = self.norm_b(ab)
        ab = flip(ab, dim=1)

        out = self.output_linear(af + ab)         # (B, L, D)
        return out

In [12]:
# -------------------------- FFN + Norm --------------------------
class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, dim)
        )

    def forward(self, x):
        return self.net(x)

In [13]:
# -------------------------- Depth Aware Mamba (DTR-Compatible Input) --------------------------
class DepthAwareMamba(nn.Module):
    def __init__(self, output_channel_num):
        super().__init__()
        self.output_channel_num = output_channel_num

        self.encoder_bimamba = BiMamba2(self.output_channel_num)
        self.encoder_ffn = FeedForward(self.output_channel_num, self.output_channel_num * 2)
        self.encoder_norm1 = nn.LayerNorm(self.output_channel_num)
        self.encoder_norm2 = nn.LayerNorm(self.output_channel_num)

        self.decoder_bimamba = BiMamba2(self.output_channel_num)
        self.cross_bimamba = CrossMamba2(self.output_channel_num)
        self.decoder_ffn = FeedForward(self.output_channel_num, self.output_channel_num * 2)
        self.decoder_norm1 = nn.LayerNorm(self.output_channel_num)
        self.decoder_norm2 = nn.LayerNorm(self.output_channel_num)
        self.decoder_norm3 = nn.LayerNorm(self.output_channel_num)

    def forward(self, depth_feat, context_feat, depth_pos=None):
        depth_feat = depth_feat.contiguous()
        context_feat = context_feat.contiguous()
        if depth_pos is not None:
            depth_pos = depth_pos.contiguous()
            context_feat = context_feat + depth_pos
    
        # Encoder on context_feat
        x = self.encoder_bimamba(context_feat.contiguous())
        x = self.encoder_norm1((x + context_feat).contiguous())
        x_ffn = self.encoder_ffn(x.contiguous())
        x = self.encoder_norm2((x + x_ffn).contiguous())
    
        # Decoder on depth_feat and fused context
        d = self.decoder_bimamba(depth_feat.contiguous())
        d = self.decoder_norm1((d + depth_feat).contiguous())
    
        x = self.cross_bimamba(x.contiguous(), d.contiguous())
        x = self.decoder_norm2(x.contiguous())
        x_ffn = self.decoder_ffn(x.contiguous())
        x = self.decoder_norm3((x + x_ffn).contiguous())
        return x


In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CSA(nn.Module):
    """
    Cross-Scale Attention (CSA) module.
    Projects both inputs to a common channel dimension 'c' before computing attention.
    """
    def __init__(self, c1, c2, c):
        super(CSA, self).__init__()
        self.proj_query = nn.Linear(c1, c)
        self.proj_coarser = nn.Linear(c2, c)  # Shared for key and value

    def forward(self, x1, x2):
        """
        :param x1: Finer-resolution input (B, C1, H1, W1)
        :param x2: Coarser-resolution input (B, C2, H2, W2)
        :return: Augmented finer-resolution feature (B, c, H1, W1)
        """
        B, C1, H1, W1 = x1.shape
        _, C2, H2, W2 = x2.shape

        flat1 = x1.permute(0, 2, 3, 1).reshape(B, H1 * W1, C1)
        flat2 = x2.permute(0, 2, 3, 1).reshape(B, H2 * W2, C2)

        query = self.proj_query(flat1)
        key = self.proj_coarser(flat2)
        value = self.proj_coarser(flat2)

        energy = torch.bmm(query, key.permute(0, 2, 1))
        attention = F.softmax(energy, dim=-1)

        out = torch.bmm(attention, value)

        out = out + query
        out = out.reshape(B, H1, W1, -1).permute(0, 3, 1, 2)

        return out

In [15]:
class MSR(nn.Module):
    """
    Multi-Scale Refinement (MSR) module without depth head for feature refinement only.
    """
    def __init__(self, channels=256, hidden_dim=64):
        super(MSR, self).__init__()
        self.linear1 = nn.Linear(channels, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, channels)

    def forward(self, coarser=None, finer=None):
        """
        :param coarser: Optional coarser-resolution feature (B, C, H_c, W_c)
        :param finer: Optional finer-resolution feature (B, C, H_f, W_f)
        :return: refined feature (B, C, H, W)
        """
        if coarser is None and finer is None:
            raise ValueError("At least one input must be provided.")

        if coarser is not None:
            feature = F.interpolate(coarser, scale_factor=2, mode='bilinear', align_corners=False)
            if finer is not None:
                if feature.shape[2:] != finer.shape[2:]:
                    raise ValueError("Spatial dimensions mismatch after upsampling.")
                feature = feature + finer
        else:
            feature = finer

        B, C, H, W = feature.shape
        flat = feature.permute(0, 2, 3, 1).reshape(-1, C)

        hidden = F.relu(self.linear1(flat))
        out_flat = self.linear2(hidden)

        refined = out_flat.reshape(B, H, W, C).permute(0, 3, 1, 2)

        return refined

In [16]:
from visualDet3D.networks.backbones.dla import dla102
from visualDet3D.networks.backbones.dlaup import DLAUp
from visualDet3D.networks.detectors.dfe import DepthAwareFE
from visualDet3D.networks.detectors.dpe import DepthAwarePosEnc
from visualDet3D.networks.detectors.dtr import DepthAwareTransformer
import math
import time
import csv

In [17]:
import timm

class MonoDTRCore(nn.Module):
    def __init__(self, backbone_arguments=dict()):
        super(MonoDTRCore, self).__init__()

        # Swin-T backbone with features_only=True to ensure feature pyramid output
        self.backbone = timm.create_model('swin_tiny_patch4_window7_224', pretrained=True,features_only=True, img_size=(288, 1280))
        # self.backbone = timm.create_model('swin_tiny_patch4_window7_224', pretrained=True, features_only=True)
        self.first_level = 1  # Kept for reference, but no longer used

        # Feature pyramid channels from Swin-T
        self.channels = [96, 192, 384, 768]

        # CSA modules for cross-scale attention
        self.csa_1_4 = CSA(96, 256, 256)    # 1/4 with 1/8
        self.csa_1_8 = CSA(192, 256, 256)   # 1/8 with 1/16
        self.csa_1_16 = CSA(384, 768, 256)  # 1/16 with 1/32

        # MSR modules for upsampling and refinement
        self.msr_1_8 = MSR(256)
        self.msr_1_4 = MSR(256)

        # Downsample for transformer input to manage memory usage
        self.downsample_for_transformer = nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)
        # self.dropout_transformer = nn.Dropout(0.3)
        # Downstream modules (assumed from original MonoDTRCore)
        self.output_channel_num = 256
        self.dpe = DepthAwarePosEnc(self.output_channel_num)
        self.depth_embed = nn.Embedding(100, self.output_channel_num)
        self.dtr = DepthAwareTransformer(self.output_channel_num)
        self.dfe = DepthAwareFE(self.output_channel_num)
        self.img_conv = nn.Conv2d(self.output_channel_num, self.output_channel_num, kernel_size=3, padding=1)

    def forward(self, x):
        """
        Forward pass of MonoDTRCore.
        
        Input:
            x: dict with key 'image' containing (N, 3, H, W)
        
        Output:
            feat: (N, C, H/8, W/8) - Processed feature map for compatibility with dtr
            depth: (N, D, H, W) - Depth prediction at 1/1 resolution
        """
        if 'image' not in x:
            raise ValueError("Input dictionary must contain 'image' key")
        img = x['image']  # (N, 3, H, W)
        assert img.shape[2] == 288 and img.shape[3] == 1280, f"Expected shape [N, 3, 288, 1280], got {img.shape}"

        if img.dim() != 4:
            raise ValueError(f"Expected 4D input tensor (N, C, H, W), got shape {img.shape}")
        
        if img.shape[1] != 3:
            raise ValueError(
                f"Expected input image to have 3 channels (RGB), got {img.shape[1]} channels. "
                "Please check your data pipeline to ensure the input is correctly formatted as (N, 3, H, W). "
                "If the input has 7 channels (e.g., multi-spectral), preprocess it to 3 channels or modify the backbone."
            )

        # Extract features using standard forward method
        features = self.backbone(img)  # [(N, H/4, W/4, 96), (N, H/8, W/8, 192), (N, H/16, W/16, 384), (N, H/32, W/32, 768)]
        # print(f"Feature shapes: {[f.shape for f in features]}")  # Debug
        # Permute features from (N, H, W, C) to (N, C, H, W)
        features = [f.permute(0, 3, 1, 2) for f in features]  # [(N, 96, H/4, W/4), (N, 192, H/8, W/8), (N, 384, H/16, W/16), (N, 768, H/32, W/32)]


        # Refine features progressively with CSA and MSR
        feat_1_16 = self.csa_1_16(features[2], features[3])  # (N, 256, H/16, W/16)
        feat_1_8 = self.csa_1_8(features[1], feat_1_16)     # (N, 256, H/8, W/8)
        feat_1_4 = self.csa_1_4(features[0], feat_1_8)       # (N, 256, H/4, W/4)

        refined_1_8 = self.msr_1_8(coarser=feat_1_16, finer=feat_1_8)
        refined_1_4 = self.msr_1_4(coarser=refined_1_8, finer=feat_1_4)

        # Keep feat_1_1 for depth prediction at 1/1 resolution
        x_full_res = refined_1_4  # (N, 256, H/4, W/4)

        # Downsample for transformer to manage memory usage and match original resolution
        x = self.downsample_for_transformer(x_full_res) # (N, 256, H/4, W/4)

        # Proceed with downstream processing
        N, C, H, W = x.shape  # H=H/8, W=W/8 (reduced resolution)
        depth, depth_guide, depth_feat = self.dfe(x) # depth: (N, D, H/8, W/8), depth_guide: (N, num_classes, H/8, W/8), depth_feat: (N, 256, H/8, W/8)
        depth_feat = depth_feat.permute(0, 2, 3, 1).view(N, H * W, C)
        depth_guide = depth_guide.argmax(1)
        depth_emb = self.depth_embed(depth_guide).view(N, H * W, C)
        depth_emb = self.dpe(depth_emb, (H, W))
        img_feat = x + self.img_conv(x)
        img_feat = img_feat.permute(0, 2, 3, 1)
        # print(f"img_feat shape: {img_feat.shape}")
        img_feat = img_feat.view(N, H * W, C)
        feat = self.dtr(depth_feat, img_feat, depth_emb)
        feat = feat.permute(0, 2, 1).view(N, C, H, W)

        return feat, depth

In [18]:
from visualDet3D.networks.heads.detection_3d_head import AnchorBasedDetection3DHead
from visualDet3D.networks.heads.depth_losses import bin_depths, DepthFocalLoss
from visualDet3D.networks.utils.registry import DETECTOR_DICT

In [19]:
class MonoDTR(nn.Module):
    def __init__(self, network_cfg):
        super(MonoDTR, self).__init__()

        self.obj_types = network_cfg.obj_types

        self.build_head(network_cfg)

        self.build_core(network_cfg)

        self.network_cfg = network_cfg

    def build_core(self, network_cfg):
        self.mono_core = MonoDTRCore(network_cfg.mono_backbone)

    def build_head(self, network_cfg):
        self.bbox_head = AnchorBasedDetection3DHead(
            **(network_cfg.head)
        )
        self.depth_loss = DepthFocalLoss(96)

    def train_forward(self, left_images, annotations, P2, depth_gt=None):
        
        features, depth = self.mono_core(dict(image=left_images, P2=P2))
        
        depth_output   = depth
        
        features = features.contiguous()
        P2 = P2.contiguous()
        left_images = left_images.contiguous()
        
        try:
            cls_preds, reg_preds = self.bbox_head(
                dict(
                    features=features,
                    P2=P2,
                    image=left_images
                )
            )
        except RuntimeError as e:
            print(f"RuntimeError: {e}")
            raise
            
        anchors = self.bbox_head.get_anchor(left_images, P2)

        cls_loss, reg_loss, loss_dict = self.bbox_head.loss(cls_preds, reg_preds, anchors, annotations, P2)
        
        depth_gt = bin_depths(depth_gt, mode = "LID", depth_min=1, depth_max=80, num_bins=96, target=True)

        if reg_loss.mean() > 0 and not depth_gt is None and not depth_output is None:
            
            depth_gt = depth_gt.unsqueeze(1)
            depth_loss = 1.0 * self.depth_loss(depth_output, depth_gt)
            loss_dict['depth_loss'] = depth_loss
            reg_loss += depth_loss

            self.depth_output = depth_output.detach()
        else:
            loss_dict['depth_loss'] = torch.zeros_like(reg_loss)
        return cls_loss, reg_loss, loss_dict

    def test_forward(self, left_images, P2):
        assert left_images.shape[0] == 1 # we recommmend image batch size = 1 for testing

        features, _ = self.mono_core(dict(image=left_images, P2=P2))
        
        cls_preds, reg_preds = self.bbox_head(
                dict(
                    features=features,
                    P2=P2,
                    image=left_images
                )
            )

        anchors = self.bbox_head.get_anchor(left_images, P2)

        scores, bboxes, cls_indexes = self.bbox_head.get_bboxes(cls_preds, reg_preds, anchors, P2, left_images)
        
        return scores, bboxes, cls_indexes

    def forward(self, inputs):

        if isinstance(inputs, list) and len(inputs) >= 3:
            return self.train_forward(*inputs)
        else:
            return self.test_forward(*inputs)

In [20]:
import os
import sys
import numpy as np
from easydict import EasyDict
from tqdm import tqdm
from fire import Fire
import coloredlogs
import logging
import torch
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from _path_init import *
from visualDet3D.networks.utils.registry import DETECTOR_DICT, DATASET_DICT, PIPELINE_DICT
from visualDet3D.networks.utils.utils import BackProjection, BBox3dProjector, get_num_parameters
from visualDet3D.evaluator.kitti.evaluate import evaluate
import visualDet3D.data.kitti.dataset
from visualDet3D.utils.timer import Timer
from visualDet3D.utils.utils import LossLogger, cfg_from_file
from visualDet3D.networks.optimizers import optimizers, schedulers

In [21]:
import torch
from torch.cuda.amp import autocast  # For AMP
import torch.optim as optim
from visualDet3D.utils.utils import LossLogger
from visualDet3D.utils.utils import compound_annotation

def My_train_mono_detection(data, module: nn.Module,
                         optimizer: optim.Optimizer,
                         writer: SummaryWriter = None,
                         loss_logger: LossLogger = None,
                         global_step: int = None,
                         epoch_num: int = None,
                         cfg: EasyDict = EasyDict(),
                         iter_num: int = None):
    if iter_num is None:
        iter_num = 0  # Fallback, but manage externally
    steps = cfg.trainer.accumulation_steps
    is_accum_start = (iter_num % steps == 0)
    if is_accum_start and optimizer is not None:
        optimizer.zero_grad(set_to_none=True)  # Faster than zero_grad()

    images, P2, labels, bbox2d, bbox_3d, depth = data
    # Assume batch=1, so lists of len=1

    # Handle empty annotations gracefully
    max_length = np.max([len(label) for label in labels]) if labels else 0
    if max_length == 0:
        # zero_losses = {'classification_loss': 0.0, 'regression_loss': 0.0, 'total_loss': 0.0}
        # if loss_logger is not None:
        #     loss_logger.update(zero_losses)
        # if writer is not None:
        #     writer.add_scalar('loss/total', 0.0, global_step or iter_num)
        return  # Still increments iter_num externally

    annotation = compound_annotation(labels, max_length, bbox2d, bbox_3d, cfg.obj_types)
    annotation_tensor = images.new(annotation).cuda()  # Inherit dtype/device

    # Forward pass with AMP for efficiency
    # with autocast(device_type='cuda', enabled=cfg.get('use_amp', True)):
    cls_loss, reg_loss, loss_dict = module([
        images.cuda().float().contiguous(),
        annotation_tensor,
        P2.cuda(),
        depth.cuda().contiguous()
    ])
    cls_loss = cls_loss.mean()
    reg_loss = reg_loss.mean()
    total_loss = cls_loss + reg_loss
    # loss_dict['total_loss'] = total_loss.item()  # Add for logging

    # Log unscaled micro-batch losses
    if loss_logger is not None:
        loss_logger.update(loss_dict)
    del loss_dict
    # if writer is not None:
    #     for k, v in loss_dict.items():
    #         writer.add_scalar(f'loss/{k}', v, global_step or iter_num)
        # Log effective (scaled) total occasionally
        # if (iter_num + 1) % steps == 0:
        #     writer.add_scalar('loss/effective_total', total_loss.item() * steps, global_step or iter_num)

    if optimizer is not None:
        # Check for invalid losses
        if torch.isnan(total_loss) or torch.isinf(total_loss):
            print(f"Warning: Invalid loss at iter {iter_num}: {total_loss.item()}")
            # Optionally: optimizer.zero_grad(); return
            total_loss = torch.tensor(0.0, device=total_loss.device, requires_grad=True)

        scaled_loss = total_loss / steps
        scaled_loss.backward()

    # Update every accumulation_steps
    if (iter_num + 1) % steps == 0:
        # Gradient clipping
        if hasattr(cfg.optimizer, 'clipped_gradient_norm'):
            torch.nn.utils.clip_grad_norm_(module.parameters(), cfg.optimizer.clipped_gradient_norm)
        else:
            print("Warning: No clipped_gradient_norm in cfg; skipping clipping.")

        optimizer.step()
        if is_accum_start:  # Reset only if full cycle (handle uneven final)
            optimizer.zero_grad(set_to_none=True)

        # Optional: LR scheduler step every effective batch
        # if hasattr(optimizer, 'scheduler'): optimizer.scheduler.step()

In [22]:
from collections import OrderedDict  # Add this import
def main(config="/kaggle/input/monodtr-library-ver2/MonoDTR/config/config.py", experiment_name="default", world_size=1, local_rank=-1):
    """Main function for the training script.

    KeywordArgs:
        config (str): Path to config file.
        experiment_name (str): Custom name for the experitment, only used in tensorboard.
        world_size (int): Number of total subprocesses in distributed training. 
        local_rank: Rank of the process. Should not be manually assigned. 0-N for ranks in distributed training (only process 0 will print info and perform testing). -1 for single training. 
    """

    ## Get config
    cfg = cfg_from_file(config)

    ## Collect distributed(or not) information
    cfg.dist = EasyDict()
    cfg.dist.world_size = world_size
    cfg.dist.local_rank = local_rank
    is_distributed = local_rank >= 0 # local_rank < 0 -> single training
    is_logging     = local_rank <= 0 # only log and test with main process
    is_evaluating  = local_rank <= 0

    ## Setup writer if local_rank > 0
    recorder_dir = os.path.join(cfg.path.log_path, experiment_name + f"config={config}")
    if is_logging: # writer exists only if not distributed and local rank is smaller
        ## Clean up the dir if it exists before
        if os.path.isdir(recorder_dir):
            os.system("rm -r {}".format(recorder_dir))
            print("clean up the recorder directory of {}".format(recorder_dir))
        writer = SummaryWriter(recorder_dir)

        ## Record config object using pprint
        import pprint

        formatted_cfg = pprint.pformat(cfg)
        writer.add_text("config.py", formatted_cfg.replace(' ', '&nbsp;').replace('\n', '  \n')) # add space for markdown style in tensorboard text
    else:
        writer = None

    ## Set up GPU and distribution process
    if is_distributed:
        cfg.trainer.gpu = local_rank # local_rank will overwrite the GPU in configure file
    gpu = min(cfg.trainer.gpu, torch.cuda.device_count() - 1)
    torch.backends.cudnn.benchmark = getattr(cfg.trainer, 'cudnn', False)
    torch.cuda.set_device(gpu)
    if is_distributed:
        torch.distributed.init_process_group(backend='nccl', init_method='env://')
    print(local_rank)
 
    ## define datasets and dataloader.
    dataset_train = DATASET_DICT[cfg.data.train_dataset](cfg)
    dataset_val = DATASET_DICT[cfg.data.val_dataset](cfg, "validation")

    dataloader_train = DataLoader(dataset_train, num_workers=cfg.data.num_workers,
                                  batch_size=cfg.data.batch_size, collate_fn=dataset_train.collate_fn, shuffle=local_rank<0, drop_last=True,
                                  sampler=torch.utils.data.DistributedSampler(dataset_train, num_replicas=world_size, rank=local_rank, shuffle=True) if local_rank >= 0 else None)
    dataloader_val = DataLoader(dataset_val, num_workers=cfg.data.num_workers,
                                batch_size=cfg.data.batch_size, collate_fn=dataset_val.collate_fn, shuffle=False, drop_last=True)

    ## Create the model
    # detector = DETECTOR_DICT[cfg.detector.name](cfg.detector)
    detector = MonoDTR(cfg.detector)

    # # this is for load training checkpoint
    state_dict_lasted = torch.load("/kaggle/input/checkpoint/MonoDTR_latest.pth", map_location='cpu')
    detector.load_state_dict(state_dict_lasted, strict=False)
    
    # this is for load pretrain checkpoint
    state_dict_pretrain = torch.load("/kaggle/input/pretrain-monodtr-base/MonoDTR.pth", map_location='cpu')
    detector.load_state_dict(state_dict_pretrain, strict=False)
    
    # Filter out incompatible keys
    new_state_dict = OrderedDict()
    loaded_keys = []  # Track which parameters are loaded
    for k, v in state_dict_pretrain.items():
        if k in detector.state_dict() and v.shape == detector.state_dict()[k].shape:
            new_state_dict[k] = v
            loaded_keys.append(k)  # Record loaded keys
    
    # detector.load_state_dict(new_state_dict, strict=False)
    
    # Freeze specific layers in the Swin-T backbone (first two stages)
    for name, param in detector.named_parameters():
        # print(name)
        if name in loaded_keys:
            param.requires_grad = False  # Freeze loaded weights
        # elif name.startswith('mono_core.backbone.layers_0'):
        # elif name.startswith('mono_core.backbone.layers_0') or name.startswith('mono_core.backbone.layers_1') or name.startswith('mono_core.backbone.layers_2'):
        #     param.requires_grad = False  # Freeze first two stages of Swin-T backbone
        else:
            param.requires_grad = True

    for name, param in detector.named_parameters():
        if param.requires_grad == True:
            print(f"Parameter {name} is trainable")
            
            
    ## Convert to cuda
    if is_distributed:
        detector = torch.nn.SyncBatchNorm.convert_sync_batchnorm(detector)
        detector = torch.nn.parallel.DistributedDataParallel(detector.cuda(), device_ids=[gpu], output_device=gpu)
    else:
        detector = detector.cuda()
    detector.train()

    ## Record basic information of the model
    if is_logging:
        string1 = detector.__str__().replace(' ', '&nbsp;').replace('\n', '  \n')
        writer.add_text("model structure", string1) # add space for markdown style in tensorboard text
        num_parameters = get_num_parameters(detector)
        print(f'number of trained parameters of the model: {num_parameters}')
    
    ## define optimizer and weight decay
    optimizer = optimizers.build_optimizer(cfg.optimizer, detector)

    ## define scheduler
    # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, cfg.trainer.max_epochs, cfg.optimizer.lr_target)
    scheduler_config = getattr(cfg, 'scheduler', None)
    scheduler = schedulers.build_scheduler(scheduler_config, optimizer)
    is_iter_based = getattr(scheduler_config, "is_iter_based", False)

    ## define loss logger
    training_loss_logger =  LossLogger(writer, 'train') if is_logging else None

    ## training pipeline
    if 'training_func' in cfg.trainer:
        # training_dection = PIPELINE_DICT[cfg.trainer.training_func] #hoanbi1
        training_dection = My_train_mono_detection
    else:
        raise KeyError

    ## Get evaluation pipeline
    if 'evaluate_func' in cfg.trainer:
        evaluate_detection = PIPELINE_DICT[cfg.trainer.evaluate_func]
        print("Found evaluate function {}".format(cfg.trainer.evaluate_func))
    else:
        evaluate_detection = None
        print("Evaluate function not found")


    ## timer is used to estimate eta
    timer = Timer()

    print('Num training images: {}'.format(len(dataset_train)))

    global_step = 0
    # for epoch_num in range(cfg.trainer.max_epochs):
    for epoch_num in range(15):
        ## Start training for one epoch
        torch.cuda.empty_cache()
        detector.train()
        if training_loss_logger:
            training_loss_logger.reset()
        for iter_num, data in enumerate(dataloader_train):
            training_dection(data, detector, optimizer, writer, training_loss_logger, global_step, epoch_num, cfg, iter_num)
            # training_dection(data, detector, optimizer, writer, training_loss_logger, global_step, epoch_num, cfg)

            global_step += 1
            if (iter_num + 1) % cfg.trainer.accumulation_steps == 0:
                if is_iter_based:
                    scheduler.step()
    
                if is_logging and global_step % cfg.trainer.disp_iter == 0:
                    ## Log loss, print out and write to tensorboard in main process
                    if 'total_loss' not in training_loss_logger.loss_stats:
                        print(f"\nIn epoch {epoch_num}, iteration:{iter_num}, global_step:{global_step}, total_loss not found in logger.")
                    else:
                        log_str = 'Epoch: {} | Iteration: {}  | Running loss: {:1.5f} | eta:{}'.format(
                            epoch_num, iter_num, training_loss_logger.loss_stats['total_loss'].avg,
                            timer.compute_eta(global_step, len(dataloader_train) * cfg.trainer.max_epochs))
                        print(log_str, end='\r')
                        writer.add_text("training_log/train", log_str, global_step)
                        training_loss_logger.log(global_step)

        if not is_iter_based:
            scheduler.step()
        ## save model in main process if needed
        if is_logging:
            torch.save(detector.module.state_dict() if is_distributed else detector.state_dict(), os.path.join(
                cfg.path.checkpoint_path, '{}_latest.pth'.format(
                    cfg.detector.name)
                )
            )
        if is_logging and (epoch_num + 1) % cfg.trainer.save_iter == 0:
            torch.save(detector.module.state_dict() if is_distributed else detector.state_dict(), os.path.join(
                cfg.path.checkpoint_path, '{}_{}.pth'.format(
                    cfg.detector.name,epoch_num)
                )
            )
        checkpoint = {
                "epoch": epoch_num + 1,
                "global_step": global_step,
                "state_dict_backbone": detector.state_dict(),
                "state_optimizer": optimizer.state_dict(),
                "state_lr_scheduler": scheduler.state_dict()
            }
        torch.save(checkpoint, os.path.join(cfg.path.checkpoint_path, f"checkpoint_resume.pt"))
        print(f"Save checkpoint at epoch {epoch_num+1} successfully!")
        ## test model in main process if needed
        # if is_evaluating and evaluate_detection is not None and cfg.trainer.test_iter > 0 and (epoch_num + 1) % cfg.trainer.test_iter == 0:
        if is_evaluating and evaluate_detection is not None and cfg.trainer.test_iter > 0 and (epoch_num + 1) % 5 == 0:
            print("\n/**** start testing after training epoch {} ******/".format(epoch_num))
            evaluate_detection(cfg, detector.module if is_distributed else detector, dataset_val, writer, epoch_num)
            print("/**** finish testing after training epoch {} ******/".format(epoch_num))

        if is_distributed:
            torch.distributed.barrier() # wait untill all finish a epoch

        if is_logging:
            writer.flush()

In [23]:
def run_training():
    config_path = "/kaggle/input/monodtr-library-ver2/MonoDTR/config/config.py"  # Path to your config file
    experiment_name = "EXP_NAME"  # Use the defined experiment name
    world_size = 1  # For single GPU training
    local_rank = 0  # Local rank set to 0 as per your command
    %env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
    main(config=config_path, experiment_name=experiment_name, world_size=world_size)

In [24]:
run_training()

env: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
-1




model.safetensors:   0%|          | 0.00/114M [00:00<?, ?B/s]

  state_dict_lasted = torch.load("/kaggle/input/checkpoint/MonoDTR_latest.pth", map_location='cpu')
  state_dict_pretrain = torch.load("/kaggle/input/pretrain-monodtr-base/MonoDTR.pth", map_location='cpu')


Parameter mono_core.backbone.patch_embed.proj.weight is trainable
Parameter mono_core.backbone.patch_embed.proj.bias is trainable
Parameter mono_core.backbone.patch_embed.norm.weight is trainable
Parameter mono_core.backbone.patch_embed.norm.bias is trainable
Parameter mono_core.backbone.layers_0.blocks.0.norm1.weight is trainable
Parameter mono_core.backbone.layers_0.blocks.0.norm1.bias is trainable
Parameter mono_core.backbone.layers_0.blocks.0.attn.relative_position_bias_table is trainable
Parameter mono_core.backbone.layers_0.blocks.0.attn.qkv.weight is trainable
Parameter mono_core.backbone.layers_0.blocks.0.attn.qkv.bias is trainable
Parameter mono_core.backbone.layers_0.blocks.0.attn.proj.weight is trainable
Parameter mono_core.backbone.layers_0.blocks.0.attn.proj.bias is trainable
Parameter mono_core.backbone.layers_0.blocks.0.norm2.weight is trainable
Parameter mono_core.backbone.layers_0.blocks.0.norm2.bias is trainable
Parameter mono_core.backbone.layers_0.blocks.0.mlp.fc1.w

  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds

Save checkpoint at epoch 1 successfully!


  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds

Save checkpoint at epoch 2 successfully!


  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds

Save checkpoint at epoch 3 successfully!


  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds

Save checkpoint at epoch 4 successfully!


  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds, torch.tensor(depths).float()
  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds

Save checkpoint at epoch 5 successfully!

/**** start testing after training epoch 4 ******/
rebuild /kaggle/working/MonoDTR/output/validation/data


  return torch.from_numpy(rgb_images).float(), torch.tensor(calib).float(), label, bbox2ds, bbox3ds
  scores, bbox, obj_index = module([images.cuda().float().contiguous(), torch.tensor(P2).cuda().float()])
100%|██████████| 3769/3769 [06:39<00:00,  9.44it/s]
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.
[1m
File "../input/monodtr-library-ver2/MonoDTR/visualDet3D/evaluator/kitti/eval.py", line 129:[0m
[1m@numba.jit(nopython=True, parallel=True)
[1mdef d3_box_overlap_kernel(boxes,
[0m[1m^[0m[0m
[0m


Car AP(Average Precision)@0.70, 0.70, 0.70:
bbox AP:0.00, 0.00, 0.00
bev  AP:0.00, 0.00, 0.00
3d   AP:0.00, 0.00, 0.00
aos  AP:0.00, 0.00, 0.00
Car AP(Average Precision)@0.70, 0.50, 0.50:
bbox AP:0.00, 0.00, 0.00
bev  AP:0.00, 0.00, 0.00
3d   AP:0.00, 0.00, 0.00
aos  AP:0.00, 0.00, 0.00

/**** finish testing after training epoch 4 ******/




Save checkpoint at epoch 6 successfully!
Save checkpoint at epoch 7 successfully!
Save checkpoint at epoch 8 successfully!
Save checkpoint at epoch 9 successfully!
Save checkpoint at epoch 10 successfully!

/**** start testing after training epoch 9 ******/
clean up the recorder directory of /kaggle/working/MonoDTR/output/validation/data
rebuild /kaggle/working/MonoDTR/output/validation/data


  scores, bbox, obj_index = module([images.cuda().float().contiguous(), torch.tensor(P2).cuda().float()])
100%|██████████| 3769/3769 [05:49<00:00, 10.77it/s]


Car AP(Average Precision)@0.70, 0.70, 0.70:
bbox AP:0.00, 0.00, 0.00
bev  AP:0.00, 0.00, 0.00
3d   AP:0.00, 0.00, 0.00
Car AP(Average Precision)@0.70, 0.50, 0.50:
bbox AP:0.00, 0.00, 0.00
bev  AP:0.00, 0.00, 0.00
3d   AP:0.00, 0.00, 0.00

/**** finish testing after training epoch 9 ******/
Save checkpoint at epoch 11 successfully!
Save checkpoint at epoch 12 successfully!
Save checkpoint at epoch 13 successfully!
Save checkpoint at epoch 14 successfully!
Save checkpoint at epoch 15 successfully!

/**** start testing after training epoch 14 ******/
clean up the recorder directory of /kaggle/working/MonoDTR/output/validation/data
rebuild /kaggle/working/MonoDTR/output/validation/data


100%|██████████| 3769/3769 [05:56<00:00, 10.56it/s]


Car AP(Average Precision)@0.70, 0.70, 0.70:
bbox AP:0.00, 0.00, 0.00
bev  AP:0.00, 0.00, 0.00
3d   AP:0.00, 0.00, 0.00
Car AP(Average Precision)@0.70, 0.50, 0.50:
bbox AP:0.00, 0.00, 0.00
bev  AP:0.00, 0.00, 0.00
3d   AP:0.00, 0.00, 0.00

/**** finish testing after training epoch 14 ******/
