# Trainer Debug Notebook
Quick debug notebook for online function testing without reloading data and models repeatedly.


In [1]:
# Place %autoreload 2 at the top of your notebook to ensure it applies to all subsequent imports.
%load_ext autoreload
%autoreload 2


In [2]:
from __future__ import absolute_import, division, print_function
import torch
import os
import sys
from trainer import Trainer
from options import MonodepthOptions
import time


## 1. Initialize Options
Edit parameters here as needed


In [3]:
# Initialize options
options = MonodepthOptions()

# Define arguments - edit as needed
args = [
    '--num_epochs', '50000',
    '--num_workers', '1',
    '--batch_size', '2',
    '--log_frequency', '10',
    '--save_frequency', '100000',
    '--of_samples',
    '--of_samples_num', '50',
    '--frame_ids', '0', '-1', '1',
    '--dataset', 'endovis',
    '--data_path', '/mnt/nct-zfs/TCO-All/SharedDatasets/SCARED_Images_Resized/',
    '--log_dir', '/mnt/nct-zfs/TCO-Test/jinjingxu/exps/train/mvp3r/results/unisfm/iidsfm',
    '--compute_metrics',
    '--reproj_supervise_type', 'reprojection_color_warp',
    '--seed', '42',
]

# Parse options (use parse_notebook for notebook args list)
opts = options.parse_notebook(args)

# Override specific options if needed (edit here for quick changes)
# opts.batch_size = 2
# opts.num_workers = 1
# opts.of_samples_num = 50
# opts.compute_metrics = True

print("Options initialized:")
print(f"  Dataset: {opts.dataset}")
print(f"  Batch size: {opts.batch_size}")
print(f"  Frame IDs: {opts.frame_ids}")
print(f"  Compute metrics: {opts.compute_metrics}")


Options initialized:
  Dataset: endovis
  Batch size: 2
  Frame IDs: [0, -1, 1]
  Compute metrics: True


In [4]:
# Initialize trainer
trainer = Trainer(opts)

print("Trainer initialized successfully!")
print(f"  Device: {trainer.device}")
print(f"  Number of models: {len(trainer.models)}")
print(f"  Models: {list(trainer.models.keys())}")




Training model named:
   2025-12-05-11-34-22
Models and tensorboard events files are saved to:
   /mnt/nct-zfs/TCO-Test/jinjingxu/exps/train/mvp3r/results/unisfm/iidsfm
Training is using:
   cuda
Overfitting mode: using 50 samples
Using split:
   endovis
There are 50 training items and 1705 validation items

Trainer initialized successfully!
  Device: cuda
  Number of models: 7
  Models: ['encoder', 'depth', 'decompose_encoder', 'decompose', 'adjust_net', 'pose_encoder', 'pose']


## 3. Load Trained Models (Optional)
Load pretrained weights if available


In [5]:
# Uncomment and set path to load pretrained models
# load_weights_folder = "/path/to/weights/folder"
# opts.load_weights_folder = load_weights_folder
# opts.models_to_load = ["encoder", "depth", "pose_encoder", "pose", "decompose_encoder", "decompose", "adjust_net"]
# trainer.load_model()

print("Models ready (no pretrained weights loaded)")


Models ready (no pretrained weights loaded)


## 4. Initialize Data Loaders
Get iterators for training and validation data


In [6]:
# Data loaders are already initialized in Trainer.__init__
# Access them directly
train_loader = trainer.train_loader
val_loader = trainer.val_loader

print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")

# Get a sample batch for testing
train_iter = iter(train_loader)
val_iter = iter(val_loader)

print("Data loaders ready!")


Training batches: 25
Validation batches: 852
Data loaders ready!


## 5. Test Single Batch Processing
Test forward pass on a single batch


In [7]:
# Get a sample batch
trainer.set_train()
sample_inputs = next(train_iter)
sample_inputs = next(val_iter) # gt_depth exists

# Move inputs to device (process_batch does this, but we do it explicitly here for clarity)
for key, val in sample_inputs.items():
    if isinstance(val, torch.Tensor):
        sample_inputs[key] = val.to(trainer.device)

print("Sample batch keys:", list(sample_inputs.keys()))
for key, val in sample_inputs.items():
    if isinstance(val, torch.Tensor):
        print(f"  {key}: shape={val.shape}, dtype={val.dtype}, device={val.device}")

# Process batch
with torch.no_grad():
    outputs, losses = trainer.process_batch(sample_inputs)

print("\nOutput keys:", list(outputs.keys()))
print("\nLosses:")
for key, val in losses.items():
    if isinstance(val, torch.Tensor):
        print(f"  {key}: {val.item():.6f}")
    else:
        print(f"  {key}: {val}")


Sample batch keys: [('K', 0), ('inv_K', 0), ('K', 1), ('inv_K', 1), ('K', 2), ('inv_K', 2), ('K', 3), ('inv_K', 3), ('color', 0, 0), ('color', 0, 1), ('color', 0, 2), ('color', 0, 3), ('color', -1, 0), ('color', -1, 1), ('color', -1, 2), ('color', -1, 3), ('color', 1, 0), ('color', 1, 1), ('color', 1, 2), ('color', 1, 3), ('color_aug', 0, 0), ('color_aug', 0, 1), ('color_aug', 0, 2), ('color_aug', 0, 3), ('color_aug', -1, 0), ('color_aug', -1, 1), ('color_aug', -1, 2), ('color_aug', -1, 3), ('color_aug', 1, 0), ('color_aug', 1, 1), ('color_aug', 1, 2), ('color_aug', 1, 3), ('depth_gt', 0, 0)]
  ('K', 0): shape=torch.Size([2, 4, 4]), dtype=torch.float32, device=cuda:0
  ('inv_K', 0): shape=torch.Size([2, 4, 4]), dtype=torch.float32, device=cuda:0
  ('K', 1): shape=torch.Size([2, 4, 4]), dtype=torch.float32, device=cuda:0
  ('inv_K', 1): shape=torch.Size([2, 4, 4]), dtype=torch.float32, device=cuda:0
  ('K', 2): shape=torch.Size([2, 4, 4]), dtype=torch.float32, device=cuda:0
  ('inv_K', 

## 6. Test Validation with Metrics
Test validation forward pass and metrics computation


In [8]:
# Get validation batch
trainer.set_eval()
try:
    val_inputs = next(val_iter)
except StopIteration:
    val_iter = iter(val_loader)
    val_inputs = next(val_iter)

# Move inputs to device
for key, val in val_inputs.items():
    if isinstance(val, torch.Tensor):
        val_inputs[key] = val.to(trainer.device)

print("Validation batch keys:", list(val_inputs.keys()))
if ("depth_gt", 0, 0) in val_inputs:
    print(f"  depth_gt: shape={val_inputs[('depth_gt', 0, 0)].shape}, device={val_inputs[('depth_gt', 0, 0)].device}")

# Process validation batch
with torch.no_grad():
    outputs, losses = trainer.process_batch(val_inputs)
    
    # Compute metrics if enabled
    metrics = {}
    if opts.compute_metrics:
        from utils.metrics import compute_depth_metrics
        metrics = compute_depth_metrics(val_inputs, outputs)

print("\nValidation Losses:")
for key, val in losses.items():
    if isinstance(val, torch.Tensor):
        print(f"  {key}: {val.item():.6f}")

if metrics:
    print("\nDepth Metrics:")
    for key, val in metrics.items():
        print(f"  {key}: {val:.6f}")
else:
    print("\nNo metrics computed (GT depth not available or compute_metrics=False)")


Validation batch keys: [('K', 0), ('inv_K', 0), ('K', 1), ('inv_K', 1), ('K', 2), ('inv_K', 2), ('K', 3), ('inv_K', 3), ('color', 0, 0), ('color', 0, 1), ('color', 0, 2), ('color', 0, 3), ('color', -1, 0), ('color', -1, 1), ('color', -1, 2), ('color', -1, 3), ('color', 1, 0), ('color', 1, 1), ('color', 1, 2), ('color', 1, 3), ('color_aug', 0, 0), ('color_aug', 0, 1), ('color_aug', 0, 2), ('color_aug', 0, 3), ('color_aug', -1, 0), ('color_aug', -1, 1), ('color_aug', -1, 2), ('color_aug', -1, 3), ('color_aug', 1, 0), ('color_aug', 1, 1), ('color_aug', 1, 2), ('color_aug', 1, 3), ('depth_gt', 0, 0)]
  depth_gt: shape=torch.Size([2, 1, 1024, 1280]), device=cuda:0

Validation Losses:
  loss: 0.394922
  loss_reconstruction: 0.335299
  loss_reflec: 0.000678
  loss_reprojection: 0.327701
  loss_disp_smooth: 0.002639

Depth Metrics:
  abs_rel: 0.232254
  sq_rel: 3.867111
  rmse: 14.407352
  rmse_log: 0.283523
  a1: 0.579472
  a2: 0.838206
  a3: 0.999674
  median_scaling_ratio: 245.557007
  medi

## 7. Explicit Training Loop
Manual training loop for debugging (no tensorboard logging)


In [9]:
# Training loop parameters (edit as needed)
num_batches_to_train = 5  # Number of batches to process
log_every_n_batches = 2   # Log every N batches

trainer.set_train()
train_iter = iter(train_loader)
train_iter = iter(val_loader)

trainer.step = 0

for batch_idx in range(num_batches_to_train):
    try:
        inputs = next(train_iter)
    except StopIteration:
        train_iter = iter(train_loader)
        inputs = next(train_iter)
    
    # Note: process_batch moves inputs to device internally, but we can do it explicitly here too
    # for key, val in inputs.items():
    #     if isinstance(val, torch.Tensor):
    #         inputs[key] = val.to(trainer.device)
    
    before_op_time = time.time()
    
    # Forward pass (process_batch handles device movement)
    outputs, losses = trainer.process_batch(inputs)
    
    # Backward pass
    trainer.model_optimizer.zero_grad()
    losses["loss"].backward()
    trainer.model_optimizer.step()
    
    duration = time.time() - before_op_time
    
    # Log periodically
    if batch_idx % log_every_n_batches == 0:
        print(f"Batch {batch_idx:4d} | Loss: {losses['loss'].item():.6f} | Time: {duration:.3f}s")
        print(f"  Sub-losses: reconstruction={losses.get('loss_reconstruction', 0):.6f}, "
              f"reprojection={losses.get('loss_reprojection', 0):.6f}, "
              f"reflec={losses.get('loss_reflec', 0):.6f}, "
              f"disp_smooth={losses.get('loss_disp_smooth', 0):.6f}")
    
        # optional: log to tensorboard
        # trainer.log_time(batch_idx, duration, losses["loss"].cpu().data)
        # trainer.log("train", inputs, outputs, losses, metrics=None)
        # trainer.val()

        # optional: obtain various image from outputs and save as one row of images of all image
        from utils import img_gen
        for key in outputs.keys():
            outputs[key] = outputs[key].detach()
        # compute the depth_err metrics    
        from utils.metrics import compute_depth_metrics
        metrics = compute_depth_metrics(inputs, outputs)
        merged_dict = {**inputs, **outputs, **metrics}
        img_gen(
            merged_dict=merged_dict,
            image_keys=[("color", 0, 0), ("disp", 0), ("depth", 0, 0), ("depth_gt", 0, 0), "depth_err"],
            save_path="output_grid.png",
            sample_idx=0
        )

    # trainer.step += 1



print("\nTraining loop completed!")


Batch    0 | Loss: 0.400739 | Time: 1.579s
  Sub-losses: reconstruction=0.339050, reprojection=0.332640, reflec=0.001099, disp_smooth=0.006853
Saved image grid to output_grid.png
Batch    2 | Loss: 0.384031 | Time: 0.268s
  Sub-losses: reconstruction=0.326219, reprojection=0.318423, reflec=0.001525, disp_smooth=0.005830
Saved image grid to output_grid.png
Batch    4 | Loss: 0.371374 | Time: 0.196s
  Sub-losses: reconstruction=0.315245, reprojection=0.307922, reflec=0.001755, disp_smooth=0.005165
Saved image grid to output_grid.png

Training loop completed!


## 8. Test Specific Functions
Test individual components


In [10]:
# Test decompose function
trainer.set_train()
sample_inputs = next(iter(train_loader))

# Move inputs to device
for key, val in sample_inputs.items():
    if isinstance(val, torch.Tensor):
        sample_inputs[key] = val.to(trainer.device)

# Get basic outputs first
features = trainer.models["encoder"](sample_inputs[("color_aug", 0, 0)])
outputs = trainer.models["depth"](features)
outputs.update(trainer.predict_poses(sample_inputs))

# Test decompose
trainer.decompose(sample_inputs, outputs)

print("Decompose outputs:")
for key in outputs.keys():
    if "reflectance" in key or "light" in key or "reprojection" in key:
        if isinstance(outputs[key], torch.Tensor):
            print(f"  {key}: shape={outputs[key].shape}")

print("\nDepth stored:", ("depth", 0, 0) in outputs)


Decompose outputs:
  ('reflectance', 0, 0): shape=torch.Size([2, 3, 256, 320])
  ('light', 0, 0): shape=torch.Size([2, 1, 256, 320])
  ('reflectance', 0, -1): shape=torch.Size([2, 3, 256, 320])
  ('light', 0, -1): shape=torch.Size([2, 1, 256, 320])
  ('reflectance', 0, 1): shape=torch.Size([2, 3, 256, 320])
  ('light', 0, 1): shape=torch.Size([2, 1, 256, 320])

Depth stored: True


## 9. Inspect Model Parameters
Check model states and parameters


In [11]:
# Count parameters
total_params = 0
for model_name, model in trainer.models.items():
    model_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params += model_params
    print(f"{model_name:20s}: {model_params:>10,} params ({trainable_params:>10,} trainable)")

print(f"\nTotal parameters: {total_params:,}")
print(f"Total trainable: {sum(p.numel() for p in trainer.parameters_to_train):,}")


encoder             : 11,689,512 params (11,689,512 trainable)
depth               :  3,152,724 params ( 3,152,724 trainable)
decompose_encoder   : 11,689,512 params (11,689,512 trainable)
decompose           :  3,273,732 params ( 3,273,732 trainable)
adjust_net          :     19,425 params (    19,425 trainable)
pose_encoder        : 11,698,920 params (11,698,920 trainable)
pose                :  1,314,572 params ( 1,314,572 trainable)

Total parameters: 42,838,397
Total trainable: 42,838,397


## 10. Quick Parameter Edits
Edit common parameters without reinitializing


In [12]:
# Quick parameter edits (edit as needed)
# opts.batch_size = 4
# opts.learning_rate = 1e-5
# opts.compute_metrics = True
# opts.reproj_supervise_type = 'color_warp'

# Update optimizer if learning rate changed
# import torch.optim as optim
# trainer.model_optimizer = optim.Adam(trainer.parameters_to_train, opts.learning_rate)

print("Current settings:")
print(f"  Batch size: {opts.batch_size}")
print(f"  Learning rate: {opts.learning_rate}")
print(f"  Compute metrics: {opts.compute_metrics}")
print(f"  Reproj supervise type: {opts.reproj_supervise_type}")


Current settings:
  Batch size: 2
  Learning rate: 0.0001
  Compute metrics: True
  Reproj supervise type: reprojection_color_warp
