# Model Architecture:

**Stage 1:** Contrastive Pretraining: CILP_model

**Goal:** align RGB and LiDAR in a shared 200-D space --> encodes both rgb and lidar in the same dimensionality space
```
RGB ----> Img Encoder ----\
                            ----> CLIP-style similarity
LiDAR -> Lidar Encoder ----/
```
**Outcome:** Shared embedding space where matching RGB/LiDAR pairs have high similarity and non-matches low similarity.

----------------------------

**Stage 2:** Projector Training: projector

**Goal:** learn a mapping from RGB CILP embeddings to LiDAR embeddings used by lidar_cnn:
ℝ²⁰⁰ (CILP RGB embedding) → ℝ³²⁰⁰ (LiDAR-CNN embedding)

projector knows how to “pretend” RGBs are LiDAR internally: projected_RGB_embedding ≈ “real” LiDAR embedding for each paired RGB/LiDAR sample.
```
RGB ----> Img Encoder ----> Projector ----> LiDAR embedding
                                     |
                                     v
                             MSE-loss to true LiDAR embedding

```
----------------------------

**Stage 3:** Final Classifier: RGB2LiDARClassifier

**Goal:** chaining all models together to classify spheres and cubes from images

pretends the RGBs look like LiDAR in the internal feature space and then uses LiDAR classifier.
```
RGB (img) ----> (CILP Img Encoder) ----> 200-D CILP embedding ----> (Projector) ---> 3200-D LiDAR embedding
---> (LiDAR Classifier) ---> cube/sphere

```

# Setup

In [None]:
%%capture
%pip install fiftyone==1.10.0 sympy==1.12 torch==2.9.0 torchvision==0.20.0 numpy open-clip-torch

In [None]:
# %pip install --pre torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu118


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/nightly/cu118


## Imports

In [None]:
import os
import random
import time
from pathlib import Path
from google.colab import userdata

import numpy as np
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
import torchvision.transforms as transforms
#import torchvision.transforms.v2 as transforms   ##TODO Note:

import wandb
import cv2
import albumentations as A

In [None]:
from google.colab import drive
drive.mount('/content/drive')

STORAGE_PATH = Path("/content/drive/MyDrive/Colab Notebooks/Applied Computer Vision/Applied-Computer-Vision-Projects/Multimodal_Learning_02/")

Mounted at /content/drive


## Constants

In [None]:
SEED = 51
NUM_WORKERS = os.cpu_count() if os.cpu_count() is not None else 2  # Number of CPU cores

DATASET_PATH = STORAGE_PATH / "multimodal_training_workshop/data/assessment"

BATCH_SIZE = 32
IMG_SIZE = 64

CLASSES = ["cubes", "spheres"]
LABEL_MAP = {"cubes": 0, "spheres": 1}

# Utility Functions

## Reproducibility

In [None]:
def set_seeds(seed=SEED):
    """
    Set seeds for complete reproducibility across all libraries and operations.

    Args:
        seed (int): Random seed value
    """
    # Set environment variables before other imports
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'

    # Python random module
    random.seed(seed)

    # NumPy
    np.random.seed(seed)

    # PyTorch CPU
    torch.manual_seed(seed)

    # PyTorch GPU (all devices)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # For multi-GPU setups

        # CUDA deterministic operations
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    # OpenCV
    cv2.setRNGSeed(seed)

    # Albumentations (for data augmentation)
    try:
        A.seed_everything(seed)
    except AttributeError:
        # Older versions of albumentations
        pass

    # PyTorch deterministic algorithms (may impact performance)
    try:
        torch.use_deterministic_algorithms(True)
    except RuntimeError:
        # Some operations don't have deterministic implementations
        print("Warning: Some operations may not be deterministic")

    print(f"All random seeds set to {seed} for reproducibility")



# Usage: Call this function at the beginning and before each training phase
set_seeds(SEED)

# Additional reproducibility considerations:

def create_deterministic_training_dataloader(dataset, batch_size, shuffle=True, **kwargs):
    """
    Create a DataLoader with deterministic behavior.

    Args:
        dataset: PyTorch Dataset instance
        batch_size: Batch size
        shuffle: Whether to shuffle data
        **kwargs: Additional DataLoader arguments

    Returns:
        Training DataLoader with reproducible behavior
    """
    # Use a generator with fixed seed for reproducible shuffling
    generator = torch.Generator()
    generator.manual_seed(51)

    return torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        generator=generator if shuffle else None,
        **kwargs
    )



All random seeds set to 51 for reproducibility


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.is_available()

True

In [None]:
img_transforms = transforms.Compose([
    #transforms.ToImage(),
    transforms.ToTensor(),
    transforms.Resize(IMG_SIZE),
    #transforms.ToDType(torch.float32, scale=True),
    ## transforms.Normalize((mean_intensity,), (std_intensity,))
])

In [None]:
def input_fn(batch):
    """
    Adapts a batch from the dataloader to the model inputs.

    batch = (rgb, lidar, label), but CILP, projector only need (rgb, lidar).
    """
    rgb, lidar, _ = batch
    return (rgb.to(device), lidar.to(device))

In [None]:
def rgb_only_input_fn(batch):
    rgb_imgs = batch[0].to(device)     # (B, 4, 64, 64)
    return (rgb_imgs,)                 # tuple, because model(*input_fn(batch))

# TODO: Integrate Wandb as in task 3

W&B logging, metrics, plots → reuse almost as-is



```
W&B init, wandb.log(...)

a metrics dict holding losses, epoch times, params, GPU memory

a plot_losses(losses) helper that plots train/valid curves

maybe a comparison DataFrame builder
```



In [None]:
# Load W&B API key from .env file and make it available as env variable
# from dotenv import load_dotenv
# load_dotenv()  # loads .env automatically

# os.environ["WANDB_API_KEY"]

KeyError: 'WANDB_API_KEY'

In [None]:
# Load W&B API key from Colab Secrets and make it available as env variable
wandb_key = userdata.get('WANDB_API_KEY')
os.environ["WANDB_API_KEY"] = wandb_key

In [None]:
wandb.login()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Currently logged in as: [33mmichele-marschner[0m ([33mmichele-marschner-university-of-potsdam[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
def init_wandb(model, fusion_name, num_params, opt_name, batch_size=BATCH_SIZE, epochs=15):
  config = {
    # "embedding_size": embedding_size,      ## TODO: ändert die sich? hab ich die bei fusion?
    "optimizer_type": opt_name,
    "fusion_strategy": fusion_name,
    "model_architecture": model.__class__.__name__,
    "batch_size": batch_size,
    "num_epochs": epochs,
    "num_parameters": num_params
  }

  run = wandb.init(
    project="cilp-extended-assessment",
    name=f"{fusion_name}_run",
    config=config,
    reinit=True,                          # allows multiple runs in one script
  )

  return

# The Model



```
ConvEncoder
```


```
CILP_EMB_SIZE = 200

img_embedder   = ConvEncoder(in_ch=4, emb_dim=CILP_EMB_SIZE).to(device)
lidar_embedder = ConvEncoder(in_ch=1, emb_dim=CILP_EMB_SIZE).to(device)
```
--> definiere vorab CILP_EMB_SIZE=200 (im assessment)








**TODO:** Does it make sense to save the best model figured out in task 3 and 4 and re-train it here? (use it as "foundation")

Generell ist die Frage - da sich hier alles nur um diesen einen Datensatz dreht - klar um fusion performance - aber eben nicht um position - dürfte es ja sogar schon das Endmodel sein, dass ich in task 3 und task 4 identifiziere - auch der head sollte ja dann schon passen

In [None]:
## move to models.py
class ConvEncoder(nn.Module):
    """
    Small CNN that turns a 4-channel image into a compact embedding vector.

    Why:
    - We want to compare RGB+LiDAR representations using similarity.
    - For that, we need each modality mapped to a fixed-size vector (embedding).
    - This encoder is the "feature extractor" for one modality.
    """

    def __init__(self, in_ch: int, emb_dim: int = 128):
        """
        Args:
            in_ch: number of input channels (4 for RGB, 4 for LiDAR)
            emb_dim: dimensionality of the output embedding (e.g. 128)
        """
        super().__init__()
        k = 3      # kernel size

        # Three conv layers with pooling
        self.conv1 = nn.Conv2d(in_ch, 50, k, padding=1)
        self.conv2 = nn.Conv2d(50, 100, k, padding=1)
        self.conv3 = nn.Conv2d(100, 200, k, padding=1)
        self.conv4 = nn.Conv2d(200, 200, k, padding=1)    ## TODO: nur bei dem CILP Model
        self.pool = nn.MaxPool2d(2)

        # After 3x pooling (factor 2 each time) on 64x64 → 8x8 feature maps
        # channels: 200, so flattened size = 200 * 8 * 8
        self.fc1 = nn.Linear(200 * 8 * 8, 1000)         ## TODO: CILP macht 4*4
        self.fc2 = nn.Linear(1000, emb_dim)

        self.output_dim = emb_dim

    def forward(self, x):
        # Convolution + nonlinearity + downsampling
        x = self.pool(F.relu(self.conv1(x)))   # (B,50,32,32)
        x = self.pool(F.relu(self.conv2(x)))   # (B,100,16,16)
        x = self.pool(F.relu(self.conv3(x)))   # (B,200,8,8)

        # Flatten spatial dimensions
        x = torch.flatten(x, 1)                # (B,200*8*8)

        # Two fully connected layers to get to embedding dimension
        x = F.relu(self.fc1(x))                # (B,1000)
        emb = self.fc2(x)                        # (B,emb_dim)

        return F.normalize(emb)                 ##  TODO: nur bei CILP? wegen Cosine similarity - schädlich bei anderen?


In [None]:
CILP_EMB_SIZE = 200

img_embedder   = ConvEncoder(in_ch=4, emb_dim=CILP_EMB_SIZE).to(device)
lidar_embedder = ConvEncoder(in_ch=1, emb_dim=CILP_EMB_SIZE).to(device)

## Stage 1: CILP contrastive pretraining

In [None]:
## move to models.py
class ContrastivePretraining(nn.Module):
    def __init__(self):
        super().__init__()
        self.img_embedder = img_embedder
        self.lidar_embedder = lidar_embedder
        self.cos = nn.CosineSimilarity(dim=-1)

    def forward(self, rgb_imgs, lidar_depths):
        img_emb = self.img_embedder(rgb_imgs)
        lidar_emb = self.lidar_embedder(lidar_depths)

        repeated_img_emb = img_emb.repeat_interleave(len(img_emb), dim=0)
        repeated_lidar_emb = lidar_emb.repeat(len(lidar_emb), 1)

        similarity = self.cos(repeated_img_emb, repeated_lidar_emb)
        similarity = torch.unflatten(similarity, 0, (len(img_emb), len(img_emb)))
        similarity = (similarity + 1) / 2

        logits_per_img = similarity
        logits_per_lidar = similarity.T
        return logits_per_img, logits_per_lidar

## Stage 2: Projector training

# The Dataset

**TODO:** I would use the code from the task 3

```
AssessmentDataset (Pattern A or your MyDataset variant)

create_assessment_splits(...)

make_dataloaders(...)
```



In [None]:
## move to datasets.py
class AssessmentDataset(Dataset):
    """
    Loads paired RGB/LiDAR samples from the assessment dataset.

    Used to create train/val/test splits by constructing multiple instances
    with different index ranges.

    One sample: (rgb_tensor, lidar_tensor, label_tensor)
    - rgb: 4-channel image (B,4,H,W) after transform
    - lidar: 4-channel tensor (B,4,H,W) after transform
    """

    def __init__(self, root_dir=DATASET_PATH, start_idx=0, end_idx=None,
                 transform_rgb=None, transform_lidar=None, shuffle=True):
        """
        Args:
            start_idx: first sample index to include
            end_idx: one past last index to include (None = all)
            transform_rgb: optional transform for RGB image (PIL -> tensor)
            transform_lidar: optional transform for LiDAR tensor
            shuffle: if True, shuffle full sample list once before slicing
        """
        self.classes = CLASSES
        self.root_dir = root_dir
        self.transform_rgb = transform_rgb
        self.transform_lidar = transform_lidar
        self.samples = []

        # 1) Build full list of samples
        all_samples = []
        # Paths to different modalities, organized by class
        for class_name in self.classes:
            class_dir = self.root_dir / class_name
            RGB_DIR = class_dir / "rgb"
            LIDAR_DIR = class_dir / "lidar"

            rgb_files = sorted(RGB_DIR.glob("*.png"))
            npy_files = sorted(LIDAR_DIR.glob("*.npy"))

            # Verify matching files
            rgb_stems = {f.stem for f in rgb_files}
            npy_stems = {f.stem for f in npy_files}
            matching = rgb_stems & npy_stems

            print(f"{class_name}: {len(matching)} paired samples")

            label = LABEL_MAP[class_name]

            for stem in sorted(matching):
                all_samples.append(
                    {
                        "rgb": RGB_DIR / f"{stem}.png",
                        "lidar": LIDAR_DIR / f"{stem}.npy",
                        "label": label,
                    }
                )

        # Optional: shuffle once, so contiguous splits are roughly balanced
        if shuffle:
            rng = random.Random(SEED)
            rng.shuffle(all_samples)

        # 2) Apply slice [start_idx:end_idx]
        if end_idx is None:
            end_idx = len(all_samples)

        self.samples = all_samples[start_idx:end_idx]

        print(
            f"AssessmentDataset: "
            f"slice [{start_idx}:{end_idx}] -> {len(self.samples)} samples"
        )

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        # RGB
        rgb = Image.open(sample["rgb"])
        if self.transform_rgb is not None:
            rgb = self.transform_rgb(rgb)

        # LiDAR
        lidar_np = np.load(sample["lidar"])        # (64, 64)
        if lidar_np.ndim == 2:
            lidar_np = lidar_np[None, :, :]        # (1, 64, 64)

        lidar = torch.tensor(lidar_np).float()

        if self.transform_lidar is not None:
            lidar = self.transform_lidar(lidar)

        label = torch.tensor(sample["label"], dtype=torch.long)

        return rgb, lidar, label


In [None]:
## move to training.py or datasets.py?
def create_data_splits(
    root_dir,
    train_frac=0.7,
    val_frac=0.15,
    test_frac=0.15,
    transform_rgb=None,
    transform_lidar=None,
    seed=SEED,
):
    """
    Build a single AssessmentDataset and return train/val/test Subsets.
    """
    total = train_frac + val_frac + test_frac
    assert abs(total - 1.0) < 1e-6, "Fractions must sum to 1.0"

    # One unified dataset over all samples
    full_ds = AssessmentDataset(
        root_dir=root_dir,
        start_idx=0,
        end_idx=None,
        transform_rgb=transform_rgb,
        transform_lidar=transform_lidar,
        shuffle=False,
    )

    rgb0, lidar0, label0 = full_ds[0]
    print(f"One sample debug:")
    print(f"  RGB tensor shape:   {rgb0.shape}")
    print(f"  LiDAR tensor shape: {lidar0.shape}")
    print(f"  Label:              {label0.item()}")

    N = len(full_ds)
    n_train = int(N * train_frac)
    n_val   = int(N * val_frac)
    n_test  = N - n_train - n_val

    # deterministic shuffle of indices
    g = torch.Generator().manual_seed(seed)
    perm = torch.randperm(N, generator=g)

    train_idx = perm[:n_train]
    val_idx   = perm[n_train:n_train+n_val]
    test_idx  = perm[n_train+n_val:]

    train_ds = Subset(full_ds, train_idx)
    val_ds   = Subset(full_ds, val_idx)
    test_ds  = Subset(full_ds, test_idx)

    print(f"Total: {N}, train: {len(train_ds)}, val: {len(val_ds)}, test: {len(test_ds)}")

    return train_ds, val_ds, test_ds


In [None]:
## move to training.py
def make_dataloaders(
    train_ds,
    val_ds,
    test_ds,
    batch_size,
    num_workers=2,
    drop_last=True,
    device="cpu",
):
    """
    Given train/val/test datasets, return their DataLoaders.
    """

    train_dataloader = create_deterministic_training_dataloader(
        train_ds,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=True,
        drop_last=drop_last,        ## TODO: überprüfen mit ANotnio: in case of contrastive learning later
    )

    val_dataloader = DataLoader(
        val_ds,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True,
        drop_last=False,
    )

    test_dataloader = DataLoader(
        test_ds,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True,
        drop_last=False,
    )

    return train_dataloader, val_dataloader, test_dataloader


In [None]:
train_ds, val_ds, test_ds = create_data_splits(
    root_dir=DATASET_PATH,
    train_frac=0.7,
    val_frac=0.15,
    test_frac=0.15,
    transform_rgb=img_transforms
  )

cubes: 2501 paired samples
spheres: 9999 paired samples
AssessmentDataset: slice [0:12500] -> 12500 samples
One sample debug:
  RGB tensor shape:   torch.Size([4, 64, 64])
  LiDAR tensor shape: torch.Size([1, 64, 64])
  Label:              0
Total: 12500, train: 8750, val: 1875, test: 1875


In [None]:
train_dataloader, val_dataloader, test_dataloader = make_dataloaders(
    train_ds,
    val_ds,
    test_ds,
    batch_size=BATCH_SIZE
)

# Training


```
train_model
```



In [None]:
def format_positions(positions):
    return ['{0: .3f}'.format(x) for x in positions]

In [None]:
def print_loss(epoch, loss, outputs, target, is_train=True, is_debug=False):
    loss_type = "train loss:" if is_train else "valid loss:"
    print("epoch", str(epoch), loss_type, str(loss))
    if is_debug:
        print("example pred:", format_positions(outputs[0].tolist()))
        print("example real:", format_positions(target[0].tolist()))

In [None]:
## move to training.py
def train_model(model, optimizer, input_fn, epochs, loss_fn, train_dataloader, valid_dataloader, target_idx=-1, log_to_wandb=False, model_name=None):
    train_losses = []
    valid_losses = []
    epoch_times = []

    # for GPU memory tracking
    max_gpu_mem_mb = 0.0
    use_cuda = torch.cuda.is_available()

    if use_cuda:
        torch.cuda.reset_peak_memory_stats()

    for epoch in range(epochs):
        start_time = time.time()                  # to track the train time per model
        print(f"Epoch and start time: {epoch} und {start_time}")
        model.train()
        train_loss = 0

        for step, batch in enumerate(train_dataloader):
            optimizer.zero_grad()
            target = batch[target_idx].to(device)  # labels: 0/1 for cube/sphere
            outputs = model(*input_fn(batch))      # e.g. model(rgb, lidar)

            loss = loss_fn(outputs, target)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss = train_loss / (step + 1)
        train_losses.append(train_loss)
        print_loss(epoch, train_loss, outputs, target, is_train=True)


        # ----- validation -----
        model.eval()
        valid_loss = 0
        with torch.no_grad():
          for step, batch in enumerate(valid_dataloader):
              target = batch[target_idx].to(device)
              outputs = model(*input_fn(batch))
              valid_loss += loss_fn(outputs, target).item()
        valid_loss = valid_loss / (step + 1)
        valid_losses.append(valid_loss)
        print_loss(epoch, valid_loss, outputs, target, is_train=False)

        # timing
        epoch_time = time.time() - start_time
        epoch_times.append(epoch_time)

        # GPU memory
        if use_cuda:
            gpu_mem_mb = torch.cuda.max_memory_allocated() / (1024 ** 2)
            max_gpu_mem_mb = max(max_gpu_mem_mb, gpu_mem_mb)

        # console logging (reuse your print_loss if you want)
        print(
            f"[{model_name or 'model'}] Epoch {epoch+1}/{epochs} "
            f"- train_loss: {train_loss:.4f}  valid_loss: {valid_loss:.4f}  "
            f"time: {epoch_time:.2f}s"
        )

        # wandb logging
        if log_to_wandb:
            wandb.log(
                {
                    "model": model_name or "model",
                    "epoch": epoch + 1,
                    "train_loss": train_loss,
                    "valid_loss": valid_loss,
                    "epoch_time_sec": epoch_time,
                    "max_gpu_mem_mb_epoch": gpu_mem_mb if use_cuda else 0.0,
                }
            )

    return train_losses, valid_losses, epoch_times, max_gpu_mem_mb

## Stage 1: CILP contrastive pretraining

In [None]:
# Initialize the model
CILP_model = ContrastivePretraining().to(device)

loss_img = nn.CrossEntropyLoss()
loss_lidar = nn.CrossEntropyLoss()

In [None]:
def cilp_loss_fn(outputs, target_ignored):
    """
    outputs: (logits_per_img, logits_per_lidar), each of shape (B, B)

    We build ground-truth indices 0..B-1 so that:
      - row i in logits_per_img should classify LiDAR i as the correct match
      - row i in logits_per_lidar should classify RGB i as the correct match
    """
    logits_per_img, logits_per_lidar = outputs   # both are (B, B)
    actual_batch_size = logits_per_img.size(0)

    # ground-truth: diagonal is the correct match
    ground_truth = torch.arange(actual_batch_size, dtype=torch.long, device=logits_per_img.device)

    loss_i = loss_img(logits_per_img, ground_truth)        # image → lidar
    loss_l = loss_lidar(logits_per_lidar, ground_truth)    # lidar → image

    total_loss = (loss_i + loss_l) * 0.5

    return total_loss


In [None]:
EPOCHS = 15
LR_CILP = 1e-3    # 3e-4

optimizer = torch.optim.Adam(CILP_model.parameters(), lr=LR_CILP)

cilp_train_losses, cilp_valid_losses, cilp_epoch_times, cilp_max_gpu_mb = train_model(
    model=CILP_model,
    optimizer=optimizer,
    input_fn=input_fn,
    epochs=EPOCHS,
    loss_fn=cilp_loss_fn,
    train_dataloader=train_dataloader,
    valid_dataloader=val_dataloader,
    target_idx=-1,
    log_to_wandb=False,
    model_name="CILP_contrastive",
)

Epoch and start time: 0 und 1764364985.301687


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
for param in CILP_model.parameters(): ## TODO: again done in projector??
    param.requires_grad = False

## Stage 2: Cross-Modal Projection

In [None]:
## move to models.py
class Projector(nn.Module):
    """
    Maps RGB embeddings -> LiDAR embedding space.
    Used after CILP encoders are frozen.
    """
    def __init__(self, img_dim, lidar_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(img_dim, 1000),
            nn.ReLU(),
            nn.Linear(1000, 500),
            nn.ReLU(),
            nn.Linear(500, lidar_dim),
        )

    def forward(self, x):
        return self.net(x)

In [None]:
## move to models.py
class ProjectorTrainingWrapper(nn.Module):
    """
    Combines:
    - frozen CILP img_embedder
    - frozen CILP lidar_embedder
    - trainable projector

    And returns what the loss_fn needs.
    """
    def __init__(self, cilp_model, projector):
        super().__init__()
        self.cilp = cilp_model
        self.projector = projector

        # freeze CILP
        for p in self.cilp.parameters():
            p.requires_grad = False

    def forward(self, rgb, lidar):
        # obtain embeddings
        rgb_emb = self.cilp.img_embedder(rgb)          # (B, D_img)
        lidar_emb = self.cilp.lidar_embedder(lidar)    # (B, D_lidar)

        # predict lidar embedding from rgb embedding
        pred_lidar_emb = self.projector(rgb_emb)

        # train_model expects ANY tuple → returned to loss_fn
        return rgb_emb, lidar_emb, pred_lidar_emb


In [None]:
img_dim = CILP_model.img_embedder.output_dim   #
lidar_dim = CILP_model.lidar_embedder.output_dim

projector = Projector(img_dim, lidar_dim).to(device)

projector_wrapper = ProjectorTrainingWrapper(CILP_model, projector).to(device)

In [None]:
def projector_loss_fn(outputs, _target_ignored):
    """
    outputs: tuple (rgb_embs, lidar_embs, pred_lidar_embs)

    The projector should map: pred_lidar_embs ≈ lidar_embs
    """
    rgb_embs, lidar_embs, pred_lidar_embs = outputs
    mse = nn.MSELoss()
    return mse(pred_lidar_embs, lidar_embs)

In [None]:
EPOCHS_PROJECTOR = 10
LR_PROJECTOR = 1e-4

optimizer_proj = torch.optim.Adam(projector_wrapper.projector.parameters(), lr=LR_PROJECTOR)

proj_train_losses, proj_valid_losses, proj_epoch_times, proj_max_gpu_mb = train_model(
    model=projector_wrapper,
    optimizer=optimizer_proj,
    input_fn=input_fn,
    epochs=EPOCHS_PROJECTOR,
    loss_fn=projector_loss_fn,
    train_dataloader=train_dataloader,
    valid_dataloader=val_dataloader,
    target_idx=-1,
    log_to_wandb=False,
    model_name="Projector_RGB_to_LiDAR",
)


Epoch and start time: 0 und 1764364150.9628556
epoch 0 train loss: 0.0019723757358245646
epoch 0 valid loss: 0.00017922741584091756
[Projector_RGB_to_LiDAR] Epoch 1/30 - train_loss: 0.0020  valid_loss: 0.0002  time: 71.83s
Epoch and start time: 1 und 1764364222.793578
epoch 1 train loss: 9.961871289894643e-05
epoch 1 valid loss: 9.070638213112639e-05
[Projector_RGB_to_LiDAR] Epoch 2/30 - train_loss: 0.0001  valid_loss: 0.0001  time: 67.72s
Epoch and start time: 2 und 1764364290.512395
epoch 2 train loss: 6.278865497305935e-05
epoch 2 valid loss: 7.795102307378935e-05
[Projector_RGB_to_LiDAR] Epoch 3/30 - train_loss: 0.0001  valid_loss: 0.0001  time: 67.38s
Epoch and start time: 3 und 1764364357.888088
epoch 3 train loss: 5.43285697377167e-05
epoch 3 valid loss: 7.360645455870933e-05
[Projector_RGB_to_LiDAR] Epoch 4/30 - train_loss: 0.0001  valid_loss: 0.0001  time: 66.75s
Epoch and start time: 4 und 1764364424.6400433
epoch 4 train loss: 5.148220647884848e-05
epoch 4 valid loss: 7.1228

KeyboardInterrupt: 

## Stage 3: RGB2LiDARClassifier

In [None]:
## move to models.py
class Classifier(nn.Module):
    def __init__(self, in_ch):
        super().__init__()
        kernel_size = 3
        n_classes = 1
        self.embedder = nn.Sequential(
            nn.Conv2d(in_ch, 50, kernel_size, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(50, 100, kernel_size, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(100, 200, kernel_size, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(200, 200, kernel_size, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten()
        )
        self.classifier = nn.Sequential(
            nn.Linear(200 * 4 * 4, 100),
            nn.ReLU(),
            nn.Linear(100, n_classes)
        )

    def get_embs(self, imgs):
        return self.embedder(imgs)

    def forward(self, raw_data=None, data_embs=None):
        assert (raw_data is not None or data_embs is not None), "No images or embeddings given."
        if raw_data is not None:
            data_embs = self.get_embs(raw_data)
        return self.classifier(data_embs)

In [None]:
model_save_path = STORAGE_PATH / "models/lidar_cnn.pt"

lidar_cnn = Classifier(1).to(device)
lidar_cnn.load_state_dict(torch.load(model_save_path, weights_only=True))


for param in lidar_cnn.parameters():
    param.requires_grad = False

lidar_cnn.eval()

Classifier(
  (embedder): Sequential(
    (0): Conv2d(1, 50, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(50, 100, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(100, 200, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (9): Conv2d(200, 200, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (10): ReLU()
    (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (12): Flatten(start_dim=1, end_dim=-1)
  )
  (classifier): Sequential(
    (0): Linear(in_features=3200, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=1, bias=True)
  )
)

In [None]:
## move to models.py
class RGB2LiDARClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.projector = projector
        self.img_embedder = CILP_model.img_embedder
        self.shape_classifier = lidar_cnn

    def forward(self, imgs):
        img_encodings = self.img_embedder(imgs)
        proj_lidar_embs = self.projector(img_encodings)
        return self.shape_classifier(data_embs=proj_lidar_embs)

In [None]:
classifier = RGB2LiDARClassifier()

In [None]:
def get_correct(output, y):
    zero_tensor = torch.tensor([0]).to(device)
    pred = torch.gt(output, zero_tensor)
    correct = pred.eq(y.view_as(pred)).sum().item()
    return correct

In [None]:
def get_valid_metrics():
    classifier.eval()
    correct = 0
    batch_correct = 0
    for step, batch in enumerate(val_dataloader):
        rbg_img, _, class_idx = batch
        output = classifier(rbg_img)
        loss = nn.BCEWithLogitsLoss()(output, class_idx)
        batch_correct = get_correct(output, class_idx)
        correct += batch_correct
    print(f"Valid Loss: {loss.item():2.4f} | Accuracy {correct/valid_N:2.4f}")

get_valid_metrics()

RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor

In [None]:
bce_logits_loss = nn.BCEWithLogitsLoss()

def rgb2lidar_loss_fn(outputs, target):
    """
    outputs: (B,) or (B,1)
    target: (B,) with integer 0/1 from dataloader
    """
    # ensure float and shape match outputs
    target = target.float().to(outputs.device)
    if target.shape != outputs.shape:
        target = target.view_as(outputs)
    return bce_logits_loss(outputs, target)

In [None]:
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3)
epochs = 5  # or whatever you need

train_losses, valid_losses, epoch_times, max_gpu_mb = train_model(
    model=classifier,
    optimizer=optimizer,
    input_fn=rgb_only_input_fn,
    epochs=epochs,
    loss_fn=rgb2lidar_loss_fn,
    train_dataloader=train_dataloader,
    valid_dataloader=val_dataloader,
    target_idx=2,                    # class_idx is at position 2 in the batch
    log_to_wandb=False,               # if you want W&B logging
    model_name="RGB2LiDARClassifier"
)
