In [57]:
import numpy as np
from PIL import Image, ImageShow
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader

In [58]:
def in_colab() -> bool:
    try:
        import google.colab  # noqa: F401
        return True
    except Exception:
        return False

if in_colab():
    !git clone https://github.com/MaxSpeer/applied-computer-vision-assignment2.git
    %cd applied-computer-vision-assignment2
    !pip install -r requirements.txt
    !pip install -e .
else:
    from pathlib import Path
    import sys
    project_root = Path("..").resolve()
    sys.path.append(str(project_root))

In [59]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.is_available()

False

In [60]:
IMG_SIZE = 64
BATCH_SIZE = 32
VALID_BATCHES = 10
N = 9999

img_transforms = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.ToTensor(),  # Scales data into [0,1]
])

In [61]:
import fiftyone as fo
from fiftyone.utils.huggingface import load_from_hub

#fo.delete_dataset("multimodal-shapes-subset")
dataset_hf = load_from_hub("maxspeer/assessment2_spheres_and_cube_2k_2",
                         name="multimodal-shapes-subset",
                         num_workers=4,
                         batch_size=500,
                        #max_samples=3000,
                           overwrite=True
                        )

# fiftyone session
session = fo.launch_app(dataset_hf, auto=False)
print(session.url)

Downloading config file fiftyone.yml from maxspeer/assessment2_spheres_and_cube_2k_2
Loading dataset
Importing samples...
 100% |███████████████| 6000/6000 [52.8ms elapsed, 0s remaining, 113.6K samples/s]  
Session launched. Run `session.show()` to open the App in a cell output.
http://localhost:5151/


In [62]:
from src.datasets import MultimodalDataset

img_transforms = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.ToTensor(),  # Scales data into [0,1] TODO correct non deprecated version
])

BATCH_SIZE = 32
train_dataset = MultimodalDataset(dataset_hf,"train",img_transforms)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)


val_dataset = MultimodalDataset(dataset_hf,"val",img_transforms)
valid_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

In [63]:
CILP_EMB_SIZE = 200

class Embedder(nn.Module):
    def __init__(self, in_ch, emb_size=CILP_EMB_SIZE):
        super().__init__()
        kernel_size = 3
        stride = 1
        padding = 1

        # Convolution
        self.conv = nn.Sequential(
            nn.Conv2d(in_ch, 50, kernel_size, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(50, 100, kernel_size, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(100, 200, kernel_size, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(200, 200, kernel_size, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten()
        )

        # Embeddings
        self.dense_emb = nn.Sequential(
            nn.Linear(200 * 4 * 4, 100),
            nn.ReLU(),
            nn.Linear(100, emb_size)
        )

    def forward(self, x):
        conv = self.conv(x)
        emb = self.dense_emb(conv)
        return F.normalize(emb)

In [64]:

img_embedder = Embedder(4).to(device)
lidar_embedder = Embedder(1).to(device)


Could not connect session, trying again in 10 seconds



In [65]:
class ContrastivePretraining(nn.Module):
    def __init__(self):
        super().__init__()
        self.img_embedder = img_embedder
        self.lidar_embedder = lidar_embedder
        self.cos = nn.CosineSimilarity(dim=1)

    def forward(self, rgb_imgs, lidar_depths):
        img_emb = self.img_embedder(rgb_imgs)
        lidar_emb = self.lidar_embedder(lidar_depths)

        repeated_img_emb = img_emb.repeat_interleave(len(img_emb), dim=0)
        repeated_lidar_emb = lidar_emb.repeat(len(lidar_emb), 1)

        similarity = self.cos(repeated_img_emb, repeated_lidar_emb)
        similarity = torch.unflatten(similarity, 0, (BATCH_SIZE, BATCH_SIZE))
        similarity = (similarity + 1) / 2

        #logits_per_img = similarity
        #logits_per_lidar = similarity.T
        
        logits_per_img = similarity / 0.07
        logits_per_lidar = logits_per_img.T


        return logits_per_img, logits_per_lidar

In [66]:
# class ContrastivePretraining(nn.Module):
#     def __init__(self, img_embedder, lidar_embedder, init_temp=0.07):
#         super().__init__()
#         self.img_embedder = img_embedder
#         self.lidar_embedder = lidar_embedder
#         self.logit_scale = nn.Parameter(torch.tensor(np.log(1/init_temp), dtype=torch.float32))

#     def forward(self, rgb_imgs, lidar_depths):
#         img_emb = self.img_embedder(rgb_imgs)          # [B, D], normalized
#         lidar_emb = self.lidar_embedder(lidar_depths)  # [B, D], normalized

#         scale = self.logit_scale.exp().clamp(1, 100)
#         logits = scale * (img_emb @ lidar_emb.T)       # [B, B]
#         return logits, logits.T


In [67]:
CILP_model = ContrastivePretraining().to(device)
optimizer = Adam(CILP_model.parameters(), lr=0.0001)
loss_img = nn.CrossEntropyLoss()
loss_lidar = nn.CrossEntropyLoss()
ground_truth = torch.arange(BATCH_SIZE, dtype=torch.long).to(device)
epochs = 3

In [68]:
def get_CILP_loss(batch):
    rbg_img = batch[0].to(device)
    lidar_depth = batch[2].to(device)
    class_idx = batch[3].to(device)

    logits_per_img, logits_per_lidar = CILP_model(rbg_img, lidar_depth)

    total_loss = (loss_img(logits_per_img, ground_truth) +
        loss_lidar(logits_per_lidar, ground_truth)
    ) / 2

    return total_loss, logits_per_img

In [None]:
import copy
import torch
import wandb
from src.training import getWandbRun

# Make sure CILP is trainable (in case you froze it during projector training)
for p in CILP_model.parameters():
    p.requires_grad = True
CILP_model.train()

best_val = float("inf")
best_epoch = -1
best_ckpt = None

batch_size = BATCH_SIZE
epochs = 10

with getWandbRun(
    "CILP_Pretraining",
    "rgb/lidar",
    batch_size=batch_size,
    epochs=epochs,
    project_name="assessment2_notebook4"
) as run:

    for epoch in range(epochs):
        # ---------------- train ----------------
        CILP_model.train()
        train_loss = 0.0
        n_train = 0

        for batch in train_dataloader:
            optimizer.zero_grad(set_to_none=True)

            loss, logits_per_img = get_CILP_loss(batch)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            n_train += 1

        train_avg = train_loss / max(n_train, 1)
        print(f"Epoch {epoch} | Train loss: {train_avg:.6f}")

        # ---------------- valid ----------------
        CILP_model.eval()
        valid_loss = 0.0
        n_val = 0
        with torch.no_grad():
            for batch in valid_dataloader:
                loss, logits_per_img = get_CILP_loss(batch)
                valid_loss += loss.item()
                n_val += 1

        val_avg = valid_loss / max(n_val, 1)
        print(f"Epoch {epoch} | Valid loss: {val_avg:.6f}")

        # Log to W&B
        run.log(
            {
                "epoch": epoch,
                "train_loss": train_avg,
                "valid_loss": val_avg,
                "learning_rate": optimizer.param_groups[0]["lr"],
            },
            step=epoch,
        )

        # Track best checkpoint (CPU snapshot to avoid GPU memory growth)
        if val_avg < best_val:
            best_val = val_avg
            best_epoch = epoch
            best_ckpt = {
                "epoch": epoch,
                "best_val_loss": best_val,
                "model_state_dict": {k: v.detach().cpu().clone() for k, v in CILP_model.state_dict().items()},
                "optimizer_state_dict": copy.deepcopy(optimizer.state_dict()),
            }
            print(f"New best at epoch {epoch}: {best_val:.6f}")

    # Save best checkpoint
    save_path = "CILP_best.pt"
    torch.save(best_ckpt, save_path)
    print(f"Saved best checkpoint from epoch {best_epoch} to {save_path} (val={best_val:.6f})")

    # Optional: upload checkpoint to W&B as artifact
    artifact = wandb.Artifact("CILP_best", type="model")
    artifact.add_file(save_path)
    run.log_artifact(artifact)

    run.finish()


wandb.init fertig

Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds

Epoch 0 | Train loss: 0.784947
Epoch 0 | Valid loss: 0.807195
New best at epoch 0: 0.807195

Could not connect session, trying again in 10 seconds

Epoch 1 | Train loss: 0.771363
Epoch 1 | Valid loss: 0.798061
New best at epoch 1: 0.798061

Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds

Epoch 2 | Train loss: 0.775447
Epoch 2 | Valid loss: 0.780357
New best at epoch 2: 0.780357

Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds

Epoch 3 | Train loss: 0.801186
Epoch 3 | Valid loss: 0.803570

Could not connect session, 

In [70]:
import torch.nn as nn

projector = nn.Sequential(
    nn.Linear(CILP_EMB_SIZE, 1000),
    nn.ReLU(),
    nn.Linear(1000, 500),
    nn.ReLU(),
    nn.Linear(500, 3200)
).to(device)

In [71]:
projector = nn.Sequential(
    nn.Linear(200, 512),
    nn.ReLU(),
    nn.Linear(512, 1024),
    nn.ReLU(),
    nn.Linear(1024, 3200),
).to(device)

In [72]:
from src.assesment_utils import Classifier

lidar_cnn = Classifier(1).to(device)
lidar_cnn.load_state_dict(torch.load("../lidar_cnn.pt", weights_only=True, map_location=torch.device('cpu')))
# Do not unfreeze. Otherwise, it would be difficult to pass the assessment.
for param in lidar_cnn.parameters():
    param.requires_grad = False
lidar_cnn.eval()

Classifier(
  (embedder): Sequential(
    (0): Conv2d(1, 50, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(50, 100, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(100, 200, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (9): Conv2d(200, 200, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (10): ReLU()
    (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (12): Flatten(start_dim=1, end_dim=-1)
  )
  (classifier): Sequential(
    (0): Linear(in_features=3200, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=1, bias=True)
  )
)

In [73]:
def get_projector_loss(model, batch):
    rbg_img = batch[0].to(device)
    lidar_depth = batch[2].to(device)
    class_idx = batch[3].to(device)

    # rbg_img, lidar_depth, class_idx = batch
    imb_embs = CILP_model.img_embedder(rbg_img)
    lidar_emb = lidar_cnn.get_embs(lidar_depth)
    pred_lidar_embs = model(imb_embs)
    return nn.MSELoss()(pred_lidar_embs, lidar_emb)

In [None]:
optimizer = torch.optim.AdamW(projector.parameters(), lr=3e-4, weight_decay=1e-4)


In [None]:
# from src.assesment_utils import print_loss
# def train_model(model, optimizer, loss_func, epochs, train_dataloader, valid_dataloader):
#     for epoch in range(epochs):
#         model.train()
#         train_loss = 0
#         for step, batch in enumerate(train_dataloader):
#             optimizer.zero_grad()
#             loss = loss_func(model, batch)
#             loss.backward()
#             optimizer.step()
#             train_loss += loss.item()

#         train_loss = train_loss / (step + 1)
#         print_loss(epoch, train_loss, is_train=True)
        
#         model.eval()
#         valid_loss = 0
#         for step, batch in enumerate(valid_dataloader):
#             loss = loss_func(model, batch)
#             valid_loss += loss.item()
#         valid_loss = valid_loss / (step + 1)
#         print_loss(epoch, valid_loss, is_train=False)
        

In [None]:
for p in CILP_model.parameters():
    p.requires_grad = False
for p in lidar_cnn.parameters():
    p.requires_grad = False

from src.assesment_utils import train_model as train_model_assessment
epochs = 100

train_model_assessment(projector, optimizer, get_projector_loss, epochs, train_dataloader, valid_dataloader)


Could not connect session, trying again in 10 seconds

Epoch   0 | Train Loss: 5.4660
Epoch   0 | Valid Loss: 4.7688
Epoch   1 | Train Loss: 4.7992
Epoch   1 | Valid Loss: 4.5415
Epoch   2 | Train Loss: 4.5046

Could not connect session, trying again in 10 seconds

Epoch   2 | Valid Loss: 4.1923

Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds

Epoch   3 | Train Loss: 3.9888
Epoch   3 | Valid Loss: 3.6321

Could not connect session, trying again in 10 seconds

Epoch   4 | Train Loss: 3.5619
Epoch   4 | Valid Loss: 3.3747
Epoch   5 | Train Loss: 3.3923
Epoch   5 | Valid Loss: 3.3372

Could not connect session, trying again in 10 seconds

Epoch   6 | Train Loss: 3.2529

Could not connect session, trying again in 10 seconds

Epoch   6 | Valid Loss: 3.2315

Could not connect session, trying again in 10 seconds

Epoch   7 | Train Loss: 3.1921
Epoch   7 | Valid Loss: 3.1760

Could not connect session, trying again in 10 seconds



KeyboardInterrupt: 


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 s

In [None]:
with torch.no_grad():
    rgb = batch[0].to(device)
    lidar = batch[2].to(device)
    e_rgb = CILP_model.img_embedder(rgb)
    e_lid = lidar_cnn.get_embs(lidar)

print("RGB mean/std/norm:", e_rgb.mean().item(), e_rgb.std().item(), e_rgb.norm(dim=1).mean().item())
print("LiDAR mean/std/norm:", e_lid.mean().item(), e_lid.std().item(), e_lid.norm(dim=1).mean().item())


RGB mean/std/norm: -0.00023113441420719028 0.07071582227945328 1.0
LiDAR mean/std/norm: 0.32104167342185974 2.2003915309906006 109.63412475585938



Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds

Could not connect session, trying again in 10 seconds




In [None]:
projector.train()
batch = next(iter(train_dataloader))
for i in range(500):
    optimizer.zero_grad()
    loss = get_projector_loss(projector, batch)
    loss.backward()
    optimizer.step()
    if i % 50 == 0:
        print(i, loss.item())


0 3.278610944747925
50 2.4535748958587646
100 2.0725209712982178
150 1.7902824878692627

Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds

200 1.55489182472229
250 1.3500958681106567
300 1.1765564680099487

Could not connect session, trying again in 10 seconds

350 1.025274395942688

Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds

400 0.891940176486969
450 0.7780547142028809



Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds

Could not connect session, trying again in 10 seconds



Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds

Could not connect session, trying again in 10 seconds



Could not connect session, trying again in 10 seconds


Could not connect session, trying again in 10 seconds

Could not connect session, trying again in 10 seconds


