# Task 5: CILP Assessment Performance

## Setup

In [1]:
import numpy as np
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader

from assessment import assessment_utils

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.is_available()


import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mjosefpribbernow[0m ([33mjosefpribbernow-hasso-plattner-institute[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

### Base Model

In [2]:
lidar_cnn = assessment_utils.Classifier(1).to(device)
lidar_cnn.load_state_dict(torch.load("assessment/lidar_cnn.pt", weights_only=True))
# Do not unfreeze. Otherwise, it would be difficult to pass the assessment.
for param in lidar_cnn.parameters():
    lidar_cnn.requires_grad = False
lidar_cnn.eval()

Classifier(
  (embedder): Sequential(
    (0): Conv2d(1, 50, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(50, 100, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(100, 200, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (9): Conv2d(200, 200, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (10): ReLU()
    (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (12): Flatten(start_dim=1, end_dim=-1)
  )
  (classifier): Sequential(
    (0): Linear(in_features=3200, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=1, bias=True)
  )
)

### Dataset

In [3]:
IMG_SIZE = 64
img_transforms = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    # transforms.ToImage(),
    # transforms.ToDtype(torch.float32, scale=True),  # Scales data into [0,1]
    transforms.ToTensor(),  # Using ToTensor for compatibility, as ToImage is not available, in my fucked up environment
])

class MyDataset(Dataset):
    def __init__(self, root_dir, start_idx, stop_idx):
        self.classes = ["cubes", "spheres"]
        self.root_dir = root_dir
        self.rgb = []
        self.lidar = []
        self.class_idxs = []

        for class_idx, class_name in enumerate(self.classes):
            for idx in range(start_idx, stop_idx):
                file_number = "{:04d}".format(idx)
                rbg_img = Image.open(self.root_dir + class_name + "/rgb/" + file_number + ".png")
                rbg_img = img_transforms(rbg_img).to(device)
                self.rgb.append(rbg_img)
    
                lidar_depth = np.load(self.root_dir + class_name + "/lidar/" + file_number + ".npy")
                lidar_depth = torch.from_numpy(lidar_depth[None, :, :]).to(torch.float32).to(device)
                self.lidar.append(lidar_depth)

                self.class_idxs.append(torch.tensor(class_idx, dtype=torch.float32)[None].to(device))

    def __len__(self):
        return len(self.class_idxs)

    def __getitem__(self, idx):
        rbg_img = self.rgb[idx]
        lidar_depth = self.lidar[idx]
        class_idx = self.class_idxs[idx]
        return rbg_img, lidar_depth, class_idx

In [4]:
BATCH_SIZE = 32
VALID_BATCHES = 10
N = 9999

valid_N = VALID_BATCHES*BATCH_SIZE
train_N = N - valid_N

train_data = MyDataset("data/assessment/", 0, train_N)
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
valid_data = MyDataset("data/assessment/", train_N, N)
valid_dataloader = DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=False, drop_last=True)

N *= 2
valid_N *= 2
train_N *= 2

In [5]:
# W&B Configuration
TASK_NAME = "Final Assessment"
WANDB_TAGS = ["Final Assessment"]

## 5.1 Contrastive Pretraining

**Decisions:**
We use the provided Embedder archtecture from 05_Assessment.ipynb. Also we use MaxPool2d as downsampling method, as it performed best in the ablation study.

In [6]:
CILP_EMB_SIZE = 200

class Embedder(nn.Module):
    def __init__(self, in_ch, emb_size=CILP_EMB_SIZE):
        super().__init__()
        kernel_size = 3

        # Convolution
        self.conv = nn.Sequential(
            nn.Conv2d(in_ch, 50, kernel_size, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(50, 100, kernel_size, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(100, 200, kernel_size, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(200, 200, kernel_size, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten()
        )

        # Embeddings
        self.dense_emb = nn.Sequential(
            nn.Linear(200 * 4 * 4, 100),
            nn.ReLU(),
            nn.Linear(100, emb_size)
        )

    def forward(self, x):
        conv = self.conv(x)
        emb = self.dense_emb(conv)
        return F.normalize(emb)

In [7]:
img_embedder = Embedder(4).to(device)
lidar_embedder = Embedder(1).to(device)

In [8]:
class ContrastivePretraining(nn.Module):
    def __init__(self):
        super().__init__()
        self.img_embedder = img_embedder
        self.lidar_embedder = lidar_embedder
        self.cos = nn.CosineSimilarity()

    def forward(self, rgb_imgs, lidar_depths):
        img_emb = self.img_embedder(rgb_imgs)
        lidar_emb = self.lidar_embedder(lidar_depths)

        repeated_img_emb = img_emb.repeat_interleave(len(img_emb), dim=0)
        repeated_lidar_emb = lidar_emb.repeat(len(lidar_emb), 1)

        similarity = self.cos(repeated_img_emb, repeated_lidar_emb)
        similarity = torch.unflatten(similarity, 0, (BATCH_SIZE, BATCH_SIZE))
        similarity = (similarity + 1) / 2

        logits_per_img = similarity
        logits_per_lidar = similarity.T
        return logits_per_img, logits_per_lidar

In [9]:
CILP_LR = 0.0001
CILP_model = ContrastivePretraining().to(device)
optimizer = torch.optim.AdamW(CILP_model.parameters(), lr=CILP_LR)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1)
loss_img = nn.CrossEntropyLoss()
loss_lidar = nn.CrossEntropyLoss()
ground_truth = torch.arange(BATCH_SIZE, dtype=torch.long).to(device)
epochs = 3

# Initialize W&B for CILP training
wandb.init(
    project="cilp-extended-assessment",
    group=TASK_NAME,
    name="04_contrastive_pretraining",
    tags=WANDB_TAGS + ["contrastive_pretraining"],
    config={
        "learning_rate": CILP_LR,
        "architecture": "CILP_Contrastive",
        "embedding_size": CILP_EMB_SIZE,
        "batch_size": BATCH_SIZE,
        "epochs": epochs,
        "optimizer": optimizer.__class__.__name__,
        "scheduler": "ReduceLROnPlateau",
        "fusion_strategy": "contrastive",
        "num_params": sum(p.numel() for p in CILP_model.parameters() if p.requires_grad),
    }
)

In [10]:
def get_CILP_loss(batch):
    rbg_img, lidar_depth, class_idx = batch
    logits_per_img, logits_per_lidar = CILP_model(rbg_img, lidar_depth)
    total_loss = (loss_img(logits_per_img, ground_truth) + loss_lidar(logits_per_lidar, ground_truth))/2
    return total_loss, logits_per_img

In [11]:
for epoch in range(epochs):
    CILP_model.train()
    train_loss = 0
    for step, batch in enumerate(train_dataloader):
        optimizer.zero_grad()
        loss, logits_per_img = get_CILP_loss(batch)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    
    avg_train_loss = train_loss/step
    assessment_utils.print_CILP_results(epoch, avg_train_loss, logits_per_img, is_train=True)

    CILP_model.eval()
    valid_loss = 0
    for step, batch in enumerate(valid_dataloader):
        loss, logits_per_img = get_CILP_loss(batch)
        valid_loss += loss.item()
    
    avg_valid_loss = valid_loss/step
    assessment_utils.print_CILP_results(epoch, avg_valid_loss, logits_per_img, is_train=False)
    
    # Step the scheduler based on validation loss
    scheduler.step(avg_valid_loss)
    current_lr = optimizer.param_groups[0]['lr']
    
    # Log metrics to W&B
    wandb.log({
        "cilp_train/loss": avg_train_loss,
        "cilp_valid/loss": avg_valid_loss,
        "learning_rate": current_lr,
        "epoch": epoch,
    })

# Log similarity matrix at end of training as an image
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 8))
im = ax.imshow(logits_per_img.detach().cpu().numpy(), cmap='viridis', aspect='auto')
ax.set_xlabel('LiDAR Index')
ax.set_ylabel('Image Index')
ax.set_title('Similarity Matrix')
plt.colorbar(im, ax=ax)

wandb.log({"similarity_matrix": wandb.Image(fig)})
wandb.finish()

plt.close(fig)

Epoch 0
Train Loss: 3.0853904412555853 
Similarity:
tensor([[0.9976, 0.6505, 0.4310,  ..., 0.8315, 0.4353, 0.3540],
        [0.6577, 0.9971, 0.0366,  ..., 0.9429, 0.1506, 0.8389],
        [0.4685, 0.0447, 0.9972,  ..., 0.1142, 0.8984, 0.1690],
        ...,
        [0.8833, 0.8906, 0.1490,  ..., 0.9892, 0.1719, 0.5702],
        [0.3745, 0.1390, 0.9145,  ..., 0.0976, 0.9934, 0.4095],
        [0.3605, 0.8644, 0.1730,  ..., 0.6582, 0.3792, 0.9982]],
       device='cuda:0', grad_fn=<DivBackward0>)
Valid Loss: 3.192580499147114 
Similarity:
tensor([[0.9942, 0.7943, 0.5059,  ..., 0.4865, 0.4968, 0.9824],
        [0.8727, 0.9938, 0.2315,  ..., 0.2860, 0.3188, 0.9130],
        [0.4441, 0.1624, 0.9937,  ..., 0.6094, 0.5524, 0.3754],
        ...,
        [0.4157, 0.2911, 0.5634,  ..., 0.9985, 0.9942, 0.3972],
        [0.4508, 0.3343, 0.5197,  ..., 0.9956, 0.9979, 0.4372],
        [0.9952, 0.8599, 0.4179,  ..., 0.4431, 0.4631, 0.9971]],
       device='cuda:0', grad_fn=<DivBackward0>)
Epoch 1
Train

0,1
cilp_train/loss,█▂▁
cilp_valid/loss,█▂▁
epoch,▁▅█
learning_rate,▁▁▁

0,1
cilp_train/loss,3.02667
cilp_valid/loss,3.18173
epoch,2.0
learning_rate,0.0001


In [12]:
# Save CILP model checkpoint
torch.save(CILP_model.state_dict(), "models/04_contrastive_pretraining.pth")
print("Saved CILP model to models/04_contrastive_pretraining.pth")

Saved CILP model to models/04_contrastive_pretraining.pth


In [13]:
for param in CILP_model.parameters():
    CILP_model.requires_grad = False

## 5.2 Cross-Modal Projector

In [14]:
projector = nn.Sequential(
    nn.Linear(CILP_EMB_SIZE, 1000),
    nn.ReLU(),
    nn.Linear(1000, 500),
    nn.ReLU(),
    nn.Linear(500, 3200)
).to(device)

In [15]:
def get_projector_loss(model, batch):
    rbg_img, lidar_depth, class_idx = batch
    imb_embs = CILP_model.img_embedder(rbg_img)
    lidar_emb = lidar_cnn.get_embs(lidar_depth)
    pred_lidar_embs = model(imb_embs)
    return nn.MSELoss()(pred_lidar_embs, lidar_emb)

In [16]:
epochs = 40
optimizer = torch.optim.AdamW(projector.parameters())
assessment_utils.train_model(
    projector, 
    optimizer, 
    get_projector_loss, 
    epochs, 
    train_dataloader, 
    valid_dataloader,
    wandb_project="cilp-extended-assessment",
    wandb_name="04_projector_training",
    wandb_config={
        "architecture": "Projector",
        "group": TASK_NAME,
        "tags": WANDB_TAGS + ["projector_training"]
    }
)

Epoch   0 | Train Loss: 3.4974
Epoch   0 | Valid Loss: 3.2700
Epoch   1 | Train Loss: 3.1812
Epoch   1 | Valid Loss: 3.2089
Epoch   2 | Train Loss: 3.1319
Epoch   2 | Valid Loss: 3.1251
Epoch   3 | Train Loss: 3.0906
Epoch   3 | Valid Loss: 3.1192
Epoch   4 | Train Loss: 3.0602
Epoch   4 | Valid Loss: 3.0859
Epoch   5 | Train Loss: 3.0167
Epoch   5 | Valid Loss: 3.0149
Epoch   6 | Train Loss: 2.9154
Epoch   6 | Valid Loss: 2.9643
Epoch   7 | Train Loss: 2.7895
Epoch   7 | Valid Loss: 2.7400
Epoch   8 | Train Loss: 2.6158
Epoch   8 | Valid Loss: 2.7514
Epoch   9 | Train Loss: 2.5252
Epoch   9 | Valid Loss: 2.7436
Epoch  10 | Train Loss: 2.3127
Epoch  10 | Valid Loss: 2.3582
Epoch  11 | Train Loss: 2.2281
Epoch  11 | Valid Loss: 2.3297
Epoch  12 | Train Loss: 2.1649
Epoch  12 | Valid Loss: 2.3392
Epoch  13 | Train Loss: 2.1342
Epoch  13 | Valid Loss: 2.3059
Epoch  14 | Train Loss: 2.0964
Epoch  14 | Valid Loss: 2.2836
Epoch  15 | Train Loss: 2.0746
Epoch  15 | Valid Loss: 2.2377
Epoch  1

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
learning_rate,█████████▄▄▄▄▄▄▄▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁
train/loss,█▇▇▆▆▆▆▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid/loss,██▇▇▇▇▆▅▅▅▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,39.0
learning_rate,2e-05
train/loss,1.75495
valid/loss,1.97287


In [17]:
# Save projector checkpoint
torch.save(projector.state_dict(), "models/04_projector_training.pth")
print("Saved projector to models/04_projector_training.pth")

Saved projector to models/04_projector_training.pth


## 5.3 Final Classifier (RGB-to-LiDAR classifier)

In [18]:
class RGB2LiDARClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.projector = projector
        self.img_embedder = CILP_model.img_embedder
        self.shape_classifier = lidar_cnn
    
    def forward(self, imgs):
        img_encodings = self.img_embedder(imgs)
        proj_lidar_embs = self.projector(img_encodings)
        return self.shape_classifier(data_embs=proj_lidar_embs)

In [19]:
my_classifier = RGB2LiDARClassifier()

In [20]:
def get_correct(output, y):
    zero_tensor = torch.tensor([0]).to(device)
    pred = torch.gt(output, zero_tensor)
    correct = pred.eq(y.view_as(pred)).sum().item()
    return correct

In [21]:
def get_valid_metrics():
    my_classifier.eval()
    correct = 0
    batch_correct = 0
    total_loss = 0
    for step, batch in enumerate(valid_dataloader):
        rbg_img, _, class_idx = batch
        output = my_classifier(rbg_img)
        loss = nn.BCEWithLogitsLoss()(output, class_idx)
        batch_correct = get_correct(output, class_idx)
        correct += batch_correct
        total_loss += loss.item()
    
    avg_loss = total_loss / (step + 1)
    accuracy = correct / valid_N
    print(f"Valid Loss: {avg_loss:2.4f} | Accuracy {accuracy:2.4f}")
    return avg_loss, accuracy

get_valid_metrics()

Valid Loss: 1.6064 | Accuracy 0.7953


(1.6063885599374772, 0.7953125)

In [22]:
epochs = 5
CLASSIFIER_LR = 0.001
optimizer = torch.optim.AdamW(my_classifier.parameters(), lr=CLASSIFIER_LR)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1)

wandb.init(
    project="cilp-extended-assessment",
    group=TASK_NAME,
    name="04_final_classifier",
    tags=WANDB_TAGS + ["final_classifier"],
    config={
        "learning_rate": CLASSIFIER_LR,
        "architecture": "RGB2LiDARClassifier",
        "embedding_size": CILP_EMB_SIZE,
        "batch_size": BATCH_SIZE,
        "epochs": epochs,
        "optimizer": optimizer.__class__.__name__,
        "scheduler": "ReduceLROnPlateau",
        "fusion_strategy": "contrastive",
        "num_params": sum(p.numel() for p in my_classifier.parameters() if p.requires_grad),
    }
)

my_classifier.train()
for epoch in range(epochs):
    correct = 0
    batch_correct = 0
    total_train_loss = 0
    for step, batch in enumerate(train_dataloader):
        optimizer.zero_grad()
        rbg_img, _, class_idx = batch
        output = my_classifier(rbg_img)
        loss = nn.BCEWithLogitsLoss()(output, class_idx)
        batch_correct = get_correct(output, class_idx)
        correct += batch_correct
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    avg_train_loss = total_train_loss / (step + 1)
    train_accuracy = correct / train_N
    print(f"Train Loss: {avg_train_loss:2.4f} | Accuracy {train_accuracy:2.4f}")
    valid_loss, valid_acc = get_valid_metrics()
    
    # Step the scheduler based on validation loss
    scheduler.step(valid_loss)
    current_lr = optimizer.param_groups[0]['lr']
    
    wandb.log({
        "train/loss": avg_train_loss,
        "train/accuracy": train_accuracy,
        "valid/loss": valid_loss,
        "valid/accuracy": valid_acc,
        "learning_rate": current_lr,

        "epoch": epoch,    })

Train Loss: 0.3383 | Accuracy 0.8544
Valid Loss: 0.0105 | Accuracy 0.9953
Train Loss: 0.0205 | Accuracy 0.9913
Valid Loss: 0.0100 | Accuracy 0.9953
Train Loss: 0.0138 | Accuracy 0.9949
Valid Loss: 0.1297 | Accuracy 0.9453
Train Loss: 0.0211 | Accuracy 0.9930
Valid Loss: 0.0022 | Accuracy 0.9984
Train Loss: 0.0085 | Accuracy 0.9958
Valid Loss: 0.0198 | Accuracy 0.9938


In [23]:
# Save final classifier checkpoint
torch.save(my_classifier.state_dict(), "models/04_final_classifier.pth")
print("Saved final classifier to models/04_final_classifier.pth")

Saved final classifier to models/04_final_classifier.pth


Sample 5 predictions and log them to Weights & Biases (wandb) for visualization.

In [24]:
my_classifier.eval()
for step, batch in enumerate(valid_dataloader):
    rbg_img, _, class_idx = batch
    output = my_classifier(rbg_img)
    wandb.log({"predictions": wandb.Table(data=[[rbg_img[i].cpu().numpy(), torch.sigmoid(output[i]).item(), class_idx[i].item()] for i in range(5)],
                                           columns=["rgb_image", "predicted_class", "true_class"])})
    break
wandb.finish()

0,1
epoch,▁▃▅▆█
learning_rate,▁▁▁▁▁
train/accuracy,▁████
train/loss,█▁▁▁▁
valid/accuracy,██▁█▇
valid/loss,▁▁█▁▂

0,1
epoch,4.0
learning_rate,0.001
train/accuracy,0.99576
train/loss,0.00853
valid/accuracy,0.99375
valid/loss,0.01985
