In [None]:
!git clone https://github.com/MaxSpeer/applied-computer-vision-assignment2.git
%cd applied-computer-vision-assignment2
!pip install -r requirements.txt
!pip install -e .


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.is_available()

False

In [20]:
import fiftyone as fo
from fiftyone.utils.huggingface import load_from_hub

#fo.delete_dataset("multimodal-shapes-subset")
dataset = load_from_hub("maxspeer/assessment2_spheres_and_cube_2k_2",
                         name="multimodal-shapes-subset",
                         num_workers=1,
                         batch_size=500,
                        #max_samples=3000,
                           overwrite=True
                        )

Downloading config file fiftyone.yml from maxspeer/assessment2_spheres_and_cube_2k_2
Loading dataset
Importing samples...
 100% |███████████████| 6000/6000 [48.9ms elapsed, 0s remaining, 122.7K samples/s]  


In [None]:
from training import MultimodalDataset
import torchvision.transforms as transforms
from torch.utils.data import DataLoader


IMG_SIZE = 64

img_transforms = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.ToTensor(),  # Scales data into [0,1] TODO correct non deprecated version
])

BATCH_SIZE = 32
train_dataset = MultimodalDataset(dataset,"train",img_transforms)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)


val_dataset = MultimodalDataset(dataset,"val",img_transforms)
valid_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

## Models
1. One-Modal Classifier for RGB or Lidar
2. Late Fusion Model
3. Intermediate Fuision model ("concat", "add" and "hardman")

### Log into wandb

In [17]:
import os
import wandb

#os.environ['WANDB_API_KEY'] = ''
wandb.login(force=True)

True

In [28]:
# This function is used to get the outputs from the model during training
def get_outputs_fusion(model, batch):
    input1 = batch[0].to(device) # rgb
    input2 = batch[1].to(device) # lidar (xyza)
    target = batch[-1].to(device) # label
    outputs = model(input1,
                   input2)

    return outputs, target

### Hyperparameter optimiziation
For simplification only the fusion models are trained, not the baseline rgb Classifier

In [None]:
from models import IntermediateFusionNet, LateFusionModel
# Fusion models to compare
fusionModels = [
    ("concat", lambda: IntermediateFusionNet(4,4, "concat")),
    ("add", lambda: IntermediateFusionNet(4,4, "add")),
    ("hadamard", lambda: IntermediateFusionNet(4,4, "hadamard")),
    ("late", lambda: LateFusionModel()),
]

In [None]:
from src.training import getWandbRun, train_model

batchSizes = [16]# ,32,64]
learningRates = [0.001]#, 0.0001, 0.00001]
epochs = 3

for mode, modelBlueprint in fusionModels:
    best_valid_loss = float('inf')
    best_model_state = None

    for batch_size in batchSizes:
        for lr in learningRates:
            with getWandbRun(mode, "rgb/lidar", batch_size=batch_size, epochs=epochs) as run:
                model = modelBlueprint().to(device)

                print(f"Training fusion mode: {mode}")
                model_param_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
                print(f"Number of trainable parameters: {model_param_count}")
                run.config.update({"number_of_parameters": model_param_count})

                train_dataloader = DataLoader(train_dataset, batch_size=batch_size,
                                              shuffle=True, drop_last=True)
                valid_dataloader = DataLoader(val_dataset, batch_size=batch_size,
                                              shuffle=True, drop_last=True)

                opt = Adam(model.parameters(), lr=lr)
                loss_func = nn.BCEWithLogitsLoss()
                train_loss, valid_loss = train_model(model=model,
                                                    optimizer=opt,
                                                    train_dataloader=train_dataloader,
                                                    valid_dataloader=valid_dataloader,
                                                    loss_func=loss_func,
                                                    get_outputs=get_outputs_fusion,
                                                    wandbrun=run,
                                                    epochs=epochs)
                run.finish()

wandb.init fertig
Training fusion mode: concat
Number of trainable parameters: 13015851
Epoch 0: train loss = 0.6845
Epoch 0: valid loss = 0.4935 | F1=0.7994 P=0.7197 R=0.8990
Logging predictions table...
Batch moved to device
Inside _get_outputs_fusion
target:  tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.]])
y_int: tensor([1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0])
Wandb Table created
RGB moved to CPU
Processing sample 0
rgb_cpu shape: torch.Size([4, 64, 64])
yi: 1
pi: 1
Adding row to Wandb Table for sample 0: gt=sphere pred=sphere
Processing sample 1
rgb_cpu shape: torch.Size([4, 64, 64])
yi: 1
pi: 1
Adding row to Wandb Table for sample 1: gt=sphere pred=sphere
Processing sample 2
rgb_cpu shape: torch.Size([4, 64, 64])
yi: 1
pi: 1
Adding row to Wandb Table for sample 2: gt=sphere pred=sphere
Processing s

0,1
learning_rate,▁▁▁
train_loss,█▂▁
val_f1,▁▆█
val_precision,▁▃█
val_recall,▁▇█
valid_loss,█▄▁

0,1
learning_rate,0.001
train_loss,0.46605
val_precision,0.74937
val_recall,0.99329


wandb.init fertig
Training fusion mode: add
Number of trainable parameters: 6615851
Epoch 0: train loss = 0.7141
Epoch 0: valid loss = 0.5649 | F1=0.8125 P=0.7027 R=0.9630
Logging predictions table...
Batch moved to device
Inside _get_outputs_fusion
target:  tensor([[1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.]])
y_int: tensor([1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1])
Wandb Table created
RGB moved to CPU
Processing sample 0
rgb_cpu shape: torch.Size([4, 64, 64])
yi: 1
pi: 1
Adding row to Wandb Table for sample 0: gt=sphere pred=sphere
Processing sample 1
rgb_cpu shape: torch.Size([4, 64, 64])
yi: 0
pi: 0
Adding row to Wandb Table for sample 1: gt=cube pred=cube
Processing sample 2
rgb_cpu shape: torch.Size([4, 64, 64])
yi: 1
pi: 1
Adding row to Wandb Table for sample 2: gt=sphere pred=sphere
Processing sample 3


0,1
learning_rate,▁▁▁
train_loss,█▄▁
val_f1,▁▅█
val_precision,▁▄█
val_recall,▆█▁
valid_loss,█▄▁

0,1
learning_rate,0.001
train_loss,0.32552
val_precision,0.90228
val_recall,0.92953


wandb.init fertig
Training fusion mode: hadamard
Number of trainable parameters: 6615851
Epoch 0: train loss = 0.6056
Epoch 0: valid loss = 0.4337 | F1=0.8343 P=0.7507 R=0.9390
Logging predictions table...
Batch moved to device
Inside _get_outputs_fusion
target:  tensor([[0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.]])
y_int: tensor([0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0])
Wandb Table created
RGB moved to CPU
Processing sample 0
rgb_cpu shape: torch.Size([4, 64, 64])
yi: 0
pi: 1
Adding row to Wandb Table for sample 0: gt=cube pred=sphere
Processing sample 1
rgb_cpu shape: torch.Size([4, 64, 64])
yi: 1
pi: 1
Adding row to Wandb Table for sample 1: gt=sphere pred=sphere
Processing sample 2
rgb_cpu shape: torch.Size([4, 64, 64])
yi: 1
pi: 1
Adding row to Wandb Table for sample 2: gt=sphere pred=sphere
Processing sa

0,1
learning_rate,▁▁▁
train_loss,█▄▁
val_f1,▁▇█
val_precision,▁▆█
val_recall,▁█▆
valid_loss,█▃▁

0,1
learning_rate,0.001
train_loss,0.24989
val_precision,0.89908
val_recall,0.98


wandb.init fertig
Training fusion mode: late
Number of trainable parameters: 26256743
Epoch 0: train loss = 0.7139
Epoch 0: valid loss = 0.6816 | F1=0.5697 P=0.6908 R=0.4847
Logging predictions table...
Batch moved to device
Inside _get_outputs_fusion
target:  tensor([[0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.]])
y_int: tensor([0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1])
Wandb Table created
RGB moved to CPU
Processing sample 0
rgb_cpu shape: torch.Size([4, 64, 64])
yi: 0
pi: 0
Adding row to Wandb Table for sample 0: gt=cube pred=cube
Processing sample 1
rgb_cpu shape: torch.Size([4, 64, 64])
yi: 0
pi: 0
Adding row to Wandb Table for sample 1: gt=cube pred=cube
Processing sample 2
rgb_cpu shape: torch.Size([4, 64, 64])
yi: 1
pi: 0
Adding row to Wandb Table for sample 2: gt=sphere pred=cube
Processing sample 3
rgb_

0,1
learning_rate,▁▁▁
train_loss,█▄▁
val_f1,▁▇█
val_precision,▁▆█
val_recall,▁██
valid_loss,█▄▁

0,1
learning_rate,0.001
train_loss,0.45613
val_precision,0.81967
val_recall,0.99338
