In [16]:
def in_colab() -> bool:
    try:
        import google.colab  # noqa: F401
        return True
    except Exception:
        return False

In [17]:
if in_colab():
    !git clone https://github.com/MaxSpeer/applied-computer-vision-assignment2.git
    %cd applied-computer-vision-assignment2
    !pip install -r requirements.txt
    !pip install -e .
else:
    from pathlib import Path
    import sys
    project_root = Path("..").resolve()
    sys.path.append(str(project_root))

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.is_available()

False

In [20]:
import fiftyone as fo
from fiftyone.utils.huggingface import load_from_hub

#fo.delete_dataset("multimodal-shapes-subset")
dataset = load_from_hub("maxspeer/assessment2_spheres_and_cube_2k_2",
                         name="multimodal-shapes-subset",
                         num_workers=1,
                         batch_size=500,
                        #max_samples=3000,
                           #overwrite=True
                        )

Downloading config file fiftyone.yml from maxspeer/assessment2_spheres_and_cube_2k_2
Loading dataset
Importing samples...
 100% |███████████████| 6000/6000 [49.3ms elapsed, 0s remaining, 121.7K samples/s]  


In [23]:
from src.datasets import MultimodalDataset
import torchvision.transforms as transforms
from torch.utils.data import DataLoader


IMG_SIZE = 64

img_transforms = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.ToTensor(),  # Scales data into [0,1] TODO correct non deprecated version
])

BATCH_SIZE = 32
train_dataset = MultimodalDataset(dataset,"train",img_transforms)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)


val_dataset = MultimodalDataset(dataset,"val",img_transforms)
valid_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

## Models
1. One-Modal Classifier for RGB or Lidar
2. Late Fusion Model
3. Intermediate Fuision model ("concat", "add" and "hardman")

### Log into wandb

In [24]:
import os
import wandb

#os.environ['WANDB_API_KEY'] = ''
wandb.login(force=True)

[34m[1mwandb[0m: Currently logged in as: [33mmaximilian-speer[0m ([33mmaximilian-speer-hasso-plattner-institut[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [25]:
# This function is used to get the outputs from the model during training
def get_outputs_fusion(model, batch):
    input1 = batch[0].to(device) # rgb
    input2 = batch[1].to(device) # lidar (xyza)
    target = batch[-1].to(device) # label
    outputs = model(input1,
                   input2)

    return outputs, target

### Hyperparameter optimiziation
For simplification only the fusion models are trained, not the baseline rgb Classifier

In [26]:
from src.models import IntermediateFusionNet, LateFusionModel
# Fusion models to compare
fusionModels = [
    ("concat", lambda: IntermediateFusionNet(4,4, "concat")),
    ("add", lambda: IntermediateFusionNet(4,4, "add")),
    ("hadamard", lambda: IntermediateFusionNet(4,4, "hadamard")),
    ("late", lambda: LateFusionModel()),
]

In [27]:
from src.training import getWandbRun, train_model

batchSizes = [16]# ,32,64]
learningRates = [0.001]#, 0.0001, 0.00001]
epochs = 3

for mode, modelBlueprint in fusionModels:
    best_valid_loss = float('inf')
    best_model_state = None

    for batch_size in batchSizes:
        for lr in learningRates:
            with getWandbRun(mode, "rgb/lidar", batch_size=batch_size, epochs=epochs) as run:
                model = modelBlueprint().to(device)

                print(f"Training fusion mode: {mode}")
                model_param_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
                print(f"Number of trainable parameters: {model_param_count}")
                run.config.update({"number_of_parameters": model_param_count})

                train_dataloader = DataLoader(train_dataset, batch_size=batch_size,
                                              shuffle=True, drop_last=True)
                valid_dataloader = DataLoader(val_dataset, batch_size=batch_size,
                                              shuffle=True, drop_last=True)

                opt = Adam(model.parameters(), lr=lr)
                loss_func = nn.BCEWithLogitsLoss()
                train_loss, valid_loss = train_model(model=model,
                                                    optimizer=opt,
                                                    train_dataloader=train_dataloader,
                                                    valid_dataloader=valid_dataloader,
                                                    loss_func=loss_func,
                                                    get_outputs=get_outputs_fusion,
                                                    wandbrun=run,
                                                    epochs=epochs)
                run.finish()

wandb.init fertig
Training fusion mode: concat
Number of trainable parameters: 13015851
Epoch 0: train loss = 0.7078
Epoch 0: valid loss = 0.5070 | F1=0.7442 P=0.7393 R=0.7492
Epoch 1: train loss = 0.4774
Epoch 1: valid loss = 0.4897 | F1=0.8271 P=0.7052 R=1.0000
Epoch 2: train loss = 0.4201
Epoch 2: valid loss = 0.4833 | F1=0.8081 P=0.6780 R=1.0000


0,1
learning_rate,▁▁▁
train_loss,█▂▁
val_f1,▁█▆
val_precision,█▄▁
val_recall,▁██
valid_loss,█▃▁

0,1
learning_rate,0.001
train_loss,0.42008
val_precision,0.678
val_recall,1.0


wandb.init fertig
Training fusion mode: add
Number of trainable parameters: 6615851
Epoch 0: train loss = 0.8047
Epoch 0: valid loss = 0.5818 | F1=0.7313 P=0.6970 R=0.7692
Epoch 1: train loss = 0.5227
Epoch 1: valid loss = 0.3959 | F1=0.8538 P=0.7565 R=0.9799
Epoch 2: train loss = 0.3573
Epoch 2: valid loss = 0.2255 | F1=0.9108 P=0.8362 R=1.0000


0,1
learning_rate,▁▁▁
train_loss,█▄▁
val_f1,▁▆█
val_precision,▁▄█
val_recall,▁▇█
valid_loss,█▄▁

0,1
learning_rate,0.001
train_loss,0.35733
val_precision,0.83616
val_recall,1.0


wandb.init fertig
Training fusion mode: hadamard
Number of trainable parameters: 6615851
Epoch 0: train loss = 0.6654
Epoch 0: valid loss = 0.6418 | F1=0.7431 P=0.5913 R=1.0000
Epoch 1: train loss = 0.5404
Epoch 1: valid loss = 0.5012 | F1=0.7471 P=0.7433 R=0.7508
Epoch 2: train loss = 0.4799
Epoch 2: valid loss = 0.4195 | F1=0.8182 P=0.7925 R=0.8456


0,1
learning_rate,▁▁▁
train_loss,█▃▁
val_f1,▁▁█
val_precision,▁▆█
val_recall,█▁▄
valid_loss,█▄▁

0,1
learning_rate,0.001
train_loss,0.47989
val_precision,0.79245
val_recall,0.84564


wandb.init fertig
Training fusion mode: late
Number of trainable parameters: 13117093
Epoch 0: train loss = 0.6915
Epoch 0: valid loss = 0.6006 | F1=0.5498 P=0.6832 R=0.4600
Epoch 1: train loss = 0.5227
Epoch 1: valid loss = 0.4466 | F1=0.7857 P=0.7586 R=0.8148
Epoch 2: train loss = 0.3978
Epoch 2: valid loss = 0.3107 | F1=0.8882 P=0.8343 R=0.9495


0,1
learning_rate,▁▁▁
train_loss,█▄▁
val_f1,▁▆█
val_precision,▁▄█
val_recall,▁▆█
valid_loss,█▄▁

0,1
learning_rate,0.001
train_loss,0.39775
val_precision,0.83432
val_recall,0.94949


| Metric | Late Fusion | Intermediate (Concat) | Intermediate (Add) | Intermediate (Hadamard) |
|---|---|---|---|---|
| Validation Loss (best) | 0.016224 | 0.0058308 | 0.020876 | 0.025468 |
| F1 score  (best)| 0.99833 | 0.99666 | 0.99666 | 0.99324 |
| Parameters (count) | 13117093 | 13015851 | 6615851 | 6615851 |
| Training Time (seconds/epoch) * | 50s | 47s | 42s | 44s |


All models perform similarly well, only small differencen in the training time and performence can be observed. With 13M parameters the Late Fusion and Intermediate (concat) models are double the size of the other two models. They both achieve better validation loss (late fusion) and F1 Score (Intermediate (concat)). However the models size increases the time and computation needs during training. Overall the results only differ slightly and could also be influenced by other factors (like weight initialzation). 