# Setup

In [None]:
%%capture
%pip install wandb weave

In [None]:
%%capture
%pip install fiftyone==1.10.0 sympy==1.12 torch torchvision numpy open-clip-torch

In [None]:
import os
from pathlib import Path
from google.colab import userdata
import time

import torch
import torch.nn as nn
import torch.nn.functional as F

import matplotlib.pyplot as plt
import pandas as pd

import wandb

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
SEED = 51
NUM_WORKERS = os.cpu_count()  # Number of CPU cores
BATCH_SIZE = 64
IMG_SIZE = 64
EPOCHS = 15
LR = 0.0001

# Integrate Wandb

In [None]:
# Load W&B API key from .env file and make it available as env variable
from dotenv import load_dotenv
load_dotenv()  # loads .env automatically

os.environ["WANDB_API_KEY"]

In [None]:
# Load W&B API key from Colab Secrets and make it available as env variable
wandb_key = userdata.get('WANDB_API_KEY')
os.environ["WANDB_API_KEY"] = wandb_key

In [None]:
wandb.login()

In [None]:
config = {
        "embedding_size": 128,
        "optimizer_type": "AdamW",
        "fusion_strategy": "late",
        "model_architecture": "CILP_LateFusion",
        "lr": LR, ## why every epoch as hyperparameter?
        "batch_size": BATCH_SIZE,
        "num_epochs": EPOCHS,
        # Number of parameters
    }

In [None]:
metrics = {
        "epoch": 1,
        "train_loss": 0.5,
        "val_loss": 0.5,
    }

In [None]:
wandb.init(
    project="clip-extended-assessment",
    config=config
)

# Reproducibility

In [None]:
def set_seeds(seed=SEED):
    """
    Set seeds for complete reproducibility across all libraries and operations.

    Args:
        seed (int): Random seed value
    """
    # Set environment variables before other imports
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'

    # Python random module
    random.seed(seed)

    # NumPy
    np.random.seed(seed)

    # PyTorch CPU
    torch.manual_seed(seed)

    # PyTorch GPU (all devices)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # For multi-GPU setups

        # CUDA deterministic operations
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    # OpenCV
    cv2.setRNGSeed(seed)

    # Albumentations (for data augmentation)
    try:
        A.seed_everything(seed)
    except AttributeError:
        # Older versions of albumentations
        pass

    # PyTorch deterministic algorithms (may impact performance)
    try:
        torch.use_deterministic_algorithms(True)
    except RuntimeError:
        # Some operations don't have deterministic implementations
        print("Warning: Some operations may not be deterministic")

    print(f"All random seeds set to {seed} for reproducibility")



# Usage: Call this function at the beginning and before each training phase
set_seeds(SEED)

# Additional reproducibility considerations:

def create_deterministic_training_dataloader(dataset, batch_size, shuffle=True, **kwargs):
    """
    Create a DataLoader with deterministic behavior.

    Args:
        dataset: PyTorch Dataset instance
        batch_size: Batch size
        shuffle: Whether to shuffle data
        **kwargs: Additional DataLoader arguments

    Returns:
        Training DataLoader with reproducible behavior
    """
    # Use a generator with fixed seed for reproducible shuffling
    generator = torch.Generator()
    generator.manual_seed(51)

    return torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        generator=generator if shuffle else None,
        **kwargs
    )



# Utility Functions

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.is_available()

In [1]:
loss_func = nn.MSELoss()

def train_model(model, optimizer, input_fn, epochs, loss_fn, train_dataloader, valid_dataloader, target_idx=-1, log_to_wandb=False, model_name=None):
    train_losses = []
    valid_losses = []
    epoch_times = []

    # for GPU memory tracking
    max_gpu_mem_mb = 0.0
    use_cuda = torch.cuda.is_available()

    if use_cuda:
        torch.cuda.reset_peak_memory_stats()

    for epoch in range(epochs):
        start_time = time.time()                  # to track the train time per model
        model.train()
        train_loss = 0

        for step, batch in enumerate(train_dataloader):
            optimizer.zero_grad()
            target = batch[target_idx].to(device)  # labels: 0/1 for cube/sphere
            outputs = model(*input_fn(batch))      # e.g. model(rgb, lidar)

            loss = loss_fn(outputs, target)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss = train_loss / (step + 1)
        train_losses.append(train_loss)
        print_loss(epoch, train_loss, outputs, target, is_train=True)


        # ----- validation -----
        model.eval()
        valid_loss = 0
        with torch.no_grad():
          for step, batch in enumerate(valid_dataloader):
              target = batch[target_idx].to(device)
              outputs = model(*input_fn(batch))
              valid_loss += loss_func(outputs, target).item()
        valid_loss = valid_loss / (step + 1)
        valid_losses.append(valid_loss)
        print_loss(epoch, valid_loss, outputs, target, is_train=False)

        # timing
        epoch_time = time.time() - start_time
        epoch_times.append(epoch_time)

        # GPU memory
        if use_cuda:
            gpu_mem_mb = torch.cuda.max_memory_allocated() / (1024 ** 2)
            max_gpu_mem_mb = max(max_gpu_mem_mb, gpu_mem_mb)

        # console logging (reuse your print_loss if you want)
        print(
            f"[{model_name or 'model'}] Epoch {epoch+1}/{epochs} "
            f"- train_loss: {train_loss:.4f}  valid_loss: {valid_loss:.4f}  "
            f"time: {epoch_time:.2f}s"
        )

        # wandb logging
        if log_to_wandb:
            wandb.log(
                {
                    "model": model_name or "model",
                    "epoch": epoch + 1,
                    "train_loss": train_loss,
                    "valid_loss": valid_loss,
                    "epoch_time_sec": epoch_time,
                    "max_gpu_mem_mb_epoch": gpu_mem_mb if use_cuda else 0.0,
                }
            )

    return train_losses, valid_losses, epoch_times, max_gpu_mem_mb

NameError: name 'nn' is not defined

In [None]:
def print_loss(epoch, loss, outputs, target, is_train=True, is_debug=False):
    loss_type = "train loss:" if is_train else "valid loss:"
    print("epoch", str(epoch), loss_type, str(loss))
    if is_debug:
        print("example pred:", format_positions(outputs[0].tolist()))
        print("example real:", format_positions(target[0].tolist()))

In [None]:
def plot_losses(losses, title="Training & Validation Loss Comparison", figsize=(10,6)):
    plt.figure(figsize=figsize)

    for model_name, log in losses.items():
        train = log["train_losses"]
        valid = log["valid_losses"]

        # plot train + valid with different line styles
        plt.plot(train, label=f"{model_name} - train", linewidth=2)
        plt.plot(valid, label=f"{model_name} - valid", linestyle="--", linewidth=2)

    plt.title(title, fontsize=16)
    plt.xlabel("Epochs", fontsize=14)
    plt.ylabel("Loss", fontsize=14)
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
img_transforms = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.ToTensor(),  # Scales data into [0,1]
])

# TODO: Load and prepare Data - or import it from Notebook 01

# Create Models

Take the Net architecture from the workshop and turn it into an encoder that outputs an embedding instead of 9 positions.

In [None]:
class ConvEncoder(nn.Module):
    """
    Small CNN that turns a 4-channel image into a compact embedding vector.

    Why:
    - We want to compare RGB+LiDAR representations using similarity.
    - For that, we need each modality mapped to a fixed-size vector (embedding).
    - This encoder is the "feature extractor" for one modality.
    """

    def __init__(self, in_ch: int, emb_dim: int = 128):
        """
        Args:
            in_ch: number of input channels (4 for RGB, 4 for LiDAR)
            emb_dim: dimensionality of the output embedding (e.g. 128)
        """
        super().__init__()
        k = 3      # kernel size

        # Three conv layers with pooling
        self.conv1 = nn.Conv2d(in_ch, 50, k, padding=1)
        self.conv2 = nn.Conv2d(50, 100, k, padding=1)
        self.conv3 = nn.Conv2d(100, 200, k, padding=1)
        self.pool = nn.MaxPool2d(2)

        # After 3x pooling (factor 2 each time) on 64x64 → 8x8 feature maps
        # channels: 200, so flattened size = 200 * 8 * 8
        self.fc1 = nn.Linear(200 * 8 * 8, 1000)
        self.fc2 = nn.Linear(1000, emb_dim)

    def forward(self, x):
        # Convolution + nonlinearity + downsampling
        x = self.pool(F.relu(self.conv1(x)))   # (B,50,32,32)
        x = self.pool(F.relu(self.conv2(x)))   # (B,100,16,16)
        x = self.pool(F.relu(self.conv3(x)))   # (B,200,8,8)

        # Flatten spatial dimensions
        x = torch.flatten(x, 1)                # (B,200*8*8)

        # Two fully connected layers to get to embedding dimension
        x = F.relu(self.fc1(x))                # (B,1000)
        x = self.fc2(x)                        # (B,emb_dim)

        # Normalize for cosine similarity
        x = F.normalize(x, dim=-1)
        return x


In [None]:
### TODO aufpassen: bei Antonio zusätzlich vorher ConvEnc trainiert und gradienten eingefroern - chatty macht das hier nicht??
### aber auch im nächsten lab trainiert antonio nicht vor um besser vergleichen zu können mit early fusion

class LateFusionModel(nn.Module):
    """
    Late fusion:
    - RGB and LiDAR are encoded completely separately.
    - Only at the very end we concatenate their embeddings and optionally project.

    This matches the idea:
        "Separate encoders → final embeddings → fusion → similarity"
    """

    def __init__(self, emb_dim=128, hidden_dim=256, out_dim):
        """
        Args:
            emb_dim: size of each individual modality embedding
            fused_dim: size of the final fused embedding
        """
        super().__init__()

        # Separate encoders for each modality
        self.rgb_enc = ConvEncoder(in_ch=4, emb_dim=emb_dim)      ## TODO: mit Antonio abgleichen
        self.lidar_enc = ConvEncoder(in_ch=4, emb_dim=emb_dim)    ## TODO: mit Antonio abgleichen

        # Linear layer to mix and reduce concatenated embeddings
        # Input is [rgb_emb, lidar_emb] of size 2 * emb_dim
        self.fusion_fc1 = nn.Linear(2 * emb_dim, hidden_dim)
        self.fusion_fc2 = nn.Linear(hidden_dim, out_dim)

    def forward(self, rgb, lidar):
        """
        Args:
            rgb:   (B, 4, 64, 64)
            lidar: (B, 4, 64, 64)

        Returns:
            fused_emb: (B, fused_dim)  # used for similarity / contrastive loss
            rgb_emb:   (B, emb_dim)    # optional, for analysis
            lidar_emb: (B, emb_dim)    # optional, for analysis
        """
        # 1) Encode each modality with its own ConvEncoder
        rgb_emb = self.rgb_enc(rgb)           # (B, emb_dim)
        lidar_emb = self.lidar_enc(lidar)     # (B, emb_dim)

        # 2) Late fusion: concatenate embeddings
        x = torch.cat((rgb_emb, lidar_emb), dim=1)  # (B, 2*emb_dim)

        # 3) Optional projection to a joint fused space   ## TODO auch hier abweichung zu antonio, hat relu + das zweite linear
        x = F.relu(self.fusion_fc1(x))                 # (B, fused_dim)
        out = self.fusion_fc2(x)                       # (B, out_dim)

        return out


In [None]:
class IntermediateFusionModel(nn.Module):
    """
    Intermediate fusion:
    - Each modality has its own early conv layers (conv1, conv2).
    - Their feature maps are then concatenated.
    - Shared later layers (conv3 + FCs) operate on the fused feature maps.

    This lets RGB and LiDAR interact earlier and at a more local spatial level.
    """

    def __init__(self, emb_dim=128, hidden_dim=256, out_dim):    ## TODO Antonio gibt channels als parameter rein
        super().__init__()
        k = 3
        # this downsampling can be done with convolutions of stride 2
        self.pool = nn.MaxPool2d(2)

        # --- Modality-specific early convolutions ---

        # RGB branch: takes 4-channel input and produces feature maps
        self.rgb_conv1 = nn.Conv2d(4, 50, k, padding=1)
        self.rgb_conv2 = nn.Conv2d(50, 100, k, padding=1)     ## TODO Antonio hat ein Conv2d more, das erste auch nur 4,25?

        # LiDAR branch: same structure but separate weights
        self.lidar_conv1 = nn.Conv2d(4, 50, k, padding=1)
        self.lidar_conv2 = nn.Conv2d(50, 100, k, padding=1)

        # --- Shared later layers ---        ## TODO baut das hier komplett anders auf als Antonio, außerdem hat Matmul, also nicht concatenate sondern multiplication den besten val_loss

        # After conv2+pool in each branch:
        # RGB feature maps:   (B, 100, 16, 16)
        # LiDAR feature maps: (B, 100, 16, 16)
        # Concatenated:       (B, 200, 16, 16)
        self.shared_conv3 = nn.Conv2d(200, 200, k, padding=1)

        # After another pool: (B, 200, 8, 8)
        self.fc1 = nn.Linear(200 * 8 * 8, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, out_dim)

    def forward(self, rgb, lidar):
        """
        Args:
            rgb:   (B, 4, 64, 64)
            lidar: (B, 4, 64, 64)

        Returns:
            emb: (B, emb_dim)  # fused embedding for similarity / contrastive loss
        """
        # --- RGB early branch ---
        x_rgb = self.pool(F.relu(self.rgb_conv1(rgb)))     # (B, 50, 32, 32)
        x_rgb = self.pool(F.relu(self.rgb_conv2(x_rgb)))   # (B, 100, 16, 16)

        # --- LiDAR early branch ---
        x_lid = self.pool(F.relu(self.lidar_conv1(lidar))) # (B, 50, 32, 32)
        x_lid = self.pool(F.relu(self.lidar_conv2(x_lid))) # (B, 100, 16, 16)

        # --- Intermediate fusion on feature maps ---
        # Concatenate along channel dimension
        x = torch.cat([x_rgb, x_lid], dim=1)               # (B, 200, 16, 16)

        # Shared conv and pooling
        x = self.pool(F.relu(self.shared_conv3(x)))        # (B, 200, 8, 8)

        # Flatten and project to embedding
        x = torch.flatten(x, 1)                            # (B, 200*8*8)
        x = F.relu(self.fc1(x))                            # (B, 1000)
        out = self.fc2(x)                                  # (B, out_dim)

        return out


# Fusion

In [None]:
def get_rgb_lidar_inputs(batch):
    """
    batch: something like (rgb, lidar, label_idx)
    """
    rgb = batch[0].to(device)
    lidar = batch[1].to(device)
    return (rgb, lidar)

In [None]:
train_dataloader = create_deterministic_training_dataloader(
  #torch_train_set,             ## TODO
  batch_size=BATCH_SIZE,
  shuffle=True,
  num_workers=NUM_WORKERS,
  pin_memory=True
)

valid_dataloader = torch.utils.data.DataLoader(
  #torch_val_set,                 ## TODO
  batch_size=BATCH_SIZE,
  shuffle=False, # No need to shuffle validation data
  num_workers=num_workers,
  pin_memory=True
)

In [None]:
loss_func = nn.CrossEntropyLoss()

metrics = {}   # store losses for each model

models_to_train = {
    "late_fusion": LateFusionClassifier().to(device),
    "intermediate_fusion": IntermediateFusionClassifier().to(device)
}

for name, model in models_to_train.items():
  net = Net(8).to(device)
  opt = Adam(net.parameters(), lr=LR)

  train_losses, valid_losses, epoch_times, max_gpu_mem_mb = train_model(
    model=net,
    optimizer=opt,
    input_fn=get_rgb_lidar_inputs,     ## TODO
    epochs=EPOCHS,
    loss_fn=loss_func,
    train_dataloader=train_dataloader,
    valid_dataloader=valid_dataloader,
    target_idx=-1,   # last element in batch is target
    log_to_wandb=True,
    model_name=name.replace(" ", "_").lower(),
  )

  # metrics for comparison table
  num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

  metrics[name] = {
      "train_losses": train_losses,
      "valid_losses": valid_losses,
      "epoch_times": epoch_times,
      "best_valid_loss": min(valid_losses),
      "max_gpu_mem_mb": max_gpu_mem_mb,
      "num_params": num_params,
  }


In [None]:
plot_losses(losses)

In [None]:
row_late = results["Late Fusion"]
row_inter = results["Intermediate Fusion"]

comparison_df = pd.DataFrame(
    {
        "Metric": [
            "Validation Loss (best)",
            "Parameters (count)",
            "Training Time (s/epoch, avg)",
            "GPU Memory (MB, peak)",
        ],
        "Late Fusion": [
            row_late["best_valid_loss"],
            row_late["num_params"],
            row_late["avg_epoch_time"],
            row_late["max_gpu_mem_mb"],
        ],
        "Intermediate Fusion": [
            row_inter["best_valid_loss"],
            row_inter["num_params"],
            row_inter["avg_epoch_time"],
            row_inter["max_gpu_mem_mb"],
        ],
    }
)

comparison_df

In [None]:
# logs the comparison table to wandb
table = wandb.Table(dataframe=comparison_df)
wandb.log({"fusion_comparison": table})