# Setup
This section installs required dependencies, mounts Google Drive, defines data paths, and sets global settings such as the random seed and device. It prepares the environment so that all subsequent loading, training, and evaluation steps run consistently and reproducibly, both in Colab and locally.

In [None]:
# Install dependencies
%pip install -r requirements.txt

In [None]:
# Commented out IPython magic to ensure Python compatibility.
%%capture
%pip install fiftyone==1.10.0 sympy==1.12 torch torchvision numpy open-clip-torch

## Imports

Here we import all libraries and modules needed for the final assessment: PyTorch, torchvision transforms, W&B, dataset utilities, training utilities, and the model classes used across all three stages. Centralizing imports keeps the notebook organized and ensures that each component is available when needed.

In [None]:
import os
from pathlib import Path
from google.colab import userdata

import wandb
import fiftyone as fo
from PIL import Image

import numpy as np
import matplotlib.pyplot as plt

import torch
import torchvision.transforms.v2 as transforms
from torch.utils.data import Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

STORAGE_PATH = Path("/content/drive/MyDrive/Colab Notebooks/Applied Computer Vision/Applied-Computer-Vision-Projects/Multimodal_Learning_02/")
TMP_STORAGE_PATH = "/content"

DATA_PATH = STORAGE_PATH / "data/assessment"
# DATA_PATH = TMP_STORAGE_PATH / "data/assessment"

In [None]:
!cp -r "/content/drive/MyDrive/Colab Notebooks/Applied Computer Vision/Applied-Computer-Vision-Projects/Multimodal_Learning_02/data" /content/data

In [None]:
# Commented out IPython magic to ensure Python compatibility.
%cd "/content/drive/MyDrive/Colab Notebooks/Applied Computer Vision/Applied-Computer-Vision-Projects/Multimodal_Learning_02/"

In [None]:
from src.utility import set_seeds

## Constants

We define key configuration values such as the seed, batch size, image size, number of workers, and label mappings. These constants ensure consistent behavior across all stages and make the hyperparameters easy to adjust or reference later.

In [None]:
IMG_SIZE = 64
SEED = 51

FIFTYONE_DATASET_NAME = "cilp_assessment"
CLASSES = ["cubes", "spheres"]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.is_available()

In [None]:
# Usage: Call this function at the beginning and before each training phase
set_seeds(SEED)

# Integration of Wandb

This section authenticates with Weights & Biases using the API key stored in Colab Secrets. Initializing W&B enables automatic logging of losses, metrics, hyperparameters, and summary statistics for all training stages. This satisfies the experiment-tracking requirement of the assessment.

In [None]:
# Load W&B API key from Colab Secrets and make it available as env variable
wandb_key = userdata.get('WANDB_API_KEY')
os.environ["WANDB_API_KEY"] = wandb_key
wandb.login()

In [None]:
'''
def log_similarity_matrix(sim_matrix, title="Similarity Matrix"):
    # sim_matrix: (N, N) tensor or ndarray
    sim = sim_matrix.detach().cpu().numpy() if hasattr(sim_matrix, "detach") else np.array(sim_matrix)

    fig, ax = plt.subplots()
    im = ax.imshow(sim, aspect="auto")
    plt.colorbar(im, ax=ax)
    ax.set_title(title)

    wandb.log({"similarity_matrix": wandb.Image(fig)})
    plt.close(fig)
'''

In [None]:
'''
def log_sample_predictions(images, true_labels, pred_labels, max_samples=5):
    samples = []
    for img, t, p in list(zip(images, true_labels, pred_labels))[:max_samples]:
        # assuming img is a tensor [C,H,W] in 0–1 range
        img_np = img.detach().cpu().numpy()
        caption = f"true: {t}, pred: {p}"
        samples.append(wandb.Image(img_np, caption=caption))

    wandb.log({"sample_predictions": samples})
'''

# Loading and preparation of Data

TODO

In [None]:
## Final: dynamisch
img_transforms = transforms.Compose([
    transforms.ToImage(),   # Scales data into [0,1]
    transforms.Resize(IMG_SIZE),
    transforms.ToDtype(torch.float32, scale=True),
    transforms.Normalize(([0.0051, 0.0052, 0.0051, 1.0000]), ([5.8023e-02, 5.8933e-02, 5.8108e-02, 2.4509e-07]))     ## assessment dataset
    # transforms.Normalize(mean.tolist(), std.tolist())     ## assessment dataset
])

In [None]:
class MyDataset(Dataset):
    def __init__(self, root_dir, start_idx, stop_idx):
        self.classes = ["cubes", "spheres"]
        self.root_dir = root_dir
        self.rgb = []
        self.lidar = []
        self.class_idxs = []

        for class_idx, class_name in enumerate(self.classes):
            for idx in range(start_idx, stop_idx):
                file_number = "{:04d}".format(idx)
                rbg_img = Image.open(self.root_dir + class_name + "/rgb/" + file_number + ".png")
                rbg_img = img_transforms(rbg_img).to(device)
                self.rgb.append(rbg_img)

                lidar_depth = np.load(self.root_dir + class_name + "/lidar/" + file_number + ".npy")
                lidar_depth = torch.from_numpy(lidar_depth[None, :, :]).to(torch.float32).to(device)
                self.lidar.append(lidar_depth)

                self.class_idxs.append(torch.tensor(class_idx, dtype=torch.float32)[None].to(device))

    def __len__(self):
        return len(self.class_idxs)

    def __getitem__(self, idx):
        rbg_img = self.rgb[idx]
        lidar_depth = self.lidar[idx]
        class_idx = self.class_idxs[idx]
        return rbg_img, lidar_depth, class_idx

# Verify Data Files

In [None]:
pairs = {}

# Paths to different modalities, organized by class
for class_name in CLASSES:
    class_dir = DATA_PATH / class_name
    RGB_DIR = class_dir / "rgb"
    LIDAR_DIR = class_dir / "lidar"

    # Check if directories exist
    assert RGB_DIR.exists(), f"RGB directory not found: {RGB_DIR}"
    assert LIDAR_DIR.exists(), f"LIDAR directory not found: {LIDAR_DIR}"

    # Count files
    rgb_files = sorted(RGB_DIR.glob("*.png"))
    npy_files = sorted(LIDAR_DIR.glob("*.npy"))

    print(f"Found {len(rgb_files)} RGB images")
    print(f"Found {len(npy_files)} NPY LiDAR")

    # Verify matching files
    rgb_stems = {f.stem for f in rgb_files}
    npy_stems = {f.stem for f in npy_files}
    matching = rgb_stems & npy_stems

    # store all matching pairs with full paths
    pairs[class_name] = [
        {
            "stem": stem,
            "rgb": RGB_DIR / f"{stem}.png",
            "lidar": LIDAR_DIR / f"{stem}.npy",
        }
        for stem in sorted(matching)
    ]

    print(f"Matching pairs: {len(matching)}")

    if len(matching) == 0:
        print("\n⚠️  ERROR: No matching RGB/LIDAR pairs found!")
    else:
        print(f"\n✅ Ready to create dataset with {len(matching)} samples")

# Create FiftyOne Grouped Dataset
A grouped dataset allows us to associate RGB images with their corresponding point clouds.

In [None]:
# Delete existing dataset if it exists
if FIFTYONE_DATASET_NAME in fo.list_datasets():
    print(f"Deleting existing dataset: {FIFTYONE_DATASET_NAME}")
    fo.delete_dataset(FIFTYONE_DATASET_NAME)

# Create new grouped dataset
print(f"Creating new dataset: {FIFTYONE_DATASET_NAME}")
dataset = fo.Dataset(FIFTYONE_DATASET_NAME, persistent=True)
dataset.add_group_field("group", default="rgb")

print(f"✅ Created grouped dataset: {FIFTYONE_DATASET_NAME}")

# Add Samples to Dataset

For each matching RGB/LIDAR pair, we create a group with two slices:
- `rgb`: The camera image
- `lidar`: The point cloud

In [None]:
samples = []

for class_name, class_pairs in pairs.items():
    label_str = "cube" if class_name == "cubes" else "sphere"

    for item in class_pairs:
        # Get file paths
        rgb_path = item["rgb"]
        lidar_path = item["lidar"]

        # Create group
        group = fo.Group()

        # Create RGB sample
        rgb_sample = fo.Sample(
            filepath=str(rgb_path),
            group=group.element("rgb"),
            label=fo.Classification(label=label_str),
        )

        # Create PCD sample
        lidar_sample = fo.Sample(
            filepath=str(lidar_path),
            group=group.element("lidar"),
            label=fo.Classification(label=label_str),
        )

        samples.extend([rgb_sample, lidar_sample])

# Add all samples to dataset
dataset.add_samples(samples)

print(f"✅ Created dataset '{dataset.name}' with {len(dataset)} samples")
print("Group field:", dataset.group_field)
print("Group slices:", dataset.group_slices)

# Launch FiftyOne App

This will open the FiftyOne App in your browser where you can:
- View RGB images and point clouds side-by-side
- Use the group slices dropdown to switch between modalities
- Filter samples by metadata (positions, colors)
- Navigate through the dataset interactively

In [None]:
session = fo.launch_app(dataset, auto=False)
print(session.url)

In [None]:
total_per_class = {cls: len(items) for cls, items in pairs.items()}
total_samples = sum(total_per_class.values())

print("Total samples per class:")
for cls, n in total_per_class.items():
    print(f"  {cls}: {n}")
print(f"\nTotal samples: {total_samples}")

In [None]:
# picks the first class and first sample from pairs
any_class = CLASSES[0]
sample = pairs[any_class][0]

sample_rgb_path = sample["rgb"]
sample_lidar_path = sample["lidar"]

# RGB image
rgb_img = Image.open(sample_rgb_path)
print("RGB image:")
print("  size (width, height):", rgb_img.size)
print("  mode:", rgb_img.mode)
print("  format:", rgb_img.format)

# LiDAR depth map
lidar = np.load(sample_lidar_path)
print("\nLiDAR depth map:")
print("  shape:", lidar.shape)
print("  dtype:", lidar.dtype)

In [None]:
set_seeds(SEED)
train_ratio = 0.8

splits = {
    "train": {},
    "val": {},
}

for cls, items in pairs.items():
    n = len(items)
    n_train = int(n * train_ratio)

    splits["train"][cls] = items[:n_train]
    splits["val"][cls] = items[n_train:]

train_size = sum(len(v) for v in splits["train"].values())
val_size = sum(len(v) for v in splits["val"].values())

print("Train/validation sizes:")
for cls in CLASSES:
    print(
        f"  {cls}: train={len(splits['train'][cls])}, "
        f"val={len(splits['val'][cls])}"
    )
print(f"\nTotal train: {train_size}")
print(f"Total val:   {val_size}")

In [None]:
# --- Data ---
class_names = list(total_per_class.keys())
counts_full = [total_per_class[c] for c in class_names]

train_counts = [len(splits["train"][c]) for c in class_names]
val_counts   = [len(splits["val"][c])   for c in class_names]

# --- Plot ---
fig, axes = plt.subplots(1, 2, figsize=(8, 6))

# left: full dataset distribution
axes[0].bar(class_names, counts_full, color="steelblue")
axes[0].set_title("Class distribution (full dataset)")
axes[0].set_xlabel("Class")
axes[0].set_ylabel("Count")

# right: train vs validation split
x = range(len(class_names))
width = 0.35

axes[1].bar([i - width/2 for i in x], train_counts, width=width, label="Train")
axes[1].bar([i + width/2 for i in x], val_counts,   width=width, label="Val")
axes[1].set_xticks(x)
axes[1].set_xticklabels(class_names)
axes[1].set_title("Train vs Validation")
axes[1].set_xlabel("Class")
axes[1].legend()

# show plots
plt.tight_layout()
plt.subplots_adjust(wspace=0.4)
plt.show()

# Cleanup
To delete the dataset and free up space:

In [None]:
# Uncomment to delete the dataset
# fo.delete_dataset(DATASET_NAME)
# print(f"Deleted dataset: {DATASET_NAME}")