<a href="https://colab.research.google.com/github/Mechanics-Mechatronics-and-Robotics/CV-2025/blob/main/Week_10/Contrastive_Learning_Tutorial_in_PyTorch_with_Point_Clouds.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Contrastive Learning Tutorial in PyTorch with Point Clouds



## Installation / Setup

In [63]:
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.version.cuda}")
print(f"CUDA available: {torch.cuda.is_available()}")

import sys
print(f"Python: {sys.version}")

PyTorch: 2.6.0+cu124
CUDA: 12.4
CUDA available: True
Python: 3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0]


In [64]:
# Install PyG and its dependencies (for PyTorch 2.6.0 + CUDA 12.4)
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.6.0+cu124.html
!pip install torch_geometric==2.6.0  # Match PyG version to PyTorch

Looking in links: https://data.pyg.org/whl/torch-2.6.0+cu124.html
[31mERROR: Could not find a version that satisfies the requirement pyg_lib (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pyg_lib[0m[31m


In [65]:
import torch_geometric
from torch_geometric.data import Data
# from torch_geometric.datasets import ShapeNet

print(f"PyTorch: {torch.__version__}, CUDA: {torch.cuda.is_available()}")
print(f"PyG: {torch_geometric.__version__}")

PyTorch: 2.6.0+cu124, CUDA: True
PyG: 2.6.0


In [66]:
# # First install PyTorch with CUDA 11.8 (current stable for Colab)
# !pip install torch==2.3.0+cu118 torchvision==0.18.0+cu118 --extra-index-url https://download.pytorch.org/whl/cu118

# # Install PyG dependencies
# !pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.3.0+cu118.html

# # Finally install torch-geometric
# !pip install torch_geometric==2.5.3

# # Verify installation
# import torch
# from torch_geometric.data import Data

# print(f"Torch version: {torch.__version__}")
# print(f"CUDA available: {torch.cuda.is_available()}")

In [67]:
# # First install PyTorch with CUDA 12.1
# !pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121

# # Install PyG dependencies with explicit CUDA 12.1 wheels
# !pip install -q torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.6.0+cu121.html

# # Verify installation
# try:
#     import torch_geometric
#     print("PyG successfully installed!")
# except ImportError:
#     print("Installation failed")

In [68]:
# # We will use conda for easier installation of PyG
# # If only using pip, it somehow takes forever to install on colab
# !pip install -q condacolab
# import condacolab
# condacolab.install()

In [69]:
# # Install torch geometric for point-cloud layers
# import torch
# version = f"https://data.pyg.org/whl/torch-{torch.__version__}.html"
# try:
#     import torch_geometric
# except:
#     !echo $version
#     !pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f $version
#     import torch_geometric

✋ **Note: You'll need to restart your runtime and execute the two cells again.** ✋

## Dataset

Goal:
- Self-Supervised Representation Learning of Shapes
- Can be used for downstream tasks like clustering, fine-tuning, outlier-detection, ...
- Pointcloud = Set of unconnected nodes --> PyG
- [ShapeNet Dataset](https://paperswithcode.com/dataset/shapenet) - we just use a subset of classes and act like we didn't have labels
- I select 5k data points as otherwise I run out of memory on Colab



In [70]:
# from torch_geometric.datasets import ModelNet
# dataset = ModelNet(root='./modelnet10', name='10', train=True)  # 10 categories, 100 samples
# print(dataset[0])  # Each sample has ~2,500 points
# print("Number of Samples: ", len(dataset))

In [71]:
from torch_geometric.datasets import ModelNet
from torch_geometric.transforms import SamplePoints
import numpy as np

# Load full dataset (no [:] slicing yet)
full_dataset = ModelNet(
    root='./modelnet10',
    name='10',
    train=True,
    transform=SamplePoints(1024)
)

# Get balanced subset (1 sample per class)
balanced_indices = []
for class_idx in range(10):
    class_mask = np.array([data.y.item() == class_idx for data in full_dataset])
    indices = np.where(class_mask)[0]
    balanced_indices.append(indices[0])  # Take first sample per class

dataset = full_dataset[balanced_indices]  # Now has all 10 classes

# Verify distribution
CLASS_NAMES = ['bathtub', 'bed', 'chair', 'desk', 'dresser',
               'monitor', 'night_stand', 'sofa', 'table', 'toilet']
cat_dict = {name: 0 for name in CLASS_NAMES}
for data in dataset:
    cat_dict[CLASS_NAMES[data.y.item()]] += 1

print("Balanced class distribution:")
print(cat_dict)

Balanced class distribution:
{'bathtub': 1, 'bed': 1, 'chair': 1, 'desk': 1, 'dresser': 1, 'monitor': 1, 'night_stand': 1, 'sofa': 1, 'table': 1, 'toilet': 1}


In [72]:
# from huggingface_hub import notebook_login
# notebook_login()  # Follow the link to get your token from https://huggingface.co/settings/tokens

In [73]:
# from datasets import load_dataset
# dataset = load_dataset("ShapeNet/ShapeNetCore", data_files="02773838.zip/")

# # # Requires accepting terms at https://huggingface.co/datasets/ShapeNet/ShapeNetCore
# dataset = load_dataset("ShapeNet/ShapeNetCore",
#                       use_auth_token=True)  # Uses your logged-in token

In [74]:
# # First fix torch CUDA mismatches (critical!)
# !pip install torch==2.6.0+cu124 torchvision==0.17.0+cu124 --index-url https://download.pytorch.org/whl/cu124

# # Then install datasets with compatible fsspec
# !pip install datasets==2.15.0 fsspec==2023.9.0  # Version pin to avoid conflicts

In [75]:
# from torch_geometric.datasets import ShapeNet
# # Limit to 5000 samples, due to RAM restrictions
# dataset = ShapeNet(root=".", categories=["Table", "Lamp", "Guitar", "Motorbike"]).shuffle()[:5000]
# print("Number of Samples: ", len(dataset))
# print("Sample: ", dataset[0])

Attribute Name  | Description
-------------------|------------------
Pos       | Normalized positions as 3D coordinates
X       |  Normal vectors
Y       | Class label

In [76]:
dataset[0].pos

tensor([[ 29.1622, -27.5397,   2.3750],
        [ 14.4805,   7.3108, -21.6250],
        [  1.2347, -17.7123, -18.0499],
        ...,
        [ 25.2320,  -2.4805, -10.2863],
        [ 32.7812, -37.9010, -10.4150],
        [ -6.7688, -23.7728,  -6.4172]])

Let's use plotly to inspect the data ...

In [2]:
!pip install -q plotly
import plotly.graph_objects as go
from torch_geometric.datasets import ModelNet
from torch_geometric.transforms import SamplePoints

# ModelNet10 class names (in order)
CLASS_NAMES = [
    'bathtub', 'bed', 'chair', 'desk', 'dresser',
    'monitor', 'night_stand', 'sofa', 'table', 'toilet'
]

# Load data with point sampling
dataset = ModelNet(
    root='./modelnet10',
    name='10',
    train=True,
    transform=SamplePoints(1024)  # 1024-point cloud
)[:10]

def plot_interactive(shape, class_names=CLASS_NAMES):
    pos = shape.pos.numpy()
    class_label = class_names[shape.y.item()]  # Get class name

    fig = go.Figure(data=[
        go.Scatter3d(
            x=pos[:, 0], y=pos[:, 1], z=pos[:, 2],
            mode='markers',
            marker=dict(size=3, opacity=0.8),
            name=f'Class: {class_label}'  # Show label in hover
        )
    ])

    fig.update_layout(
        title=f'ModelNet10: {class_label} (Points: {len(pos)})',
        scene=dict(
            xaxis_title='X',
            yaxis_title='Y',
            zaxis_title='Z'
        ),
        width=800,
        height=600
    )

    fig.show()

# Plot first sample (with title)
plot_interactive(dataset[0])

In [78]:
# #!pip install plotly --quiet
# import plotly.express as px

# def plot_3d_shape(shape):
#     print("Number of data points: ", shape.x.shape[0])
#     x = shape.pos[:, 0]
#     y = shape.pos[:, 1]
#     z = shape.pos[:, 2]
#     fig = px.scatter_3d(x=x, y=y, z=z, opacity=0.3)
#     fig.show()

# # Pick a sample
# sample_idx = 3
# plot_3d_shape(dataset[sample_idx])

In [79]:
# # Let's check the distribution of classes
# cat_dict = {key: 0 for key in dataset.categories}
# for datapoint in dataset: cat_dict[dataset.categories[datapoint.category.int()]]+=1
# cat_dict

In [3]:
# ModelNet10 class names (ordered by label index)
CLASS_NAMES = [
    'bathtub', 'bed', 'chair', 'desk', 'dresser',
    'monitor', 'night_stand', 'sofa', 'table', 'toilet'
]

# Initialize dictionary with class names
cat_dict = {class_name: 0 for class_name in CLASS_NAMES}

# Count occurrences
for data in dataset:
    class_idx = data.y.item()  # Get numeric label (0-9)
    class_name = CLASS_NAMES[class_idx]
    cat_dict[class_name] += 1

print("Class distribution:")
print(cat_dict)

Class distribution:
{'bathtub': 10, 'bed': 0, 'chair': 0, 'desk': 0, 'dresser': 0, 'monitor': 0, 'night_stand': 0, 'sofa': 0, 'table': 0, 'toilet': 0}


In [82]:
plot_interactive(dataset[10])

IndexError: range object index out of range

## Data Preparation

- In some scenarios it makes sense to pre-compute the augmentations (for example if heavy computations are involved)
- This would require to store multiple Data Points in one Data Object, which is possible in PyTorch
- Here we will compute the augmentations on the fly and use the below transformations for this
- Later, for each data point we will need 2 augmentations (positive pair)
- What are good augmentations for Point Clouds?
    - Rotation (if the used layer is not rotation invariant)
    - Jittering (can be seen as adding noise to the coordinates)
    - Shifting / Shearing
    - ... many more




In [None]:
from torch_geometric.loader import DataLoader
import torch_geometric.transforms as T

data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# We're lucky and pytorch geometric helps us with pre-implemented transforms
# which can also be applied on the whole batch directly
augmentation = T.Compose([T.RandomJitter(0.03), T.RandomFlip(1), T.RandomShear(0.2)])

Let's have a look at some samples ...

In [None]:
# Original data point
sample = next(iter(data_loader))
plot_3d_shape(sample[0])

In [None]:
# Augmented data point
transformered = augmentation(sample)
plot_3d_shape(transformered[0])

## Model

- Different choices for Point Cloud Feature-Learning layers (PointNet, PointNet++, EdgeConv, PointTransformer, ...)
- In PyTorch geometric we find an implementation of DynamicEdgeConv
- It uses the parameter k to detect the nearest neighbors which form a subgraph
- If you have many points, you can also sample a subset
- In the paper they use 4 layers, here we just have 2
- Implementation is inspired by [this PyG example](https://github.com/pyg-team/pytorch_geometric/blob/a6e349621d4caf8b381fe58f8e57109b2d0947ed/examples/dgcnn_segmentation.py)
- We only apply augmentations during training






In [None]:
import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import MLP, DynamicEdgeConv, global_max_pool


class Model(torch.nn.Module):
    def __init__(self, k=20, aggr='max'):
        super().__init__()
        # Feature extraction
        self.conv1 = DynamicEdgeConv(MLP([2 * 3, 64, 64]), k, aggr)
        self.conv2 = DynamicEdgeConv(MLP([2 * 64, 128]), k, aggr)
        # Encoder head
        self.lin1 = Linear(128 + 64, 128)
        # Projection head (See explanation in SimCLRv2)
        self.mlp = MLP([128, 256, 32], norm=None)

    def forward(self, data, train=True):
        if train:
            # Get 2 augmentations of the batch
            augm_1 = augmentation(data)
            augm_2 = augmentation(data)

            # Extract properties
            pos_1, batch_1 = augm_1.pos, augm_1.batch
            pos_2, batch_2 = augm_2.pos, augm_2.batch

            # Get representations for first augmented view
            x1 = self.conv1(pos_1, batch_1)
            x2 = self.conv2(x1, batch_1)
            h_points_1 = self.lin1(torch.cat([x1, x2], dim=1))

            # Get representations for second augmented view
            x1 = self.conv1(pos_2, batch_2)
            x2 = self.conv2(x1, batch_2)
            h_points_2 = self.lin1(torch.cat([x1, x2], dim=1))

            # Global representation
            h_1 = global_max_pool(h_points_1, batch_1)
            h_2 = global_max_pool(h_points_2, batch_2)
        else:
            x1 = self.conv1(data.pos, data.batch)
            x2 = self.conv2(x1, data.batch)
            h_points = self.lin1(torch.cat([x1, x2], dim=1))
            return global_max_pool(h_points, data.batch)

        # Transformation for loss function
        compact_h_1 = self.mlp(h_1)
        compact_h_2 = self.mlp(h_2)
        return h_1, h_2, compact_h_1, compact_h_2

Possible improvement: Only pass once through model by stacking augmentations

## Training

- We use InfoNCE / NT-Xent Loss implemented in pytorch metric learning library
- Temperature allows to balance the similarity measure (make it more peaked)
- Typical values are around 0.1 / 0.2

In [None]:
# See https://kevinmusgrave.github.io/pytorch-metric-learning/losses/#ntxentloss
!pip install pytorch-metric-learning -q

from pytorch_metric_learning.losses import NTXentLoss
loss_func = NTXentLoss(temperature=0.10)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Model().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)

# Use a large batch size (might lead to RAM issues)
# Free Colab Version has ~ 12 GB of RAM
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)


- No test dataset, as the evaluation can be done "downstream"
- The compact representations go into the loss function
- During test time no augmentations are applied and we can use the output representations

In [None]:
import tqdm

def train():
    model.train()
    total_loss = 0
    for _, data in enumerate(tqdm.tqdm(data_loader)):
        data = data.to(device)
        optimizer.zero_grad()
        # Get data representations
        h_1, h_2, compact_h_1, compact_h_2 = model(data)
        # Prepare for loss
        embeddings = torch.cat((compact_h_1, compact_h_2))
        # The same index corresponds to a positive pair
        indices = torch.arange(0, compact_h_1.size(0), device=compact_h_2.device)
        labels = torch.cat((indices, indices))
        loss = loss_func(embeddings, labels)
        loss.backward()
        total_loss += loss.item() * data.num_graphs
        optimizer.step()
    return total_loss / len(dataset)

for epoch in range(1, 4):
    loss = train()
    print(f'Epoch {epoch:03d}, Loss: {loss:.4f}')
    scheduler.step()

## Evaluation of the Embeddings

In [None]:
from sklearn.manifold import TSNE
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

# Get sample batch
sample = next(iter(data_loader))

# Get representations
h = model(sample.to(device), train=False)
h = h.cpu().detach()
labels = sample.category.cpu().detach().numpy()

# Get low-dimensional t-SNE Embeddings
h_embedded = TSNE(n_components=2, learning_rate='auto',
                   init='random').fit_transform(h.numpy())

# Plot
ax = sns.scatterplot(x=h_embedded[:,0], y=h_embedded[:,1], hue=labels,
                    alpha=0.5, palette="tab10")

# Add labels to be able to identify the data points
annotations = list(range(len(h_embedded[:,0])))

def label_points(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        ax.text(point['x']+.02, point['y'], str(int(point['val'])))

label_points(pd.Series(h_embedded[:,0]),
            pd.Series(h_embedded[:,1]),
            pd.Series(annotations),
            plt.gca())

Let's find the most similar and most different data points ...

[Source](https://stackoverflow.com/questions/50411191/how-to-compute-the-cosine-similarity-in-pytorch-for-all-rows-in-a-matrix-with-re)

In [None]:
import numpy as np

def sim_matrix(a, b, eps=1e-8):
    """
    Eps for numerical stability
    """
    a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
    a_norm = a / torch.max(a_n, eps * torch.ones_like(a_n))
    b_norm = b / torch.max(b_n, eps * torch.ones_like(b_n))
    sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1))
    return sim_mt

similarity = sim_matrix(h, h)
max_indices = torch.topk(similarity, k=2)[1][:, 1]
max_vals  = torch.topk(similarity, k=2)[0][:, 1]

# Select index
idx = 17
similar_idx = max_indices[idx]
print(f"Most similar data point in the embedding space for {idx} is {similar_idx}")

Categories are: "Table", "Lamp", "Guitar", "Motorbike", "Skateboard"

**Note**: This is only based on the data in the current batch!

In [None]:
plot_3d_shape(sample[idx].cpu())

In [None]:
plot_3d_shape(sample[similar_idx].cpu())

This confirms that our embedding space has a proper arrangement and that our contrastive loss separated different entities successfully.