<a href="https://colab.research.google.com/github/KoniHD/hw2/blob/main/notebooks/hw2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

## Clone Project

In [None]:
import os

if "google.colab" in str(get_ipython()):
    if not os.path.exists("hw2/pyproject.toml"):
        print("Repo doesn't exist yet. Cloning from github ...")
        !git clone --quiet --depth 1 https://github.com/KoniHD/hw2.git
        os.chdir("hw2")
        !uv pip install --quiet -r requirements.txt --system
        os.chdir("..")
        print("Cloned Repo successfully!")
        os.kill(os.getpid(), 9)  # Restart kernel to make modules available
else:
    os.chdir("..")
    !uv sync

Requires a **restart** the Notebook to make new pip installs active.

In [None]:
import os

print(
    f"{'Success!' if os.path.exists('hw2/pyproject.toml') else 'Failed to clone Repo!'}"
)

## Download Dataset

In [None]:
# Fetch data
if not os.path.exists("data"):
    !mkdir -p data
    !wget -q -P data/ https://s3.amazonaws.com/video.udacity-data.com/topher/2018/May/5aea1b91_train-test-data/train-test-data.zip
    !unzip -q -n data/train-test-data.zip -d data

## Imports libraries

In [None]:
import glob

import matplotlib.pyplot as plt
from huggingface_hub import upload_file
from lightning.pytorch import Trainer, seed_everything
from lightning.pytorch.callbacks import (
    BackboneFinetuning,
    EarlyStopping,
    ModelCheckpoint,
)
from lightning.pytorch.loggers import CSVLogger, TensorBoardLogger
from torch.utils.data import DataLoader
from torchvision import transforms

from data.custom_transforms import (
    Normalize,
    RandomCrop,
    Rescale,
    ToTensor,
)
from data.facial_keypoints_dataset import (
    FacialKeypointsDataset,
    FacialKeypointsHeatmapDataset,
)
from keypoint_task import KeypointDetection
from models.resnet18 import ResNetKeypointDetector
from models.simple_cnn import Simple_CNN
from models.unet import UNetKeypointDetector
from utils import visualize_batch, visualize_heatmaps, visualize_loss_curve

if "google.colab" in str(get_ipython()):
    from google.colab import userdata

    hf_token = userdata.get("HF_TOKEN")

## Set Hyperparameter

In [None]:
config = {
    # Data
    "batch_size": 16,
    "img_size": 224,
    # Model
    "out_dim": 136,
    "activation": "relu",
    "dropout": 0.3,
    "batch_norm": True,
    # Training
    "lr": 4e-3,
    "max_epochs": 30,
    "criterion": "mse",
    "random_seed": 42,
    "patience": 5,
    "optimizer": "adam",
}

## Load Data and visualize

In [None]:
seed_everything(
    config["random_seed"], workers=True
)  # Try to create deterministic results

# defining the data_transform using transforms.Compose([all tx's, . , .])
# order matters! i.e. rescaling should come before a smaller crop
data_transform = transforms.Compose(
    [Rescale(250), RandomCrop(config["img_size"]), Normalize(), ToTensor()]
)

training_keypoints_csv_path = os.path.join("data", "training_frames_keypoints.csv")
training_data_dir = os.path.join("data", "training")
test_keypoints_csv_path = os.path.join("data", "test_frames_keypoints.csv")
test_data_dir = os.path.join("data", "test")


# create the transformed dataset
transformed_dataset = FacialKeypointsDataset(
    csv_file=training_keypoints_csv_path,
    root_dir=training_data_dir,
    transform=data_transform,
)

# load training data in batches
train_loader = DataLoader(
    transformed_dataset, batch_size=config["batch_size"], shuffle=True, num_workers=2
)  # num_workers changed to Colab recommended number

# creating the test dataset
test_dataset = FacialKeypointsDataset(
    csv_file=test_keypoints_csv_path, root_dir=test_data_dir, transform=data_transform
)

# loading test data in batches
test_loader = DataLoader(
    test_dataset, batch_size=config["batch_size"], shuffle=False, num_workers=2
)  # num_workers changed to Colab recommended number, shuffle changed to False

test_batch = next(iter(test_loader))

for i, data in enumerate(test_loader):
    sample = data
    image = sample["image"][0]
    keypoints = sample["keypoints"][0]
    _, h, w = image.shape
    # plot the image black and white
    plt.imshow(image.numpy().transpose(1, 2, 0), cmap="gray")
    plt.scatter(
        keypoints[:, 0] * (w / 2) + (w / 2),
        keypoints[:, 1] * (h / 2) + (h / 2),
        c="r",
        s=20,
    )
    plt.show()
    print(f"Image min/max:   {image.min():.4f} / {image.max():.4f}")
    break

# Data Exploration & Sanity Checks

Observe basic dataset characteristics and sanity check via **model overfitting**.

In [None]:
print("===Metrics of first batch===")
batch = next(iter(train_loader))
images, keypoints = batch["image"], batch["keypoints"]

print(f"Image shape:\t\t{images.shape}")
print(
    f"Image min/max:\t\t{images.min():.4f} / {images.max():.4f}\t\twithin [0, 1]: {(-0 <= images.min().round(decimals=1) and images.max().round(decimals=1) <= 1)}"
)
print(
    f"Keypoints min/max:\t{keypoints.min():.4f} / {keypoints.max():.4f}\twithin [-1, 1]: {(-1 <= keypoints.min().round(decimals=1) and keypoints.max().round(decimals=1) <= 1)}"
)

# Training

## Part 1: Direct Coordinate Regression

**Optional:** Load existing model

In [None]:
simple_cnn = Simple_CNN.from_pretrained("KoniHD/Simple_CNN")
keypoint_task = KeypointDetection(
    model=simple_cnn,
    lr=config["lr"],
    criterion=config["criterion"],
    patience=config["patience"],
    optimizer=config["optimizer"],
    activation=config["activation"],
    droput=config["dropout"],
    batch_norm=config["batch_norm"],
)

### Overfitting

Evaluation whether the model architecture is sufficient.
(Can be skipped when loading existing model.)

In [None]:
default_exp_dir = "exp/simple_cnn/"

# Model
simple_cnn = Simple_CNN(
    out_dim=config["out_dim"],
    activation=config["activation"],
    dropout=0.0,  # No dropout when overfitting
    batch_norm=False,  # No batch_norm while overfitting
)

# Lightning Wrapper
keypoint_task = KeypointDetection(
    model=simple_cnn,
    lr=config["lr"],
    criterion=config["criterion"],
    patience=config["patience"],
    optimizer=config["optimizer"],
    activation=config["activation"],
    droput=0.0,
    batch_norm=False,
)

trainer = Trainer(
    max_epochs=200,
    accelerator="auto",
    deterministic="warn",
    logger=False,
    default_root_dir=default_exp_dir,
    detect_anomaly=True,
    overfit_batches=1,
    enable_autolog_hparams=False,
    enable_checkpointing=False,
)
trainer.fit(keypoint_task, train_dataloaders=train_loader)

metrics = trainer.callback_metrics
print(f"\n\n=============\nFinal train loss: {metrics['train_loss']:.4f}")

Visualize overfitting results

In [None]:
visualize_batch(model=simple_cnn, batch=test_batch)

### Real training

Start with clean model. (Can be ignored when loading exisiting model.)

In [None]:
# Model
simple_cnn = Simple_CNN(
    out_dim=config["out_dim"],
    activation=config["activation"],
    dropout=config["dropout"],
    batch_norm=config["batch_norm"],
)

# Lightning Wrapper
keypoint_task = KeypointDetection(
    model=simple_cnn,
    lr=config["lr"],
    criterion=config["criterion"],
    patience=config["patience"],
    optimizer=config["optimizer"],
    activation=config["activation"],
    droput=config["dropout"],
    batch_norm=config["batch_norm"],
)

In [None]:
version = 0

# keypoint_task = torch.compile(keypoint_task, mode="reduce-overhead") # Can only be used when Tenforboard.log_graph=False

checkpoint_callback = ModelCheckpoint(
    dirpath=default_exp_dir + f"version_{version}",
    filename="simple-cnn",
    monitor="train_loss",  # Do not monitor on test set!!
    mode="min",
    save_top_k=1,
    save_last=True,
    save_weights_only=True,
    enable_version_counter=True,
)

earlystopping_callback = EarlyStopping(
    monitor="train_loss",  # Do not monitor on test set!!
    patience=config["patience"],
    mode="min",
    min_delta=0.001,
)

trainer = Trainer(
    accelerator="auto",
    logger=[
        TensorBoardLogger(
            default_exp_dir,
            name="",  # necessary to save logs to the same directory as CSVLogger
            version=f"version_{version}",
            log_graph=True,
            default_hp_metric=False,
        ),
        CSVLogger(default_exp_dir, name="", version=f"version_{version}"),
    ],
    max_epochs=config["max_epochs"],
    callbacks=[checkpoint_callback, earlystopping_callback],
    deterministic="warn",  # Used for attempted reporduceability
    default_root_dir=default_exp_dir,
    num_sanity_val_steps=0,
    enable_checkpointing=True,
)

trainer.fit(keypoint_task, train_dataloaders=train_loader, val_dataloaders=test_loader)

keypoint_task = KeypointDetection.load_from_checkpoint(
    checkpoint_callback.best_model_path, weights_only=True, model=simple_cnn
)

metrics = trainer.callback_metrics
print(
    f"\n\n=============\nFinal train loss: {metrics['train_loss']:.4f}\nFinal val loss: {metrics['val_loss']:.4f}"
)

visualize_batch(model=simple_cnn, batch=test_batch)
visualize_loss_curve(
    logs=f"{default_exp_dir}version_{version}/metrics.csv",
    title=f"Part 1: Simple CNN Training Curve Version {version}",
)

### Tensorboard

In [None]:
%reload_ext tensorboard
%tensorboard --logdir {default_exp_dir}

**Optional:** Save model weights and training log to huggingface for reproducibility.

In [None]:
model_to_save = getattr(
    simple_cnn, "_orig_mod", simple_cnn
)  # make sure to upload the non-compiled model
model_to_save.push_to_hub(
    "KoniHD/Simple_CNN",
    config=config,
    commit_message=f"Training run version: {version}",
    private=True,
    token=hf_token,
)

tfevents_file = glob.glob(f"{default_exp_dir}version_{version}/*.tfevents.*")[0]
upload_file(
    path_or_fileobj=tfevents_file,
    path_in_repo=f"logs/run{version + 1}/events.out.tfevents",
    repo_id="KoniHD/Simple_CNN",
    token=hf_token,
    repo_type="model",
    commit_message=f"Logs from run no. {version}",
)

## Part 2: Transfer Learning for Keypoint Detection

### Set new hyperparameters

In [None]:
default_exp_dir = "exp/restnet/"

config = {
    # Data
    "batch_size": 16,
    "img_size": 224,
    # Model
    "grayScale": True,
    "out_dim": 136,
    # Training
    "lr": 4e-3,
    "max_epochs": 50,
    "criterion": "mse",
    "random_seed": 42,
    "patience": 10,
    "optimizer": "adam",
    "pretrained_backbone": True,
}

### Fine-tuning ResNet18

In [None]:
version = 0

resnet18 = ResNetKeypointDetector(
    out_dim=config["out_dim"], grayScale=config["grayScale"]
)

keypoint_task = KeypointDetection(
    model=resnet18,
    lr=config["lr"],
    criterion=config["criterion"],
    patience=config["patience"],
    optimizer=config["optimizer"],
    pretrained_backbone=config["pretrained_backbone"],
    activation="relu",  # Predefined by ResNet18 architecture
    dropout=0.3,  # Predefined by ResNet18 architecture
    batch_norm=True,  # Predefined by ResNet18 architecture
)

checkpoint_callback = ModelCheckpoint(
    dirpath=default_exp_dir + f"version_{version}",
    filename="resnet18",
    monitor="train_loss",
    mode="min",
    save_top_k=1,
    save_last=True,
    save_weights_only=True,
    enable_version_counter=True,
)

earlystopping_callback = EarlyStopping(
    monitor="train_loss",
    patience=config["patience"],
    mode="min",
    min_delta=0.001,
)

finetuning_callback = BackboneFinetuning(
    unfreeze_backbone_at_epoch=8,
    lambda_func=lambda epoch: 1.05,
    backbone_initial_ratio_lr=0.1,
    should_align=True,
)

trainer = Trainer(
    accelerator="auto",
    logger=[
        TensorBoardLogger(
            default_exp_dir,
            name="",  # necessary to save logs to the same directory as CSVLogger
            version=f"version_{version}",
            log_graph=True,
            default_hp_metric=False,
        ),
        CSVLogger(default_exp_dir, name="", version=f"version_{version}"),
    ],
    max_epochs=config["max_epochs"],
    callbacks=[checkpoint_callback, earlystopping_callback, finetuning_callback],
    deterministic="warn",
    default_root_dir=default_exp_dir,
    enable_checkpointing=True,
)

trainer.fit(keypoint_task, train_dataloaders=train_loader, val_dataloaders=test_loader)

keypoint_task = KeypointDetection.load_from_checkpoint(
    checkpoint_callback.best_model_path, weights_only=True, model=resnet18
)

metrics = trainer.callback_metrics
print(
    f"\n\n=============\nFinal train loss: {metrics['train_loss']:.4f}\nFinal val loss: {metrics['val_loss']:.4f}"
)

visualize_batch(model=resnet18, batch=test_batch)
visualize_loss_curve(
    logs=f"{default_exp_dir}version_{version}/metrics.csv",
    title=f"Part 2: ResNet18 Fine-tuning Training Curve Version {version}",
)


### Tensorboard

In [None]:
%reload_ext tensorboard
%tensorboard --logdir {default_exp_dir}

**Optional:** Save results to huggingface

In [None]:
model_to_save = getattr(
    resnet18, "_orig_mod", resnet18
)  # make sure to upload the non-compiled model
model_to_save.push_to_hub(
    "KoniHD/Fine-Tuned-ResNet18",
    config=config,
    commit_message=f"Training run version: {version}",
    private=True,
    token=hf_token,
)

tfevents_file = glob.glob(f"{default_exp_dir}version_{version}/*.tfevents.*")[0]
upload_file(
    path_or_fileobj=tfevents_file,
    path_in_repo=f"logs/run{version + 1}/events.out.tfevents",
    repo_id="KoniHD/Fine-Tuned-ResNet18",
    token=hf_token,
    repo_type="model",
    commit_message=f"Logs from run no. {version}",
)

## Part 3: Heatmap-based Keypoint Detection

### Prepare new dataset

In [None]:
default_exp_dir = "exp/unet/"

config = {
    # Data
    "batch_size": 16,
    "img_size": 224,
    "output_size": 128,
    # Model
    "num_predictions": 68,
    "base_features": 64,
    "activation": "relu",
    "batch_norm": True,
    # Training
    "lr": 4e-3,
    "max_epochs": 50,
    "criterion": "bce",
    "random_seed": 42,
    "patience": 10,
    "optimizer": "adam",
}

In [None]:
# create the transformed dataset
transformed_heatmap_dataset = FacialKeypointsHeatmapDataset(
    csv_file=training_keypoints_csv_path,
    root_dir=training_data_dir,
    transform=data_transform,
    output_size=config["output_size"],
    sigma=3.0,
)

# load training data in batches
train_heatmap_loader = DataLoader(
    transformed_heatmap_dataset,
    batch_size=config["batch_size"],
    shuffle=True,
    num_workers=2,
)  # num_workers changed to Colab recommended number

# creating the test dataset
test_heatmap_dataset = FacialKeypointsHeatmapDataset(
    csv_file=test_keypoints_csv_path,
    root_dir=test_data_dir,
    transform=data_transform,
    output_size=config["output_size"],
    sigma=3.0,
)

# loading test data in batches
test_heatmap_loader = DataLoader(
    test_heatmap_dataset, batch_size=config["batch_size"], shuffle=False, num_workers=2
)  # num_workers changed to Colab recommended number, shuffle changed to False

test_batch = next(iter(test_heatmap_loader))

In [None]:
# ── Heatmap Sanity Check ──────────────────────────────────────────────────
import matplotlib.pyplot as plt

sample_hm = next(iter(train_heatmap_loader))["heatmaps"]
print(f"Heatmap shape:       {sample_hm.shape}")        # expect (B, 68, 128, 128)
print(f"Heatmap max:         {sample_hm.max():.4f}")    # should be ~1.0
print(f"Heatmap mean:        {sample_hm.mean():.6f}")   # should be very small
print(f"Fraction > 0.01:     {(sample_hm > 0.01).float().mean():.4f}")

fig, axes = plt.subplots(1, 3, figsize=(12, 4))
for i, ax in enumerate(axes):
    im = ax.imshow(sample_hm[0, i * 20].numpy(), cmap="jet")
    ax.set_title(f"GT heatmap kp {i * 20}")
    plt.colorbar(im, ax=ax)
plt.suptitle("Ground-Truth Heatmaps — should show visible Gaussian peaks")
plt.tight_layout()
plt.show()

In [None]:
# ── UNet Overfit Test ─────────────────────────────────────────────────────
# Architecture sanity check: model must nail a single batch within 50 steps.
# Pass: loss well below 0.01. Fail: stuck above 0.05 → deeper issue.
unet_overfit = UNetKeypointDetector(
    num_predictions=config["num_predictions"],
    activation=config["activation"],
    base_features=config["base_features"],
    output_size=config["output_size"],
)
keypoint_task_overfit = KeypointDetection(
    model=unet_overfit,
    lr=config["lr"],
    criterion=config["criterion"],
    patience=config["patience"],
    optimizer=config["optimizer"],
    batch_norm=config["batch_norm"],
)
trainer_overfit = Trainer(
    max_epochs=50,
    accelerator="auto",
    deterministic="warn",
    logger=False,
    default_root_dir=default_exp_dir,
    detect_anomaly=True,
    overfit_batches=1,
    enable_checkpointing=False,
)
trainer_overfit.fit(keypoint_task_overfit, train_dataloaders=train_heatmap_loader)
metrics = trainer_overfit.callback_metrics
print(f"\n=== Overfit Test ===\nFinal loss: {metrics['train_loss']:.6f}")

### Training U-Net

In [None]:
version = 0

unet = UNetKeypointDetector(
    num_predictions=config["num_predictions"],
    activation=config["activation"],
    base_features=config["base_features"],
    output_size=config["output_size"],
)

keypoint_task = KeypointDetection(
    model=unet,
    lr=config["lr"],
    criterion=config["criterion"],
    patience=config["patience"],
    optimizer=config["optimizer"],
    pretrained_backbone=None,
    dropout=None,
    batch_norm=config["batch_norm"],
)

checkpoint_callback = ModelCheckpoint(
    dirpath=default_exp_dir + f"version_{version}",
    filename="unet",
    monitor="train_loss",
    mode="min",
    save_top_k=1,
    save_last=True,
    save_weights_only=True,
    enable_version_counter=True,
)

earlystopping_callback = EarlyStopping(
    monitor="train_loss",
    patience=config["patience"],
    mode="min",
    min_delta=0.001,
)

trainer = Trainer(
    accelerator="auto",
    logger=[
        TensorBoardLogger(
            default_exp_dir,
            name="",  # necessary to save logs to the same directory as CSVLogger
            version=f"version_{version}",
            log_graph=True,
            default_hp_metric=False,
        ),
        CSVLogger(default_exp_dir, name="", version=f"version_{version}"),
    ],
    max_epochs=config["max_epochs"],
    callbacks=[checkpoint_callback, earlystopping_callback],
    deterministic="warn",
    default_root_dir=default_exp_dir,
    enable_checkpointing=True,
)

trainer.fit(
    keypoint_task,
    train_dataloaders=train_heatmap_loader,
    val_dataloaders=test_heatmap_loader,
)

keypoint_task = KeypointDetection.load_from_checkpoint(
    checkpoint_callback.best_model_path, weights_only=True, model=unet
)

metrics = trainer.callback_metrics
print(
    f"\n\n=============\nFinal train loss: {metrics['train_loss']:.4f}\nFinal val loss: {metrics['val_loss']:.4f}"
)

visualize_heatmaps(unet, test_batch)
visualize_loss_curve(
    logs=f"{default_exp_dir}version_{version}/metrics.csv",
    title=f"Part 4: UNet Training Curve Version {version}",
)

### Tensorboard

In [None]:
%reload_ext tensorboard
%tensorboard --logdir {default_exp_dir}