[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](
https://colab.research.google.com/github/Supervised-Soup/supervised-soup-project/blob/main/notebooks/colab_training_notebook_updated.ipynb
)


In [None]:
# # set cublas workspace config to make deterministic run possibl
import os
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"


In [None]:
# @title Setup Code
# @markdown This cell is to set up paths and dependencies and clone our repo.
# This cell can be copy and pasted to the start of every new colab notebook.
# Note on the latest changes: I have added a ColabCache folder on our shared drive, to save the dependency files.
# This way the install of the requirements should be much faster, even across sessions.

# mounting google drive to access the training data
from google.colab import drive
drive.mount('/content/drive')

# path for shared dataset
import os
os.environ["DATA_PATH"] = '/content/drive/MyDrive/SupervisedSoupData/ImageNetSubset'
DATA_PATH = os.getenv("DATA_PATH")

# verify path
if os.path.exists(DATA_PATH):
    print("Dataset found at:", DATA_PATH)
    print("Contents:", os.listdir(DATA_PATH))
else:
    print("Dataset path not found. Please check if you have setup your Drive shortcut properly (see guide on confluence: https://stud-team-rn9zsvdn.atlassian.net/wiki/pages/resumedraft.action?draftId=6586396&draftShareId=6aea0c7c-2591-45b1-a0f8-f3db9e25e222).")

# integrating github by cloning our repo
!git clone https://github.com/NeuralSpiral/supervised-soup-project.git
%cd /content/supervised-soup-project

# to install the dependencies
# !pip install -r requirements.txt
CACHE_PATH = "/content/drive/MyDrive/SupervisedSoupData/ColabCache/pip"

!pip install --no-index --find-links={CACHE_PATH} -r requirements.txt
!pip install -e .

# run short import test
!python tests/setup_test.py

# add path to our src folder
import sys
sys.path.append('/content/supervised-soup-project/supervised_soup')

# now we can import the code from our main folder if we need it for the notebook (e.g. dataloader, model), e.g.:
# from supervised_soup import dataloader


In [None]:
# @title Dataset Selection

# You only need to change this to "cleaned" for experiments on the other dataset
# Options: "original" or "cleaned"
DATASET_VERSION = "original"  

# Base path where datasets are stored
BASE_DATA_PATH = '/content/drive/MyDrive/SupervisedSoupData'

# Set DATA_PATH environment variable based on selection
if DATASET_VERSION == "original":
    dataset_folder = "ImageNetSubset"
elif DATASET_VERSION == "cleaned":
    dataset_folder = "ImageNetSubset_cleaned"
else:
    raise ValueError(f"Unknown DATASET_VERSION={DATASET_VERSION}. Choose 'original' or 'cleaned'.")

DATA_PATH = os.path.join(BASE_DATA_PATH, dataset_folder)
os.environ["DATA_PATH"] = DATA_PATH

# Verify the dataset path exists
if os.path.exists(DATA_PATH):
    print(f"Dataset ({DATASET_VERSION}) found at: {DATA_PATH}")
    print("Contents:", os.listdir(DATA_PATH)[:5])
else:
    raise FileNotFoundError(f"Dataset path not found: {DATA_PATH}")


In [None]:
# @title How to cache dependencies on drive for faster install time

# Only run this cell if requirements.txt changes, refreshes cached wheels on Drive for faster installs

%cd /content/supervised-soup-project

# download and cache all wheel files
!pip download -r requirements.txt -d "/content/drive/MyDrive/SupervisedSoupData/ColabCache/pip"

In [None]:
# Copy the dataset from Drive to the Colab local VM for faster training
# without this training was basically at CPU speed even with GPUs 

import shutil
import os

DRIVE_DATA_PATH = os.getenv("DATA_PATH")
LOCAL_DATA_PATH = "/content/data"

if not os.path.exists(LOCAL_DATA_PATH):
    print("Copying dataset from Drive â†’ local VM...")
    shutil.copytree(DRIVE_DATA_PATH, LOCAL_DATA_PATH)
else:
    print("Local dataset already exists, skipping copy.")

# Override DATA_PATH for faster training
os.environ["DATA_PATH"] = LOCAL_DATA_PATH


In [None]:
# Verify the dataset exists and show a sample image

import os
from PIL import Image
import matplotlib.pyplot as plt

DATA_PATH = os.getenv("DATA_PATH")
train_dir = os.path.join(DATA_PATH, "train")

if not os.path.exists(train_dir):
    raise RuntimeError("Train directory not found. Check DATA_PATH.")

print("Sample classes:", os.listdir(train_dir)[:5])

sample_class = os.listdir(train_dir)[0]
sample_image = os.listdir(os.path.join(train_dir, sample_class))[0]

img = Image.open(os.path.join(train_dir, sample_class, sample_image))
plt.imshow(img)
plt.title(sample_class)
plt.axis("off")
plt.show()


In [None]:
# Check if CUDA is available and print GPU info

import torch
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


In [None]:
# Experiment configuration
# Cell defines all experiment-related configurations, easier reproducibility with mutiple experiments
# Change the values here for your experiment run
# current values are example for a baseline training run
# For wandb naming conventions see the doc file on confluence


EXPERIMENT_CONFIG = {
    "experiment_name": "baseline_resnet18_frozen",

    # Dataset stuff
    "dataset_version": DATASET_VERSION, 
    "data_path": os.environ["DATA_PATH"],
    
    # Training configs
    "epochs": 30,
    "learning_rate": 1e-3,
    "batch_size": 64,
    "seed": 42,

    # Model configuration
    "model_name": "resnet18",
    "pretrained": True,
    "freeze_layers": True,
    "freeze_until": None,

    # Augmentation
    "with_augmentation": False,

    # Optimizer and hyperparameters
    "optimizer": "sgd",
    "momentum": 0.9,
    "weight_decay": 0.0,

    # Scheduler
    "scheduler": "cosine",
    "min_lr": 1e-6,


    "loss": "cross_entropy",

    "device": "cuda" if torch.cuda.is_available() else "cpu",


    # wandb stuff
    "wandb_project": "x-AI-Proj-ImageClassification",
    "wandb_group": "baseline_frozen",
    # wandb_name should be a unique, descripitve name for every individual run

}

# experiment and wandb names now auto-generated from configs (no accidental misnaming)

# set a freee_tag based on configs
if EXPERIMENT_CONFIG["freeze_layers"]:
    freeze_tag = "frozen"
elif EXPERIMENT_CONFIG["freeze_until"] is not None:
    freeze_tag = f"partial_{EXPERIMENT_CONFIG['freeze_until']}"
else:
    freeze_tag = "full_finetune"

# wandb_name
EXPERIMENT_CONFIG["wandb_name"] = (
    f"{EXPERIMENT_CONFIG['model_name']}_"
    f"{freeze_tag}_"
    f"aug{int(EXPERIMENT_CONFIG['with_augmentation'])}_"
    f"seed{EXPERIMENT_CONFIG['seed']}_"
    f"ds{EXPERIMENT_CONFIG['dataset_version']}"
)

# experiment_name (less detailed)
EXPERIMENT_CONFIG["experiment_name"] = (
    f"{EXPERIMENT_CONFIG['model_name']}_"
    f"ds{EXPERIMENT_CONFIG['dataset_version']}_"
    f"aug{int(EXPERIMENT_CONFIG['with_augmentation'])}"
)




In [None]:
# Login to Weights & Biases for experiment tracking with your API key

import wandb
os.environ.pop("WANDB_ENTITY", None)
os.environ.pop("WANDB_PROJECT", None)
os.environ["WANDB_ENTITY"] = "neural-spi-university"

wandb.login()


In [None]:
# @title General training run cell
# runs a full training experiment using the parameters from EXPERIMENT_CONFIG.

from supervised_soup.train import run_training

model, history = run_training(
    epochs=EXPERIMENT_CONFIG["epochs"],
    lr=EXPERIMENT_CONFIG["learning_rate"],
    with_augmentation=EXPERIMENT_CONFIG["with_augmentation"],
    model_name=EXPERIMENT_CONFIG.get("model_name"),
    pretrained=EXPERIMENT_CONFIG["pretrained"],
    freeze_layers=EXPERIMENT_CONFIG["freeze_layers"],
    freeze_until=EXPERIMENT_CONFIG.get("freeze_until"),
    seed=EXPERIMENT_CONFIG["seed"],
    wandb_group=EXPERIMENT_CONFIG["wandb_group"],
    wandb_name=EXPERIMENT_CONFIG["wandb_name"],
    run_type=EXPERIMENT_CONFIG["experiment_name"],
    experiment_config=EXPERIMENT_CONFIG,
)

In [None]:
# @title Short test run (3 epochs)
# Example run to test training works and metrics log to wandb
# Run for debugging and validation before long runs

from supervised_soup.train import run_training

model, history = run_training(
    epochs=3,
    lr=EXPERIMENT_CONFIG["learning_rate"],
    with_augmentation=EXPERIMENT_CONFIG["with_augmentation"],
    model_name=EXPERIMENT_CONFIG.get("model_name"),
    pretrained=EXPERIMENT_CONFIG["pretrained"],
    freeze_layers=EXPERIMENT_CONFIG["freeze_layers"],
    freeze_until=EXPERIMENT_CONFIG.get("freeze_until"),
    seed=EXPERIMENT_CONFIG["seed"],
    wandb_group=f"{EXPERIMENT_CONFIG['wandb_group']}_test",
    wandb_name=f"{EXPERIMENT_CONFIG['wandb_name']}_3ep_test",
    run_type=f"{EXPERIMENT_CONFIG['experiment_name']}_test",
    experiment_config=EXPERIMENT_CONFIG,
)



In [None]:
# This was for debugging stuff

# Print W&B run info, GPU memory usage, and sample local dataset folders for debugging
print("W&B entity:", wandb.run.entity)
print("W&B project:", wandb.run.project)

!nvidia-smi

import os
print("Local dataset folders:", os.listdir(LOCAL_DATA_PATH)[:5])


In [None]:
# Resume training from the last saved checkpoint if it exists
# Run this cell if a previous run was interrupted

import os
from supervised_soup.train import run_training

model, history = run_training(
    epochs=EXPERIMENT_CONFIG["epochs"],
    lr=EXPERIMENT_CONFIG["learning_rate"],
    with_augmentation=EXPERIMENT_CONFIG["with_augmentation"],
    pretrained=EXPERIMENT_CONFIG["pretrained"],
    freeze_layers=EXPERIMENT_CONFIG["freeze_layers"],
    seed=EXPERIMENT_CONFIG["seed"],
    wandb_group=EXPERIMENT_CONFIG["wandb_group"],
    wandb_name=f"{EXPERIMENT_CONFIG['wandb_name']}_resume",
    run_type=f"{EXPERIMENT_CONFIG['experiment_name']}_resume",
    resume=True,
    experiment_config=EXPERIMENT_CONFIG,
)


In [None]:
# How to print run history to string
# specify entity/project/run_id
# e.g. run = api.run("neural-spi-university/x-AI-Proj-ImageClassification/f8wn0wc1")
import wandb

api = wandb.Api()
run = api.run("neural-spi-university/x-AI-Proj-ImageClassification/f8wn0wc1")

cols = [
    "epoch",
    "train/loss",
    "train/accuracy",
    "val/loss",
    "val/accuracy",
    "val/f1_macro",
    "val/roc_auc_macro",
]

df = run.history(keys=cols)
print(df.to_string(index=False))