In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch, sys
from torchvision.transforms import v2
from torch.utils.data import Dataset, DataLoader, random_split, Subset
from pathlib import Path
from tqdm import tqdm

# Check if running in Google Colab
IN_COLAB = 'google.colab' in sys.modules

# Set dataset path accordingly
if IN_COLAB:
    ! git clone --branch refactored https://github.com/MrKiwix/IAPR-project.git
    %cd IAPR-project
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_DIR = Path('/content/drive/MyDrive/IAPR')
else:
    ROOT_DIR = Path('./')

# Training Notebook

Function calls to start a training.

This assumes the following:
- Training data is present at the desired location
- The CSV label information is also created

> Warning: this exports the best model weights and will therefore erase the previous one with the same name

We start with the constant and transformation setup:

In [None]:
# Constants
NUM_CLASSES = 13
IMG_SIZE = (120, 180) # (height, width), make sure to match this size in the main.py
BATCH_SIZE = 16
NUM_EPOCHS = 100
SCHEDULER = True # if we want to use a learning rate scheduler

# Path to dataset and csv label
label_csv  = ROOT_DIR / Path("./data/train.csv")
images_dir = ROOT_DIR / Path("./data/train")
alpha_reference = ROOT_DIR / Path("./data/alpha_references/")
synth_dir = ROOT_DIR / Path("./data/synthetic_data/")
# create synth_dir if it doesn't exist
synth_dir.mkdir(parents=True, exist_ok=True)

# Create the model directory if it doesn't exist
best_model_dir = ROOT_DIR / Path("./model/")
best_model_dir.mkdir(parents=True, exist_ok=True)

# Computed on the whole training set:
means = [0.6887134909629822, 0.666830837726593, 0.6608285307884216]
stds =  [0.15740245580673218, 0.1555258184671402, 0.17858198285102844]

# Training and eval transform
train_tf = v2.Compose([
    v2.ToImage(),
    
    v2.Resize(IMG_SIZE, antialias=True), 
        
    v2.RandomHorizontalFlip(p=0.5),           
    v2.RandomVerticalFlip(p=0.5),   
            
    v2.ToDtype(torch.float32, scale=True), 
    v2.Normalize(mean=means, std=stds),
])

val_tf = v2.Compose([
    v2.ToImage(),
    v2.Resize(IMG_SIZE, antialias=True),                
    v2.ToDtype(torch.float32, scale=True), 
    v2.Normalize(mean=means, std=stds),
])

Now, we can load our dataset to create a training and validation split

In [None]:
from src.data.TrainChocolateDataset import * 
from src.data.SyntheticChocolateDataset import SyntheticChocolateDataset

# We first create a general dataset
train_eval_dataset = ChocolateDataset(
    data_dir=images_dir,
    label_csv=label_csv,
    transform=None, # Since the two split are not using the same transform, we set it to None
    target_transform=LabelToTensor(),
)
# We now split the dataset into training and validation sets
# Split indexes
train_len = int(0.8 * len(train_eval_dataset))
test_len  = len(train_eval_dataset) - train_len
train_idxs, test_idxs = torch.utils.data.random_split(
    range(len(train_eval_dataset)), [train_len, test_len], generator=torch.Generator().manual_seed(42))


training_dataset = Subset(
    ChocolateDataset(images_dir, label_csv, transform=train_tf, target_transform=LabelToTensor()),
    train_idxs)
val_dataset = Subset(
    ChocolateDataset(images_dir, label_csv, transform=val_tf, target_transform=LabelToTensor()),
    test_idxs)

# Synthetic dataset, this will take some time to generate
synth_dataset = SyntheticChocolateDataset(
    background_dir=images_dir,
    alpha_reference_dir=alpha_reference,
    synth_dir=synth_dir,
    original_label_csv=label_csv,
    train_idx=train_idxs,
    per_background=10,
    transform=train_tf,
    target_transform=LabelToTensor(),
)

# We can now merge the two datasets
merged_training_dataset = torch.utils.data.ConcatDataset([training_dataset, synth_dataset])

# Create DataLoaders
num_workers = 0 # Increase if possible to get good speedup
train_loader = DataLoader(merged_training_dataset, BATCH_SIZE,
                          shuffle=True,  num_workers=num_workers, pin_memory=True)
val_loader  = DataLoader(val_dataset,  BATCH_SIZE,
                          shuffle=False, num_workers=num_workers, pin_memory=True)

# print the size of the datasets
print(f"Training dataset size: {len(training_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Synthetic dataset size: {len(synth_dataset)}")
print(f"Merged training dataset size: {len(merged_training_dataset)}")

Data is now ready with our loader, let's instantiate the model

In [None]:
import csv # for logging

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Import the model and the training settings
from src.model.ChocoNetwork import ChocoNetwork
from src.training.training import *

model = ChocoNetwork().to(device)
loss = torch.nn.SmoothL1Loss() 
kaggle_loss = ChocolateCountF1Loss() # Custom loss to match the one on the leaderboard (1-F1 score)
optimizer = get_optimizer(model) # AdamW with weight decay

if SCHEDULER:
    from torch.optim.lr_scheduler import ReduceLROnPlateau
    scheduler = ReduceLROnPlateau(
      optimizer,
      mode='max',         # we want val_f1 to go UP
      factor=0.8,         # reduce LR by 20% when F1 plateaus
      patience=5,         # wait 5 epochs with no F1 gain
      threshold=1e-3,     # require at least 0.001 improvement
      verbose=True
    )
else:
    scheduler = None

best_val_f1 = 0.0
best_val_loss = float("inf")

# Let's train this model
with open(best_model_dir / f"run_model.csv", "w", newline="") as csv_file:
    
    # csv
    writer = csv.writer(csv_file)
    writer.writerow(["epoch", "train_loss", "val_loss", "val_f1", "val_mae"])
    
    # training loop
    for epoch in tqdm(range(1, NUM_EPOCHS + 1)):
        
        # Computing the losses
        train_loss = train_epoch(train_loader, model, loss, optimizer, device)
        val_loss, val_f1, val_mae = eval_epoch(val_loader, model, loss, NUM_CLASSES, device)

        # Update the scheduler
        if SCHEDULER:
            scheduler.step(val_f1)
        
        mae_str = "; ".join([f"{m:.2f}" for m in val_mae])
        print(f"Epoch {epoch:02d} | "
            f"train loss L1: {train_loss:.4f} | "
            f"val loss l1: {val_loss:.4f} | "
            f"val custom F1: {val_f1:.4f} | "
            f"val MAE/class: [{mae_str}]")
        
        # log to csv
        writer.writerow([epoch,train_loss, val_loss, val_f1, mae_str])
        csv_file.flush()

        # save best model if one of the metrics improved
        if val_f1 > best_val_f1 or val_loss < best_val_loss:
            if val_f1 > best_val_f1:
                best_val_f1 = val_f1
                print(f"New best model found at epoch {epoch} with val F1: {val_f1:.4f}")
            else:
                print(f"New best model found at epoch {epoch} with val loss: {val_loss:.4f}")
                best_val_loss = val_loss
            # we save the model with both metrics in the same so that we can load it later
            best_model_path = best_model_dir / f"best_model_{epoch:02d}_f1_{val_f1:.4f}_loss_{val_loss:.4f}.pth"
            torch.save(model.state_dict(), best_model_path)

print(f"The training finished at epoch {epoch} with a best val F1 of {best_val_f1:.4f}")
