In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim

from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn

import torchvision
from torchvision import datasets, models, transforms
from torchvision.io import read_image

import time
import copy

torch.manual_seed(0)

class train_stages():
    def __init__(self, general_params, stage_params, logger=None) -> None:
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.general_params = general_params
        self.stage_params = stage_params
        self.logger = logger # in case a ClearML logger can be used

    def stage_params_func(self, stage_params, num_stages=4):
        # all parameters are linearly scaled with the number of stages
        stage = self.stage
        num_stages = num_stages-1
        
        image_size = stage_params["image_size_min"] + ((stage_params["image_size_max"]-stage_params["image_size_min"])/num_stages)*stage # min + (((max - min) / num_stages) * stage)
        dropout = stage_params["dropout_min"] + ((stage_params["dropout_max"]-stage_params["dropout_min"])/num_stages)*stage 
        randaug = stage_params["randaug_magnitude_min"]+ ((stage_params["randaug_magnitude_max"]-stage_params["randaug_magnitude_min"])/num_stages)*stage
        self.params = {
            "epochs": stage_params["epochs"],
            "image_size": int(image_size),
            "dropout": dropout,   
            "randaug_magnitude": int(randaug),
        }

        print(f"Parameters for the current stage: \n{self.params}")

    def data_loader(self):
        data_transforms = {
            'training': transforms.Compose([
                transforms.Resize((self.params["image_size"], self.params["image_size"])),
                transforms.RandAugment(magnitude=self.params["randaug_magnitude"]),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ]),
            'validation': transforms.Compose([
                transforms.Resize((self.params["image_size"], self.params["image_size"])),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ]),
        }

        data_dir = '/Workdir/Data/imagenette2'
        image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x), data_transforms[x]) for x in ['training', 'validation']}
        self.dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=self.general_params["batch_size"], shuffle=True, num_workers=8) for x in ['training', 'validation']}
        self.dataset_sizes = {x: len(image_datasets[x]) for x in ['training', 'validation']}
        self.class_names = ["tench", "English springer", "cassette player", "chain saw", "church", "French horn", "garbage truck", "gas pump", "golf ball", "parachute"]

    def train_model(self, model, criterion, optimizer, scheduler, num_epochs=25):
        since = time.time()

        for epoch in range(num_epochs):
            print(f'Epoch {epoch}/{num_epochs - 1}')
            print('-' * 10)

            # Each epoch has a training and validation phase
            for phase in ['training', 'validation']:
                if phase == 'training':
                    model.train()  # Set model to training mode
                else:
                    model.eval()   # Set model to evaluate mode

                running_loss = 0.0
                running_corrects = 0
                
                # Iterate over data.
                for inputs, labels in self.dataloaders[phase]:
                    inputs = inputs.to(self.device)
                    labels = labels.to(self.device)

                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'training'):
                        outputs = model(inputs)
                        _, preds = torch.max(outputs, 1)
                        loss = criterion(outputs, labels)

                        # backward + optimize only if in training phase
                        if phase == 'training':
                            loss.backward()
                            optimizer.step()

                    # statistics
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)
                if phase == 'training':
                    scheduler.step()

                epoch_loss = running_loss / self.dataset_sizes[phase]
                epoch_acc = running_corrects.double() / self.dataset_sizes[phase]
                
                # Add ClearML Graphs
                if self.logger != None:
                    self.logger.report_scalar("Loss", f"Stage {self.stage + 1}: {phase}", iteration=(epoch), value=epoch_loss)
                    self.logger.report_scalar("Accuracy", f"Stage {self.stage + 1}: {phase}", iteration=(epoch), value=epoch_acc)

                print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

        time_elapsed = time.time() - since
        print(f'Training stage completed in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')

        return model


    def model_setup(self, model_ft):
        num_ftrs = model_ft.classifier[1].in_features
        # Here the size of each output sample is set to 2.
        # Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
        model_ft.classifier = nn.Linear(num_ftrs, len(self.class_names))
        return model_ft

    def train_model_setup(self, model):
        model_ft = model

        model_ft = model_ft.to(self.device)

        criterion = nn.CrossEntropyLoss()

        # Observe that all parameters are being optimized
        optimizer_ft = optim.RMSprop(model_ft.parameters(), lr=self.general_params["learning_rate"], momentum=self.general_params["momentum"]) #, weight_decay=self.general_params["weight_decay"])


        exp_lr_scheduler = lr_scheduler.ExponentialLR(optimizer_ft, gamma=self.general_params["learning_rate_decay"])

        model_ft = self.train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=self.stage_params["epochs"])
        return model_ft


    def run_stage_manager(self):
        print("Starting training")
        
        if not os.path.isdir("Training"):
            os.makedirs("Training")
        
        for self.stage in range(0, self.general_params.get("stages", 4)):
            print(f"Starting stage {self.stage + 1}")
            self.stage_params_func(self.stage_params, self.general_params.get("stages", 4))
            self.data_loader()

            if self.stage == 0:
                model = torchvision.models.get_model(self.general_params["model"], dropout=self.params["dropout"])
                model = self.model_setup(model)
                model = self.train_model_setup(model)
                torch.save(model.state_dict(), f'Training/model_weights_stage{self.stage}.pth') #save the current weights to file
            else:
                model = torchvision.models.get_model(self.general_params["model"], dropout=self.params["dropout"])
                model = self.model_setup(model)
                model.load_state_dict(torch.load(f'Training/model_weights_stage{self.stage-1}.pth')) # load the weights of the previous stage into the currenct network which has a different dropout value
                model = self.train_model_setup(model)
                torch.save(model.state_dict(), f'Training/model_weights_stage{self.stage}.pth')
                
            del model, self.dataloaders
            torch.cuda.empty_cache() # Clearing the GPU memory for the next stage
            print(f"Finished stage {self.stage}\n\n\n")
            time.sleep(5)
 
        print("Finished all stages")


In [None]:
ClearML = False
if ClearML:
    # ClearML setup
    from clearml import Task, Logger
    %env CLEARML_WEB_HOST=https://app.clear.ml
    %env CLEARML_API_HOST=https://api.clear.ml
    %env CLEARML_FILES_HOST=https://files.clear.ml
    # Pytorch
    %env CLEARML_API_ACCESS_KEY=API_KEY
    %env CLEARML_API_SECRET_KEY=API_SECRET

    task = Task.init(project_name='ImageNetTE', task_name='ImageNetTE - EfficientNetV2-S')

    task.set_base_docker(docker_image="nvcr.io/nvidia/pytorch:23.03-py3", docker_arguments="--rm -e LOCAL_PYTHON=/usr/bin/python3 --ipc=host")
    logger = task.logger
else:
    logger = None

In [None]:
# ClearML hyper parameters
general_params = {
    "model": "efficientnet_v2_s",
    "batch_size": 1,
    "learning_rate": 0.01,
    "learning_rate_decay": 0.99,
    "momentum": 0.9,
    #"weight_decay": 0,
    "stages" : 4,
}
if ClearML: general_params = task.connect(general_params, name="general")  # enabling configuration override by clearml

stage_params = {
    "image_size_min": 128,
    "image_size_max": 300,
    "dropout_min": 0.1,
    "dropout_max": 0.3,
    "randaug_magnitude_min": 5,
    "randaug_magnitude_max": 15,
    "epochs": 1,
}
if ClearML: stage_params = task.connect(stage_params, name="Stage Params")  # enabling configuration override by clearml



In [None]:
train = train_stages(general_params=general_params, stage_params=stage_params, logger=logger)
start = time.time()
train.run_stage_manager()
print(f"Finished training in {(time.time() - start) / 60} minutes")
task.logger.report_single_value("Training Time, minutes", (time.time()-start) / 60)

In [None]:
if ClearML: task.close()