# Pretraining Training script in MAE

> Pretraining Training script in MAE

In [1]:
#| default_exp mae.pretraining_training

In [2]:
#| hide
%load_ext autoreload
%autoreload 2

In [3]:
#| export
import argparse
import datetime
import json
import numpy as np
from typing import Iterable
from types import SimpleNamespace
import os
import time
from pathlib import Path

import torch
import torch.backends.cudnn as cudnn
from torch.utils.tensorboard import SummaryWriter
import torchvision.transforms as transforms
import torchvision.datasets as datasets

import timm

2024-09-22 06:52:48.497568: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-22 06:52:48.724540: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-22 06:52:48.724564: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-22 06:52:48.724572: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-22 06:52:48.876008: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: S

In [4]:
timm.__version__

'1.0.9'

In [5]:
torch.__version__

'2.4.1'

In [8]:
torch.cuda.is_available()

True

In [7]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [9]:
#| exporti
import segmentation_test.mae.misc as misc
import segmentation_test.mae.model_development as models_mae

In [10]:
#| export
import timm.optim.optim_factory as optim_factory
import math
from fastcore.test import *

In [11]:
torch.cuda.is_available()

True

### Gradient Scaling and Norm Counting: 


Dealing with mixed precision training, where we use both 16-bit and 32-bit floating-point numbers to speed up computation and reduce memory usage. But this can lead to some tricky numerical issues. Enter the `NativeScalerWithGradNormCount` class!

## What's this class all about?

This nifty little class is a wrapper around PyTorch's `GradScaler`. It's designed to handle the intricacies of mixed precision training while also giving us some extra goodies like gradient norm calculation.

## Let's break it down:

1. **Initialization**: We start by creating a `GradScaler` object. This is PyTorch's built-in tool for automatic mixed precision training.

2. **The `__call__` method**: This is where the magic happens!
   - It scales the loss and performs backpropagation.
   - If we're updating gradients, it handles gradient unscaling and clipping.
   - It also calculates the gradient norm, which is super useful for monitoring training stability.

3. **State management**: The `state_dict` and `load_state_dict` methods allow us to save and load the scaler's state. This is crucial for resuming training from checkpoints.

## Why is this so cool?

- It seamlessly integrates mixed precision training into our workflow.
- It provides gradient clipping out of the box, which helps prevent exploding gradients.
- The gradient norm calculation gives us valuable insights into our training process.

By using this class, we're not just training our model - we're training it smartly and efficiently. It's like having a personal trainer for your neural network!


In [12]:
#| export
class NativeScalerWithGradNormCount:
    state_dict_key = "amp_scaler"

    def __init__(self):
        self._scaler = torch.cuda.amp.GradScaler()

    def __call__(
		self, 
		loss, 
		optimizer, 
		clip_grad=None, 
		parameters=None, 
		create_graph=False, 
		update_grad=True):
        self._scaler.scale(loss).backward(create_graph=create_graph)
        if update_grad:
            if clip_grad is not None:
                assert parameters is not None
                self._scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params in-place
                norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad)
            else:
                self._scaler.unscale_(optimizer)
                norm = get_grad_norm_(parameters)
            self._scaler.step(optimizer)
            self._scaler.update()
        else:
            norm = None
        return norm

    def state_dict(self):
        return self._scaler.state_dict()

    def load_state_dict(self, state_dict):
        self._scaler.load_state_dict(state_dict)

In [13]:
#| export
def adjust_learning_rate(
	optimizer, 
	epoch, 
	args
	):
    """Decay the learning rate with half-cycle cosine after warmup"""
    if epoch < args.warmup_epochs:
        lr = args.lr * epoch / args.warmup_epochs 
    else:
        lr = args.min_lr + (args.lr - args.min_lr) * 0.5 * \
            (1. + math.cos(math.pi * (epoch - args.warmup_epochs) / (args.epochs - args.warmup_epochs)))
    for param_group in optimizer.param_groups:
        if "lr_scale" in param_group:
            param_group["lr"] = lr * param_group["lr_scale"]
        else:
            param_group["lr"] = lr
    return lr


In [14]:
#| export
def test_adjust_learning_rate():
    class DummyOptimizer:
        def __init__(self):
            self.param_groups = [
				{"lr": 0.1}, 
				{"lr": 0.2, "lr_scale": 2}
			]

    class DummyArgs:
        def __init__(self):
            self.warmup_epochs = 5
            self.epochs = 100
            self.lr = 0.1
            self.min_lr = 0.001

    optimizer = DummyOptimizer()
    args = DummyArgs()

    # Test during warmup
    lr = adjust_learning_rate(optimizer, 2, args)
    test_eq(lr, 0.04)
    test_eq(optimizer.param_groups[0]["lr"], 0.04)
    test_eq(optimizer.param_groups[1]["lr"], 0.08)

    # Test after warmup
    lr = adjust_learning_rate(optimizer, 50, args)
    expected_lr = 0.001 + (0.1 - 0.001) * 0.5 * (1 + math.cos(math.pi * 45 / 95))
    test_close(lr, expected_lr, eps=1e-6)
    test_close(optimizer.param_groups[0]["lr"], expected_lr, eps=1e-6)
    test_close(optimizer.param_groups[1]["lr"], expected_lr * 2, eps=1e-6)
    print("All tests passed!")


In [15]:
test_adjust_learning_rate()

All tests passed!


In [55]:
#| export
def train_one_epoch(
	            model: torch.nn.Module,
                data_loader: Iterable, 
				optimizer: torch.optim.Optimizer,
                device: torch.device, 
				epoch: int, 
				loss_scaler: NativeScalerWithGradNormCount,
                log_writer=None,
                args=None):
    model.train(True)
    metric_logger = misc.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)
    print_freq = 20

    accum_iter = args.accum_iter

    optimizer.zero_grad()

    if log_writer is not None:
        print('log_dir: {}'.format(log_writer.log_dir))

    for data_iter_step, (samples, _) in enumerate(metric_logger.log_every(data_loader, print_freq, header)):

        # we use a per iteration (instead of per epoch) lr scheduler
        if data_iter_step % accum_iter == 0:
            adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args)

        samples = samples.to(device, non_blocking=True)

        with torch.cuda.amp.autocast():
            loss, _, _ = model(samples, mask_ratio=args.mask_ratio)

        loss_value = loss.item()

        if not math.isfinite(loss_value):
            print("Loss is {}, stopping training".format(loss_value))
            sys.exit(1)

        loss /= accum_iter
        loss_scaler(
			loss, 
			optimizer, 
			parameters=model.parameters(),
            update_grad=(data_iter_step + 1) % accum_iter == 0
		)
        if (data_iter_step + 1) % accum_iter == 0:
        	optimizer.zero_grad()
        torch.cuda.synchronize()

        metric_logger.update(loss=loss_value)

        lr = optimizer.param_groups[0]["lr"]
        metric_logger.update(lr=lr)

        loss_value_reduce = misc.all_reduce_mean(loss_value)
        if log_writer is not None and (data_iter_step + 1) % accum_iter == 0:
            """ We use epoch_1000x as the x-axis in tensorboard.
			This calibrates different curves when batch size changes.
			"""
            epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000)
            log_writer.add_scalar('train_loss', loss_value_reduce, epoch_1000x)
            log_writer.add_scalar('lr', lr, epoch_1000x)


	# gather the stats from all processes
    metric_logger.synchronize_between_processes()
    print("Averaged stats:", metric_logger)
    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}

# main_pretrain.py script will be implemented here

> engine_pretrain.py needs to be implemented here, and call main function

In [17]:
from fastai.vision.all import untar_data, URLs

# Download and extract the ImageWang dataset (a subset of ImageNet)
path = untar_data(URLs.IMAGEWANG_160)

# Update the DATA_PATH to point to the downloaded dataset
DATA_PATH = str(path)

print(f"ImageWang dataset downloaded and extracted to: {DATA_PATH}")


ImageWang dataset downloaded and extracted to: /home/user/.fastai/data/imagewang-160


In [18]:
Path(DATA_PATH).ls()
# MAE pre-training arguments

(#3) [Path('/home/user/.fastai/data/imagewang-160/val'),Path('/home/user/.fastai/data/imagewang-160/train'),Path('/home/user/.fastai/data/imagewang-160/unsup')]

In [57]:
#| export
# MAE pre-training arguments
BATCH_SIZE = 2  # Batch size per GPU (effective batch size is BATCH_SIZE * ACCUM_ITER * # gpus)
EPOCHS = 400
ACCUM_ITER = 1  # Accumulate gradient iterations (for increasing the effective batch size under memory constraints)

# Model parameters
MODEL = 'mae_vit_large_patch16'  # Name of model to train
INPUT_SIZE = 224  # images input size
MASK_RATIO = 0.75  # Masking ratio (percentage of removed patches)
NORM_PIX_LOSS = False  # Use (per-patch) normalized pixels as targets for computing loss

# Optimizer parameters
WEIGHT_DECAY = 0.05
LR = None  # learning rate (absolute lr)
BLR = 1e-3  # base learning rate: absolute_lr = base_lr * total_batch_size / 256
MIN_LR = 0.  # lower lr bound for cyclic schedulers that hit 0
WARMUP_EPOCHS = 40  # epochs to warmup LR

# Dataset parameters
DATA_PATH = str(path)  # dataset path
# Typically, this folder structure contains:
# - 'train' folder: Contains subfolders for each class, with training images
# - 'val' folder: Contains subfolders for each class, with validation images
# - 'test' folder: (optional) Contains test images, if provided
# - 'labels.txt' or similar: A file mapping class names to numeric IDs
OUTPUT_DIR = './output_dir'  # path where to save, empty for no saving
LOG_DIR = './output_dir'  # path where to tensorboard log
DEVICE = 'cuda'  # device to use for training / testing
SEED = 0
RESUME = ''  # resume from checkpoint
START_EPOCH = 0
NUM_WORKERS = 4# 10 was too much for my GPU
PIN_MEM = True  # Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.

# distributed training parameters
WORLD_SIZE = 1  # number of distributed processes
LOCAL_RANK = -1
DIST_ON_ITP = False
DIST_URL = 'env://'  # url used to set up distributed training


In [58]:
#| export
# Create a SimpleNamespace object to mimic argparse.Namespace
args = SimpleNamespace(
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    accum_iter=ACCUM_ITER,
    model=MODEL,
    input_size=INPUT_SIZE,
    mask_ratio=MASK_RATIO,
    norm_pix_loss=NORM_PIX_LOSS,
    weight_decay=WEIGHT_DECAY,
    lr=LR,
    blr=BLR,
    min_lr=MIN_LR,
    warmup_epochs=WARMUP_EPOCHS,
    data_path=DATA_PATH,
    output_dir=OUTPUT_DIR,
    log_dir=LOG_DIR,
    device=DEVICE,
    seed=SEED,
    resume=RESUME,
    start_epoch=START_EPOCH,
    num_workers=NUM_WORKERS,
    pin_mem=PIN_MEM,
    world_size=WORLD_SIZE,
    local_rank=LOCAL_RANK,
    dist_on_itp=DIST_ON_ITP,
    dist_url=DIST_URL
)


In [59]:
misc.init_distributed_mode(args)

[07:50:04.040791] [07:50:04.040777] [07:50:04.041363] [07:50:04.040767] [07:50:04.041411] [07:50:04.041405] [07:50:04.041431] [07:50:04.040686] [07:50:04.041469] [07:50:04.041466] [07:50:04.041489] [07:50:04.041460] [07:50:04.041512] [07:50:04.041508] [07:50:04.041535] Not using distributed mode


In [60]:
print('job dir: {}'.format(os.getcwd()))

[07:50:04.335558] [07:50:04.335556] [07:50:04.335717] [07:50:04.335553] [07:50:04.335735] [07:50:04.335733] [07:50:04.335747] [07:50:04.335550] [07:50:04.335763] [07:50:04.335761] [07:50:04.335774] [07:50:04.335759] [07:50:04.335788] [07:50:04.335786] [07:50:04.335799] [07:50:04.335536] [07:50:04.335818] [07:50:04.335816] [07:50:04.335829] [07:50:04.335814] [07:50:04.335843] [07:50:04.335840] [07:50:04.335853] [07:50:04.335811] [07:50:04.335869] [07:50:04.335867] [07:50:04.335880] [07:50:04.335865] [07:50:04.335894] [07:50:04.335892] [07:50:04.335905] job dir: /home/user/Schreibtisch/projects/git_data/segmentation_test/nbs


In [61]:
device = torch.device(args.device)
device

device(type='cuda')

In [62]:
# fix the seed for reproducibility
seed = args.seed + misc.get_rank()
torch.manual_seed(seed)
np.random.seed(seed)
cudnn.benchmark = True

In [63]:
# simple augmentation
transform_train = transforms.Compose([
            transforms.RandomResizedCrop(
                args.input_size, 
                scale=(0.2, 1.0), 
                interpolation=3),  # 3 is bicubic
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225])])

In [64]:
transform_train

Compose(
    RandomResizedCrop(size=(224, 224), scale=(0.2, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic, antialias=True)
    RandomHorizontalFlip(p=0.5)
    ToTensor()
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
)

In [65]:
dataset_train = datasets.ImageFolder(
    os.path.join(
        args.data_path, 'train'), 
        transform=transform_train)

In [28]:
dataset_train

Dataset ImageFolder
    Number of datapoints: 14669
    Root location: /home/user/.fastai/data/imagewang-160/train
    StandardTransform
Transform: Compose(
               RandomResizedCrop(size=(224, 224), scale=(0.2, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic, antialias=True)
               RandomHorizontalFlip(p=0.5)
               ToTensor()
               Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
           )

In [29]:
if True:  # args.distributed:
    num_tasks = misc.get_world_size()
    global_rank = misc.get_rank()
    sampler_train = torch.utils.data.DistributedSampler(
        dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True
    )
    print("Sampler_train = %s" % str(sampler_train))
else:
    sampler_train = torch.utils.data.RandomSampler(dataset_train)

[06:53:19.034733] Sampler_train = <torch.utils.data.distributed.DistributedSampler object at 0x72483b1ef150>


In [30]:
if global_rank == 0 and args.log_dir is not None:
    os.makedirs(args.log_dir, exist_ok=True)
    log_writer = SummaryWriter(log_dir=args.log_dir)
else:
    log_writer = None

In [31]:
data_loader_train = torch.utils.data.DataLoader(
        dataset_train, sampler=sampler_train,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        pin_memory=args.pin_mem,
        drop_last=True,
    )

In [32]:
args.model

'mae_vit_large_patch16'

In [33]:
%%capture
# define the model
model = models_mae.__dict__[args.model](norm_pix_loss=args.norm_pix_loss)
model

MaskedAutoencoderViT(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (blocks): ModuleList(
    (0-23): 24 x Block(
      (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=1024, out_features=3072, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=1024, out_features=1024, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
      

In [34]:
torch.cuda.is_available()

True

In [35]:
model.to(device)

model_without_ddp = model

In [67]:
%%capture
print("Model = %s" % str(model_without_ddp))

In [37]:
eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size()
eff_batch_size

64

In [38]:
if args.lr is None:  # only base_lr is specified
	args.lr = args.blr * eff_batch_size / 256
args.lr

0.00025

In [39]:
print("base lr: %.2e" % (args.lr * 256 / eff_batch_size))
print("actual lr: %.2e" % args.lr)


[06:53:33.447557] base lr: 1.00e-03
[06:53:33.447843] actual lr: 2.50e-04


In [40]:
print("accumulate grad iterations: %d" % args.accum_iter)
print("effective batch size: %d" % eff_batch_size)

[06:53:33.974881] accumulate grad iterations: 1
[06:53:33.975095] effective batch size: 64


In [41]:
if args.distributed:
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True)
    model_without_ddp = model.module
    

In [42]:
%%capture
# following timm: set wd as 0 for bias and norm layers
# this was found in original repo, but may be old timm version
#param_groups = optim_factory.add_weight_decay(model_without_ddp, args.weight_decay)
# it was not working, so changeing param_groups_weight_decay
param_groups = optim_factory.param_groups_weight_decay(model_without_ddp, args.weight_decay)
param_groups

[{'params': [Parameter containing:
   tensor([ 0.0350, -0.0317, -0.0275,  ...,  0.0315,  0.0151, -0.0012],
          device='cuda:0', requires_grad=True),
   Parameter containing:
   tensor([1., 1., 1.,  ..., 1., 1., 1.], device='cuda:0', requires_grad=True),
   Parameter containing:
   tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0', requires_grad=True),
   Parameter containing:
   tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0', requires_grad=True),
   Parameter containing:
   tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0', requires_grad=True),
   Parameter containing:
   tensor([1., 1., 1.,  ..., 1., 1., 1.], device='cuda:0', requires_grad=True),
   Parameter containing:
   tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0', requires_grad=True),
   Parameter containing:
   tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0', requires_grad=True),
   Parameter containing:
   tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0', requires_grad=True),
   Pa

In [43]:
optimizer = torch.optim.AdamW(
    param_groups, 
    lr=args.lr, 
    betas=(0.9, 0.95))

In [45]:
#| export
NativeScaler = NativeScalerWithGradNormCount

In [46]:
print(optimizer)
loss_scaler = NativeScaler()

[06:55:54.553497] AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.95)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.00025
    maximize: False
    weight_decay: 0.0

Parameter Group 1
    amsgrad: False
    betas: (0.9, 0.95)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.00025
    maximize: False
    weight_decay: 0.05
)


  self._scaler = torch.cuda.amp.GradScaler()


In [49]:
misc.load_model(
    args=args, 
    model_without_ddp=model_without_ddp, 
    optimizer=optimizer, 
    loss_scaler=loss_scaler)


This function call is responsible for loading the model, optimizer, and loss scaler for the training process.
It takes in the following arguments:
- `args`: The arguments parsed from the command line or configuration file.
- `model_without_ddp`: The model instance without DistributedDataParallel (DDP) wrapping.
- `optimizer`: The optimizer instance to be used for training.
- `loss_scaler`: The loss scaler instance for automatic mixed precision training.
The purpose of this function is to prepare the model, optimizer, and loss scaler for the training loop.
It ensures that the model is correctly configured for training, including setting the optimizer and loss scaler.
This is a crucial step in the training process as it sets up the necessary components for the model to learn from the data.

In [53]:
def main(args):
    misc.init_distributed_mode(args)
    print('job dir: {}'.format(os.getcwd()))
    print("{}".format(args).replace(', ', ',\n'))

    device = torch.device(args.device)

    # fix the seed for reproducibility
    seed = args.seed + misc.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)

    cudnn.benchmark = True

    # simple augmentation
    transform_train = transforms.Compose([
            transforms.RandomResizedCrop(args.input_size, scale=(0.2, 1.0), interpolation=3),  # 3 is bicubic
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
    dataset_train = datasets.ImageFolder(os.path.join(args.data_path, 'train'), transform=transform_train)
    print(dataset_train)

    if True:  # args.distributed:
        num_tasks = misc.get_world_size()
        global_rank = misc.get_rank()
        sampler_train = torch.utils.data.DistributedSampler(
            dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True
        )
        print("Sampler_train = %s" % str(sampler_train))
    else:
        sampler_train = torch.utils.data.RandomSampler(dataset_train)

    if global_rank == 0 and args.log_dir is not None:
        os.makedirs(args.log_dir, exist_ok=True)
        log_writer = SummaryWriter(log_dir=args.log_dir)
    else:
        log_writer = None

    data_loader_train = torch.utils.data.DataLoader(
        dataset_train, sampler=sampler_train,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        pin_memory=args.pin_mem,
        drop_last=True,
    )
    # define the model
    model = models_mae.__dict__[args.model](norm_pix_loss=args.norm_pix_loss)

    model.to(device)

    model_without_ddp = model
    print("Model = %s" % str(model_without_ddp))

    eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size()
    
    if args.lr is None:  # only base_lr is specified
        args.lr = args.blr * eff_batch_size / 256

    print("base lr: %.2e" % (args.lr * 256 / eff_batch_size))
    print("actual lr: %.2e" % args.lr)

    print("accumulate grad iterations: %d" % args.accum_iter)
    print("effective batch size: %d" % eff_batch_size)

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True)
        model_without_ddp = model.module
    
    # following timm: set wd as 0 for bias and norm layers
    param_groups = optim_factory.param_groups_weight_decay(
        model_without_ddp, 
        args.weight_decay)
    optimizer = torch.optim.AdamW(
        param_groups, 
        lr=args.lr, 
        betas=(0.9, 0.95))
    print(optimizer)
    loss_scaler = NativeScaler()

    misc.load_model(
        args=args, 
        model_without_ddp=model_without_ddp, 
        optimizer=optimizer, 
        loss_scaler=loss_scaler)

    print(f"Start training for {args.epochs} epochs")
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            data_loader_train.sampler.set_epoch(epoch)
        train_stats = train_one_epoch(
            model, data_loader_train,
            optimizer, device, epoch, loss_scaler,
            log_writer=log_writer, args=args
        )
        if args.output_dir and (epoch % 20 == 0 or epoch + 1 == args.epochs):
            misc.save_model(
                args=args, 
                model=model, 
                model_without_ddp=model_without_ddp, 
                optimizer=optimizer,
                loss_scaler=loss_scaler, 
                epoch=epoch)

        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
                        'epoch': epoch,}

        # This section of code is responsible for logging the training statistics to a file.
        # It checks if an output directory is specified and if the current process is the main process.
        # If both conditions are true, it flushes the log writer (if it exists) to ensure all logs are written.
        # Then, it appends the current training statistics (log_stats) to a file named "log.txt" in the output directory.
        # The statistics are written in JSON format followed by a newline character.
        if args.output_dir and misc.is_main_process():
            if log_writer is not None:
                log_writer.flush()
            with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f:
                f.write(json.dumps(log_stats) + "\n")

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    print('Training time {}'.format(total_time_str))

In [66]:
%%capture
main(args)

[07:50:19.022964] [07:50:19.022961] [07:50:19.023056] [07:50:19.022959] [07:50:19.023073] [07:50:19.023071] [07:50:19.023086] [07:50:19.022955] [07:50:19.023104] [07:50:19.023102] [07:50:19.023115] [07:50:19.023099] [07:50:19.023130] [07:50:19.023127] [07:50:19.023141] [07:50:19.022940] [07:50:19.023160] [07:50:19.023158] [07:50:19.023172] [07:50:19.023156] [07:50:19.023186] [07:50:19.023183] [07:50:19.023197] [07:50:19.023153] [07:50:19.023214] [07:50:19.023211] [07:50:19.023236] [07:50:19.023209] [07:50:19.023268] [07:50:19.023265] [07:50:19.023279] Not using distributed mode
[07:50:19.023312] [07:50:19.023310] [07:50:19.023325] [07:50:19.023308] [07:50:19.023339] [07:50:19.023336] [07:50:19.023349] [07:50:19.023306] [07:50:19.023366] [07:50:19.023364] [07:50:19.023377] [07:50:19.023362] [07:50:19.023392] [07:50:19.023389] [07:50:19.023402] [07:50:19.023304] [07:50:19.023421] [07:50:19.023418] [07:50:19.023432] [07:50:19.023416] [07:50:19.023445] [07:50:19.023443] [07:50:19.023456] [

OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB. GPU 0 has a total capacity of 3.95 GiB of which 6.12 MiB is free. Including non-PyTorch memory, this process has 3.93 GiB memory in use. Of the allocated memory 3.86 GiB is allocated by PyTorch, and 21.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)