# Set PythonPath and cd into appropriate directory

In [None]:
%set_env PYTHONPATH=/home/ubuntu/work/Model-References/PyTorch/computer_vision/classification/torchvision:/root/examples/models:/usr/lib/habanalabs/:/root

In [None]:
%cd /home/ubuntu/work/Model-References/PyTorch/computer_vision/classification/torchvision

# Import libraries for pytorch training

In [None]:
# Copyright (c) 2021, Habana Labs Ltd.  All rights reserved.
from __future__ import print_function

#Import local copy of the model only for ResNext101_32x4d
#which is not part of standard torchvision package.
import datetime
import os
import time
import sys

import torch
import torch.utils.data
from torch import nn
import torchvision
import torchvision.datasets as datasets
from torchvision import transforms
import random
import utils
from resnet50_notebook_utils import *


# Main training function

Insert the following code block in the appropriate places (after backward loss computation and after optimizer step).

```
if args.run_lazy_mode:
    import habana_frameworks.torch.core as htcore
    htcore.mark_step()
```

In [None]:
def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, print_freq):
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ",device=device)
    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value}'))
    metric_logger.add_meter('img/s', utils.SmoothedValue(window_size=10, fmt='{value}'))

    header = 'Epoch: [{}]'.format(epoch)
    step_count = 0
    last_print_time= time.time()

    for image, target in metric_logger.log_every(data_loader, print_freq, header):
        image, target = image.to(device, non_blocking=True), target.to(device, non_blocking=True)

        dl_ex_start_time=time.time()

        if args.channels_last:
            image = image.contiguous(memory_format=torch.channels_last)


        if args.run_lazy_mode:
            import habana_frameworks.torch.core as htcore
            htcore.mark_step()

        output = model(image)
        loss = criterion(output, target)
        optimizer.zero_grad(set_to_none=True)

        # performance gain seen for these models using this mark_step.
        if args.run_lazy_mode:
            import habana_frameworks.torch.core as htcore
            htcore.mark_step()


        loss.backward()
        
        ##

        optimizer.step()

        ##

        if step_count % print_freq == 0:
            output_cpu = output.detach().to('cpu')
            acc1, acc5 = utils.accuracy(output_cpu, target, topk=(1, 5))
            batch_size = image.shape[0]
            metric_logger.update(loss=loss.item(), lr=optimizer.param_groups[0]["lr"])
            metric_logger.meters['acc1'].update(acc1.item(), n=batch_size*print_freq)
            metric_logger.meters['acc5'].update(acc5.item(), n=batch_size*print_freq)
            current_time = time.time()
            last_print_time = dl_ex_start_time if args.dl_time_exclude else last_print_time
            metric_logger.meters['img/s'].update(batch_size*print_freq / (current_time - last_print_time))
            last_print_time = time.time()

        step_count = step_count + 1
        if step_count >= args.num_train_steps:
            break

# Replicate command line args for single HPU resnet50 training

In [None]:
os.environ["MAX_WAIT_ATTEMPTS"] = "50"
os.environ['HCL_CPU_AFFINITY'] = '1'
os.environ['PT_HPU_ENABLE_SYNC_OUTPUT_HOST'] = 'false'
parser = get_resnet50_argparser()
   

args = parser.parse_args(["--batch-size", "256", "--epochs", "20", "--workers", "12",
"--dl-time-exclude", "False", "--print-freq", "20", "--channels-last", "True", "--seed", "123",
"--run-lazy-mode", "--hmp",  "--hmp-bf16", "/home/ubuntu/work/Model-References/PyTorch/computer_vision/classification/torchvision/ops_bf16_Resnet.txt",
"--hmp-fp32", "/home/ubuntu/work/Model-References/PyTorch/computer_vision/classification/torchvision/ops_fp32_Resnet.txt",
"--deterministic"])

Main training loop for single node training. Use fake data to train

In [None]:
from habana_frameworks.torch.utils.library_loader import load_habana_module
load_habana_module()

try:
    # Default 'fork' doesn't work with synapse. Use 'forkserver' or 'spawn'
    torch.multiprocessing.set_start_method('spawn')
except RuntimeError:
    pass

if args.run_lazy_mode:
    os.environ["PT_HPU_LAZY_MODE"] = "1"
if args.is_hmp:
    from habana_frameworks.torch.hpex import hmp
    hmp.convert(opt_level=args.hmp_opt_level, bf16_file_path=args.hmp_bf16,
                fp32_file_path=args.hmp_fp32, isVerbose=args.hmp_verbose)

torch.manual_seed(args.seed)

if args.deterministic:
    seed = args.seed
    random.seed(seed)

else:
    seed = None

device = torch.device('hpu')

torch.backends.cudnn.benchmark = True

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

dataset = datasets.FakeData(transform=transforms.Compose([transforms.RandomResizedCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                normalize,]))
dataset_test = datasets.FakeData(transform=transforms.Compose([transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                normalize,]))


train_sampler = torch.utils.data.RandomSampler(dataset)
test_sampler = torch.utils.data.SequentialSampler(dataset_test)	

if args.workers > 0:
    # patch torch cuda functions that are being unconditionally invoked
    # in the multiprocessing data loader
    torch.cuda.current_device = lambda: None
    torch.cuda.set_device = lambda x: None

data_loader_type = torch.utils.data.DataLoader

data_loader = data_loader_type(
    dataset, batch_size=args.batch_size, sampler=train_sampler,
    num_workers=args.workers, pin_memory=True)

data_loader_test = data_loader_type(
    dataset_test, batch_size=args.batch_size, sampler=test_sampler,
    num_workers=args.workers, pin_memory=True)

print("Creating model")
model = torchvision.models.__dict__['resnet50'](pretrained=False)
model.to(device)


criterion = nn.CrossEntropyLoss()

if args.run_lazy_mode:
    from habana_frameworks.torch.hpex.optimizers import FusedSGD
    sgd_optimizer = FusedSGD
else:
    sgd_optimizer = torch.optim.SGD
optimizer = sgd_optimizer(
    model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)

permute_params(model, True, args.run_lazy_mode)
permute_momentum(optimizer, True, args.run_lazy_mode)

model_for_train = model

print("Start training")
start_time = time.time()
for epoch in range(args.start_epoch, args.epochs):
    train_one_epoch(model_for_train, criterion, optimizer, data_loader,
            device, epoch, print_freq=args.print_freq)


if args.run_lazy_mode:
    os.environ.pop("PT_HPU_LAZY_MODE")

total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))

# Distributed Training

**Restart the kernel before running the next section of the notebook**

We will use the Model-References repo command line to demo multinode training. 

Multinode training differs in the following ways.

1. Initialization with hccl
```
import habana_frameworks.torch.core.hccl    
dist.init_process_group(backend='hccl', rank=rank, world_size=world_size)
```
2. Omit mark steps in lazy mode 
3. Use the torch distributed data sampler. ex:
```
        train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
```
4. Distributed data parallel pytorch model initalization. ex:
```
model = torch.nn.parallel.DistributedDataParallel(model, bucket_cap_mb=bucket_size_mb, broadcast_buffers=False,
                    gradient_as_bucket_view=is_grad_view)
```__Note__: regarding steps 3/4 you must use the DistributedDataParallel API, the DataParallel API is unsupported by Habana

In [None]:
%set_env PYTHONPATH=/home/ubuntu/work/Model-References/PyTorch/computer_vision/classification/torchvision:/root/examples/models:/usr/lib/habanalabs/:/root

In [None]:
%cd /home/ubuntu/work/Model-References/PyTorch/computer_vision/classification/torchvision

Apply the following patch to use fake data and remove evaluation

In [None]:
! git apply ~/fake_data_no_eval.patch

In [None]:
! python3 -u demo_resnet.py  --world-size 8 --batch-size 256 --model resnet50 --device hpu --print-freq 1 \
  --channels-last True --deterministic --data-path $HOME --mode lazy \
  --epochs 30 --data-type bf16  --custom-lr-values 0.275,0.45,0.625,0.8,0.08,0.008,0.0008 \
  --custom-lr-milestones 1,2,3,4,30,60,80 --dl-time-exclude=False --dl-worker-type=MP