# SageMaker Model Profiling

Here we will see how we can use Sagemaker Profiling to see our training system metrics as well as generate a Profiler Report.

## `pytorch_cifar_profiling.py`
<details>
  <summary> Click here to see the full code for the script </summary>

```python

import argparse
import time

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms

from smdebug import modes
from smdebug.profiler.utils import str2bool
from smdebug.pytorch import get_hook

def train(args, net, device):
    hook = get_hook(create_if_not_exists=True)
    batch_size = args.batch_size
    epoch = args.epoch
    transform_train = transforms.Compose(
        [
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
        ]
    )

    transform_valid = transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
        ]
    )

    trainset = torchvision.datasets.CIFAR10(
        root="./data", train=True, download=True, transform=transform_train
    )
    trainloader = torch.utils.data.DataLoader(
        trainset,
        batch_size=batch_size,
        shuffle=True
    )

    validset = torchvision.datasets.CIFAR10(
        root="./data", train=False, download=True, transform=transform_valid
    )
    validloader = torch.utils.data.DataLoader(
        validset,
        batch_size=batch_size,
        shuffle=False
    )

    loss_optim = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=1.0, momentum=0.9)

    epoch_times = []

    if hook:
        hook.register_loss(loss_optim)
    # train the model

    for i in range(epoch):
        print("START TRAINING")
        if hook:
            hook.set_mode(modes.TRAIN)
        start = time.time()
        net.train()
        train_loss = 0
        for _, (inputs, targets) in enumerate(trainloader):
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = net(inputs)
            loss = loss_optim(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        print("START VALIDATING")
        if hook:
            hook.set_mode(modes.EVAL)
        net.eval()
        val_loss = 0
        with torch.no_grad():
            for _, (inputs, targets) in enumerate(validloader):
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = net(inputs)
                loss = loss_optim(outputs, targets)
                val_loss += loss.item()

        epoch_time = time.time() - start
        epoch_times.append(epoch_time)
        print(
            "Epoch %d: train loss %.3f, val loss %.3f, in %.1f sec"
            % (i, train_loss, val_loss, epoch_time)
        )

    # calculate training time after all epoch
    p50 = np.percentile(epoch_times, 50)
    return p50


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--batch_size", type=int, default=128)
    parser.add_argument("--epoch", type=int, default=1)
    parser.add_argument("--gpu", type=str2bool, default=True)
    parser.add_argument("--model", type=str, default="resnet50")

    opt = parser.parse_args()

    for key, value in vars(opt).items():
        print(f"{key}:{value}")
    # create model
    net = models.__dict__[opt.model](pretrained=True)
    if opt.gpu == 1:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    net.to(device)

    # Start the training.
    median_time = train(opt, net, device)
    print("Median training time per Epoch=%.1f sec" % median_time)


if __name__ == "__main__":
    main()
```

</details>

First we will need to install `smdebug`.

In [3]:
# install dependencies
!pip install smdebug

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
[0m

Next we will need to specify the metrics that we want to track and create the profiler rules. Below you can see that I have specified to track 3 metrics: Loss not decreasing, Low GPU Utilization and also to generate the profiler report. I have also specified that these metrics should be tracked every 500 milliseconds.

In [4]:
from sagemaker.debugger import Rule, ProfilerRule, rule_configs

rules = [
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    ProfilerRule.sagemaker(rule_configs.LowGPUUtilization()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]

In [5]:
from sagemaker.debugger import ProfilerConfig, FrameworkProfile

profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500, framework_profile_params=FrameworkProfile(num_steps=10)
)

Now that we have specified our profiler rules, we can create our hyperparameter dict and estimator to perform training. We will also need to specify our profiler rules and configs in the estimator.

In [6]:
hyperparameters = {
    "batch_size": 2048,
    "gpu": True,
    "epoch": 2,
    "model": "resnet50",
}

In [11]:
import sagemaker
from sagemaker.pytorch import PyTorch

role = sagemaker.get_execution_role()
session = sagemaker.Session()
region = session.boto_region_name
bucket = session.default_bucket()
output_path = f's3://{bucket}/jobs'
job_name = "smdebugger-profiler-cifar-pytorch"
estimator = PyTorch(
    base_job_name=job_name,
    role=role,
    source_dir="scripts",
    entry_point="pytorch_cifar_profiling.py",
    hyperparameters=hyperparameters,
    framework_version="1.8",
    py_version="py36",
    instance_type="ml.p2.xlarge",
    instance_count=1,
    use_spot_instances = True,
    checkpoint_s3_uri = f'{output_path}/{job_name}/checkpoints',
    max_run=600,
    max_wait=1200,
    ## Profile parameters
    rules=rules,
    profiler_config=profiler_config,
)

In [None]:
estimator.fit(wait=True)

2022-05-07 08:15:54 Starting - Starting the training job...
2022-05-07 08:15:55 Starting - Launching requested ML instancesLossNotDecreasing: InProgress
LowGPUUtilization: InProgress
ProfilerReport: InProgress
......
2022-05-07 08:17:24 Starting - Preparing the instances for training......
2022-05-07 08:18:17 Downloading - Downloading input data...
2022-05-07 08:18:54 Training - Downloading the training image................................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-05-07 08:23:59,490 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-05-07 08:23:59,512 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-05-07 08:24:02,554 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-05-07 08:24:03,131 sagemaker-training-toolkit INFO     Invok

In [17]:
import boto3

session = boto3.session.Session()
region = session.region_name

training_job_name = estimator.latest_training_job.name
print(f"Training jobname: {training_job_name}")
print(f"Region: {region}")

Training jobname: smdebugger-profiler-cifar-pytorch-2022-05-07-08-15-53-574
Region: eu-west-1


## Checking System Utilization
Below is some boilerplate code to get the training job object using the training job name and display the system metrics.

In [24]:
from smdebug.profiler.analysis.notebook_utils.training_job import TrainingJob

ImportError: cannot import name 'Markup' from 'jinja2' (/opt/conda/lib/python3.7/site-packages/jinja2/__init__.py)