# Debugger and Profiler

In [None]:
# install dependencies
!pip install smdebug

In [7]:
hyperparameters = {
    "batch_size": 2048,
    "gpu": True,
    "epoch": 2,
    "model": "resnet50",
}

In [8]:
from sagemaker.debugger import Rule, ProfilerRule, rule_configs
#TODO: Can you add the rules you want to track
rules = [ 
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
    Rule.sagemaker(rule_configs.vanishing_gradient()),
    Rule.sagemaker(rule_configs.overfit()),
    Rule.sagemaker(rule_configs.overtraining()),
    Rule.sagemaker(rule_configs.poor_weight_initialization()),
]

In [9]:
from sagemaker.debugger import DebuggerHookConfig, ProfilerConfig, FrameworkProfile

#TODO: Can you create the profiler and debugger configs
profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500, 
    framework_profile_params=FrameworkProfile(
        num_steps=10
    )
    )
debugger_config = DebuggerHookConfig(
    hook_parameters={
        "train.save_interval": "100",
        "eval.save_interval": "10"
    }
)

Framework profiling will be deprecated from tensorflow 2.12 and pytorch 2.0 in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [10]:
import sagemaker
from sagemaker.pytorch import PyTorch
#TODO: Create the estimator to train your model
estimator = PyTorch(
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type='ml.m5.xlarge',
    source_dir='scripts',
    entry_point='pytorch_cifar_profiling.py',
    framework_version='1.8',
    py_version='py36',
    hyperparameters=hyperparameters, 
    profiler_config=profiler_config,
    debugger_hook_config=debugger_config,
    rules=rules
)

In [11]:
estimator.fit(wait=True)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2024-07-30-20-32-57-006


2024-07-30 20:32:57 Starting - Starting the training job...
2024-07-30 20:33:17 Starting - Preparing the instances for trainingLossNotDecreasing: InProgress
VanishingGradient: InProgress
Overfit: InProgress
Overtraining: InProgress
PoorWeightInitialization: InProgress
ProfilerReport: InProgress
...
2024-07-30 20:33:54 Downloading - Downloading the training image......
2024-07-30 20:34:55 Training - Training image download completed. Training in progress...bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2024-07-30 20:35:01,741 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2024-07-30 20:35:01,744 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-07-30 20:35:01,752 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2024-07-30 20:35:01,754 sagemaker_pytorch_container.training INFO     Invoking user training 

UnexpectedStatusException: Error for Training job pytorch-training-2024-07-30-20-32-57-006: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/opt/conda/bin/python3.6 pytorch_cifar_profiling.py --batch_size 2048 --epoch 2 --gpu True --model resnet50"
Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth
  0%|          | 0.00/97.8M [00:00<?, ?B/s]  9%|â         | 8.50M/97.8M [00:00<00:01, 89.1MB/s] 17%|ââ        | 17.1M/97.8M [00:00<00:00, 89.6MB/s] 27%|âââ       | 26.1M/97.8M [00:00<00:00, 91.7MB/s] 36%|ââââ      | 35.6M/97.8M [00:00<00:00, 94.9MB/s] 46%|âââââ     | 45.1M/97.8M [00:00<00:00, 96.8MB/s] 56%|ââââââ    | 54.7M/97.8M [00:00<00:00, 97.9MB/s] 66%|âââââââ   | 64.3M/97.8M [00:00<00:00, 98.9MB/s] 76%|ââââââââ  | 74.0M/97.8M [00:00<00:00, 99.6MB/s] 85%|âââââââââ | 83.5M/97.8M [00:00<00:00, 99.6MB/s] 9, exit code: 1

In [None]:
import boto3

session = boto3.session.Session()
region = session.region_name

training_job_name = estimator.latest_training_job.name
print(f"Training jobname: {training_job_name}")
print(f"Region: {region}")

In [None]:
from smdebug.trials import create_trial
from smdebug.core.modes import ModeKeys

trial = create_trial(estimator.latest_job_debugger_artifacts_path())

In [None]:
print(trial.tensor_names())


In [None]:
# TODO: Can you print the names of all the tensors that were tracked
print((trial.tensor('CrossEntropyLoss_output_0').steps(mode)
# TODO: Can you print the number of datapoints for one of those tensors
# for both train and eval mode

In [None]:
from smdebug.profiler.analysis.notebook_utils.training_job import TrainingJob

tj = TrainingJob(training_job_name, region)
tj.wait_for_sys_profiling_data_to_be_available()

In [None]:
from smdebug.profiler.analysis.notebook_utils.timeline_charts import TimelineCharts

system_metrics_reader = tj.get_systems_metrics_reader()
system_metrics_reader.refresh_event_file_list()

view_timeline_charts = TimelineCharts(
    system_metrics_reader,
    framework_metrics_reader=None,
    select_dimensions=["CPU", "GPU"],
    select_events=["total"],
)

In [None]:
rule_output_path = estimator.output_path + estimator.latest_training_job.job_name + "/rule-output"
print(f"You will find the profiler report in {rule_output_path}")

In [None]:
! aws s3 ls {rule_output_path} --recursive

In [None]:
! aws s3 cp {rule_output_path} ./ --recursive

In [None]:
import os

# get the autogenerated folder name of profiler report
profiler_report_name = [
    rule["RuleConfigurationName"]
    for rule in estimator.latest_training_job.rule_job_summary()
    if "Profiler" in rule["RuleConfigurationName"]
][0]

In [None]:
import IPython

IPython.display.HTML(filename=profiler_report_name + "/profiler-output/profiler-report.html")