In [1]:
# install dependencies
!pip install smdebug



In [2]:
from sagemaker.debugger import Rule, ProfilerRule, rule_configs
from sagemaker.debugger import DebuggerHookConfig, ProfilerConfig, FrameworkProfile
import sagemaker
from sagemaker.pytorch import PyTorch
import boto3
from smdebug.trials import create_trial
from smdebug.core.modes import ModeKeys
from smdebug.profiler.analysis.notebook_utils.training_job import TrainingJob
from smdebug.profiler.analysis.notebook_utils.timeline_charts import TimelineCharts
import os
import IPython

[2022-05-07 12:33:14.817 ip-172-16-17-202:32741 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None


In [3]:
rules = [
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    ProfilerRule.sagemaker(rule_configs.LowGPUUtilization()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
    Rule.sagemaker(rule_configs.vanishing_gradient()),
    Rule.sagemaker(rule_configs.overfit()),
    Rule.sagemaker(rule_configs.overtraining()),
    Rule.sagemaker(rule_configs.poor_weight_initialization()),
]

In [4]:
profiler_config = ProfilerConfig(
    system_monitor_interval_millis=500, framework_profile_params=FrameworkProfile(num_steps=10)
)
debugger_config = DebuggerHookConfig(
    hook_parameters={"train.save_interval": "100", "eval.save_interval": "10"}
)

In [5]:
hyperparameters = {
    "batch_size": 2048,
    "gpu": True,
    "epoch": 2,
    "model": "resnet50",
}

In [11]:
role = sagemaker.get_execution_role()
session = sagemaker.Session()
region = session.boto_region_name
bucket = session.default_bucket()
output_path = f's3://{bucket}/jobs'
job_name = "smdebugger-debugger-and-profiler-cifar-pytorch"
estimator = PyTorch(
    base_job_name=job_name,
    role=role,
    source_dir="scripts",
    entry_point="pytorch_cifar_profiling.py",
    hyperparameters=hyperparameters,
    framework_version="1.8",
    py_version="py36",
    instance_type="ml.p2.xlarge",
    instance_count=1,
    use_spot_instances = True,
    checkpoint_s3_uri = f'{output_path}/{job_name}/checkpoints',
    max_run=1200,
    max_wait=1800,
    ## Profile and Debug parameters
    rules=rules,
    profiler_config=profiler_config,
    debugger_hook_config=debugger_config
)

In [14]:
estimator.fit(wait=True)

2022-05-07 12:49:57 Starting - Starting the training job...
2022-05-07 12:49:59 Starting - Launching requested ML instancesLossNotDecreasing: InProgress
VanishingGradient: InProgress
Overfit: InProgress
Overtraining: InProgress
PoorWeightInitialization: InProgress
LowGPUUtilization: InProgress
ProfilerReport: InProgress
......
2022-05-07 12:51:24 Starting - Preparing the instances for training...............
2022-05-07 12:53:57 Downloading - Downloading input data............
2022-05-07 12:55:57 Training - Downloading the training image...............
2022-05-07 12:58:18 Training - Training image download completed. Training in progress...................
2022-05-07 13:01:19 Interrupted - Training job interrupted...........................
2022-05-07 13:06:02 Starting - Starting the training job......
2022-05-07 13:07:01 Starting - Launching requested ML instances.........
2022-05-07 13:08:21 Starting - Insufficient capacity error from EC2 while launching instances, retrying!..........

Job ended with status 'Stopped' rather than 'Completed'. This could mean the job timed out or stopped early for some other reason: Consider checking whether it completed as you expect.


Training seconds: 409
Billable seconds: 123
Managed Spot Training savings: 69.9%


In [15]:
session = boto3.session.Session()
region = session.region_name

training_job_name = estimator.latest_training_job.name
print(f"Training jobname: {training_job_name}")
print(f"Region: {region}")

Training jobname: smdebugger-debugger-and-profiler-cifar--2022-05-07-12-49-57-160
Region: eu-west-1


In [16]:
trial = create_trial(estimator.latest_job_debugger_artifacts_path())

print(trial.tensor_names())
print(len(trial.tensor("CrossEntropyLoss_output_0").steps(mode=ModeKeys.TRAIN)))
print(len(trial.tensor("CrossEntropyLoss_output_0").steps(mode=ModeKeys.EVAL)))

[2022-05-07 13:25:07.307 ip-172-16-17-202:32741 INFO s3_trial.py:42] Loading trial debug-output at path s3://sagemaker-eu-west-1-663084464644/smdebugger-debugger-and-profiler-cifar--2022-05-07-12-49-57-160/debug-output




MissingCollectionFiles: Training job has ended. All the collection files could not be loaded