# 1. SageMaker Training for Diffusion model
---

본 모듈에서는 Amzaon SageMaker API을 효과적으로 이용하기 위해 multigpu-distributed 학습을 위한 PyTorch 프레임워크 자체 구현만으로 모델 훈련을 수행해 봅니다.

In [1]:
install_needed = True  # should only be True once
# install_needed = False

In [2]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
#     !{sys.executable} -m pip install -U split-folders tqdm albumentations crc32c wget
    !{sys.executable} -m pip install 'sagemaker[local]' --upgrade
    !{sys.executable} -m pip install -U smdebug sagemaker-experiments
    !{sys.executable} -m pip install -U sagemaker
    !/bin/bash ./local/local_mode_setup.sh
    !/bin/bash ./local/local_change_setting.sh
    IPython.Application.instance().kernel.do_shutdown(True)

installing deps and restarting kernel
nvidia-docker2 already installed. We are good to go!
SageMaker instance route table setup is ok. We are good to go.
SageMaker instance routing for Docker is ok. We are good to go!
Already revised


## 2. 환경 설정

<p>Sagemaker 학습에 필요한 기본적인 package를 import 합니다. </p>
<p>boto3는 HTTP API 호출을 숨기는 편한 추상화 모델을 가지고 있고, Amazon EC2 인스턴스 및 S3 버켓과 같은 AWS 리소스와 동작하는 파이선 클래스를 제공합니다. </p>
<p>sagemaker python sdk는 Amazon SageMaker에서 기계 학습 모델을 교육 및 배포하기 위한 오픈 소스 라이브러리입니다.</p>

In [1]:
import joblib
import matplotlib.pyplot as plt
import sagemaker
# import splitfolders

import datetime
import glob
import os
import time
import warnings

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial

# import wget
# import tarfile
import shutil

import boto3
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torchvision

# from tqdm import tqdm
from time import strftime
from PIL import Image
from torch.utils.data import Dataset
from torchvision import datasets, transforms

from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch

from sagemaker.debugger import (Rule,
                                rule_configs,
                                ProfilerConfig, 
                                FrameworkProfile, 
                                DetailedProfilingConfig, 
                                DataloaderProfilingConfig, 
                                PythonProfilingConfig)

warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'retina'

In [2]:
role = get_execution_role()

In [3]:
sagemaker.__version__

'2.75.0'

In [4]:
def create_experiment(experiment_name):
    try:
        sm_experiment = Experiment.load(experiment_name)
    except:
        sm_experiment = Experiment.create(experiment_name=experiment_name,
                                          tags=[
                                              {
                                                  'Key': 'multigpu',
                                                  'Value': 'yes'
                                              },
                                              {
                                                  'Key': 'multinode',
                                                  'Value': 'yes'
                                              },
                                          ])

In [5]:
def create_trial(experiment_name, set_param, i_type, i_cnt, spot):
    create_date = strftime("%m%d-%H%M%s")
    
    if set_param['sagemakerdp']:
        algo = 'sdp'
#     elif set_param['sagemakermp']:
#         algo = 'smp'
    else:
        algo = 'ds'
    
    spot = 's' if spot else 'd'
    i_tag = 'test'
    if i_type == 'ml.p3.16xlarge':
        i_tag = 'p3'
    elif i_type == 'ml.p3dn.24xlarge':
        i_tag = 'p3dn'
    elif i_type == 'ml.p4d.24xlarge':
        i_tag = 'p4d'    
        
    trial = "-".join([i_tag,str(i_cnt),algo, spot])
       
    sm_trial = Trial.create(trial_name=f'{experiment_name}-{trial}-{create_date}',
                            experiment_name=experiment_name)

    job_name = f'{sm_trial.trial_name}'
    return job_name

In [6]:
bucket = 'diffusion-sagemaker-211011'
code_location = f's3://{bucket}/sm_codes'
output_path = f's3://{bucket}/poc_diffusion/output' 
s3_log_path = f's3://{bucket}/tf_logs' 
# s3_log_path = f'{bucket}/tf_logs' 

In [7]:
metric_definitions=[
     {'Name': 'train:lr', 'Regex': 'lr - (.*?),'},
     {'Name': 'train:Loss', 'Regex': 'loss -(.*?),'},
]

In [8]:
from sagemaker.debugger import Rule, ProfilerRule, rule_configs

rules=[ 
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    Rule.sagemaker(rule_configs.overfit()),
    ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
]

In [10]:
hyperparameters = {
    'attention_resolutions': '32,16,8',
    'class_cond': False,
    'diffusion_steps': 1000,
    'image_size': 128,
    'channel_mult': '1,1,2,4,4',
#     'channel_mult': '1,1.5,2,4,5',
    'learn_sigma': True,
    'noise_schedule': 'linear',
    'num_channels': 256,
    'num_heads': 4,
    'num_res_blocks': 3,
    'resblock_updown': True,
    'use_fp16': True,
    'use_scale_shift_norm': True,
#     'schedule_sampler' : 'uniform',
    'lr': 1e-4,
    'weight_decay': 0.0,
    'lr_anneal_steps' : 1000,
    'batch_size' : 16,
#     'microbatch' : -1,
    'ema_rate' : '0.9999',
    'log_interval' : 10,
    'save_interval' : 200,
#     'resume_checkpoint' : "/opt/ml/code/resume_ckt/model000100.pt",
#     'fp16_scale_growth' : 1e-3,
    's3_log_path' : s3_log_path,   ### 로그를 위한 s3_log_path 추가
    'sagemakerdp' : True,
#     'eps': 1e-8,
    }


experiment_name = 'diffusion-poc-exp3'
instance_type = 'ml.p4d.24xlarge'  # 'ml.p3.16xlarge', 'ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'local_gpu'
# instance_type = 'local_gpu'
instance_count = 2
# instance_type = 'ml.g4dn.4xlarge'  # 'ml.p3.16xlarge', 'ml.p3dn.24xlarge', 'ml.p4d.24xlarge', 'local_gpu'
# instance_count = 2
do_spot_training = False
max_wait = None
max_run = 1*60*60

In [11]:
if instance_type =='local_gpu':
    from sagemaker.local import LocalSession
    from pathlib import Path

    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
    s3_data_path = 'file:///home/ec2-user/SageMaker/dataset/dataset'
    source_dir = f'{Path.cwd()}/scripts'
    checkpoint_s3_bucket = None
else:
    sess = boto3.Session()
    sagemaker_session = sagemaker.Session()
    sm = sess.client('sagemaker')
    s3_data_path = 's3://dataset-us-west-2-cyj/cifar10'
    source_dir = 'scripts'
    checkpoint_s3_bucket = f's3://{bucket}/checkpoints'

In [12]:
image_uri = None
distribution = None
train_job_name = 'sagemaker'


train_job_name = 'smp-dist'
distribution = {}

if hyperparameters['sagemakerdp']:
    distribution["smdistributed"]={ 
                        "dataparallel": {
                            "enabled": True
                        }
                }
else:
    distribution["mpi"]={
                        "enabled": True,
    #                     "processes_per_host": 8, # Pick your processes_per_host
    #                     "custom_mpi_options": "-verbose -x orte_base_help_aggregate=0 "
                  }

if do_spot_training:
    max_wait = max_run

print("train_job_name : {} \ntrain_instance_type : {} \ntrain_instance_count : {} \nimage_uri : {} \ndistribution : {}".format(train_job_name, instance_type, instance_count, image_uri, distribution))    

train_job_name : smp-dist 
train_instance_type : ml.p4d.24xlarge 
train_instance_count : 2 
image_uri : None 
distribution : {'smdistributed': {'dataparallel': {'enabled': True}}}


In [13]:
image_uri='322537213286.dkr.ecr.us-west-2.amazonaws.com/smddp-private-preview:smddp-v1.4.0-rc2-pt1.10'

In [14]:
# all input configurations, parameters, and metrics specified in estimator 
# definition are automatically tracked
estimator = PyTorch(
    entry_point='image_train.py',
    source_dir=source_dir,
    role=role,
    sagemaker_session=sagemaker_session,
    framework_version='1.10',
    py_version='py38',
    image_uri=image_uri,
    instance_count=instance_count,
    instance_type=instance_type,
    volume_size=1024,
    code_location = code_location,
    output_path=output_path,
    hyperparameters=hyperparameters,
    distribution=distribution,
    disable_profiler=True,
    debugger_hook_config=False,
    metric_definitions=metric_definitions,
#     rules=rules,
    max_run=max_run,
    use_spot_instances=do_spot_training,  # spot instance 활용
    max_wait=max_wait,
    subnets=['subnet-02e36c042e58264e6'],   ## 	subnet-05c77affac40aa7f3 (2b)  subnet-02e36c042e58264e6 (2c)
    security_group_ids=['sg-0bc738570daec9015'],
    checkpoint_s3_uri=checkpoint_s3_bucket
)

In [18]:
# Configure FSx Input for your SageMaker Training job
### 1200 MB/s 1.2TiB 1000MB/s/TiB

from sagemaker.inputs import FileSystemInput

file_system_directory_path= '/pzdyvbmv/dataset'  # '/5n6znbmv'    g4ljfbmv
 
file_system_id='fs-05d92d4916e54e652'  # fs-0849611d06d289065  063be12d6ca6d7862

file_system_access_mode='rw'
file_system_type='FSxLustre'
train_fs = FileSystemInput(file_system_id=file_system_id,
                                    file_system_type=file_system_type,
                                    directory_path=file_system_directory_path,
                                    file_system_access_mode=file_system_access_mode)

In [20]:
if instance_type =='local_gpu':
    inputs = s3_data_path
else:
    inputs = train_fs

In [25]:
%%time
create_experiment(experiment_name)
job_name = create_trial(experiment_name, hyperparameters, instance_type, instance_count, do_spot_training)

estimator.fit(
    inputs={'training': inputs},
    job_name=job_name,
    experiment_config={
      'TrialName': job_name,
      'TrialComponentDisplayName': job_name,
    },
    wait=False,
)

INFO:sagemaker:Creating training-job with name: diffusion-poc-exp3-p4d-2-sdp-d-0206-06541644130482


CPU times: user 356 ms, sys: 20.5 ms, total: 376 ms
Wall time: 1.01 s


In [26]:
job_name=estimator.latest_training_job.name

In [27]:
sagemaker_session.logs_for_job(job_name=job_name, wait=True)

2022-02-06 06:54:45 Starting - Starting the training job...
2022-02-06 06:54:47 Starting - Launching requested ML instances.........
2022-02-06 06:56:22 Starting - Preparing the instances for training........................
2022-02-06 07:00:36 Downloading - Downloading input data
2022-02-06 07:00:36 Training - Downloading the training image................................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-02-06 07:06:00,298 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-02-06 07:06:00,378 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m

2022-02-06 07:06:04 Training - Training image download completed. Training in progress.[35mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[35mbash: no job control in this shell[0m
[35m2022-02-06 07:06:05,062 sa

UnexpectedStatusException: Error for Training job diffusion-poc-exp3-p4d-2-sdp-d-0206-06541644130482: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
ExitCode 1
ErrorMessage ""
Command "mpirun --host algo-1:8,algo-2:8 -np 16 --allow-run-as-root --tag-output --oversubscribe -mca btl_tcp_if_include eth0 -mca oob_tcp_if_include eth0 -mca plm_rsh_no_tree_spawn 1 -mca pml ob1 -mca btl ^openib -mca orte_abort_on_non_zero_status 1 -mca btl_vader_single_copy_mechanism none -mca plm_rsh_num_concurrent 2 -x NCCL_SOCKET_IFNAME=eth0 -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH -x PATH -x SMDATAPARALLEL_USE_HOMOGENEOUS=1 -x FI_PROVIDER=efa -x RDMAV_FORK_SAFE=1 -x LD_PRELOAD=/opt/conda/lib/python3.8/site-packages/gethostname.cpython-38-x86_64-linux-gnu.so -x SMDATAPARALLEL_SERVER_ADDR=algo-1 -x SMDATAPARALLEL_SERVER_PORT=7592 -x SAGEMAKER_INSTANCE_TYPE=ml.p4d.24xlarge smddprun /opt/conda/bin/python3.8 -m mpi4py image_train.py --attention_resolutions 32,16,8 --batch_size 8 --channel_mult 1,1,2,4,4 --class_cond False --diffusion_steps 1000 --ema_rate 0.9999 --image_size 128 --learn_sigma True --log_interval 10 --lr 0.0001 --lr_anneal_steps