# **Training**
* Container: codna_pytorch_p310

## AutoReload

In [2]:
%load_ext autoreload
%autoreload 2

## 1. parameter store 설정

In [3]:
import boto3
from time import strftime
from utils.ssm import parameter_store

In [4]:
region = boto3.Session().region_name
pm = parameter_store(region)
prefix = pm.get_params(key="PREFIX")  ## 수정이 필요합니다.

## 2. Training-job for preprocessing

In [5]:
import os
import sagemaker
from sagemaker.huggingface import HuggingFace
from sagemaker.workflow.execution_variables import ExecutionVariables



* params for training job

In [6]:
# Set to True to enable SageMaker to run locally
local_mode = False

if local_mode:
    instance_type = "local_gpu"
    
    from sagemaker.local import LocalSession
    import os
    
    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
    
    local_data_dir = os.getcwd() + '/data/preprocessing'
    data_channels = {
        "train": f"file://{local_data_dir}/train",
        "test": f"file://{local_data_dir}/test",
    }
    
else:
    
    instance_type = "ml.p3.2xlarge" #"ml.p3.2xlarge"#"ml.g4dn.8xlarge"#"ml.p3.2xlarge", 'ml.p3.16xlarge' , ml.g4dn.8xlarge
    
    sagemaker_session = sagemaker.Session()
    data_channels = {
        "train": os.path.join(pm.get_params(key="-".join([prefix, "PREP-DATA-PATH"])), "train"), 
        "test": os.path.join(pm.get_params(key="-".join([prefix, "PREP-DATA-PATH"])), "test"),
    }

instance_count = 1
do_spot_training = False
max_wait = None
max_run = 1*60*60   

resume = True
    
role = pm.get_params(key="-".join([prefix, "SAGEMAKER-ROLE-ARN"]))
proc_prefix = "/opt/ml/processing"
bucket_name = pm.get_params(key="-".join([prefix, "BUCKET-NAME"]))


output_path = os.path.join(
    "s3://{}".format(bucket_name),
    prefix,
    "training",
    "model-output"
)

code_location = os.path.join(
    "s3://{}".format(bucket_name),
    prefix,
    "training",
    "backup_codes"
)

git_config = {
    'repo': f'https://{pm.get_params(key="-".join([prefix, "CODE_REPO"]))}',
    'branch': 'main',
    'username': pm.get_params(key="-".join([prefix, "CODECOMMIT-USERNAME"]), enc=True),
    'password': pm.get_params(key="-".join([prefix, "CODECOMMIT-PWD"]), enc=True)
}  

model_name = 'distilbert-base-uncased'
tokenizer_name = 'distilbert-base-uncased'
label_cnt=2

create_date = strftime("%m%d-%H%M%s")
training_job_name=f'finetune-{model_name}-{create_date}'

kwargs = {}

In [7]:
print (f'instance-type: {instance_type}')
print (f'role: {role}')
print (f'bucket: {bucket_name}')
print (f'data_channels: {data_channels}')
print (f'sagemaker_session: {sagemaker_session}')
print (f'git_config: {git_config}')
print (f'training_job_name: {training_job_name}')

instance-type: ml.p3.2xlarge
role: AmazonSageMaker-ExecutionRole-20221004T162466
bucket: sm-bert-ramp
data_channels: {'train': 's3://sm-bert-ramp/ramp-mlops/preprocessing/data/train', 'test': 's3://sm-bert-ramp/ramp-mlops/preprocessing/data/test'}
sagemaker_session: <sagemaker.session.Session object at 0x7fcbdc61ef50>
git_config: {'repo': 'https://git-codecommit.ap-northeast-2.amazonaws.com/v1/repos/bert-code', 'branch': 'main', 'username': 'dongjin-at-419974056037', 'password': 'n1h2OES6ZiHws5kGNt0TJxtoLaAGxjLkOxtmlzc5YWg='}
training_job_name: finetune-distilbert-base-uncased-0807-05461691387181


In [8]:
hyperparameters = {
    'epochs': 1,
    'train_batch_size': 64,
    'model_name': model_name,
    'tokenizer_name': tokenizer_name,
    'output_dir':'/opt/ml/checkpoints',
    'label_size':label_cnt
}

In [9]:
est = HuggingFace(
    entry_point='train.py',
    source_dir='./code',
    git_config=git_config,
    instance_type=instance_type,
    instance_count=instance_count,
    role=role,
    volume_size=256,
    code_location = code_location,
    output_path=output_path,
    transformers_version='4.28.1', 
    pytorch_version='2.0.0',
    py_version='py310',
    hyperparameters = hyperparameters,
    max_run=36000, # expected max run in seconds
    sagemaker_session=sagemaker_session, 
)


In [10]:
est.fit(
    data_channels,
    wait=False,
    job_name=training_job_name
)

Cloning into '/tmp/tmpct1i0x8y'...
remote: Counting objects: 9, done.        
Already on 'main'


Your branch is up to date with 'origin/main'.
Using provided s3_resource


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: finetune-distilbert-base-uncased-0807-05461691387181


In [11]:
sagemaker_session.wait_for_job(training_job_name)


2023-08-07 05:46:30 Starting - Starting the training job...
2023-08-07 05:46:55 Starting - Preparing the instances for training............
2023-08-07 05:48:00 Downloading - Downloading input data...
2023-08-07 05:48:20 Training - Downloading the training image....................................................
2023-08-07 05:52:47 Training - Training image download completed. Training in progress......................
2023-08-07 05:54:37 Uploading - Uploading generated training model.....
2023-08-07 05:55:08 Completed - Training job completed


{'TrainingJobName': 'finetune-distilbert-base-uncased-0807-05461691387181',
 'TrainingJobArn': 'arn:aws:sagemaker:ap-northeast-2:419974056037:training-job/finetune-distilbert-base-uncased-0807-05461691387181',
 'ModelArtifacts': {'S3ModelArtifacts': 's3://sm-bert-ramp/ramp-mlops/training/model-output/finetune-distilbert-base-uncased-0807-05461691387181/output/model.tar.gz'},
 'TrainingJobStatus': 'Completed',
 'SecondaryStatus': 'Completed',
 'HyperParameters': {'epochs': '1',
  'label_size': '2',
  'model_name': '"distilbert-base-uncased"',
  'output_dir': '"/opt/ml/checkpoints"',
  'sagemaker_container_log_level': '20',
  'sagemaker_job_name': '"finetune-distilbert-base-uncased-0807-05461691387181"',
  'sagemaker_program': '"train.py"',
  'sagemaker_region': '"ap-northeast-2"',
  'sagemaker_submit_directory': '"s3://sm-bert-ramp/ramp-mlops/training/backup_codes/finetune-distilbert-base-uncased-0807-05461691387181/source/sourcedir.tar.gz"',
  'tokenizer_name': '"distilbert-base-uncase

## 3. parameter store에 Processing output 추가

In [12]:
pm.put_params(key="-".join([prefix, "MODEL-PATH"]), value=est.model_data, overwrite=True)

'Store suceess'