# <B> Training </B>
* Container: codna_pytorch_p39

## AutoReload

In [3]:
%load_ext autoreload
%autoreload 2

## 1. parameter store 셋팅

In [4]:
import boto3
from utils.ssm import parameter_store

In [5]:
strRegionName=boto3.Session().region_name
pm = parameter_store(strRegionName)
prefix = pm.get_params(key="PREFIX")

## 2.Training-job for preprocessing

In [7]:
import os
import sagemaker
from omegaconf import OmegaConf
from sagemaker.pytorch.estimator import PyTorch
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.processing import ProcessingInput, ProcessingOutput, FrameworkProcessor
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial

* **Set Up SageMaker Experiment**
    - Create or load [SageMaker Experiment](https://docs.aws.amazon.com/sagemaker/latest/dg/experiments.html) for the example training job. This will create an experiment trial object in SageMaker.

In [17]:
from time import strftime

In [8]:
def create_experiment(experiment_name):
    try:
        sm_experiment = Experiment.load(experiment_name)
    except:
        sm_experiment = Experiment.create(experiment_name=experiment_name)

In [9]:
def create_trial(experiment_name):
    create_date = strftime("%m%d-%H%M%s")
    sm_trial = Trial.create(trial_name=f'{experiment_name}-{create_date}',
                            experiment_name=experiment_name)

    job_name = f'{sm_trial.trial_name}'
    return job_name

* **Configure the training job**

    - Now we configure the training job, by modifying the `config.yaml` file that is stored in our source code directory.
    - We pass relative directory paths for the data based on the SageMaker mount directory on the remote instance.

In [10]:
code_dir = os.path.join("./an4_nemo_sagemaker", "code", "training")
config_dir = os.path.join("./an4_nemo_sagemaker", "code", "training", "conf")
config_path = os.path.join(config_dir, "config.yaml")

* params for training job

In [61]:
#data_dir = os.getcwd() + "/data"

# Set to True to enable SageMaker to run locally
local_mode = False

if local_mode:
    instance_type = "local_gpu"
else:
    instance_type = "ml.p3.2xlarge"#"ml.g4dn.8xlarge"#"ml.p3.2xlarge", 'ml.p3.16xlarge'
    
if instance_type =='local_gpu':
    from sagemaker.local import LocalSession
    import os
    
    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
    
    data_channels = {"training": f"file://{data_dir}", "testing": f"file://{data_dir}"}
    
else:
    sagemaker_session = sagemaker.Session()
    data_channels = {"training": pm.get_params(key="".join([prefix, "PREP-DATA-PATH"])), "testing": pm.get_params(key="".join([prefix, "PREP-DATA-PATH"]))}

instance_count = 1

do_spot_training = False
max_wait = None
max_run = 1*60*60   

resume = False
    
proc_prefix = "/opt/ml/processing"

output_path = os.path.join(
    "s3://{}".format(pm.get_params(key=prefix + "BUCKET")),
    "training",
    "model-output"
)

checkpoint_s3_uri = os.path.join(
    "s3://{}".format(pm.get_params(key=prefix + "BUCKET")),
    "training",
    "ckpt"
)

code_location = os.path.join(
    "s3://{}".format(pm.get_params(key=prefix + "BUCKET")),
    "training",
    "backup_codes"
)

experiment_name = ''.join([prefix, "nemo-exp1"])

metric_definitions=[
     {"Name": "train_loss", "Regex": "loss.*=\D*(.*?)$"}
]

kwargs = {}

In [62]:
print("experiment_name : {} \ntrain_instance_type : {} \ntrain_instance_count : {}\n data_channels : {}\n data_dir : {}".format(experiment_name, instance_type, instance_count, data_channels, data_dir))    

experiment_name : SM-NeMo-nemo-exp1 
train_instance_type : ml.p3.2xlarge 
train_instance_count : 1
 data_channels : {'training': 's3://sm-nemo-bucket/preprocessing/data', 'testing': 's3://sm-nemo-bucket/preprocessing/data'}
 data_dir : ./dataset-tmp


* config

In [63]:
conf = OmegaConf.load(config_path)

# Sampling
conf.model.sample_rate = 16000 #12800 ## for finetuing

# Set Data Locations based on the mounted directory in the SageMaker instance
conf.model.train_ds.manifest_filepath = "/opt/ml/input/data/training/an4/train_manifest.json"
conf.model.validation_ds.manifest_filepath = "/opt/ml/input/data/testing/an4/test_manifest.json"
# training setup
conf.trainer.accelerator = "gpu"
conf.trainer.num_nodes = instance_count
# enable SageMaker DDP
conf.trainer.strategy = None #"ddp"
conf.trainer.max_epochs = 2

# Output directory for our experiment within the SageMaker instance
conf.exp_manager.exp_dir="/opt/ml/model/"

# Create a Small Variant of the Conformer Model
conf.model.encoder.n_layers = 8
conf.model.n_heads = 4
conf.model.spec_augment.time_masks = 5

# Set Optimizer parameters
conf.model.optim.lr = 2.0 # by default we using Noam scheduling, the LR is a multiplier


if resume == True:
    # resume flags if crashes occur
    conf.exp_manager.resume_if_exists=True 
    conf.exp_manager.resume_ignore_no_checkpoint=True
    # the pre-trained model we want to fine-tune
    conf.init_from_nemo_model = "CTC.nemo"
else:
    # resume flags if crashes occur
    conf.exp_manager.resume_if_exists=False 
    conf.exp_manager.resume_ignore_no_checkpoint=True
    

OmegaConf.save(conf, config_path)

* Define processing job

In [64]:
est = PyTorch(
    entry_point="speech_to_text_ctc.py", # the script we want to run
    source_dir=code_dir, # where our conf/script is
    role=pm.get_params(key=prefix + "SAGEMAKER-ROLE-ARN"),
    instance_type=instance_type,
    instance_count=instance_count,
    image_uri=pm.get_params(key=''.join([prefix, "IMAGE-URI"])),
    # framework_version="1.13.1", # version of PyTorch
    # py_version="py39",
    volume_size=1024,
    code_location = code_location,
    output_path=output_path,
    disable_profiler=True,
    debugger_hook_config=False,
    hyperparameters={'config-path': 'conf'},
    #distribution={"smdistributed":{"dataparallel":{"enabled":True, "fp16": True}}},
    sagemaker_session=sagemaker_session,
    checkpoint_s3_uri=checkpoint_s3_uri,
    metric_definitions=metric_definitions,
    max_run=max_run,
    **kwargs
)

* run

In [None]:
if instance_type =='local_gpu': est.checkpoint_s3_uri = None

create_experiment(experiment_name)
job_name = create_trial(experiment_name)

est.fit(
    inputs=data_channels, 
    job_name=job_name,
    experiment_config={
      'TrialName': job_name,
      'TrialComponentDisplayName': job_name,
    },
    wait=True,
)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker:Creating training-job with name: SM-NeMo-nemo-exp1-0316-10011678960900


2023-03-16 10:01:49 Starting - Starting the training job...
2023-03-16 10:02:14 Starting - Preparing the instances for training...

In [51]:
data_dir = "./dataset-tmp"

In [2]:

download_an4(
    data_dir=data_dir,
    train_mount_dir="/opt/ml/input/data/training/",
    test_mount_dir="/opt/ml/input/data/testing/",
)

******
Dataset downloaded at: ./dataset-tmp/an4_sphere.tar.gz
Converting .sph to .wav...
Finished conversion.
******
******
Training manifest created.
Test manifest created.
***Done***


'/home/ec2-user/SageMaker/nemo-on-sagemaker'