# <B> Setup for Nvidai NeMo with SageMaker </B>
* Container: codna_python3

## AutoReload

In [1]:
%load_ext autoreload
%autoreload 2

## 0. Install packages

In [2]:
install_needed = True  # should only be True once
# install_needed = False

In [3]:
%%bash
#!/bin/bash

DAEMON_PATH="/etc/docker"
MEMORY_SIZE=10G

FLAG=$(cat $DAEMON_PATH/daemon.json | jq 'has("data-root")')
# echo $FLAG

if [ "$FLAG" == true ]; then
    echo "Already revised"
else
    echo "Add data-root and default-shm-size=$MEMORY_SIZE"
    sudo cp $DAEMON_PATH/daemon.json $DAEMON_PATH/daemon.json.bak
    sudo cat $DAEMON_PATH/daemon.json.bak | jq '. += {"data-root":"/home/ec2-user/SageMaker/.container/docker","default-shm-size":"'$MEMORY_SIZE'"}' | sudo tee $DAEMON_PATH/daemon.json > /dev/null
    sudo service docker restart
    echo "Docker Restart"
fi

Already revised


In [3]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
    !{sys.executable} -m pip install -U pip
    !{sys.executable} -m pip install -U smdebug sagemaker-experiments
    !{sys.executable} -m pip install -U sagemaker
    !{sys.executable} -m pip install -U datasets transformers
    !{sys.executable} -m pip install -U wget omegaconf text-unidecode sox
    
    ## Install NeMo
    !sudo yum install sox -y
    !sudo yum install libsndfile
    !pip install --upgrade --force-reinstall llvmlite
    BRANCH = 'main'
    !{sys.executable} -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]
    
    IPython.Application.instance().kernel.do_shutdown(True)

installing deps and restarting kernel
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting pip
  Downloading pip-23.0.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.3.1
    Uninstalling pip-22.3.1:
      Successfully uninstalled pip-22.3.1
Successfully installed pip-23.0.1
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting sagemaker
  Downloading sagemaker-2.140.0.tar.gz (684 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m684.5/684.5 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collecte

## 1. Set roles

In [1]:
from sagemaker import get_execution_role

In [2]:
strSageMakerRoleName = get_execution_role().rsplit('/', 1)[-1]
print (f"SageMaker Execution Role Name: {strSageMakerRoleName}")

SageMaker Execution Role Name: AmazonSageMaker-ExecutionRole-20210401T133000


## 1.1 Attach IAM polich to sagemaker execution role (<b>with console</b>)
> **EC2ContainerRegistry**: "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryFullAccess" <BR>
> **S3**: "arn:aws:iam::aws:policy/AmazonS3FullAccess"

## 2. Set default parameters

In [5]:
import boto3
import sagemaker

### Bucket / Prefix 설정

In [78]:
strRegionName = boto3.Session().region_name
strAccountId = boto3.client("sts").get_caller_identity().get("Account")
bucket_name = '  ' # <-- 사용할 bucket 명을 추가해 주세요. ex) sagemaker-us-east-1-123456789123
prefix = '  ' ## <-- 작업할 prefix 명을 추가해 주세요. ex) nemo-test

## 3. Create custom docker image for preprocessing

* docker build

In [28]:
!pygmentize custom-docker/Dockerfile

[37m## docker build -f Dockerfile -t 322537213286.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.13.1-gpu-py39-nemo-main .[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[37m# From 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.13.1-gpu-py39[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[37m# RUN pip install --no-cache-dir --upgrade pip \[39;49;00m[37m[39;49;00m
[37m#  && pip install --no-cache-dir -U omegaconf hydra-core librosa sentencepiece youtokentome inflect \[39;49;00m[37m[39;49;00m
[37m#  && pip install --no-cache-dir -U braceexpand webdataset editdistance jiwer \[39;49;00m[37m[39;49;00m
[37m#  && pip install --no-cache-dir -U pytorch-lightning \ [39;49;00m[37m[39;49;00m
[37m#  && pip install --no-cache-dir -qq https://github.com/pyannote/pyannote-audio/archive/develop.zip \ [39;49;00m[37m[39;49;00m
[37m#  && pip install --no-cache-dir git+https://github.com/huggingface/transformers \ [39;49;00m[37m[39;49;00m
[37m#  && pip install --no

* Base 이미지의 region, account-id 확인 후 아래 파라미터 입력

### 1) AWS CLI 를 이용한 방식

In [56]:
%%bash
strRepositoryName="nemo-test"
strDockerDir="./custom-docker/"
strTag="latest"

cd ${strDockerDir}
echo $(pwd)
container_name=${strRepositoryName}

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
# region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${container_name}:${strTag}"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${container_name}" > /dev/null 2>&1
if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${container_name}" > /dev/null
fi

# # Get the login command from ECR and execute it directly
# $(aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin "763104351884.dkr.ecr.us-west-2.amazonaws.com")

# Build the docker image locally with the image name and then push it to ECR
# with the full name.
docker build -f Dockerfile -t ${fullname} .
# docker tag ${container_name} ${fullname}

# Get the login command from ECR and execute it directly
$(aws ecr get-login --region ${region} --no-include-email)
docker push ${fullname}
echo ${fullname}

/home/ec2-user/SageMaker/2023/training-code/Nvidia-NeMo/nemo-on-sagemaker/custom-docker
Sending build context to Docker daemon  10.24kB
Step 1/5 : FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.13.1-gpu-py39
 ---> f0cd3f7ded0e
Step 2/5 : RUN apt -y update && apt -y install sox
 ---> Using cache
 ---> 367197b752c7
Step 3/5 : RUN apt-get -y install libsox-fmt-all
 ---> Using cache
 ---> 1bdb2a0134ff
Step 4/5 : RUN pip install --no-cache-dir --upgrade pip  && pip install --no-cache-dir -U omegaconf hydra-core librosa sentencepiece youtokentome inflect sox  && pip install --no-cache-dir -U braceexpand webdataset editdistance jiwer  && pip install --no-cache-dir -U pytorch-lightning  && pip install --no-cache-dir -qq https://github.com/pyannote/pyannote-audio/archive/develop.zip  && pip install --no-cache-dir git+https://github.com/huggingface/transformers  && pip install --no-cache-dir git+https://github.com/NVIDIA/NeMo.git@main
 ---> Using cache
 ---> 06ea2b3fb1e6
St

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



### 2) AWS BOTO3 SDK를 이용한 방식

In [None]:
from utils.ecr import ecr_handler
ecr = ecr_handler()

In [None]:
strRepositoryName = "NeMo-Image"
strRepositoryName = strRepositoryName.lower()
strDockerDir = "./custom-docker/"
strTag = "latest"

In [16]:
ecr.build_docker(strDockerDir, strRepositoryName, strRegionName=region_name, strAccountId=account_id)

/home/ec2-user/SageMaker/nemo-on-sagemaker
/home/ec2-user/SageMaker/nemo-on-sagemaker/custom-docker
aws ecr get-login --region 'us-west-2' --registry-ids '763104351884' --no-include-email


https://docs.docker.com/engine/reference/commandline/login/#credentials-store



Login Succeeded

Sending build context to Docker daemon   12.8kB

Step 1/5 : FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.13.1-gpu-py39
 ---> f0cd3f7ded0e
Step 2/5 : RUN apt -y update && apt -y install sox
 ---> Using cache
 ---> 03ea9e49dca9
Step 3/5 : RUN apt-get -y install libsox-fmt-all
 ---> Using cache
 ---> e10716b824bd
Step 4/5 : RUN pip install --no-cache-dir --upgrade pip  && pip install --no-cache-dir -U omegaconf hydra-core librosa sentencepiece youtokentome inflect sox  && pip install --no-cache-dir -U braceexpand webdataset editdistance jiwer  && pip install --no-cache-dir -U pytorch-lightning  && pip install --no-cache-dir -qq https://github.com/pyannote/pyannote-audio/archive/develop.zip  && pip install --no-cache-dir git+https://github.com/huggingface/transformers  && pip install --no-cache-dir git+https://github.com/NVIDIA/NeMo.git@main
 ---> Running in 50e1789ca22a
Collecting pip
  Downloading pip-23.0.1-py3-none-any.whl (2.1 MB)
     ━━━━━━━━

* Push the image to ECR

In [17]:
strEcrRepositoryUri = ecr.register_image_to_ecr(strRegionName, strAccountId, strRepositoryName, strTag)

== REGISTER AN IMAGE TO ECR ==
  processing_repository_uri: 419974056037.dkr.ecr.ap-northeast-2.amazonaws.com/nemo-image:latest
aws ecr get-login --region 'ap-northeast-2' --registry-ids '419974056037' --no-include-email


https://docs.docker.com/engine/reference/commandline/login/#credentials-store



Login Succeeded

aws ecr create-repository --repository-name 'nemo-image'
docker tag 'nemo-image:latest' '419974056037.dkr.ecr.ap-northeast-2.amazonaws.com/nemo-image:latest'
docker push '419974056037.dkr.ecr.ap-northeast-2.amazonaws.com/nemo-image:latest'
== REGISTER AN IMAGE TO ECR ==


In [18]:
#strEcrRepositoryUri = "419974056037.dkr.ecr.ap-northeast-2.amazonaws.com/nemo-image"

* Save image-uri to parameter store

In [None]:
strEcrRepositoryUri = ' '  ## <-- 생성된 ECR의 URI를 넣어주세요.. ex) 123456789123.dkr.ecr.us-west-2.amazonaws.com/nemo-test:latest

## 4. Download & Upload dataset

In [57]:
import os
import wget

In [58]:
data_dir = "./data"

In [59]:
print("******")
os.makedirs(data_dir, exist_ok=True)
if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):
    an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'
    an4_path = wget.download(an4_url, data_dir)
    print(f"Dataset downloaded at: {an4_path}")
else:
    print("Tarfile already exists.")
    an4_path = data_dir + '/an4_sphere.tar.gz'

******
Tarfile already exists.


* updoad data to s3

### 1) AWS CLI 를 이용한 방식

In [70]:
!aws s3 sync $data_dir s3://$bucket_name/$prefix/data

upload: data/an4_sphere.tar.gz to s3://sagemaker-us-west-2-322537213286/nemo-prod/data/an4_sphere.tar.gz


### 2) AWS BOTO3 SDK를 이용한 방식

In [22]:
from utils.s3 import s3_handler

In [23]:
s3 = s3_handler()

This is a S3 handler with [None] region.


In [24]:
source_dir, target_bucket, target_dir = data_dir, pm.get_params(key=prefix+"BUCKET"), "data"
s3.upload_dir(source_dir, target_bucket, target_dir)

Upload:[./data] was uploaded to [s3://sm-nemo-bucket/data]successfully


## 5. CodeCommit 생성
- Attach IAM polich to sagemaker execution role (<b>with console</b>)
> **CodeCommit**: "arn:aws:iam::aws:policy/AWSCodeCommitFullAccess"<BR>
> **SecretsManager**: "arn:aws:iam::aws:policy/SecretsManagerReadWrite"<BR>

### 5.1 CodeCommit 관련 Credentials 생성 및 Secret Manager에 저장하기
- CodeCommit Credentials

In [99]:
user_name = ' ' ## ==> IAM에서 사용자 아이디 확인합니다.
codecommit_cred = 'codecommit-cred-'+user_name
codecommit_cred

'codecommit-cred-napkin'

In [100]:
iam_client = boto3.client('iam')

In [101]:
try:
    response = iam_client.list_service_specific_credentials(
        UserName=user_name,
        ServiceName='codecommit.amazonaws.com'
    )
    if len(response['ServiceSpecificCredentials']) > 0:
        response = iam_client.delete_service_specific_credential(
            UserName=user_name,
            ServiceSpecificCredentialId=response['ServiceSpecificCredentials'][-1]['ServiceSpecificCredentialId']
        )
except:
    print("Create new codecommit crendentials")
    pass
finally:
    response = iam_client.create_service_specific_credential(
        UserName=user_name,
        ServiceName='codecommit.amazonaws.com'
    )
    ServiceUserName = response['ServiceSpecificCredential']['ServiceUserName']
    ServicePassword = response['ServiceSpecificCredential']['ServicePassword']
print(f"ServiceUserName : {ServiceUserName} \nServicePassword : {ServicePassword}")

ServiceUserName : napkin-at-322537213286 
ServicePassword : lTRH1S9iPZiC3kzKcQgwqMpziwyVnK13wZgo/ivDFZc=


In [111]:
code_repository_name = ' ' ## ==> 사용할 code repository 폴더 명을 넣습니다. ex) model_code
local_code_dir = ' ' ## ==> 생성한 local의 code repository 폴더 명을 넣습니다. ex) code

In [113]:
codecommit = boto3.client('codecommit')

try:
    response = codecommit.create_repository(
        repositoryName=code_repository_name,
        repositoryDescription='Data Scientists share their training code using this Repository'
    )
except:
    
    print("Repository already exists")
    response = codecommit.get_repository(
        repositoryName=code_repository_name
    )

In [114]:
codecommit_repo = response['repositoryMetadata']['cloneUrlHttp']
codecommit_repo

'https://git-codecommit.us-west-2.amazonaws.com/v1/repos/model_code'

In [None]:
!git init
!git remote add repo_codecommit $codecommit_repo
!git checkout -b main
!git add $local_code_dir
!git commit -m "code-update"
!git push --set-upstream repo_codecommit main

## 6. [Optional] AWS Systems Manager Parameter Store 를 이용한 파라미터 저장/활용
- [AWS Systems Manager Parameter Store](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-parameter-store.html)
- Attach IAM polich to sagemaker execution role (<b>with console</b>)
> **SSM**: "arn:aws:iam::aws:policy/AmazonSSMFullAccess"<BR>

In [87]:
from utils.ssm import parameter_store

In [88]:
pm = parameter_store(strRegionName)

In [128]:
pm.put_params(key="PREFIX", value=prefix, overwrite=True)
pm.put_params(key="-".join([prefix, "REGION"]), value=strRegionName, overwrite=True)
pm.put_params(key="-".join([prefix, "BUCKET"]), value=bucket_name, overwrite=True)
pm.put_params(key="-".join([prefix, "SAGEMAKER-ROLE-ARN"]), value=get_execution_role(), overwrite=True)
pm.put_params(key="-".join([prefix, "ACCOUNT-ID"]), value=strAccountId, overwrite=True)
pm.put_params(key="-".join([prefix, "IMAGE-URI"]), value=strEcrRepositoryUri, overwrite=True)
pm.put_params(key="-".join([prefix, "S3-DATA-PATH"]), value=f"s3://{bucket_name}/{prefix}/data", overwrite=True)
pm.put_params(key="-".join([prefix, "CODE_REPO"]), value=codecommit_repo.replace('https://',''), overwrite=True)  ## https:// 있을 경우 입력 못함
pm.put_params(key="-".join([prefix, "CODECOMMIT-USERNAME"]), value=ServiceUserName, overwrite=True, enc=True)
pm.put_params(key="-".join([prefix, "CODECOMMIT-PWD"]), value=ServicePassword, overwrite=True, enc=True)

'Store suceess'

In [127]:
print (f'PREFIX: {pm.get_params(key="PREFIX")}')
print (f'REGION: {pm.get_params(key="-".join([prefix, "REGION"]))}')
print (f'BUCKET: {pm.get_params(key="-".join([prefix, "BUCKET"]))}')
print (f'AGEMAKER-ROLE-ARN: {pm.get_params(key="-".join([prefix, "SAGEMAKER-ROLE-ARN"]))}')
print (f'ACCOUNT-ID: {pm.get_params(key="-".join([prefix, "ACCOUNT-ID"]))}')
print (f'IMAGE-URI: {pm.get_params(key="-".join([prefix, "IMAGE-URI"]))}')
print (f'S3-DATA-PATH: {pm.get_params(key="-".join([prefix, "S3-DATA-PATH"]))}')
print (f'CODE_REPO: {pm.get_params(key="-".join([prefix, "CODE_REPO"]))}')
print (f'CODECOMMIT-USERNAME: {pm.get_params(key="-".join([prefix, "CODECOMMIT-USERNAME"]))}')
print (f'CODECOMMIT-PWD: {pm.get_params(key="-".join([prefix, "CODECOMMIT-PWD"]))}')

PREFIX: nemo-prod
REGION: us-west-2
BUCKET: sagemaker-us-west-2-322537213286
AGEMAKER-ROLE-ARN: arn:aws:iam::322537213286:role/service-role/AmazonSageMaker-ExecutionRole-20210401T133000
ACCOUNT-ID: 322537213286
IMAGE-URI: 322537213286.dkr.ecr.us-west-2.amazonaws.com/nemo-test:latest
S3-DATA-PATH: s3://sagemaker-us-west-2-322537213286/nemo-prod/data
CODE_REPO: git-codecommit.us-west-2.amazonaws.com/v1/repos/model_code
CODECOMMIT-USERNAME: napkin-at-322537213286
CODECOMMIT-PWD: lTRH1S9iPZiC3kzKcQgwqMpziwyVnK13wZgo/ivDFZc=
