# **Setup for BERT for classificaion with SageMaker**
* Container: codna_pytorch_p310

## AutoReload

In [3]:
%load_ext autoreload
%autoreload 2

## 0. Install packages

In [4]:
install_needed = True  # should only be True once
# install_needed = False

In [None]:
%%bash
#!/bin/bash

DAEMON_PATH="/etc/docker"
MEMORY_SIZE=10G

FLAG=$(cat $DAEMON_PATH/daemon.json | jq 'has("data-root")')
# echo $FLAG

if [ "$FLAG" == true ]; then
    echo "Already revised"
else
    echo "Add data-root and default-shm-size=$MEMORY_SIZE"
    sudo cp $DAEMON_PATH/daemon.json $DAEMON_PATH/daemon.json.bak
    sudo cat $DAEMON_PATH/daemon.json.bak | jq '. += {"data-root":"/home/ec2-user/SageMaker/.container/docker","default-shm-size":"'$MEMORY_SIZE'"}' | sudo tee $DAEMON_PATH/daemon.json > /dev/null
    sudo service docker restart
    echo "Docker Restart"
fi

In [4]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
    !{sys.executable} -m pip install -U pip
    !{sys.executable} -m pip install -U sagemaker
    !{sys.executable} -m pip install -U torch
    !{sys.executable} -m pip install -U sagemaker-experiments
    
    !{sys.executable} -m pip install -U transformers
    #!{sys.executable} -m pip install -U ipywidgets
    
    #!{sys.executable} -m pip install -U watermark
    #!{sys.executable} -m pip install -U seaborn
    !{sys.executable} -m pip install -U datasets[s3]==1.18.4
    !sudo curl -L "https://github.com/docker/compose/releases/download/v2.7.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
    !sudo chmod +x /usr/local/bin/docker-compose
    
    IPython.Application.instance().kernel.do_shutdown(True)

installing deps and restarting kernel
Collecting sagemaker
  Downloading sagemaker-2.175.0.tar.gz (857 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m857.4/857.4 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting urllib3<1.27,>=1.25.4 (from botocore<1.32.0,>=1.31.14->boto3<2.0,>=1.26.131->sagemaker)
  Obtaining dependency information for urllib3<1.27,>=1.25.4 from https://files.pythonhosted.org/packages/c5/05/c214b32d21c0b465506f95c4f28ccbcba15022e000b043b72b3df7728471/urllib3-1.26.16-py2.py3-none-any.whl.metadata
  Downloading urllib3-1.26.16-py2.py3-none-any.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.4/48.4 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading urllib3-1.26.16-py2.py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.1/143.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0mta

## 1. Set roles

In [5]:
from sagemaker import get_execution_role



In [6]:
sagemaker_role = get_execution_role().rsplit('/', 1)[-1]
print (f"SageMaker Execution Role Name: {sagemaker_role}")

SageMaker Execution Role Name: AmazonSageMaker-ExecutionRole-20221004T162466


## 2. Set default parameters

In [7]:
import boto3
import sagemaker

In [8]:
region_name = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")
bucket_name = 'sm-bert-ramp' # <-- 사용할 bucket 명을 추가해 주세요. ex) sagemaker-us-east-1-123456789123, sm-bert-bucket
prefix = 'ramp-mlops' ## <-- 작업할 prefix 명을 추가해 주세요. ex) bert-test, bert-mlops

## 3. Upload dataset 
* amazon_polarity dataset
* The label being set at 1 denotes a positive review, and 0 means a negative review.

In [33]:
from utils.s3 import s3_handler

In [34]:
s3 = s3_handler(region_name=region_name)

This is a S3 handler with [ap-northeast-2] region.


In [35]:
data_dir = "./data"

In [36]:
s3.create_bucket(bucket_name)
source_dir, target_bucket, target_dir = data_dir, bucket_name, prefix+"/data"
s3.upload_dir(source_dir, target_bucket, target_dir)
data_path_s3 =f's3://{bucket_name}/{prefix}/data/amazon_polarity.csv'

ERROR:root:An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


ERROR: An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.
Upload:[./data] was uploaded to [s3://sm-bert-ramp/ramp-mlops/data]successfully


## 4. CodeCommit 생성
* Attach IAM polich to sagemaker execution role (**with console**)
>**CodeCommit**: "arn:aws:iam::aws:policy/AWSCodeCommitFullAccess" <BR>
>**SecretsManager**: "arn:aws:iam::aws:policy/SecretsManagerReadWrite"

In [20]:
user_name = 'dongjin' ## ==> IAM에서 사용자 아이디 확인합니다.
codecommit_cred = 'codecommit-cred-' + user_name
codecommit_cred

'codecommit-cred-dongjin'

In [21]:
iam_client = boto3.client('iam')

In [22]:
try:
    response = iam_client.list_service_specific_credentials(
        UserName=user_name,
        ServiceName='codecommit.amazonaws.com'
    )
    if len(response['ServiceSpecificCredentials']) > 0:
        response = iam_client.delete_service_specific_credential(
            UserName=user_name,
            ServiceSpecificCredentialId=response['ServiceSpecificCredentials'][-1]['ServiceSpecificCredentialId']
        )
except:
    print("Create new codecommit crendentials")
    pass
finally:
    response = iam_client.create_service_specific_credential(
        UserName=user_name,
        ServiceName='codecommit.amazonaws.com'
    )
    ServiceUserName = response['ServiceSpecificCredential']['ServiceUserName']
    ServicePassword = response['ServiceSpecificCredential']['ServicePassword']
print(f"ServiceUserName : {ServiceUserName} \nServicePassword : {ServicePassword}")

ServiceUserName : dongjin-at-419974056037 
ServicePassword : PHXzA4SA0aTKb8WRgCZt1VHD5qwS6VwzpxjruoWqpRo=


In [23]:
code_repository_name = "bert-code" ## ==> 사용할 code repository 폴더 명을 넣습니다. ex) model_code
local_code_dir = "./code" ## ==> 생성한 local의 code repository 폴더 명을 넣습니다. ex) code

In [24]:
codecommit = boto3.client('codecommit')

try:
    response = codecommit.create_repository(
        repositoryName=code_repository_name,
        repositoryDescription='Data Scientists share their training code using this Repository'
    )
except:
    
    print("Repository already exists")
    response = codecommit.get_repository(
        repositoryName=code_repository_name
    )

In [25]:
codecommit_repo = response['repositoryMetadata']['cloneUrlHttp']
codecommit_repo

'https://git-codecommit.ap-northeast-2.amazonaws.com/v1/repos/bert-code'

In [26]:
!rm -rf .git/

* for SageMaker Notebook

In [27]:
username = ServiceUserName.replace("@", "%40")
password = ServicePassword.replace("/", "%2F")
password = ServicePassword.replace("+", "%20")
commit_url = codecommit_repo.split("//")[1]

In [28]:
!git init
!git remote add origin $codecommit_repo ## codecommit_repo path 복사
!git checkout -b main
!git add $local_code_dir
!git commit -m "code-update"
!git push https://$username:$password@$commit_url --all


[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /root/bert-mlops-sagemaker/1.building-component/.git/
Switched to a new branch 'main'
[main (root-commit) 82cf6e0] code-update
 5 files changed, 408 insertions(+)
 create mode 100644 code/evaluation.py
 create mode 100644 code/inference.py
 create mode 100644 code/preprocessing.py
 create mode 100644 code/requirements.txt
 create mode 100644 code/train.py
fatal: unable to access 'https://git-codecommit.ap-northeast-2.amazonaws.com/v1/repos/bert-code/': The requested URL retu

* for SageMaker Studio
    - path: bert-mlops-sagemaker/1.building-component/

In [29]:
print ("git config --global user.email " "dongjinj@amazom.com") 
print ("git config --global user.name " "dongjin jang") 
print ("git init")
print (f'git remote add origin "{codecommit_repo}"')
print ("git checkout -b main")
print (f'git add "{local_code_dir}"')
print ('git commit -m "code-update"')
print ('git push --set-upstream origin main')

git config --global user.email dongjinj@amazom.com
git config --global user.name dongjin jang
git init
git remote add origin "https://git-codecommit.ap-northeast-2.amazonaws.com/v1/repos/bert-code"
git checkout -b main
git add "./code"
git commit -m "code-update"
git push --set-upstream origin main


## 5. [Optional] AWS Systems Manager Parameter Store 를 이용한 파라미터 저장/활용
* AWS Systems Manager Parameter Store <BR>
* Attach IAM polich to sagemaker execution role (**with console**)
>**SSM**: "arn:aws:iam::aws:policy/AmazonSSMFullAccess"

In [30]:
from utils.ssm import parameter_store

In [31]:
pm = parameter_store(region_name)

In [37]:
pm.put_params(key="-".join(["PREFIX"]), value=prefix, overwrite=True)
pm.put_params(key="-".join([prefix, "REGION"]), value=region_name, overwrite=True)
pm.put_params(key="-".join([prefix, "ACCOUNT-ID"]), value=account_id, overwrite=True)
pm.put_params(key="-".join([prefix, "BUCKET-NAME"]), value=bucket_name, overwrite=True)
pm.put_params(key="-".join([prefix, "DATA-PATH-S3"]), value=data_path_s3, overwrite=True)
pm.put_params(key="-".join([prefix, "SAGEMAKER-ROLE-ARN"]), value=sagemaker_role, overwrite=True)
pm.put_params(key="-".join([prefix, "CODECOMMIT-USERNAME"]), value=ServiceUserName, overwrite=True, enc=True)
pm.put_params(key="-".join([prefix, "CODECOMMIT-PWD"]), value=ServicePassword, overwrite=True, enc=True)
pm.put_params(key="-".join([prefix, "CODE_REPO"]), value=codecommit_repo.replace('https://',''), overwrite=True)

'Store suceess'

In [38]:
print (f'PREFIX: {pm.get_params(key="PREFIX")}')
print (f'REGION: {pm.get_params(key="-".join([prefix, "REGION"]))}')
print (f'BUCKET-NAME: {pm.get_params(key="-".join([prefix, "BUCKET-NAME"]))}')
print (f'ACCOUNT-ID: {pm.get_params(key="-".join([prefix, "ACCOUNT-ID"]))}')
print (f'DATA-PATH-S3: {pm.get_params(key="-".join([prefix, "DATA-PATH-S3"]))}')
print (f'SAGEMAKER-ROLE-ARN: {pm.get_params(key="-".join([prefix, "SAGEMAKER-ROLE-ARN"]))}')
print (f'CODE_REPO: {pm.get_params(key="-".join([prefix, "CODE_REPO"]))}')
print (f'ServiceUserName: {pm.get_params(key="-".join([prefix, "CODECOMMIT-USERNAME"]), enc=True)}')
print (f'ServicePassword: {pm.get_params(key="-".join([prefix, "CODECOMMIT-PWD"]), enc=False)}')

PREFIX: ramp-mlops
REGION: ap-northeast-2
BUCKET-NAME: sm-bert-ramp
ACCOUNT-ID: 419974056037
DATA-PATH-S3: s3://sm-bert-ramp/ramp-mlops/data/amazon_polarity.csv
SAGEMAKER-ROLE-ARN: AmazonSageMaker-ExecutionRole-20221004T162466
CODE_REPO: git-codecommit.ap-northeast-2.amazonaws.com/v1/repos/bert-code
ServiceUserName: dongjin-at-419974056037
ServicePassword: AQICAHjFzhQy64RiT2kLIfWwY+jSgA3nnCm2+lidS9TmTKjDHQEsrS1C1NQKfbPbmpIYtwX4AAAAizCBiAYJKoZIhvcNAQcGoHsweQIBADB0BgkqhkiG9w0BBwEwHgYJYIZIAWUDBAEuMBEEDIrXEGC0lLkb9craFwIBEIBH+oF/IQwtAvskjXKsGD9Z2Rez4HVhQCwO9Uqm/mibG38FVMgvXH4raXf76mkOkNMV5nhgf8goh9OJ6ccyAqUasdIoOSmJ4+k=
